Example #1
0
class XQUserInfoWeiboSpider(Spider):
    start_at = datetime.now()
    name = 'xq_user_info_weibo'
    logger = util.set_logger(name, LOG_FILE_USER_INFO)

    #handle_httpstatus_list = [404]

    def start_requests(self):
        start_url = "https://xueqiu.com/account/oauth/user/show.json?source=sina&userid="

        # get start url from MongoDB
        db = util.set_mongo_server()
        owner_ids = []

        for id in db.xq_cube_info.find({}, {'owner_id': 1, '_id': 0}):

            owner_ids.append(id['owner_id'])
        owner_ids = list(set(owner_ids))

        # iterate each symbol
        all_page_n = len(owner_ids)
        for i in range(all_page_n):
            now_page_n = i
            owner_id = owner_ids[i]
            url = start_url + str(owner_id)

            # progress
            if i % 1000 == 0:
                self.logger.info(
                    '%s (%s / %s) %s%%' %
                    (owner_id, str(now_page_n), str(all_page_n),
                     str(round(float(now_page_n) / all_page_n * 100, 1))))
                #util.get_progress(now_page = i, all_page = all_page_n, logger = self.logger, spider_name = self.name, start_at = self.start_at)

            yield Request(url=url,
                          meta={'user_id': owner_id},
                          callback=self.parse)

    def parse(self, response):
        try:
            if response.status == 200 and str(
                    response.url) != "https://xueqiu.com/service/captcha":
                body = json.loads(response.body.decode('utf-8'))
                if 'id' in body:
                    item = XQItem()
                    content = {}
                    content['user_id'] = response.meta['user_id']
                    content['weibo_id'] = body['id']
                    item['url'] = response.url
                    item['content'] = content
                    item['fp'] = request_fingerprint(response.request)
                    yield item

            elif str(response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: UID %s' %
                                  (response.meta['user_id']))

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))
Example #2
0
class ArtistSpider(Spider):
    #custom_settings = {'CONCURRENT_REQUESTS', 1}
    name = 'artist_id'
    #allow_domains = ['music.163.com']
    logger = util.set_logger(name, LOG_FILE_ARTIST)

    def start_requests(self):
        ls1 = [
            1001, 1002, 1003, 2001, 2002, 2003, 6001, 6002, 6003, 7001, 7002,
            7003, 4001, 4002, 4003
        ]  # id
        ls2 = [
            0, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
            81, 82, 83, 84, 85, 86, 87, 88, 89, 90
        ]  # initial
        for i in ls1:
            for j in ls2:
                start_url = 'https://music.163.com/#/discover/artist/cat?id=' + str(
                    i) + '&initial=' + str(j)
                yield Request(url=start_url,
                              callback=self.parse,
                              dont_filter=True)

    def parse(self, response):
        item = NetItem()
        hxs = response.body.decode("utf-8")
        paths = Selector(text=hxs).xpath(
            '//li//a[@class="nm nm-icn f-thide s-fc0"]').extract()
        for path in paths:
            ids = Selector(text=path).xpath('//a/@href').extract()[0]
            id = re.search("\?id=(.+)", ids).group(1)
            item['artist_id'] = id
            name = Selector(text=path).xpath('//a/text()').extract()[0]
            item['artist_name'] = name
            yield item
Example #3
0
class HotelaahSpider(Spider):
    name = 'CrawlerHotelaah'
    logger = util.set_logger(name, LOG_FILE_POLITICIAN)

    def start_requests(self):
        start_url = 'http://www.hotelaah.com/liaoning/dijishi.html'
        yield Request(url=start_url, callback=self.parse)

    def parse(self, response):
        citylist = response.xpath(
            '//td/a[not(contains(@href, "index")) and not(contains(@href, "dijishi")) and not(contains(@href, "ditu")) and not(contains(@href, "www"))]'
        ).extract()

        for city in citylist:
            item = HotelaahItem()
            city_name = Selector(text=city).xpath('//a/text()').extract()[0]
            item['city_name'] = city_name
            print(item)
            city_url = Selector(text=city).xpath('//a/@href').extract()[0]
            print(city_url)
            yield Request(url='http://www.hotelaah.com/liren/' + 'liaoning' +
                          '_' + city_url,
                          meta={'item': copy.deepcopy(item)},
                          callback=self.parse_mayor)

    def parse_mayor(self, response):
        pass
Example #4
0
    def __init__(self):
        # set logger
        self.logger = util.set_logger('pipeline', LOG_FILE_PIPELINE)

        # 建立MongoDB server
        self.db = util.set_mongo_server()

        # 建立redis server
        self.redis_server = util.set_redis_server()
class GubaReplyUserInfo(Spider):
    start_at = datetime.now()
    name = 'guba_reply_user_info'
    logger = util.set_logger(name, LOG_FILE_GUBA_REPLY_USER_INFO)

    def start_requests(self):
        db = util.set_mongo_server()
        reply_author_urls = []
        #replys = list(db.CrawlerGuba.aggregate([{'$project':{'_id': 0, 'reply': 1}} ,{'$unwind': '$reply'}]))
        for url in db.guba_stock_posts.find({}, {
                'reply.reply_author_url': 1,
                '_id': 0
        }):
            if 'reply' in url:
                for e in url['reply']:
                    if 'reply_author_url' in e:
                        reply_author_urls.append(e['reply_author_url'])
        reply_author_urls = list(set(reply_author_urls))
        all_page_n = len(reply_author_urls)
        for i in range(all_page_n):
            reply_author_url = reply_author_urls[i]
            url = reply_author_url

            if i % 1000 == 0:
                self.logger.info('%s / %s' % (str(i), str(all_page_n)))
                util.get_progress(all_page=all_page_n,
                                  logger=self.logger,
                                  spider_name=self.name,
                                  start_at=self.start_at)

            yield Request(url=url,
                          meta={'reply_author_url': reply_author_url},
                          callback=self.parse)

    def parse(self, response):
        try:
            if response.status == 200:
                hxs = Selector(response)
                reply_author_url = response.meta['reply_author_url']
                item = GubaItem()
                item['content'] = {}
                reply_author_name = hxs.xpath(
                    '//div[@class="taname"]/text()').extract()[0]
                item['content']['reply_author_name'] = reply_author_name.strip(
                )
                sign_up_time = hxs.xpath('//div[@id="influence"]').extract()[0]
                sign_up_time = re.search('999;">\((.+)\)<\/span',
                                         sign_up_time).group(1).strip()
                sign_up_time = datetime.strptime(sign_up_time, "%Y-%m-%d")
                item['content']['sign_up_time'] = sign_up_time
                item['content']['reply_author_url'] = reply_author_url
                yield item

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))
Example #6
0
class citspider(Spider):
    name = "cit_info"
    logger = util.set_logger(name, LOG_FILE_CIT)

    def start_requests(self):
        start_url = "http://www.ccdi.gov.cn/special/zyxszt/"
        yield Request(url = start_url, callback = self.parse)

    def parse(self, response):
        inspt_urls = response.xpath('//div[@class="tith2"]//a/@href').extract()
        for inspt_url in inspt_urls:
            inspt_url = "http://www.ccdi.gov.cn/special/zyxszt" + re.sub("\.", "", inspt_url)
            yield Request(url = inspt_url, callback = self.parse_list, meta = {'inspt_url' : inspt_url})

    def parse_list(self, response):
        post_nums = response.xpath('//div[@class="page"]/script[@type = "text/javascript"]/text()').extract()[0]
        post_nums = re.search("\((\d?),.+\)", post_nums).group(1)
        inspt_url = response.meta['inspt_url']
        for post_num in post_nums:
            for i in range(1, int(post_num)+1):
                if i == 1:
                    page_url = inspt_url + "index.html"
                    yield Request(url = copy.deepcopy(page_url), callback = self.parse_page, meta = {'inspt_url': copy.deepcopy(inspt_url)})
                else:
                    page_url = inspt_url + "index_"+ str(i-1) + ".html"
                    yield Request(url = copy.deepcopy(page_url), callback = self.parse_page, meta = {'inspt_url': copy.deepcopy(inspt_url)})
                #print(page_url)

    def parse_page(self, response):
        #pass
        post_urls = response.xpath('//li[@class="fixed"]//a/@href').extract()
        for post_url in post_urls:
            inspt_url = response.meta['inspt_url']
            post_url = inspt_url + re.search("\.\/(.+)", post_url).group(1)
            yield Request(url = post_url, callback = self.parse_post)

    def parse_post(self, response):
        item = CitItem()
        inspt = response.xpath('//div[@class="fl"]/span').extract()[0]
        inspt = re.search("专题&gt;(.+)<\/span>", inspt).group(1)
        inspt_title = re.search("(.+)&gt;(.+)", inspt).group(1)
        item['inspt_title'] = inspt_title
        inspt_tag = re.search("(.+)&gt;(.+)", inspt).group(2)
        item['inspt_tag'] = inspt_tag

        title = response.xpath('//h2[@class="tit"]/text()').extract()[0].strip()
        item['title'] = title
        
        time = response.xpath('//em[@class="e2"]/text()').extract()[0].strip()
        time = re.search("发布时间:(.+)", time).group(1).strip()
        item['time'] = time
        
        content = response.xpath('//p[@align="justify"]/text()').extract()
        item['content'] = content
        yield item
Example #7
0
class sodaspider(Spider):
    name = "soda_green"
    logger = util.set_logger(name, LOG_FILE_SODA)

    def start_requests(self):
        start_url = "https://music.163.com/#/artist/album?id=12707&limit=48"
        yield Request(url=start_url, callback=self.parse)

    def parse(self, response):
        print(response)
        pass
Example #8
0
class XQUserInfo(Spider):
    start_at=datetime.now()
    name = 'xq_user_cube'
    logger = util.set_logger(name, LOG_FILE_USER_STOCK)
    #handle_httpstatus_list = [404]

    def start_requests(self):
        #start_url="https://xueqiu.com/stock/portfolio/stocks.json?size=5000&tuid="
        start_url = "https://stock.xueqiu.com/v5/stock/portfolio/stock/list.json?size=10000&category=3&pid=-120&uid="

        db = util.set_mongo_server()
        owner_ids = []
        for id in db.xq_cube_info.find({}, {'owner_id': 1, '_id': 0}):
            owner_ids.append(id['owner_id'])
        owner_ids = list(set(owner_ids))

        # iterate each symbol
        all_page_n = len(owner_ids)
        for i in range(all_page_n):
            now_page_n = i
            owner_id = owner_ids[i]
            url = start_url+str(owner_id)

            # progress
            if i%1000==0:
                self.logger.info('%s (%s / %s) %s%%' % (owner_id, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1))))

            yield Request(url = url, meta = {'user_id': owner_id}, callback = self.parse)

    def parse(self, response):
        try:
            if response.status == 200 and str(response.url) != "https://xueqiu.com/service/captcha":
                content = json.loads(response.body.decode('utf-8'))
                item = XQItem()
                content['user_id'] = response.meta['user_id']
                item['url'] = response.url
                item['content'] = content
                item['fp'] = request_fingerprint(response.request)
                yield item

            if str(response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: User ID %s' % (response.meta['owner_id']))

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
Example #9
0
class DTRank(Spider):
    name = 'DTRank'
    logger = util.set_logger(name, LOG_FILE_DTRank)

    def start_requests(self):
        page_num = 87
        for i in range(1, page_num + 1):
            urls = 'http://data.10jqka.com.cn/market/jgzy/field/enddate/order/desc/page/' + str(
                i)
            yield Request(url=urls, callback=self.parse)

    def parse(self, response):
        paths = response.xpath('//tbody/tr').extract()
        item = DTRankItem()
        for path in paths:
            date = Selector(
                text=path).xpath('//td[@class = "tc cur"]/text()').extract()[0]
            item['date'] = date

            stock_symbol = Selector(text=path).xpath(
                '//td[@class="tc"][position() = 1]/a[@target = "_blank"]/text()'
            ).extract()[0]
            item['stock_symbol'] = stock_symbol

            stock_name = Selector(text=path).xpath(
                '//td[@class="tc"][position() = 2]/a[@target = "_blank"]/text()'
            ).extract()[0]
            # stock_name = stock_name.decode("UTF-8")
            item['stock_name'] = stock_name

            buy_inst_num = Selector(text=path).xpath(
                '//tr/td[@class="c-rise "]/text()').extract()[0]
            item['buy_inst_num'] = buy_inst_num

            sell_inst_num = Selector(text=path).xpath(
                '//tr/td[@class="c-fall "]/text()').extract()[0]
            item['sell_inst_num'] = sell_inst_num

            rank_reason = Selector(
                text=path).xpath('//tr/td[@class = "tl "]/text()').extract()[0]
            # rank_reason = rank_reason.decode("UTF-8")
            item['rank_reason'] = rank_reason

            yield item
Example #10
0
class PoliticianSpider(Spider):
    name = 'CrawlerPolitician'
    logger = util.set_logger(name, LOG_FILE_POLITICIAN)

    def start_requests(self):
        Polist = open("C:/Code/Testing/scrt.csv")
        lines = set(Polist.readlines())
        for line in lines:
            line = line.rstrip('\n')
            start_url = "http://www.chinavitae.com/biography/"
            yield Request(url=start_url + line + "/career",
                          callback=self.parse)

    def parse(self, response):
        hxs = Selector(response)
        item = PoliticianItem()
        name = hxs.xpath('//div[@class="bioName"]/text()').extract()[0].strip()
        item['name'] = name
        try:  #  borndate
            birth = response.xpath(
                '//div[@class="bioDetails"]//text()').extract()
            if birth:
                birth = ' '.join(birth).strip()
                borndate = re.findall('\d+', birth)[0]
                item['born'] = borndate
        except Exception as ex:
            print("With no born:" + response.url)

        careers = hxs.xpath('//tr[@valign="top"]').extract()
        for career in careers:
            duration = re.search('<td width="90" class="cdCell">(.+)<\/td>',
                                 career)
            if duration:
                duration = re.sub("—", "-", duration.group(1))
                item['duration'] = duration
            occupation = re.search('<strong>(.+)<\/strong>', career)
            if occupation:
                item['occupation'] = occupation.group(1).strip()
            branches = Selector(text=career).xpath(
                '//a[contains(@class,"link11")]/text()').extract()[0].strip()
            if branches:
                item['branch'] = branches
            yield item
Example #11
0
class XQCubeRBSpider(Spider):
    start_at=datetime.now()
    name = 'xq_cube_rb'
    logger = util.set_logger(name, LOG_FILE_CUBE_RB)
    handle_httpstatus_list = [400]

    cube_type = 'SP'
    # 上次维护的时间,每次更新
    start_time = time.strptime("2020-01-01", "%Y-%m-%d")

    def start_requests(self):
        zh_url = 'https://xueqiu.com/cubes/rebalancing/history.json?count=50&page=1&cube_symbol='
        sp_url = 'https://xueqiu.com/service/tc/snowx/PAMID/cubes/rebalancing/history?count=20&page=1&cube_symbol='


        # get start url from MongoDB
        db = util.set_mongo_server()
        symbols = []

        for s in db.xq_cube_info.find({'cube_type':self.cube_type}, {'symbol': 1, '_id': 0}):
            symbols.append(s['symbol'])
        symbols = list(set(symbols))

        # iterate each symbol
        all_page_n = len(symbols)
        for i in range(all_page_n):
            symbol = symbols[i].strip()
            now_page_n = i

            if self.cube_type == 'SP':
                url = sp_url + symbol
            elif self.cube_type == 'ZH':
                url = zh_url + symbol

            # 进度条
            if i%500==0:
                self.logger.info('%s (%s / %s) %s%%' % (symbol, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1))))

            yield Request(url = url,
                      callback = self.parse, meta = {'cube_type':self.cube_type, 'symbol':symbol,'page':1})


    def parse(self, response):
        try:
            if response.status == 200 and str(response.url) != "https://xueqiu.com/service/captcha":
                
                cube_type =  response.meta['cube_type']
                symbol =  response.meta['symbol']
                page = response.meta['page']

                body = re.sub('[\s]', '', response.body.decode('utf-8'))
                body = json.loads(body)

                if body['maxPage']:
                    max_page = body['maxPage']
                 
                if body['list']:
                    page_first_time = body['list'][0]['updated_at']
                    page_first_time = time.gmtime(page_first_time / 1000)
                    if page_first_time < self.start_time:
                        return
                    else:
                        for i in body['list']:
                            item = XQItem()
                            # i is of type dict
                            i['cube_symbol'] = symbol
                            i['cube_type'] = cube_type
                            item['url'] = response.url
                            item['content'] = i
                            item['fp'] = request_fingerprint(response.request)
                            yield item

                    # Second + page
                    if page < max_page:
                        page = page + 1
                        page_string =  '&page=' + str(page)
                        url = re.sub(r'&page=(\d+)', page_string, response.url)
                        yield Request(url = url, meta = {'cube_type':cube_type, 'symbol':symbol, 'page':page}, callback = self.parse)
            elif str(response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: %s' % (response.meta['symbol']))

        except Exception as ex:
            self.logger.error('Parse Exception: %s %s' % (str(ex), response.url))
Example #12
0
class XQUserFensi(Spider):
    start_at=datetime.now()
    name = 'xq_user_fans'
    logger = util.set_logger(name, LOG_FILE_USER_FENSI)
    #handle_httpstatus_list = [404]
    cube_type = 'SP'


    def start_requests(self):
        start_url="http://xueqiu.com/friendships/followers.json?size=1000&uid="

        # get start url from MongoDB
        db = util.set_mongo_server()
        owner_ids = []
        for id in db.xq_cube_info.find({'cube_type':self.cube_type}, {'owner_id': 1, '_id': 0}):
            owner_ids.append(id['owner_id'])
        owner_ids = list(set(owner_ids))

        # iterate each symbol
        all_page_n = len(owner_ids)
        for i in range(all_page_n):
            now_page_n = i
            owner_id = owner_ids[i]
            url = start_url+str(owner_id)

            # progress
            if i%1000==0:
                self.logger.info('%s (%s / %s) %s%%' % (owner_id, str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1))))                #util.get_progress(now_page = i, all_page = all_page_n, logger = self.logger, spider_name = self.name, start_at = self.start_at)

            yield Request(url = url,
                        meta = {'user_id': owner_id},
                        callback = self.parse)

    def parse(self, response):
        try:
            if response.status == 200 and str(response.url) != "https://xueqiu.com/service/captcha":
                content = json.loads(response.body.decode('utf-8'))
                if content['maxPage']:
                    max_page = content['maxPage']

                    # First page, use parse_gz
                    for item in self.parse_gz(response = response):
                        yield item

                    # Second + page, use parse_gz
                    if max_page > 1:
                        for i in range(2, max_page + 1):
                            url = response.url + '&pageNo=' + str(i)
                            yield Request(url = url,
                                          meta = {'user_id': response.meta['user_id']},
                                          callback = self.parse_gz)

            if str(response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: User ID %s' % (response.meta['user_id']))

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))

    def parse_gz(self, response):
        try:
            body = json.loads(response.body.decode('utf-8'))
            content = {}
            content['user_id'] = response.meta['user_id']
            content['count'] = body['count']
            content['anonymous_count'] = body['anonymous_count']

            users = []
            for user in body['followers']:
                users.append(user['id'])
            content['fans'] = users
            content['lastcrawl'] = int(time.time())

            item = XQItem()
            item['url'] = response.url
            item['content'] = content
            item['fp'] = request_fingerprint(response.request)
            yield item

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' % (str(ex), response.url))
Example #13
0
class GBFuture(Spider):
    name = 'GBFuture'
    logger = util.set_logger(name, LOG_FILE_GBFuture)

    def start_requests(self):
        start_url = "http://guba.eastmoney.com/remenba.aspx?type=2"
        yield Request(url=start_url, callback=self.parse)

    def parse(self, response):
        paths = response.xpath('//div[@class = "gbboxb"]//li/a').extract()
        paths = set(paths)
        # print(paths)
        for path in paths:
            item = MntItem()
            item['content'] = {}
            semi_url = Selector(text=path).xpath('//a/@href').extract()[0]
            semi_url = re.match('(.+)\.', semi_url).group(0)
            semi_url = re.sub('\.', ',', semi_url)
            bar_name = Selector(text=path).xpath('//a/text()').extract()[0]
            item['content']['bar_name'] = bar_name
            url = 'http://guba.eastmoney.com/' + semi_url + 'f.html'
            yield Request(url=url,
                          callback=self.parse_page,
                          meta={
                              'page': 1,
                              'semi_url': copy.deepcopy(semi_url),
                              'item': copy.deepcopy(item)
                          })

    def parse_page(self, response):  # 确定需要抓取的页数
        page = response.meta['page']
        item = response.meta['item']
        semi_url = response.meta['semi_url']
        dates = response.xpath(
            '//div[contains(@class, "normal_post")]//span[contains(@class, "l5")]/text()'
        ).extract()
        md = date.today().strftime("%m-%d")

        tag_date = []  # 提取时间的前4个字符
        for d in dates:
            tag_date.append(d[:5])
        tag_date = set(tag_date)

        if md in tag_date:
            yield Request(url='http://guba.eastmoney.com/' + semi_url + 'f_' +
                          str(page + 1) + '.html',
                          callback=self.parse_page,
                          dont_filter=True,
                          meta={
                              'page': page + 1,
                              'item': item,
                              'semi_url': semi_url
                          })

        elif md not in tag_date:
            for i in range(1, page):
                yield Request(url='http://guba.eastmoney.com/' + semi_url +
                              'f_' + str(i) + '.html',
                              callback=self.parse_post,
                              dont_filter=True,
                              meta={
                                  'item': item,
                                  'md': md
                              })

    def parse_post(self, response):
        item = response.meta['item']
        md = response.meta['md']
        hxs = response.xpath(
            '//div[contains(@class, "normal_post")]').extract()
        for hx in hxs:
            post_date = Selector(text=hx).xpath(
                '//span[contains(@class, "l5")]/text()').extract()[0]
            post_tag_date = post_date[:5]
            if post_tag_date == md:
                try:
                    post_date = Selector(text=hx).xpath(
                        '//span[contains(@class, "l5")]/text()').extract()[0]
                    post_date = str(date.today().year) + "-" + post_date
                    post_date = datetime.strptime(post_date, "%Y-%m-%d %H:%M")
                    item['content']['post_date'] = post_date
                except:
                    pass
                try:
                    post_title = Selector(text=hx).xpath(
                        '//span[contains(@class, "l3")]/a/@title').extract()[0]
                    item['content']['post_title'] = post_title
                except:
                    pass
                try:
                    reply_num = Selector(text=hx).xpath(
                        '//span[contains(@class, "l2")]/text()').extract()[0]
                    item['content']['reply_num'] = reply_num
                except:
                    pass
                try:
                    read_num = Selector(text=hx).xpath(
                        '//span[contains(@class, "l1")]/text()').extract()[0]
                    item['content']['read_num'] = read_num
                except:
                    pass
                try:
                    post_author = Selector(text=hx).xpath(
                        '//span[contains(@class, "l4")]/a[@target = "_blank"]/font/text()'
                    ).extract()[0]
                    item['content']['post_author'] = post_author
                except:
                    pass
                try:
                    author_title = Selector(text=hx).xpath(
                        '//span[contains(@class, "l4")]/a/em/@title').extract(
                        )[0]
                    item['content']['author_title'] = author_title
                except:
                    pass
                yield item
Example #14
0
# -*- coding: utf-8 -*-
import random
import redis
import time
import json
import base64
from crawler.settings import *
from scrapy.downloadermiddlewares.retry import RetryMiddleware
from twisted.web._newclient import ResponseNeverReceived
from twisted.python.failure import Failure
from twisted.internet.error import TimeoutError, ConnectionRefusedError, ConnectError, TCPTimedOutError, ConnectionDone
from datetime import datetime
from scrapy import signals
from crawler.spiders import util

logger = util.set_logger("http_proxy_middleware", LOG_FILE_MIDDLEWARE)


class RandomRequestHeaders(object):
    """Randomly rotate user agents based on a list of predefined ones"""
    def __init__(self, agents, cookies):
        self.agents = agents
        self.cookies = cookies

    @classmethod
    def from_crawler(cls, crawler):
        ua = crawler.settings.getlist('USER_AGENTS')
        ck = crawler.settings.getlist('COOKIES')
        return cls(ua, ck)

    def process_request(self, request, spider):
Example #15
0
class GBFuture(Spider):
    name = 'GBFuturetest'
    logger = util.set_logger(name, LOG_FILE_GBFuture)

    def start_requests(self):
        start_url = "http://guba.eastmoney.com/list,rb,f_1.html"
        item = MntItem()
        item['content'] = {}
        item['content']['bar_name'] = "螺纹钢吧"
        yield Request(url=start_url,
                      callback=self.parse,
                      dont_filter=True,
                      meta={
                          'page': 1,
                          'item': item
                      })

    def parse(self, response):  # 确定需要抓取的页数
        page = response.meta['page']
        item = response.meta['item']
        dates = response.xpath(
            '//div[contains(@class, "normal_post")]//span[contains(@class, "l5")]/text()'
        ).extract()
        md = date.today().strftime("%m-%d")

        tag_date = []  # 提取时间的前4个字符
        for d in dates:
            tag_date.append(d[:5])
        tag_date = set(tag_date)

        if md in tag_date:
            yield Request(url='http://guba.eastmoney.com/list,rb,f_' +
                          str(page + 1) + '.html',
                          callback=self.parse,
                          dont_filter=True,
                          meta={
                              'page': page + 1,
                              'item': item
                          })

        elif md not in tag_date:
            for i in range(1, page):
                yield Request(url='http://guba.eastmoney.com/list,rb,f_' +
                              str(i) + '.html',
                              callback=self.parse_page,
                              dont_filter=True,
                              meta={
                                  'item': item,
                                  'md': md
                              })

    def parse_page(self, response):
        item = response.meta['item']
        md = response.meta['md']
        hxs = response.xpath(
            '//div[contains(@class, "normal_post")]').extract()
        for hx in hxs:
            post_date = Selector(text=hx).xpath(
                '//span[contains(@class, "l5")]/text()').extract()[0]
            post_tag_date = post_date[:5]
            if post_tag_date == md:
                try:
                    post_date = Selector(text=hx).xpath(
                        '//span[contains(@class, "l5")]/text()').extract()[0]
                    post_date = str(date.today().year) + "-" + post_date
                    post_date = datetime.strptime(post_date, "%Y-%m-%d %H:%M")
                    item['content']['post_date'] = post_date
                except:
                    pass
                try:
                    post_title = Selector(text=hx).xpath(
                        '//span[contains(@class, "l3")]/a/@title').extract()[0]
                    item['content']['post_title'] = post_title
                except:
                    pass
                try:
                    reply_num = Selector(text=hx).xpath(
                        '//span[contains(@class, "l2")]/text()').extract()[0]
                    item['content']['reply_num'] = reply_num
                except:
                    pass
                try:
                    read_num = Selector(text=hx).xpath(
                        '//span[contains(@class, "l1")]/text()').extract()[0]
                    item['content']['read_num'] = read_num
                except:
                    pass
                try:
                    post_author = Selector(text=hx).xpath(
                        '//span[contains(@class, "l4")]/a[@target = "_blank"]/font/text()'
                    ).extract()[0]
                    item['content']['post_author'] = post_author
                except:
                    pass
                try:
                    author_title = Selector(text=hx).xpath(
                        '//span[contains(@class, "l4")]/a/em/@title').extract(
                        )[0]
                    item['content']['author_title'] = author_title
                except:
                    pass
                yield item
Example #16
0
class ChinaVitaeSpider(Spider):
    name = 'CrawlerChinaVitae'
    logger = util.set_logger(name, LOG_FILE_CHINAVITAE)

    def start_requests(self):
        start_url = 'http://www.chinavitae.com/biography_browse.php?l='
        for i in range(97, 123):
            start_urls = start_url + chr(i)
            yield Request(url=start_urls, callback=self.parse)

    #crawl every name and of vitae
    def parse(self, response):
        hxs = Selector(response)
        names = hxs.xpath('//a[@class="link11"]').extract()
        if names:
            item = ChinaVitaeItem()
            item['content'] = {}
            item['content']['careers'] = []
            for name in names:
                name_pinyin = Selector(text=name).xpath(
                    '//a[@class="link11"]/text()').extract()[0]
                item['content']['name_pinyin'] = name_pinyin
                name_url = Selector(text=name).xpath('//a/@href').extract()[0]
                name_url = "http://www.chinavitae.com" + name_url + "/full"
                yield Request(url=name_url,
                              meta={'item': item},
                              callback=self.parse_biog)

    def parse_biog(self, response):
        item = response.meta['item']

        try:  # Some of biography have no item with Chinese name
            name = response.xpath(
                '//span[@style="font-family:Courier New, Courier, mono;"]/text()'
            ).extract()[0]
            item['content']['name'] = name
        except Exception as ex:
            print("With no Chinese name: " + response.url)

        try:  # title of the biography
            biotitle = response.xpath(
                '//div[@class="bioTitle"]/text()').extract()[0]
            item['content']['biotitle'] = biotitle
        except Exception as ex:
            print("With no biotitle: " + response.url)

        try:  # whole of biography
            bigph = response.xpath(
                '//div[@id="dataPanel"]/p').extract()[0].strip()
            bigph = re.sub('\r\n', ' ', bigph)
            bigph = re.sub('<br>', '', bigph)
            bigph = re.search('<p>(.+)<\/p>', bigph).group(1)
            item['content']['biography'] = bigph
        except Exception as ex:
            print("With no bigph:" + response.url)

        try:  #  borndate
            birth = response.xpath(
                '//div[@class="bioDetails"]//text()').extract()
            if birth:
                birth = ' '.join(birth).strip()
                borndate = re.findall('\d+', birth)[0]
                item['content']['borndate'] = borndate
        except Exception as ex:
            print("With no borndate:" + response.url)

        try:  # birthplace
            birth = response.xpath(
                '//div[@class="bioDetails"]//text()').extract()
            if birth:
                birth = ' '.join(birth).strip()
                birthplace = re.search('Birthplace:(.+)', birth).group(1)
                birthplace = re.sub(' ', '', birthplace)
                item['content']['birthplace'] = birthplace
        except Exception as ex:
            print("With no birthplace:" + response.url)

        try:  # Careers
            careers = response.xpath('//tr[@valign="top"]').extract()
            career = {}
            for c in careers:
                duration = re.search(
                    '<td width="90" class="cdCell">(.+)<\/td>', c)
                if duration:
                    duration = re.sub("—", "-", duration.group(1))
                    career['duration'] = duration
                occupation = re.search('<strong>(.+)<\/strong>', c)
                if occupation:
                    career['occupation'] = occupation.group(1)
                branch = Selector(text=c).xpath(
                    '//a[contains(@class,"link11")]/text()').extract()
                if branch:
                    career['branch'] = branch

                item['content']['careers'].append(copy.deepcopy(career))
        except Exception as ex:
            print("With no careers:" + response.url)

        yield item
Example #17
0
class XQUserStatus(Spider):
    start_at = datetime.now()
    name = 'xq_user_cmt'
    logger = util.set_logger(name, LOG_FILE_USER_STATUS)

    #handle_httpstatus_list = [404]

    def start_requests(self):
        start_url = "https://xueqiu.com/v4/statuses/user_timeline.json?&count=20&user_id="

        ## get start url from MongoDB
        db = util.set_mongo_server()
        owner_ids = []
        for id in db.xq_cube_info.find({}, {'owner_id': 1, '_id': 0}):
            owner_ids.append(id['owner_id'])
        owner_ids = list(set(owner_ids))

        #owner_ids = ["1001223822"]

        # iterate each symbol
        all_page_n = len(owner_ids)
        for i in range(all_page_n):
            owner_id = owner_ids[i]
            now_page_n = i
            url = start_url + str(owner_id)

            # progress
            if i % 1000 == 0:
                self.logger.info(
                    '%s (%s / %s) %s%%' %
                    (owner_id, str(now_page_n), str(all_page_n),
                     str(round(float(now_page_n) / all_page_n * 100, 1))))
            #util.get_progress(all_page = all_page_n, logger = self.logger, spider_name = self.name, start_at = self.start_at)

            yield Request(url=url,
                          meta={'user_id': owner_id},
                          callback=self.parse)

    def parse(self, response):
        try:
            if response.status == 200 and str(
                    response.url) != "https://xueqiu.com/service/captcha":
                body = json.loads(response.body.decode('utf-8'))
                if body['maxPage']:
                    max_page = body['maxPage']
                    page = body['page']

                    # First page
                    if page == 1:
                        content = {}
                        content['user_id'] = response.meta['user_id']
                        content['statuses'] = body['statuses']
                        content['total'] = body['total']
                        content['max_page'] = body['maxPage']
                        content['page'] = body['page']

                        item = XQItem()
                        item['content'] = content
                        yield item

                    # Second + page
                    if max_page > 1:
                        for i in range(2, max_page + 1):
                            url = response.url + '&page=' + str(i)
                            yield Request(
                                url=url,
                                meta={'user_id': response.meta['user_id']},
                                callback=self.parse_status)

            elif str(response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: User ID %s' %
                                  (response.meta['user_id']))

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))

    def parse_status(self, response):
        try:
            body = json.loads(response.body.decode('utf-8'))
            content = {}
            content['user_id'] = response.meta['user_id']
            content['statuses'] = body['statuses']
            content['total'] = body['total']
            content['max_page'] = body['maxPage']
            content['page'] = body['page']

            item = XQItem()
            item['content'] = content
            item['fp'] = request_fingerprint(response.request)
            yield item

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))
Example #18
0
class XQCubeInfoSpider(Spider):
    start_at = datetime.now()
    name = 'xq_cube_info'
    logger = util.set_logger(name, LOG_FILE_CUBE_INFO)
    handle_httpstatus_list = [404]
    website_possible_httpstatus_list = [404]

    cube_type = 'SP'

    def start_requests(self):
        start_url = "https://xueqiu.com/p/"

        # 对于ZH,从100至200万     XQ-1803, ZH: 1320315个;SP:1354210(33895)个
        # 对于SP,从100万至110万
        start_page = 1100000
        end_page = 1500000

        # iterate each page
        all_page_n = end_page - start_page + 1
        for i in range(start_page, end_page):
            now_page_n = i - start_page

            if self.cube_type == 'ZH':
                if i <= 999999:
                    symbol = "ZH" + str(i).zfill(6)
                    url = start_url + symbol
                elif i >= 1000000:
                    symbol = "ZH" + str(i).zfill(7)
                    url = start_url + symbol
            elif self.cube_type == 'SP':
                symbol = "SP" + str(i).zfill(7)
                url = start_url + symbol

            #自定义进度条
            if i % 500 == 0:
                self.logger.info(
                    '%s (%s / %s) %s%%' %
                    (symbol, str(now_page_n), str(all_page_n),
                     str(round(float(now_page_n) / all_page_n * 100, 1))))
                #util.get_progress(all_page = all_page_n, logger = self.logger, spider_name = self.name, start_at = self.start_at)

            yield Request(url=url,
                          callback=self.parse,
                          meta={'cube_type': self.cube_type})

    def parse(self, response):
        try:
            #print(response.url)
            #print(response.status)
            if response.status == 200 and str(
                    response.url) != "https://xueqiu.com/service/captcha":
                item = XQItem()
                hxs = Selector(response)
                info_script = ''.join(
                    hxs.xpath(
                        '//script[contains(., "cubeInfo")]//text()').extract())
                info_script = re.sub("[\s]", "", info_script)
                m = re.search("SNB.cubeInfo=({\S+?});SNB.cube", info_script)
                if m:
                    content = json.loads(m.group(1).strip())
                    content['lastcrawl'] = int(time.time())
                    content['cube_type'] = response.meta['cube_type']
                    item['content'] = content
                    item['fp'] = request_fingerprint(response.request)
                    item['url'] = response.url
                    yield item
            # 返回404,但是非验证码情况,说明对应的cube symbol不存在,这些url也要写入redis,避免下次再进行抓取
            elif response.status == 404 and str(
                    response.url) != "https://xueqiu.com/service/captcha":
                item = XQItem()
                item['fp'] = request_fingerprint(response.request)
                item['url'] = response.url
                yield item
                #self.logger.warn('404: %s' % (str(response.url)))

            elif str(response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: %s' % (response.url))

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))
Example #19
0
class SinaNewsSpider(Spider):
    name = "CrawlerSinaNews"
    logger = util.set_logger(name, LOG_FILE_SINANEWS)
    handle_httpstatus_list = [404]

    def start_requests(self):
        # 新闻类别 col 取值 (90:国内,91:国际,92:社会,94:体育,95:娱乐,93:军事,96:科技,97:财经,98:股市,99:美股)
        col = "90"
        start_url = "http://roll.news.sina.com.cn/interface/rollnews_ch_out_interface.php?col=%s&num=5010&date=" % (col)    
        start_date = datetime.strptime("2010-01-01", "%Y-%m-%d").date()
        end_date = datetime.strptime("2018-09-01", "%Y-%m-%d").date()
        url_date = []
        
        s_d = start_date
        c_d = s_d.strftime("%Y-%m-%d")
        url_date.append(c_d)
        while s_d < end_date:
            s_d = s_d + timedelta(days=1)
            c_d = s_d.strftime("%Y-%m-%d")
            url_date.append(c_d)
        
        for i in range(len(url_date)):
            url = start_url + url_date[i]
            # 每抓取15天print log
            if i%15 == 0:
                self.logger.info("Now crawl: " + url_date[i])
            yield Request(url = url, callback = self.parse)
    
    def parse(self, response):
        item = SinaNewsItem()
        body = re.sub("[\s]", "", response.body.decode("gbk"))
        m = re.search("varjsonData=({\S+?});", body)
        if m:
            js = demjson.decode(m.group(1).strip())
            for i in js['list']: 
                item['content'] = i
                if item['content']['time']:
                    item['content']['time'] = datetime.fromtimestamp(int(item['content']['time']))
                url = i['url']
                yield Request(url = url, callback = self.parse_content, meta = {'item' : item})
    
    # 抓取正文
    def parse_content(self, response):
        if response.status == 200:
            # 不抓取video、blog 、guba 以及 weibo 子域名
            if re.search("://.*video|://blog|://passport.weibo|://guba|://slide|://survey", response.url):
                return
            item = response.meta['item']
            # 老的页面可能用gbk/gb2312编码,新的页面一般用utf8编码,因此两种编码都要试一下
            try:
                filter_body = response.body.decode('utf8')
            except:
                try:
                    filter_body = response.body.decode("gbk")
                except:
                    try:
                        filter_body = response.body.decode("gb2312")
                    except Exception as ex:
                        print("Decode webpage failed: " + response.url)
                        return

            filter_body = re.sub('<[A-Z]+[0-9]*[^>]*>|</[A-Z]+[^>]*>', '', filter_body)
            response = response.replace(body = filter_body)
            hxs =Selector(response)

            # parse news_id 
            # news_id只可能来源于name="publishid",如果不存在,则放弃该条新闻
            news_id = hxs.xpath('//head/*[@name="publishid"]/@content').extract()
            if news_id:
                item['content']['news_id'] = news_id[0]
            else:
                self.logger.info("No 'news_id'! Skip: %s" % (response.url))
                return

            # parse cmt_id
            # cmt_id可能来源于name="comment"(较新的网页),也可能来源于对html的正则解析(较旧的网页)
            cid = hxs.xpath('//head/*[@name="comment"]/@content').extract()
            if cid:
                # 新网页主要是这种格式
                d = cid[0].split(":")
                cmt_id = {"channel":d[0], "comment_id":d[1]}
                item['content']['cmt_id'] = cmt_id
                #print("cmt_id 1")
            else:
                # 旧网页主要是这种格式
                filter_body = re.sub("[\s]", "", filter_body)
                m = re.search('''channel:["'](.+?)["'],.*newsid:["'](.+?)['"]''', filter_body)
                if m:
                    cmt_id = {"channel":m.group(1), "comment_id":m.group(2)}
                    item['content']['cmt_id'] = cmt_id
                    #print("cmt_id 2")
                else:
                    # 个别特例
                    m = re.search('channel=(.+?)&newsid', filter_body)
                    if m:
                        cmt_id = {"channel":m.group(1), "comment_id":item['content']['news_id']}
                        item['content']['cmt_id'] = cmt_id
                        #print("cmt_id 3")
                    else:
                        self.logger.info("No 'cmt_id' found: %s" % (response.url))

            # keywords / tag
            key_words = hxs.xpath('//head/*[@name = "keywords"]/@content').extract()      
            if key_words:
                item['content']['keywords'] = key_words[0]  

            tags = hxs.xpath('//head/*[@name = "tags"]/@content').extract()      
            if tags:
                item['content']['tags'] = tags[0]

            # article create / update / publish time  
            create = hxs.xpath('//head/*[@name = "weibo: article:create_at"]/@content').extract()      
            if create:
                item['content']['news_create_time'] = create[0]
            update = hxs.xpath('//head/*[@name = "weibo: article:update_at"]/@content').extract()      
            if update:
                item['content']['news_update_time'] = update[0]
            publish = hxs.xpath('//head/*[@property = "article:published_time"]/@content').extract()      
            if publish:
                item['content']['news_publish_time'] = publish[0]

            # parse content
            content = hxs.xpath('//*[@id="artibody"]/p/text()').extract()
            if content:
                item['content']['content'] = "\n".join(content)
                item['url'] = response.url
        
            # parse source / author
            source = hxs.xpath('//head/*[@name="mediaid"]/@content').extract()
            if source:
                item['content']['source'] = source[0]

            author = hxs.xpath('//head/*[@property="article:author"]/@content').extract()
            if author:
                item['content']['author'] = author[0]        

            # parse reply
            # cmt_id 包含了新闻的id和channel,用于生成reply_url
            if "cmt_id" in item['content']:
                reply_url_stat = "http://comment5.news.sina.com.cn/page/info?version=1&format=json&compress=1&ie=utf-8&oe=utf-8&page=1&page_size=20&channel=" + item['content']['cmt_id']['channel'] + "&newsid=" + item['content']['cmt_id']['comment_id']

                reply_url = "http://comment5.news.sina.com.cn/page/info?version=1&format=json&compress=1&ie=utf-8&oe=utf-8&page_size=100&channel=" +  item['content']['cmt_id']['channel'] + "&newsid=" +  item['content']['cmt_id']['comment_id'] + "&page="
        
                yield Request(url = reply_url_stat, meta = {'item':item, 'cmt_url':reply_url}, callback = self.parse_reply)
        
            # 如果解析不出comment reply,那么就不抓reply
            else:
                yield item

        elif response.status == 404:
            self.logger.error("Page 404: %s" % (response.url))
            return

    # parse_reply并不解析回复正文,只用来确定总回复数replynum,总翻页数rptotal等。解析回复正文在parse_reply_json
    def parse_reply(self, response):
        d_json = json.loads(response.body.decode('utf8'))
        item = response.meta['item']
        cmt_url = response.meta['cmt_url']
        
        try:
            reply = {}
            if d_json['result']:
                # 存在回复的情况
                if 'count' in d_json['result']:
                    reply['replynum'] = int(d_json['result']['count']['show'])
                    reply['hotness'] = int(d_json['result']['count']['total'])
                    reply['qreply'] = int(d_json['result']['count']['qreply']) # 并不知道qreply是什么
                    item['content']['reply'] = reply
        
                    # 确定需要翻页数        
                    rptotal = 0
                    if reply['replynum']%100 == 0:
                        rptotal = reply['replynum']/100
                    else:
                        rptotal = int(reply['replynum']/100) + 1

                    if rptotal > 0:
                        yield Request(url = cmt_url + str(1), meta = {'item':item, 'rptotal':rptotal, 'page':1,
                                        'cmt_url':cmt_url},callback = self.parse_reply_json)
                    else:
                        yield item

        # 不存在回复,直接返回item
        except Exception as ex:
            yield item

    # parse_reply_json才真正用来解析回复正文
    def parse_reply_json(self, response):
        item =  response.meta['item']
        cmt_url = response.meta['cmt_url']
        page = response.meta['page']
        rptotal = response.meta['rptotal']        

        d_json = json.loads(response.body.decode('utf8'))
        if d_json['result']:
            if 'cmntlist' in d_json['result']:
                # 如果reply_content是空的,说明page=1,直接赋值;否则说明page>1,使用extend方法
                if "reply_content" in item['content']['reply']:
                    item['content']['reply']['reply_content'].extend(d_json['result']['cmntlist'])
                else:
                    item['content']['reply']['reply_content'] = d_json['result']['cmntlist']

        # page为当前所处页面,直到page和总页数相等才停止抓取
        if page == rptotal:
            yield item
        elif page < rptotal:
            yield Request(url = cmt_url + str(page+1), meta = {'item':item, 'rptotal':rptotal, 'cmt_url':response.meta['cmt_url'], 'page':page+1}, callback = self.parse_reply_json)
Example #20
0
class XQUserStatus(Spider):
    start_at = datetime.now()
    name = 'xq_user_cmt'
    logger = util.set_logger(name, LOG_FILE_USER_STATUS)
    #handle_httpstatus_list = [404]

    # 上次维护的时间,每次更新
    start_time = time.strptime("2020-01-01", "%Y-%m-%d")

    def start_requests(self):
        # 雪球的cmt一个页面最多显示20条
        start_url = "https://xueqiu.com/v4/statuses/user_timeline.json?&count=20&page=1&user_id="

        ## get start url from MongoDB
        db = util.set_mongo_server()
        owner_ids = []
        for id in db.xq_cube_info.find({}, {'owner_id': 1, '_id': 0}):
            owner_ids.append(id['owner_id'])
        owner_ids = list(set(owner_ids))

        # iterate each symbol
        all_page_n = len(owner_ids)
        for i in range(all_page_n):
            owner_id = owner_ids[i]
            now_page_n = i
            url = start_url + str(owner_id)

            # progress
            if i % 1000 == 0:
                self.logger.info(
                    '%s (%s / %s) %s%%' %
                    (owner_id, str(now_page_n), str(all_page_n),
                     str(round(float(now_page_n) / all_page_n * 100, 1))))

            yield Request(url=url,
                          meta={'user_id': owner_id},
                          callback=self.parse)

    def parse(self, response):
        try:
            if response.status == 200 and str(
                    response.url) != "https://xueqiu.com/service/captcha":
                body = json.loads(response.body.decode('utf-8'))
                if body['maxPage']:
                    max_page = body['maxPage']
                    page = body['page']

                if body['statuses']:
                    page_first_time = body['statuses'][0]['created_at']
                    page_first_time = time.gmtime(page_first_time / 1000)
                    if page_first_time < self.start_time:
                        return
                    content = {}
                    content['user_id'] = response.meta['user_id']
                    content['statuses'] = body['statuses']
                    content['total'] = body['total']
                    content['max_page'] = body['maxPage']
                    content['page'] = body['page']

                    item = XQItem()
                    item['content'] = content
                    yield item

                    # Second + page
                    if page < max_page:
                        page = page + 1
                        page_string = '&page=' + str(page)
                        url = re.sub(r'&page=(\d+)', page_string, response.url)
                        yield Request(
                            url=url,
                            meta={'user_id': response.meta['user_id']},
                            callback=self.parse)

            elif str(response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: User ID %s' %
                                  (response.meta['user_id']))

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))
Example #21
0
class VippearSpider(Spider):
    name = 'CrawlerVippear'
    logger = util.set_logger(name, LOG_FILE_VIPPEAR)

    def start_requests(self):
        start_url = 'http://www.chinavitae.com/vip/index.php?mode=officials&map=show&type=cv'
        yield Request(url=start_url, callback=self.parse)

    def parse(self, response):
        names = response.xpath(
            '//td[@align="left"]//a[@class="link12"]').extract()
        for name in names:
            item = ChinaVitaeItem()
            item['content'] = {}
            name_pinyin = Selector(
                text=name).xpath('//a[@class="link12"]/text()').extract()[0]
            item['content']['name_pinyin'] = name_pinyin
            name_url = Selector(text=name).xpath('//a/@href').extract()[0]
            res_url = 'http://www.chinavitae.com/vip/' + name_url
            yield Request(url=res_url,
                          meta={
                              'url': res_url,
                              'item': item
                          },
                          callback=self.parse_year)

    def parse_year(self, response):
        year = response.xpath('//p/a/text()').extract()
        year = ' '.join(year).strip()
        years = re.findall('\d{4}', year)
        item = response.meta['item']

        for y in years:
            res_url = response.meta['url']
            yield Request(url=res_url + '&filter_year=' + y,
                          meta={'item': item},
                          callback=self.parse_list)

    def parse_list(self, response):
        urls = response.xpath('//div[@class="link12b"]/a/@href').extract()
        item = response.meta['item']
        for url in urls:
            vippurl = 'http://www.chinavitae.com' + url
            yield Request(url=vippurl,
                          meta={'item': item},
                          callback=self.parse_vipp)

    def parse_vipp(self, response):
        item = response.meta['item']

        acti = response.xpath('//html//tr[2]/td[2]').extract()[0]
        acti = re.sub('\r\n', '', acti)
        acti = re.search('td>(.+)<\/td', acti).group(1)
        item['content']['activity'] = acti

        infos = response.xpath(
            '//*[contains(@class, "link12")]//text()').extract()
        infos = ','.join(infos).strip()
        infos = re.sub('\n', '', infos)
        infos = re.sub('\t', '', infos)

        date = re.search('Date: ,(.+),Activity', infos).group(1).strip()
        item['content']['date'] = date

        try:
            location = re.search('Location: ,(.+),Attendees',
                                 infos).group(1).strip()
            item['content']['location'] = location
        except Exception as ex:
            print("With no Location: " + response.url)

        try:
            attendees = re.search('Attendees: ,(.+),Source',
                                  infos).group(1).strip()
            item['content']['attendees'] = attendees
        except:
            try:
                attendees = re.search('Attendees: ,(.+),Topics',
                                      infos).group(1).strip()
                item['content']['attendees'] = attendees
            except:
                try:
                    attendees = re.search('Attendees: ,(.+)',
                                          infos).group(1).strip()
                    item['content']['attendees'] = attendees
                except Exception as ex:
                    print("With no Attendees: " + response.url)

        try:
            source = re.search('Source: ,(.+),Topics', infos).group(1).strip()
            item['content']['source'] = source
        except Exception as ex:
            print("With no Source: " + response.url)

        try:
            topics = re.search('Topics: ,(.+)', infos).group(1).strip()
            item['content']['topics'] = topics
        except Exception as ex:
            print("With no Topics: " + response.url)

        yield item
Example #22
0
class GubaSpider(Spider):
    name = 'CrawlerGuba2'
    logger = util.set_logger(name, LOG_FILE_GUBA)

    def start_requests(self):
        start_urls = "http://guba.eastmoney.com/news,v,47652005.html"
        yield Request(url=start_urls,
                      meta={'replynum': 1832},
                      callback=self.parse)

    def parse(self, response):
        hxs = Selector(response)
        posts = hxs.xpath('//div[@class="articleh"]').extract()
        for post in posts:
            item = GubaItem()
            item['content'] = {}
            readnum = Selector(
                text=post).xpath('//span[@class="l1"]/text()').extract()
            if readnum:
                readnum = readnum[0]
            replynum = Selector(
                text=post).xpath('//span[@class="l2"]/text()').extract()
            if replynum:
                replynum = replynum[0]
            url = Selector(
                text=post).xpath('//span[@class="l3"]/a/@href').extract()
            if url:
                url = url[0]
            guba_id = re.search(',(.+).html', response.url).group(1)

            if str(guba_id) in str(url):
                m_stock = re.search("(^\/.+)", url)
                if m_stock:
                    post_url = "http://guba.eastmoney.com" + m_stock.group(1)
                    post_id = re.search('\/(n.+)\.html', url).group(1)
                    item['content']['readnum'] = readnum
                    item['content']['replynum'] = replynum
                    item['content']['post_id'] = post_id
                    yield Request(url=post_url,
                                  meta={
                                      'item': item,
                                      'replynum': replynum
                                  },
                                  callback=self.parse_post)

    ##对帖子信息进行抓取并翻页
    def parse_post(self, response):
        if response.status == 200:
            hxs = Selector(response)
            item = response.meta['item']
            dt = hxs.xpath('//div[@class="zwfbtime"]/text()').extract()[0]
            dt = re.search('\D+(\d{4}-\d{2}-.+:\d{2})', dt).group(1)
            creat_time = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
            item['content']['create_time'] = creat_time

            try:
                author_url = hxs.xpath(
                    '//div[@id="zwconttbn"]/strong/a/@href').extract()[0]
                item['content']['author_url'] = author_url
            except:
                try:
                    author = hxs.xpath(
                        '//div[@id="zwconttbn"]//span/text()').extract()[0]
                    item['content']['author'] = author
                except Exception as ex:
                    print("Decode webpage failed: " + response.url)
                    return

            try:  #针对普通帖子
                postcontent = hxs.xpath(
                    '//div[@id="zwconbody"]/div[@class="stockcodec"]/text()'
                ).extract()[0].strip()
                if postcontent:
                    item['content']['content'] = postcontent

                postitle = hxs.xpath(
                    '//div[@class="zwcontentmain"]/div[@id="zwconttbt"]/text()'
                ).extract()[0].strip()
                item['content']['title'] = postitle
            except:  #针对问答帖子
                try:
                    postcontent = hxs.xpath(
                        '//div[@class="qa"]//div[contains(@class,"content")]/text()'
                    ).extract()
                    postquestion = postcontent[0]
                    postanswer = postcontent[2].strip() + postcontent[3].strip(
                    )
                    item['content']['content'] = postquestion
                    item['content']['answer'] = postanswer

                    postanswer_time = hxs.xpath(
                        '//div[@class="sign"]/text()').extract()
                    try:
                        postanswer_time = hxs.xpath(
                            '//div[@class="sign"]/text()').extract()
                        postanswer_time = re.search(
                            '\D+(\d{4}-\d{2}-.+:\d{2})',
                            postanswer_time[1].strip()).group(1)
                        answer_time = datetime.strptime(
                            postanswer_time, "%Y-%m-%d %H:%M:%S")
                        item['content']['answer_time'] = answer_time
                    except Exception as ex:
                        item['content']['answer_time'] = None

                    postitle = "Q&A"
                    item['content']['title'] = postitle
                except Exception as ex:
                    print("Parse Exception: " + response.url)
                    return

            replynum = response.meta['replynum']
            item['content']['reply'] = []
            if int(replynum) % 30 == 0:
                rptotal = int(int(replynum) / 30)

            else:
                rptotal = int(int(replynum) / 30) + 1

            if rptotal > 0:
                head = re.search('(.+)\.html', response.url).group(1)
                reply_url = head + "_" + str(1) + ".html"
                yield Request(url=reply_url,
                              meta={
                                  'item': item,
                                  'page': 1,
                                  'rptotal': rptotal,
                                  'head': head
                              },
                              callback=self.parse_reply)
            else:
                yield item

    def parse_reply(self, response):
        hxs = Selector(response)
        page = response.meta['page']
        rptotal = response.meta['rptotal']
        item = response.meta['item']
        head = response.meta['head']

        replists = hxs.xpath(
            '//div[@id="zwlist"]/div[@class="zwli clearfix"]').extract()
        for replist in replists:
            reply = {}
            try:
                reply_author = Selector(text=replist).xpath(
                    '//div[@class="zwlianame"]//a/text()').extract()[0]
                reply['reply_author'] = reply_author
                reply_author_url = Selector(text=replist).xpath(
                    '//div[@class="zwlianame"]//a/@href').extract()[0]
                reply['reply_author_url'] = reply_author_url
            except:
                try:
                    reply_author = Selector(text=replist).xpath(
                        '//span[@class="zwnick"]/span').extract()[0]
                    reply_author = re.search('"gray">(.+)<\/span>',
                                             reply_author).group(1)
                    reply['reply_author'] = reply_author
                except Exception as ex:
                    print("Decode webpage failed: " + response.url)
                    return

            reply_time = Selector(text=replist).xpath(
                '//div[@class="zwlitime"]/text()').extract()[0]
            reply_time = re.search('\D+(\d{4}-\d{2}-.+:\d{2})',
                                   reply_time).group(1)
            reply_time = datetime.strptime(reply_time, "%Y-%m-%d %H:%M:%S")
            reply['reply_time'] = reply_time

            reply_content = Selector(text=replist).xpath(
                '//div[@class="zwlitext stockcodec"]/text()').extract()
            if reply_content:
                reply['reply_content'] = reply_content[0]

            reply_quote_author = Selector(text=replist).xpath(
                '//div[@class="zwlitalkboxtext "]//a/text()').extract()
            if reply_quote_author:
                reply_quote_author = reply_quote_author[0]
                reply['reply_quote_author'] = reply_quote_author

            reply_quote_author_url = Selector(text=replist).xpath(
                '//div[@class="zwlitalkboxtext "]//a/@href').extract()
            if reply_quote_author_url:
                reply_quote_author_url = reply_quote_author_url[0]
                reply['reply_quote_author_url'] = reply_quote_author_url

            reply_quote_text = Selector(text=replist).xpath(
                '//div[@class= "zwlitalkboxtext "]/span/text()').extract()
            print(reply_quote_text)
            if reply_quote_text:
                reply_quote_content = reply_quote_text[0]
                reply['reply_quote_content'] = reply_quote_content

            item['content']['reply'].append(reply)
            print(item)
Example #23
0
class MMBHistSpider(Spider):
    name = 'MMBHist'
    logger = util.set_logger(name, LOG_FILE_MMB)
    handle_httpstatus_list = [404, 460, 504]
    db = util.set_mongo_server()

    # 抓“一家在售”
    if_crawl_onestore = True
    # 抓“多家在售”
    if_crawl_multstore = False


    def start_requests(self): 

        #“一家在售”的商品
        if self.if_crawl_onestore:
            bjids = []
            for id in self.db["MMB"].find({'bjid': {'$exists': True}}, {'bjid': 1, '_id': 0}):
                bjids.append(id['bjid'])
            bjids = list(set(bjids))

            # iterate each bjid
            all_page_n = len(bjids)
            for i in range(all_page_n):

                bjid = bjids[i].strip()
                now_page_n = i

                url = "http://tool.manmanbuy.com/history.aspx?action=gethistory&bjid=" + str(bjid)

                # 进度条
                if i%500==0:
                    self.logger.info('一家在售:(%s / %s) %s%%' % (str(now_page_n), str(all_page_n), str(round(float(now_page_n) / all_page_n * 100, 1))))

                yield Request(url = url, callback = self.parse)

        # “多家在售”的商品
        if self.if_crawl_multstore:
            p_infos = []
            # 挑出spid, name, url 不重复的记录
            pipeline = [
                {'$match':{'bjid':{'$exists':False}}},
                {'$group': {'_id': {'spid': '$spid', 'name': '$name', 'url': '$url'}}},
            ]
            cur = self.db.MMB.aggregate(pipeline)
            for i in cur:
                p_infos.append(i['_id'])

            all_page_n_mult = len(p_infos)
            for i in range(all_page_n_mult):
                p_info = p_infos[i]

                url = p_info['url']
                now_page_n = i

                # 进度条
                if i%500==0:
                    self.logger.info('多家在售: (%s / %s) %s%%' % (str(now_page_n), str(all_page_n_mult), str(round(float(now_page_n) / all_page_n_mult * 100, 1))))

                yield Request(url = url, meta = {"p_info":p_info}, callback = self.parse_mult)

            #yield Request(url = 'http://www.manmanbuy.com/pb_567731.aspx', meta = {"p_info":p_info}, callback = self.parse_mult)

    def parse_mult(self, response):
        try:
            if response.status == 200:
                # 把上一步的 item 传进来
                p_info = response.meta['p_info']

                # 解析同一个商品下的多家平台的链接
                nodes = response.xpath('//div[contains(@class, "pro-mall-list")]//ul//li//div[contains(@class, "item ")]')

                for n in nodes:
                    # 店铺名,不等于 siteName。例如同样siteName = 天猫。可以有sell_name = “vivo旗舰店”or “vivo天诚专卖店”
                    seller_name = n.xpath('div[contains(@class, "mall")]//text()').extract()
                    seller_name = ' '.join(' '.join(seller_name).split())
            
                    # get skuid
                    skuid = n.xpath('@skuid').extract()[0]

                    # get bjid
                    bjid = n.xpath('@v').extract()[0].strip()
                    bjid = ast.literal_eval(bjid)['bjid']

                    p_info.update({"seller_name":seller_name, "skuid":skuid, "bjid":bjid})

                    # 生成请求
                    url = "http://tool.manmanbuy.com/history.aspx?action=gethistory&bjid=" + str(bjid)

                    yield Request(url = url, meta = {"p_info":p_info}, callback = self.parse)

            else:
                self.logger.error('HTTP status not 200: %s \n %s' % (response.url, response.body))  
                
        except Exception as ex:
            self.logger.error('Parse Exception - "parse_mult": %s %s' % (str(ex), response.url))

    def parse(self, response):
        try:
            # 如果 200,按正常解析
            if response.status == 200:
                # 把上一步的 item 传进来(如果有)
                p_info = {}
                if "p_info" in response.meta:
                    p_info = response.meta['p_info']

                # 解析价格 json
                body = re.sub('[\s]', '', response.body.decode('gbk'))
                body = json.loads(body)

                # 在p_info中添加产品基本信息
                p_info.update({k: body[k] for k in ('siteName', 'siteId', 'zouShi', 'bjid', 'spName', 'spUrl', 'spbh', 'zouShi_test')})

                # p_hist 只包含价格/日期
                p_hist = body['datePrice']
                p_hist = re.findall("\[(.+?)\]", p_hist)

                # 把价格list“展开”
                docs = []
                lastcrawl = datetime.datetime.utcnow()
                for p in p_hist:
                    # date
                    m = re.search("Date.UTC\((.+?)\),([\d\.]+)", p)
                    if m:
                        date = m.group(1)
                        date = datetime.datetime.strptime(date, "%Y,%m,%d") - datetime.timedelta(hours = 8) # 把 strptime的结果转换成UTC
                    
                        # price
                        price = float(m.group(2).strip())
                        
                        # create doc and add to docs
                        doc = p_info
                        doc.update({"date":date, "price":price, "lastcrawl":lastcrawl})
                        docs.append(copy.deepcopy(doc))

                item = PriceItem()
                item['content'] = docs
                yield item
                    
            else:
                self.logger.error('Got %s: %s' % (response.status, response.url))

        except Exception as ex:
            self.logger.error('Parse Exception - "parse": %s %s' % (str(ex), response.url))
            self.logger.info(str(response.body))
Example #24
0
class GubaSpider(Spider):
    name = 'CrawlerGuba2'
    logger = util.set_logger(name, LOG_FILE_GUBA)

    def start_requests(self):
        start_urls = "http://guba.eastmoney.com/news,600029,18449146.html"
        yield Request(url=start_urls,
                      meta={'replynum': 0},
                      callback=self.parse)

    #对帖子信息进行抓取并翻页
    def parse(self, response):
        try:
            if response.status == 200:
                try:
                    filter_body = response.body.decode('utf8')
                except:
                    try:
                        filter_body = response.body.decode("gbk")
                    except:
                        try:
                            filter_body = response.body.decode("gb2312")
                        except Exception as ex:
                            print("Decode webpage failed: " + response.url)
                            return
                filter_body = re.sub('<[A-Z]+[0-9]*[^>]*>|</[A-Z]+[^>]*>', '',
                                     filter_body)
                response = response.replace(body=filter_body)
                hxs = Selector(response)

                item = GubaItem()
                dt = hxs.xpath('//div[@class="zwfbtime"]/text()').extract()[0]
                dt = re.search('\D+(\d{4}-\d{2}-.+:\d{2}).+', dt).group(1)
                creat_time = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
                item['content'] = {}
                item['content']['create_time'] = creat_time

                try:  #针对发帖者是注册会员
                    author_url = hxs.xpath(
                        '//div[@id="zwconttbn"]/strong/a/@href').extract()[0]
                    item['content']['author_url'] = author_url

                except Exception as ex:  #针对发帖者不是注册会员
                    author = hxs.xpath(
                        '//div[@id="zwconttbn"]//span').extract()[0]
                    author = re.search('gray">(.+)<\/span', author).group(1)
                    item['content']['author'] = author

                try:  #针对普通帖子
                    postcontent = hxs.xpath(
                        '//div[@id="zwconbody"]/div[@class="stockcodec"]/text()'
                    ).extract()[0].strip()
                    if postcontent:
                        item['content']['content'] = postcontent

                    postitle = hxs.xpath(
                        '//div[@class="zwcontentmain"]/div[@id="zwconttbt"]/text()'
                    ).extract()[0].strip()
                    item['content']['title'] = postitle
                except:  #针对问答的帖子
                    try:
                        postcontent = hxs.xpath(
                            '//div[@class="qa"]//div[contains(@class,"content")]/text()'
                        ).extract()
                        postquestion = postcontent[0]
                        postanswer = postcontent[2].strip(
                        ) + postcontent[3].strip()
                        item['content']['content'] = postquestion
                        item['content']['answer'] = postanswer
                        try:
                            postanswer_time = hxs.xpath(
                                '//div[@class="sign"]/text()').extract()
                            postanswer_time = re.search(
                                '\D+(\d{4}-\d{2}-.+:\d{2})',
                                postanswer_time[1].strip()).group(1)
                            answer_time = datetime.strptime(
                                postanswer_time, "%Y-%m-%d %H:%M:%S")
                            item['content']['answer_time'] = answer_time
                        except Exception as ex:
                            item['content']['answer_time'] = None

                        postitle = "Q&A"
                        item['content']['title'] = postitle
                    except Exception as ex:
                        print("Decode webpage content failed: " + response.url)
                        return

                replynum = response.meta['replynum']
                item['content']['replynum'] = replynum
                item['content']['reply'] = []

                if int(replynum) % 30 == 0:
                    rptotal = int(int(replynum) / 30)

                else:
                    rptotal = int(int(replynum) / 30) + 1

                if rptotal > 0:
                    head = re.search('(.+)\.html', response.url).group(1)
                    reply_url = head + "_" + str(1) + ".html"
                    yield Request(url=reply_url,
                                  meta={
                                      'item': item,
                                      'page': 1,
                                      'rptotal': rptotal,
                                      'head': head
                                  },
                                  callback=self.parse_reply)
                else:
                    yield item
                    print(item)

        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))

    def parse_reply(self, response):
        page = response.meta['page']
        rptotal = response.meta['rptotal']
        item = response.meta['item']
        head = response.meta['head']
        hxs = Selector(response)

        replists = hxs.xpath(
            '//div[@id="zwlist"]/div[@class="zwli clearfix"]').extract()
        for replist in replists:
            reply = {}
            try:
                reply_author = Selector(text=replist).xpath(
                    '//div[@class="zwlianame"]//a/text()').extract()[0]
                reply['reply_author'] = reply_author
                reply_author_url = Selector(text=replist).xpath(
                    '//div[@class="zwlianame"]//a/@href').extract()[0]
                reply['reply_author_url'] = reply_author_url
            except:
                try:
                    reply_author = Selector(text=replist).xpath(
                        '//span[@class="zwnick"]/span').extract()[0]
                    reply_author = re.search('"gray">(.+)<\/span>',
                                             reply_author).group(1)
                    reply['reply_author'] = reply_author
                except Exception as ex:
                    print("Decode webpage reply_author failed : " +
                          response.url)
                    return

            reply_time = Selector(text=replist).xpath(
                '//div[@class="zwlitime"]/text()').extract()[0]
            reply_time = re.search('\D+(\d{4}-\d{2}-.+:\d{2})',
                                   reply_time).group(1)
            reply_time = datetime.strptime(reply_time, "%Y-%m-%d %H:%M:%S")
            reply['reply_time'] = reply_time

            reply_content = Selector(text=replist).xpath(
                '//div[contains(@class, "stockcodec")]').extract()[0]
            try:
                reply_content = re.search('stockcodec">(.+)<',
                                          reply_content).group(1).strip()
                reply['reply_content'] = reply_content
            except Exception as ex:
                reply['reply_content'] = reply_content

            reply_quote_author = Selector(text=replist).xpath(
                '//div[@class="zwlitalkboxuinfo"]//a/text()').extract()
            if reply_quote_author:
                reply_quote_author = reply_quote_author[0]
                reply['reply_quote_author'] = reply_quote_author

            reply_quote_author_url = Selector(text=replist).xpath(
                '//div[@class="zwlitalkboxuinfo"]//a/@href').extract()
            if reply_quote_author_url:
                reply_quote_author_url = reply_quote_author_url[0]
                reply['reply_quote_author_url'] = reply_quote_author_url

            reply_quote_text = Selector(text=replist).xpath(
                '//div[@class= "zwlitalkboxtext"]').extract()
            if reply_quote_text:
                reply_quote_text = reply_quote_text[0]
                reply_quote_content = re.search(
                    '"zwlitalkboxtext">(.+)<\/div>',
                    str(reply_quote_text)).group(1)
                reply['reply_quote_content'] = reply_quote_content

            reply_quote_timestamp = Selector(text=replist).xpath(
                '//div[@class="zwlitalkboxtime"]/text()').extract()
            if reply_quote_timestamp:
                reply_quote_timestamp = re.search(
                    '\D+(\d{4}.+:\d{2})', reply_quote_timestamp[0]).group(1)
                reply_quote_timestamp = re.sub("/", "-", reply_quote_timestamp)
                reply_quote_time = datetime.strptime(
                    str(reply_quote_timestamp), "%Y-%m-%d %H:%M:%S")
                reply['reply_quote_time'] = reply_quote_time
                print(reply_quote_author_url)

            item['content']['reply'].append(reply)

        if page == rptotal:
            author_url = item['content']['author_url']
            yield Request(url=author_url,
                          meta={'item': item},
                          callback=self.parse_author)

        elif page < rptotal:
            reply_url = head + "_" + str(page + 1) + ".html"
            yield Request(url=reply_url,
                          meta={
                              'item': item,
                              'rptotal': rptotal,
                              'page': page + 1,
                              'head': head
                          },
                          callback=self.parse_reply)

    def parse_author(self, response):
        item = response.meta['item']
Example #25
0
class XQCubeRetSpider(RedisSpider):

    name = 'xq_cube_ret'
    start_at = datetime.now()
    logger = util.set_logger(name, LOG_FILE_CUBE_RET)
    website_possible_httpstatus_list = [301, 302, 404]
    handle_httpstatus_list = [301, 302]
    cube_type = 'SP'

    # 上次维护的时间,每次更新
    start_time = '2020-01-01'

    def start_requests(self):
        zh_url = 'https://xueqiu.com/cubes/nav_daily/all.json?cube_symbol='
        sp_url = 'https://xueqiu.com/service/tc/snowx/PAMID/cubes/nav_daily/all?cube_symbol='

        # get start url from MongoDB
        db = util.set_mongo_server()

        symbols = []
        for s in db.xq_cube_info.find({'cube_type': self.cube_type}, {
                'symbol': 1,
                '_id': 0
        }):
            symbols.append(s['symbol'])
        symbols = list(set(symbols))

        for s in db.fail.find({}, {'cube_symbol': 1, '_id': 0}):
            symbols.append(s['cube_symbol'])
        symbols = list(set(symbols))

        # iterate each symbol
        all_page_n = len(symbols)
        for i in range(all_page_n):
            now_page_n = i
            symbol = symbols[i].strip()
            if self.cube_type == 'SP':
                url = sp_url + symbol
            elif self.cube_type == 'ZH':
                url = zh_url + symbol

        #  进度条
            if i % 1000 == 0:
                self.logger.info(
                    '%s (%s / %s) %s%%' %
                    (symbol, str(now_page_n), str(all_page_n),
                     str(round(float(now_page_n) / all_page_n * 100, 1))))

            yield Request(url=url,
                          meta={
                              'symbol': symbol,
                              'cube_type': self.cube_type
                          },
                          callback=self.parse)

    def parse(self, response):
        try:
            if response.status == 200 and str(
                    response.url) != "https://xueqiu.com/service/captcha":
                item = XQItem()
                body = re.sub('[\s]', '', response.body.decode('utf-8'))
                body = json.loads(body)

                if body:
                    total_num = len(body[0]['list'])
                    for i in range(total_num - 1, -1, -1):
                        content = body[0]['list'][i]
                        if content['date'] < self.start_time:
                            return
                        else:
                            content['cube_symbol'] = response.meta['symbol']
                            content['cube_type'] = response.meta['cube_type']
                            item['url'] = response.url
                            item['content'] = content
                            item['fp'] = request_fingerprint(response.request)
                            yield item

            if response.status == 302 or str(
                    response.url) == "https://xueqiu.com/service/captcha":
                self.logger.error('CAPTURE ERROR: %s' %
                                  (response.meta['symbol']))
                oldmeta = response.request.meta
                oldmeta["change_proxy"] = True
                yield Request(url=response.request.url,
                              meta=oldmeta,
                              callback=self.parse)
        except Exception as ex:
            self.logger.warn('Parse Exception: %s %s' %
                             (str(ex), response.url))
Example #26
0
class SinaNewsSpider(Spider):
    name = "sina_news"
    logger = util.set_logger(name, LOG_FILE_SINANEWS)
    handle_httpstatus_list = [404]

    def start_requests(self):
        # 新闻类别 lid 取值 (2510:国内,2511:国际,2669:社会,2512:体育,2513:娱乐,2514:军事,2515:科技,2516:财经,2517:股市,2518:美股)
        channel_list = {
            '2510': '国内',
            '2511': '国际',
            '2669': '社会',
            '2512': '体育',
            '2513': '娱乐',
            '2514': '军事',
            '2515': '科技',
            '2516': '财经',
            '2517': '股市',
            '2518': '美股'
        }
        for lid in channel_list.keys():
            lid = "2516"

            #设置起始时间和终止时间的时间戳
            etime = time.strptime("2018-10-01 00:00:00", "%Y-%m-%d %H:%M:%S")
            stime = time.strptime("2019-01-02 00:00:00", "%Y-%m-%d %H:%M:%S")
            etime = int(time.mktime(etime))
            stime = int(time.mktime(stime))
            etime = str(etime)
            stime = str(stime)
            ctime = stime

            # channel
            channel = {
                'title': channel_list[lid],
                'id': lid,
                'cType': 'lid',
                'url': ''
            }

            start_url = "https://feed.mix.sina.com.cn/api/roll/get?pageid=153&lid=%s&etime=%s&stime=%s&ctime=%s&k=&num=50&page=1" % (
                lid, etime, stime, ctime)
            yield Request(url=start_url,
                          meta={
                              'start_url': start_url,
                              'channel': channel
                          },
                          callback=self.parse_page)

    def parse_page(self, response):
        js = json.loads(response.body)
        if js['result']['total'] % 50:
            page_total = js['result']['total'] // 50 + 1
        else:
            page_total = js['result']['total'] // 50
        start_url = response.meta['start_url']
        start_url = start_url.rstrip('1')

        for i in range(1, page_total + 1):
            url = start_url + str(i)
            yield Request(url=url,
                          meta={'channel': response.meta['channel']},
                          callback=self.parse)

    def parse(self, response):
        js = json.loads(response.body.decode(response.encoding))
        for i in js['result']['data']:
            item = SinaNewsItem()
            item['content'] = {}
            #api中有好几个time,此处选择ctime
            item['content']['time'] = datetime.fromtimestamp(int(i['ctime']))
            # author, source, keywords, title, news_id, type, pic, channel
            item['content']['author'] = i['author']
            item['content']['source'] = i['media_name']
            item['content']['keywords'] = i['keywords']
            item['content']['title'] = i['title']
            item['content']['news_id'] = i['docid']
            item['content']['type'] = i['categoryid']
            item['content']['pic'] = i['images']
            item['content']['channel'] = response.meta['channel']
            # cmt_id
            cmt_id = {}
            cmtid = re.search('(.+?):(.+):', i['commentid'])
            if cmtid:
                cmt_id['channel'] = cmtid.group(1)
                cmt_id['comment_id'] = cmtid.group(2)
                item['content']['cmt_id'] = cmt_id
            # url
            url = i['url']
            item['content']['url'] = url
            # reply number
            if 'comment_show' in i:
                replynum = int(i['comment_show'])
            else:
                replynum = 0

            yield Request(url=url,
                          meta={
                              'item': item,
                              'cmt_id': cmt_id,
                              'replynum': replynum
                          },
                          callback=self.parse_content)

    def parse_content(self, response):
        if response.status == 200:
            item = response.meta['item']

            tags = response.xpath(
                '//head/*[@name = "tags"]/@content').extract()
            if tags:
                item['content']['tags'] = tags[0]

            #article create / update / publish time
            create = response.xpath(
                '//head/*[@name = "weibo: article:create_at"]/@content'
            ).extract()
            if create:
                item['content']['news_create_time'] = create[0]
            update = response.xpath(
                '//head/*[@name = "weibo: article:update_at"]/@content'
            ).extract()
            if update:
                item['content']['news_update_time'] = update[0]
            publish = response.xpath(
                '//head/*[@property = "article:published_time"]/@content'
            ).extract()
            if publish:
                item['content']['news_publish_time'] = publish[0]

            #parse content
            content = response.xpath('//*[@id="artibody"]/p/text()').extract()
            if content:
                item['content']['content'] = "\n".join(content)

            #parse reply
            replynum = response.meta['replynum']
            cmt_id = response.meta['cmt_id']
            if replynum & ('channel' in cmt_id):
                #计算reply api的页数
                if replynum % 20:
                    rptotal = replynum // 20 + 1
                else:
                    rptotal = replynum // 20

                page = 1
                cmt_url = 'http://comment5.news.sina.com.cn/page/info?format=json&channel=%s&newsid=%s&page=' % (
                    cmt_id['channel'], cmt_id['comment_id'])
                reply_url = cmt_url + str(page)
                reply = {}
                yield Request(url=reply_url,
                              meta={
                                  'item': item,
                                  'page': page,
                                  'rptotal': rptotal,
                                  'cmt_url': cmt_url,
                                  'reply': reply
                              },
                              callback=self.parse_reply)
            else:
                yield item
        elif response.status == 404:
            self.logger.error("Page 404: %s" % (response.url))
            return

    def parse_reply(self, response):
        item = response.meta['item']
        page = response.meta['page']
        rptotal = response.meta['rptotal']
        cmt_url = response.meta['cmt_url']
        reply = response.meta['reply']

        d_json = json.loads(response.body.decode(response.encoding))
        if 'cmntlist' in d_json['result']:
            if 'reply' in item['content']:
                item['content']['reply']['reply_content'].extend(
                    d_json['result']['cmntlist'])
            else:
                if 'count' in d_json['result']:
                    reply['replynum'] = d_json['result']['count']['show']
                    reply['hotness'] = d_json['result']['count']['total']
                    reply['qreply'] = d_json['result']['count']['qreply']
                item['content']['reply'] = reply
                item['content']['reply']['reply_content'] = d_json['result'][
                    'cmntlist']

        if page == rptotal:
            yield item
        else:
            reply_url = cmt_url + str(page + 1)
            yield Request(url=reply_url,
                          meta={
                              'item': item,
                              'page': page + 1,
                              'rptotal': rptotal,
                              'cmt_url': cmt_url,
                              'reply': reply
                          },
                          callback=self.parse_reply)
Example #27
0
class GubaExFundSpider(Spider):
    name = 'guba_stock_posts'
    logger = util.set_logger(name, LOG_FILE_GUBAEXFUND)

    #handle_httpstatus_list = [404]
    #website_possible_httpstatus_list = [404]

    def start_requests(self):
        start_url = 'http://guba.eastmoney.com/remenba.aspx?type='
        for type in range(1, 5):
            start_urls = start_url + str(type)
            yield Request(url=start_urls,
                          meta={'type': type},
                          callback=self.parse)

    #解析一开始的网址
    def parse(self, response):
        type = response.meta['type']
        hxs = Selector(response)

        #个股吧
        if type == 1:
            stocks = hxs.xpath(
                '//div[@class="ngbglistdiv"]/ul[@class="ngblistul2"]/li/a'
            ).extract()
            #fund_orgs = hxs.xpath('//div[@class="ngbglistdiv"]/ul[@class="ngblistul2"]/div[@class="ngbglistjjt"]/a').extract()
            #funds = hxs.xpath('//div[@class="ngbglistdiv"]/ul[@class="ngblistul2"]/ul[@class="ngblistul3"]/li/a').extract()

            #爬取股票论坛的地址和名字
            for stock in stocks:
                m_stocks = re.search('href="(.+)">(.+)<\/a', stock)
                if m_stocks:
                    item = GubaItem()
                    item['content'] = {}
                    url_stock = "http://guba.eastmoney.com/" + m_stocks.group(
                        1)
                    item['content']['guba_url'] = url_stock
                    item['content']['guba_name'] = m_stocks.group(2)

                    yield Request(url=url_stock,
                                  meta={'item': item},
                                  callback=self.parse_page_num)

        #主题吧
        elif type == 2:
            stocks = hxs.xpath(
                '//div[@class="allzhutilistb"]/ul/li/a').extract()
            for stock in stocks:
                m_stocks = re.search('href="(.+)">(.+)<\/a', stock)
                item = GubaItem()
                item['content'] = {}
                url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1)
                item['content']['guba_url'] = url_stock
                item['content']['guba_name'] = m_stocks.group(2)

                yield Request(url=url_stock,
                              meta={'item': item},
                              callback=self.parse_page_num)

        #行业吧
        elif type == 3:
            stocks = hxs.xpath('//ul[@class="ngblistitemul"]/li/a').extract()
            for stock in stocks:
                m_stocks = re.search('href="(.+)">(.+)<\/a', stock)
                item = GubaItem()
                item['content'] = {}
                url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1)
                item['content']['guba_url'] = url_stock
                item['content']['guba_name'] = m_stocks.group(2)

                yield Request(url=url_stock,
                              meta={'item': item},
                              callback=self.parse_page_num)

        #概念吧
        elif type == 4:
            stocks = hxs.xpath('//ul[@class="ngblistitemul"]/li/a').extract()
            for stock in stocks:
                m_stocks = re.search('href="(.+)">(.+)<\/a', stock)
                item = GubaItem()
                item['content'] = {}
                url_stock = "http://guba.eastmoney.com/" + m_stocks.group(1)
                item['content']['guba_url'] = url_stock
                item['content']['guba_name'] = m_stocks.group(2)

                yield Request(url=url_stock,
                              meta={'item': item},
                              callback=self.parse_page_num)

    #解析每个论坛的页数
    def parse_page_num(self, response):
        item = response.meta['item']
        #forum_url = response.meta['forum_url']
        hxs = Selector(response)
        p = hxs.xpath('//div[@id="mainbody"]//div[@class="pager"]//@data-pager'
                      ).extract()[0]
        m = re.search('(.*_)\|(.*)\|(.+)\|(.*)', p)
        postnums = m.group(2)
        heads = m.group(1)
        #sfnums = headnums.group(1)

        item['content']['postnums'] = int(postnums)
        #item['content']['s&f_nums'] = sfnums

        if item['content']['postnums'] % 80 == 0:
            ptotal = item['content']['postnums'] / 80
        else:
            ptotal = int(item['content']['postnums'] / 80) + 1

        if int(ptotal) == 0:
            yield item
        else:
            for i in range(int(ptotal)):
                p_url = "http://guba.eastmoney.com/" + heads + str(i) + ".html"
                yield Request(p_url,
                              meta={'item': item},
                              callback=self.parse_post_list)

    #抓取每个子吧的帖子条数并翻页
    def parse_post_list(self, response):
        hxs = Selector(response)
        posts = hxs.xpath('//div[@class="articleh"]').extract()
        item = response.meta['item']
        for post in posts:
            readnum = Selector(
                text=post).xpath('//span[@class="l1"]/text()').extract()
            if readnum:
                readnum = readnum[0]
                item['content']['readnum'] = readnum
            replynum = Selector(
                text=post).xpath('//span[@class="l2"]/text()').extract()
            if replynum:
                replynum = replynum[0]
                item['content']['replynum'] = replynum
            url = Selector(
                text=post).xpath('//span[@class="l3"]/a/@href').extract()
            if url:
                url = url[0]
                guba_id = re.search(',(.+)_\d+\.html', response.url).group(1)
                if guba_id in url:
                    m_stock = re.search("(^\/.+)", url)
                    if m_stock:
                        post_url = "http://guba.eastmoney.com" + m_stock.group(
                            1)
                        item['url'] = post_url
                        post_id = re.search('\/(n.+)\.html', url).group(1)
                        item['content']['post_id'] = post_id
                        yield Request(url=post_url,
                                      meta={
                                          'item': copy.deepcopy(item),
                                          'replynum': replynum
                                      },
                                      callback=self.parse_post)

    def parse_post(self, response):
        try:
            if response.status == 200:
                try:
                    filter_body = response.body.decode('utf8')
                except:
                    try:
                        filter_body = response.body.decode("gbk")
                    except:
                        try:
                            filter_body = response.body.decode("gb2312")
                        except Exception as ex:
                            print("Decode webpage failed: " + response.url)
                            return
                filter_body = re.sub('<[A-Z]+[0-9]*[^>]*>|</[A-Z]+[^>]*>', '',
                                     filter_body)
                response = response.replace(body=filter_body)
                hxs = Selector(response)
                item = response.meta['item']
                dt = hxs.xpath('//div[@class="zwfbtime"]/text()').extract()[0]
                dt = re.search('\D+(\d{4}-\d{2}-.+:\d{2})', dt).group(1)
                creat_time = datetime.strptime(dt, "%Y-%m-%d %H:%M:%S")
                item['content']['create_time'] = creat_time

                author_url = hxs.xpath(
                    '//div[@id="zwconttbn"]/strong/a/@href').extract()[0]
                item['content']['author_url'] = author_url
                try:  #针对普通帖子
                    postcontent = hxs.xpath(
                        '//div[@id="zwconbody"]/div[@class="stockcodec"]/text()'
                    ).extract()[0].strip()
                    if postcontent:
                        item['content']['content'] = postcontent

                    postitle = hxs.xpath(
                        '//div[@class="zwcontentmain"]/div[@id="zwconttbt"]/text()'
                    ).extract()[0].strip()
                    item['content']['title'] = postitle
                except:  #针对问答帖子
                    try:
                        postcontent = hxs.xpath(
                            '//div[@class="qa"]//div[contains(@class,"content")]/text()'
                        ).extract()
                        postquestion = postcontent[0]
                        postanswer = postcontent[2].strip(
                        ) + postcontent[3].strip()
                        item['content']['content'] = postquestion
                        item['content']['answer'] = postanswer

                        postanswer_time = hxs.xpath(
                            '//div[@class="sign"]/text()').extract()
                        try:
                            postanswer_time = hxs.xpath(
                                '//div[@class="sign"]/text()').extract()
                            postanswer_time = re.search(
                                '\D+(\d{4}-\d{2}-.+:\d{2})',
                                postanswer_time[1].strip()).group(1)
                            answer_time = datetime.strptime(
                                postanswer_time, "%Y-%m-%d %H:%M:%S")
                            item['content']['answer_time'] = answer_time
                        except Exception as ex:
                            item['content']['answer_time'] = None

                        postitle = "Q&A"
                        item['content']['title'] = postitle
                    except Exception as ex:
                        print("Parse Exception: " + response.url)
                        return

                replynum = response.meta['replynum']
                item['content']['reply'] = []
                if int(replynum) % 30 == 0:
                    rptotal = int(int(replynum) / 30)

                else:
                    rptotal = int(int(replynum) / 30) + 1

                if rptotal > 0:
                    head = re.search('(.+)\.html', response.url).group(1)
                    reply_url = head + "_" + str(1) + ".html"
                    yield Request(url=reply_url,
                                  meta={
                                      'item': item,
                                      'page': 1,
                                      'rptotal': rptotal,
                                      'head': head
                                  },
                                  callback=self.parse_reply)
                else:
                    yield item

        except Exception as ex:
            self.logger.warn('Parse Exception all: %s %s' %
                             (str(ex), response.url))

    def parse_reply(self, response):
        page = response.meta['page']
        rptotal = response.meta['rptotal']
        item = response.meta['item']
        head = response.meta['head']
        hxs = Selector(response)

        replists = hxs.xpath(
            '//div[@id="zwlist"]/div[@class="zwli clearfix"]').extract()
        for replist in replists:
            reply = {}
            try:
                reply_author = Selector(text=replist).xpath(
                    '//div[@class="zwlianame"]//a/text()').extract()[0]
                reply['reply_author'] = reply_author
                reply_author_url = Selector(text=replist).xpath(
                    '//div[@class="zwlianame"]//a/@href').extract()[0]
                reply['reply_author_url'] = reply_author_url
            except:
                try:
                    reply_author = Selector(text=replist).xpath(
                        '//span[@class="zwnick"]/span').extract()[0]
                    reply_author = re.search('"gray">(.+)<\/span>',
                                             reply_author).group(1)
                    reply['reply_author'] = reply_author
                except Exception as ex:
                    print("Decode webpage failed: " + response.url)
                    return

            reply_time = Selector(text=replist).xpath(
                '//div[@class="zwlitime"]/text()').extract()[0]
            reply_time = re.search('\D+(\d{4}-\d{2}-.+:\d{2})',
                                   reply_time).group(1)
            reply_time = datetime.strptime(reply_time, "%Y-%m-%d %H:%M:%S")
            reply['reply_time'] = reply_time

            reply_content = Selector(text=replist).xpath(
                '//div[@class="zwlitext stockcodec"]/text()').extract()
            if reply_content:
                reply['reply_content'] = reply_content[0].strip()

            reply_quote_author = Selector(text=replist).xpath(
                '//div[@class="zwlitalkboxtext "]//a/text()').extract()
            if reply_quote_author:
                reply_quote_author = reply_quote_author[0]
                reply['reply_quote_author'] = reply_quote_author

            reply_quote_author_url = Selector(text=replist).xpath(
                '//div[@class="zwlitalkboxtext "]//a/@href').extract()
            if reply_quote_author_url:
                reply_quote_author_url = reply_quote_author_url[0]
                reply['reply_quote_author_url'] = reply_quote_author_url

            reply_quote_text = Selector(text=replist).xpath(
                '//div[@class= "zwlitalkboxtext "]/span/text()').extract()
            if reply_quote_text:
                reply_quote_text = reply_quote_text[0]
                reply['reply_quote_content'] = reply_quote_text

            item['content']['reply'].append(reply)

        if page == rptotal:
            yield item
        elif page < rptotal:
            reply_url = head + "_" + str(page + 1) + ".html"
            yield Request(url=reply_url,
                          meta={
                              'item': item,
                              'rptotal': rptotal,
                              'page': page + 1,
                              'head': head
                          },
                          callback=self.parse_reply)
Example #28
0
#!/usr/bin/python
# -*- coding: utf-8 -*-
import os
import logging
from datetime import datetime, timedelta
from twisted.web._newclient import ResponseNeverReceived
from twisted.internet.error import TimeoutError, ConnectionRefusedError, ConnectError
from crawler import fetch_free_proxyes
from crawler.spiders import util
from crawler.settings import *

logger = util.set_logger("http_proxy", LOG_FILE_PROXY)


class HttpProxyMiddleware(object):
    # 遇到这些类型的错误直接当做代理不可用处理掉, 不再传给retrymiddleware
    DONT_RETRY_ERRORS = (TimeoutError, ConnectionRefusedError,
                         ResponseNeverReceived, ConnectError, ValueError)

    def __init__(self, settings):
        # 保存上次不用代理直接连接的时间点
        self.last_no_proxy_time = datetime.now()
        # 一定分钟数后切换回不用代理, 因为用代理影响到速度
        self.recover_interval = 20
        # 一个proxy如果没用到这个数字就被发现老是超时, 则永久移除该proxy. 设为0则不会修改代理文件.
        self.dump_count_threshold = 20
        # 存放代理列表的文件, 每行一个代理, 格式为proto://ip:port, 这个文件会被修改, 注意备份
        self.proxy_file = "proxyes.dat"
        # 是否在超时的情况下禁用代理
        self.invalid_proxy_flag = True
        # 当有效代理小于这个数时(包括直连), 从网上抓取新的代理, 可以将这个数设为为了满足每个ip被要求输入验证码后得到足够休息时间所需要的代理数