Beispiel #1
0
    def parse(self, response):
        # print(response.body)
        item = DoubanItem()
        selector = Selector(response)
        sel = selector.xpath('//div[@class="bd doulist-subject"]')
        for each in sel:
            title = each.xpath(
                'div[@class="title"]/a/text()').extract()[0].replace(
                    ' ', '').replace('\n', '')
            url = each.xpath(
                'div[@class="title"]/a/@href').extract()[0].replace(
                    ' ', '').replace('\n', '')
            rate = each.xpath(
                'div[@class="rating"]/span[@class="rating_nums"]/text()'
            ).extract()[0].replace(' ', '').replace('\n', '')
            autor = re.search('<div class="abstract">(.*?)<br', each.extract(),
                              re.S).group(1).replace(' ',
                                                     '').replace('\n', '')

            item["title"] = title
            item["rate"] = rate
            item["autor"] = autor
            item["url"] = url

            # yield item
            yield http.Request(url=item["url"],
                               meta={'item': item},
                               callback=self.parseDetail,
                               dont_filter=True)

        nextPage = selector.xpath('//span[@class="next"]/link/@href').extract()
        if nextPage:
            next = nextPage[0]
            yield http.Request(next, callback=self.parse)
Beispiel #2
0
    def parse(self, response):
        logger.debug("Parsing Response")
        json_response = json.loads(response.text)
        for item in parse_tweets(json_response['items_html']):
            # yield item
            try:
                yield http.Request(
                    self.user_popup_url.format(item["user_id"]),
                    callback=parse_users,
                    errback=self.errBack,
                    #    dont_filter=True
                )
            except Exception as e:
                logger.error(e)

        refresh_cursor = json_response['min_position']
        logger.debug("Cursor_Position: " + refresh_cursor)
        new_url = self.create_query(refresh_cursor)
        logger.info("New URL: %s" % new_url)

        yield http.Request(
            new_url,
            callback=self.parse,
            errback=self.errBack,
            #    dont_filter=True
        )
Beispiel #3
0
    def parse(self, response):
        sel = Selector(response)
        # sites = sel.xpath('//div[@class="name"]/a')
        sites = sel.css('div.product-grid > div')
        items = []
        for site in sites:
            item = DmozItem()
            title = site.css('div.name > a::text').extract()[0]
            link = site.css('div.name > a::attr("href")').extract()[0]
            des = site.css('div.description::text').extract()[0]
            price = site.css('div.price::text').extract()[0].replace(' ','').replace('\n','').replace('\r','')

            item['title'] = title

            item['link'] = link
            # item['desc'] = des
            item['price'] = price
            items.append(item)
            yield http.Request(url=item["link"], meta={'item': item}, callback=self.parseDetail, dont_filter=True)
            # yield item

        nextPage = sel.xpath('//div[@class="links"]/a/@href').extract()[-2]
        if nextPage:
            next = nextPage
            yield http.Request(next, callback=self.parse)
Beispiel #4
0
 def parse_page(self, response):
     logger.info(response)
     cookies = self.http_client.cookies.get_dict()
     # parsing response body
     if response.body and response.body.__class__ == bytes:
         data = json.loads(response.body.decode('utf-8'))
         rows = data['data']['table']['rows']
         for row in rows:
             published_app = PublishedApp.from_row_data(row)
             for version in self.parse_app_versions(published_app):
                 yield version
             yield published_app
         if len(rows) == 0:
             yield None
         else:
             next_page_index = data['data']['table']['pagination'][
                 'current'] + 1
             url, h = self.gen_next_page(next_page_index)
             yield http.Request(url,
                                headers=h,
                                cookies=cookies,
                                meta={'dont_merge_cookies': True},
                                callback=self.parse_page)
     else:
         yield None
Beispiel #5
0
    def start_query_request(self, cursor=None):
        """
        Generate the search request
        """
        if cursor:
            url = self.url + "&cursor={cursor}"
            url = url.format(query=quote(self.query), cursor=quote(cursor))
        else:
            url = self.url.format(query=quote(self.query))
        request = http.Request(url,
                               callback=self.parse_result_page,
                               cookies=self.cookies,
                               headers=self.headers)
        yield request

        self.num_search_issued += 1
        if self.num_search_issued % 100 == 0:
            # get new SeleniumMiddleware
            for m in self.crawler.engine.downloader.middleware.middlewares:
                if isinstance(m, SeleniumMiddleware):
                    m.spider_closed()
            self.crawler.engine.downloader.middleware = DownloaderMiddlewareManager.from_crawler(
                self.crawler)
            # update cookies
            yield SeleniumRequest(url="https://twitter.com/explore",
                                  callback=self.update_cookies,
                                  dont_filter=True)
Beispiel #6
0
    def parse_tweet_page(self, response):
        # handle current page
        emoji = response.meta['emoji']
        try:data = json.loads(response.body.decode("utf-8"))
        except:yield http.Request(response.url,headers=[("User-Agent", random.choice(user_agent_list))], meta={'tmpurl':response.meta['tmpurl'],'emoji': emoji,"proxy": SETTINGS['PROXY']},callback=self.parse_tweet_page)
        for item in self.parse_tweets_block(data['items_html']):

            url = self.converUrl % item['url']
            parse_page = partial(self.parse_page,item)
            yield http.Request(url,callback=parse_page)

        # get next page
        min_position = data['min_position']
        min_position = min_position.replace("+","%2B")
        url = response.meta['tmpurl']+min_position
        yield http.Request(url,headers=[("User-Agent", random.choice(user_agent_list))], meta={'tmpurl':response.meta['tmpurl'],'emoji': emoji,"proxy": SETTINGS['PROXY']},callback=self.parse_tweet_page)
Beispiel #7
0
    def start_requests(self):
        if hasattr(self, 'state'):
            self.currentIteration = self.state.get('iteration_num', 1)
            if self.currentIteration > self.totalIterations:
                print("The job has been finished, hence we do nothing here.")
                return
            self.crawled_stuidx = self.state.get('crawled_stuidx', -1)
        print("Start from iterations %d" % self.currentIteration)

        for i, url in enumerate(self.start_urls):
            if i <= self.crawled_stuidx:
                continue
            print("start requesting \n%s" % url)

            # necessary headers to get json response from twitter
            headers = {
                "Accept": "application/json, text/javascript, */*; q=0.01",
                "x-push-state-request": "true",
                "accept-encoding": "gzip, deflate, br",
                "accept-language": "en"
            }
            # set dont_filter to be True means we allowing duplicating on this url
            yield http.Request(url,
                               method='GET',
                               headers=headers,
                               dont_filter=True,
                               meta={"query": self.queries[i]})
            if hasattr(self, 'state'):
                self.state['crawled_stuidx'] = self.state.get(
                    'crawled_stuidx', -1) + 1
Beispiel #8
0
    def parse(self, response):
        total_page_number = 177
        url = 'http://ieeexplore.ieee.org/rest/publication'
        page_number = 1
        header = {
            'Accept': 'application/json, text/plain, */*',
            'Content-Type': 'application/json;charset=UTF-8'
        }
        body = {
            "contentType": "conferences",
            "tabId": "topic",
            "publisher": "",
            "collection": "",
            "pageNumber": str(page_number),
            "selectedValue": "4291946551"
        }

        for i in range(total_page_number):
            yield http.Request(url,
                               method="POST",
                               body=json.dumps(body),
                               headers=header,
                               callback=self.parse_records,
                               dont_filter=True)
            page_number += 1
            body["pageNumber"] = str(page_number)
    def parse(self, response):

        nbRes1 = Selector(response).xpath(
            '//*[@id="modalWrapper"]/div[2]/div[3]/div[3]/div[1]/div/text()'
        ).extract()
        nbRes2 = Selector(response).xpath(
            '//*[@id="modalWrapper"]/div[2]/div[4]/div[3]/div[1]/div/text()'
        ).extract()
        tmp = nbRes1 + nbRes2
        self.logger.info("\n\n\n>>>NbAnnonces=%s" % tmp)

        if tmp:

            nbRes = int(tmp[0].split(' annonces')[0])
        else:
            item = SelogercrawlerItem()
            yield item

        nbResPages = int(ceil(nbRes / 20))  # 20 res/page
        url = response.url
        headers = {
            'Accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
            'Accept-Language':
            'en',
            'User-Agent':
            'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36'
        }

        for i in range(1, nbResPages + 1):
            yield http.Request(url=url + '&LISTING-LISTpg=' + str(i),
                               dont_filter=True,
                               callback=self.parse2,
                               headers=headers)
Beispiel #10
0
    def start_requests(self):
        # can not connect in __init__ because crawler is not binded before construct
        self.crawler.signals.connect(self.spider_closed,
                                     signal=signals.spider_closed)
        self.limit = self.settings.get('SPIDER_FOLLOWING_LIMIT', 2000)

        yield http.Request("https://twitter.com/login?lang=en", \
                        meta={'cookiejar': 1}, callback=self.pre_login)
Beispiel #11
0
 def pre_login(self, response):
     script_url = "https://twitter.com/i/js_inst?c_name=ui_metrics"
     yield http.Request(script_url,
                        meta={
                            'cookiejar': 1,
                            'response': response
                        },
                        callback=self.login)
Beispiel #12
0
    def parse_page(self, response):
        data = json.loads(response.body)
        for item in self.parse_tweets_block(data['items_html']):
            yield item

        min_position = data['min_position']
        url = self.url % (parse.quote(self.query), min_position)
        yield http.Request(url, callback=self.parse_page, meta=proxies)
Beispiel #13
0
 def start_requests(self):
     url, h = self.gen_next_page(0)
     cookies = self.http_client.cookies.get_dict()
     yield http.Request(url,
                        headers=h,
                        cookies=cookies,
                        meta={'dont_merge_cookies': True},
                        callback=self.parse_page)
    def start_requests(self):
        #13个实时
        #urls = ['https://twitter.com', 'https://twitter.com/i/streams/category/686639666771046402',]
        #urls = ['https://twitter.com/i/streams/category/686639666779394057', 'https://twitter.com/i/streams/category/686639666779426835',]
        #urls = ['https://twitter.com/i/streams/category/686639666779394055', 'https://twitter.com/i/streams/category/686639666779426842',]
        #urls = ['https://twitter.com/i/streams/category/686639666779426845', 'https://twitter.com/i/streams/category/686639666779394072']
        #urls = ['https://twitter.com/i/streams/category/690675490684678145', 'https://twitter.com/i/streams/category/692079932940259328']
        #urls = ['https://twitter.com/i/streams/category/788602775839965184', 'https://twitter.com/i/streams/category/841388582518562816']
        #urls = ['https://twitter.com/i/streams/category/841390443338309632']

        #88 个名人
        #urls = ['https://twitter.com/BarackObama', 'https://twitter.com/BillClinton']
        #urls = ['https://twitter.com/HillaryClinton','https://twitter.com/FLOTUS']
        #urls = ['https://twitter.com/mike_pence','https://twitter.com/KellyannePolls']
        #urls = ['https://twitter.com/MichelleObama','https://twitter.com/Pontifex']
        #urls = ['https://twitter.com/Queen_Europe','https://twitter.com/BillGates']
        #urls = ['https://twitter.com/David_Cameron','https://twitter.com/JeffBezos']
        #urls = ['https://twitter.com/narendramodi','https://twitter.com/Cristiano']
        #urls = ['https://twitter.com/KingJames','https://twitter.com/rogerfederer']
        #urls = ['https://twitter.com/neymarjr','https://twitter.com/RafaelNadal']
        #urls = ['https://twitter.com/StephenCurry30','https://twitter.com/DjokerNole']
        #urls = ['https://twitter.com/RondaRousey','https://twitter.com/serenawilliams']
        #urls = ['https://twitter.com/MariaSharapova','https://twitter.com/TheNotoriousMMA']
        #urls = ['https://twitter.com/kobebryant','https://twitter.com/KDTrey5']
        #urls = ['https://twitter.com/FloydMayweather','https://twitter.com/GalGadot']
        #urls = ['https://twitter.com/EmmaWatson','https://twitter.com/lizasoberano']
        #urls = ['https://twitter.com/NargisFakhri','https://twitter.com/russellcrowe']
        #urls = ['https://twitter.com/McConaughey','https://twitter.com/LeoDiCaprio']
        #urls = ['https://twitter.com/realdepp','https://twitter.com/RobertDowneyJr']
        #urls = ['https://twitter.com/TomCruise','https://twitter.com/justinbieber']
        #urls = ['https://twitter.com/khloekardashian','https://twitter.com/kourtneykardash']
        #urls = ['https://twitter.com/KendallJenner','https://twitter.com/nytimes']
        #urls = ['https://twitter.com/cnnbrk','https://twitter.com/BBCBreaking']
        #urls = ['https://twitter.com/Google','https://twitter.com/FoxNews']
        #urls = ['https://twitter.com/WhiteHouse','https://twitter.com/ABC']
        #urls = ['https://twitter.com/ImRaina','https://twitter.com/BreakingNews']
        #urls = ['https://twitter.com/gmail','https://twitter.com/Aly_Raisman']
        #urls = ['https://twitter.com/JohnWall','https://twitter.com/JHarden13']
        #urls = ['https://twitter.com/espn','https://twitter.com/CNN']
        #urls = ['https://twitter.com/NBA','https://twitter.com/GameOfThrones']
        #urls = ['https://twitter.com/SamuelLJackson','https://twitter.com/AntDavis23']
        #urls = ['https://twitter.com/jtimberlake','https://twitter.com/tomhanks']
        #urls = ['https://twitter.com/mark_wahlberg','https://twitter.com/danielwuyanzu']
        #urls = ['https://twitter.com/TheLewisTan','https://twitter.com/adamcarolla']
        #urls = ['https://twitter.com/KyrieIrving','https://twitter.com/NiallOfficial']
        #urls = ['https://twitter.com/chelseahandler','https://twitter.com/twhiddleston']
        #urls = ['https://twitter.com/taylorswift13','https://twitter.com/chrishemsworth']
        #urls = ['https://twitter.com/MarvelStudios','https://twitter.com/Schwarzenegger']
        #urls = ['https://twitter.com/JimCameron','https://twitter.com/TheRock']
        #urls = ['https://twitter.com/OfficialKat', 'https://twitter.com/SophieT']
        #urls = ['https://twitter.com/Maisie_Williams','https://twitter.com/susmitchakrabo1']
        #urls = ['https://twitter.com/ladygaga','https://twitter.com/katyperry']
        #urls = ['https://twitter.com/KimKardashian','https://twitter.com/aplusk']
        #urls = ['https://twitter.com/rihanna','https://twitter.com/justdemi']
        #urls = ['https://twitter.com/rustyrockets','https://twitter.com/MileyCyrus']

        for url in urls:
            yield http.Request(url, callback=self.parse_page)
Beispiel #15
0
    def OY_page(self, response):

        data = json.loads(response.body.decode("utf-8"))
        for item in self.OY_twets_page(data['items_html']):
            yield item

        min_position = data['min_position'].replace("+","%2B")
        url = self.url % (quote(self.OYprams), min_position)
        yield http.Request(url, callback=self.OY_page)
    def parse_page(self, response):
        data = json.loads(response.body.decode("utf-8"))
        for item in self.parse_tweets_block(data['items_html']):
            yield item

        min_position = data['min_position']
        min_position = min_position.replace("+", "%2B")
        url = self.url % (quote(self.query), min_position)
        yield http.Request(url, callback=self.parse_page)
Beispiel #17
0
    def parse_page(self, response):
        data = json.loads(response.body)
        for item in self.parse_tweets_block(data['items_html']):
            yield item

        # get next page
        self.min_position = data['min_position']
        url = self.url % (quote(' '.join(self.query.split(','))), self.min_position)
        url += '&oldest_unread_id=0&reset_error_state=false'
        yield http.Request(url, callback=self.parse_page)
Beispiel #18
0
    def gen_request(self, meta, callback, max_pos='-1'):
        data = {
            'include_available_features': '1',
            'include_entities': '0',
            'max_position': max_pos,
            'reset_error_state': 'false'
        }

        return http.Request("https://twitter.com/{}/following/users?".format(meta['name']) + urlencode(data), \
                        meta=meta, callback=callback)
Beispiel #19
0
 def start_requests(self):
     if self.query:
         url = self.url % (quote(self.query), '')
         yield http.Request(url, callback=self.parse_page)
     else:
         for query in search_keywords:
             self.extract_more = True
             url = self.url % (quote(query), '')
             print(query)
             yield scrapy.Request(url, callback=self.parse_page, meta={'query': query})
Beispiel #20
0
    def start_requests(self):
        if self.no_query:
            url = self.url % (quote(self.query), None, None)
        elif self.state:
            url = self.url % (quote(self.query), None, None)
        elif self.date:
            url = self.url % (quote(self.query), None, None)
        else:
            url = self.url % (quote(self.query), None)

        yield http.Request(url, callback=self.parse_page)
Beispiel #21
0
    def start_requests(self):
        if not model_util.did_site_run(sites.HN):
            model_util.set_site_ran(sites.HN)

            yield http.Request("https://news.ycombinator.com/newest",
                               meta={
                                   'type': 'page',
                                   'num': 1
                               })
        else:
            log.debug(u"Not running HN, too soon")
Beispiel #22
0
    def parse_page(self, response):
        # inspect_response(response)
        # handle current page
        data = json.loads(response.body)
        for item in self.parse_tweets_block(data['items_html']):
            yield item

        # get next page
        min_position = data['min_position']
        url = self.url % (quote(self.query), min_position)
        yield http.Request(url, callback=self.parse_page)
Beispiel #23
0
    def start_requests(self):
        #yield http.Request("http://www.4chan.org", meta={'type': 'page', 'num': 1})

        for i in range(20):
            yield http.Request(
                "https://www.tumblr.com/svc/discover/posts?offset={}&askingForPage={}&limit={}&type=trending&with_form_key=true"
                .format(i * 20, i + 1, 20),
                headers={"x-requested-with": "XMLHttpRequest"},
                meta={
                    'type': 'page',
                    'num': 1
                })
Beispiel #24
0
    def parse_page(self, response):
        data = json.loads(response.body.decode("utf-8"))
        logging.debug(f'pagina geparsed van {response.request.url}')
        for item in self.parse_tweets_block(data['items_html']):
            logging.debug(f"item geparsed {item}")
            yield item

        min_position = data['min_position']
        min_position = min_position.replace("+", "%2B")
        url = self.url % (quote(self.query), min_position)
        logger.debug('opvragen volgende pagina')
        yield http.Request(url, callback=self.parse_page)
Beispiel #25
0
 def start_requests(self):
     for url in self.start_urls:
         conversationId_index = url.rindex("/")
         screenName_index = url.index("/", len("https://twitter.com/"))
         screenName = url[len("https://twitter.com/"):screenName_index]
         yield http.Request(url,
                            dont_filter=True,
                            meta={
                                "conversationId":
                                url[conversationId_index + 1:],
                                "screenName": screenName
                            })
    def parse_page(self, response):  #**************
        # inspect_response(response, self)
        # handle current page
        data = json.loads(response.body.decode("utf-8"))
        for item in self.parse_tweets_block(data['items_html']):
            yield item

        # get next page
        min_position = data['min_position']
        min_position = min_position.replace("+", "%2B")
        url = self.url % (quote(self.query), min_position)
        yield http.Request(url, callback=self.parse_page)
    def start_requests(self):

        while True:
            for tweet in self.tweetCollection.find({
                    '_id': {
                        '$gt': self.start_tweet_id
                    }
            }).sort('_id', pymongo.ASCENDING).limit(1000):
                url = self.url % tweet['url']
                self.start_tweet_id = tweet['_id']
                parse_page = partial(self.parse_page, tweet)
                yield http.Request(url, callback=parse_page)
Beispiel #28
0
    def parse_page(self, response):
        # 回调这个函数之后,首先处理返回的json数据,json中的items_html是这一页的内容
        data = json.loads(response.body, encoding='utf-8')
        for item in self.prase_tweets_block(data['items_html']):
            yield item

        # 从json的min_position可以获得前往下一页的参数,对应下一条URL的max_position参数
        min_position = data['min_position']
        url = self.url % (quote(self.query), min_position)
        logger.debug('Prepare to crawl A NEW PAGE with keywords[' +
                     self.query + '] and min_position[' + min_position + ']')
        yield http.Request(url, callback=self.parse_page)
    def parse_page(self, response):
        # inspect_response(response, self)
        # handle current page
        data = json.loads(response.body.decode("utf-8"))
        for item in self.parse_tweets_block(data['items_html']):
            yield item

        print(self.crawler.stats.get_value('item_scraped_count', 0))
        # get next page
        min_position = data['min_position']
        url = self.url % (quote(self.query), min_position)
        print('-> to Twitter next page: {}'.format(url))
        yield http.Request(url, callback=self.parse_page)
Beispiel #30
0
 def parse(self, response):
     sel = Selector(response)
     # print(response.body)
     sites = sel.xpath('//div[@class="search_right_item ml10"]/div/a')
     # print(sites.extract())
     items = []
     for site in sites:
         item = Tianyan()
         link = site.xpath('@href').extract()[0]
         name = site.xpath('span/text()').extract()[0]
         print(name)
     #
         item['link'] = link
         item['name'] = name
         items.append(item)
         yield http.Request(url=item["link"], meta={'item': item},headers= self.header,cookies=self.cookie,callback=self.parseDetail, dont_filter=True)
         # yield item
     print(sel.xpath('//div[@class="search_pager human_pager in-block"]/ul/li[@class="pagination-next ng-scope "]/a').extract())
     nextPage = sel.xpath('//div[@class="search_pager human_pager in-block"]/ul/li[@class="pagination-next ng-scope "]/a/@href').extract()
     if nextPage:
         next = nextPage[0]
         yield http.Request(next, callback=self.parse)