def parse(self, response): # print(response.body) item = DoubanItem() selector = Selector(response) sel = selector.xpath('//div[@class="bd doulist-subject"]') for each in sel: title = each.xpath( 'div[@class="title"]/a/text()').extract()[0].replace( ' ', '').replace('\n', '') url = each.xpath( 'div[@class="title"]/a/@href').extract()[0].replace( ' ', '').replace('\n', '') rate = each.xpath( 'div[@class="rating"]/span[@class="rating_nums"]/text()' ).extract()[0].replace(' ', '').replace('\n', '') autor = re.search('<div class="abstract">(.*?)<br', each.extract(), re.S).group(1).replace(' ', '').replace('\n', '') item["title"] = title item["rate"] = rate item["autor"] = autor item["url"] = url # yield item yield http.Request(url=item["url"], meta={'item': item}, callback=self.parseDetail, dont_filter=True) nextPage = selector.xpath('//span[@class="next"]/link/@href').extract() if nextPage: next = nextPage[0] yield http.Request(next, callback=self.parse)
def parse(self, response): logger.debug("Parsing Response") json_response = json.loads(response.text) for item in parse_tweets(json_response['items_html']): # yield item try: yield http.Request( self.user_popup_url.format(item["user_id"]), callback=parse_users, errback=self.errBack, # dont_filter=True ) except Exception as e: logger.error(e) refresh_cursor = json_response['min_position'] logger.debug("Cursor_Position: " + refresh_cursor) new_url = self.create_query(refresh_cursor) logger.info("New URL: %s" % new_url) yield http.Request( new_url, callback=self.parse, errback=self.errBack, # dont_filter=True )
def parse(self, response): sel = Selector(response) # sites = sel.xpath('//div[@class="name"]/a') sites = sel.css('div.product-grid > div') items = [] for site in sites: item = DmozItem() title = site.css('div.name > a::text').extract()[0] link = site.css('div.name > a::attr("href")').extract()[0] des = site.css('div.description::text').extract()[0] price = site.css('div.price::text').extract()[0].replace(' ','').replace('\n','').replace('\r','') item['title'] = title item['link'] = link # item['desc'] = des item['price'] = price items.append(item) yield http.Request(url=item["link"], meta={'item': item}, callback=self.parseDetail, dont_filter=True) # yield item nextPage = sel.xpath('//div[@class="links"]/a/@href').extract()[-2] if nextPage: next = nextPage yield http.Request(next, callback=self.parse)
def parse_page(self, response): logger.info(response) cookies = self.http_client.cookies.get_dict() # parsing response body if response.body and response.body.__class__ == bytes: data = json.loads(response.body.decode('utf-8')) rows = data['data']['table']['rows'] for row in rows: published_app = PublishedApp.from_row_data(row) for version in self.parse_app_versions(published_app): yield version yield published_app if len(rows) == 0: yield None else: next_page_index = data['data']['table']['pagination'][ 'current'] + 1 url, h = self.gen_next_page(next_page_index) yield http.Request(url, headers=h, cookies=cookies, meta={'dont_merge_cookies': True}, callback=self.parse_page) else: yield None
def start_query_request(self, cursor=None): """ Generate the search request """ if cursor: url = self.url + "&cursor={cursor}" url = url.format(query=quote(self.query), cursor=quote(cursor)) else: url = self.url.format(query=quote(self.query)) request = http.Request(url, callback=self.parse_result_page, cookies=self.cookies, headers=self.headers) yield request self.num_search_issued += 1 if self.num_search_issued % 100 == 0: # get new SeleniumMiddleware for m in self.crawler.engine.downloader.middleware.middlewares: if isinstance(m, SeleniumMiddleware): m.spider_closed() self.crawler.engine.downloader.middleware = DownloaderMiddlewareManager.from_crawler( self.crawler) # update cookies yield SeleniumRequest(url="https://twitter.com/explore", callback=self.update_cookies, dont_filter=True)
def parse_tweet_page(self, response): # handle current page emoji = response.meta['emoji'] try:data = json.loads(response.body.decode("utf-8")) except:yield http.Request(response.url,headers=[("User-Agent", random.choice(user_agent_list))], meta={'tmpurl':response.meta['tmpurl'],'emoji': emoji,"proxy": SETTINGS['PROXY']},callback=self.parse_tweet_page) for item in self.parse_tweets_block(data['items_html']): url = self.converUrl % item['url'] parse_page = partial(self.parse_page,item) yield http.Request(url,callback=parse_page) # get next page min_position = data['min_position'] min_position = min_position.replace("+","%2B") url = response.meta['tmpurl']+min_position yield http.Request(url,headers=[("User-Agent", random.choice(user_agent_list))], meta={'tmpurl':response.meta['tmpurl'],'emoji': emoji,"proxy": SETTINGS['PROXY']},callback=self.parse_tweet_page)
def start_requests(self): if hasattr(self, 'state'): self.currentIteration = self.state.get('iteration_num', 1) if self.currentIteration > self.totalIterations: print("The job has been finished, hence we do nothing here.") return self.crawled_stuidx = self.state.get('crawled_stuidx', -1) print("Start from iterations %d" % self.currentIteration) for i, url in enumerate(self.start_urls): if i <= self.crawled_stuidx: continue print("start requesting \n%s" % url) # necessary headers to get json response from twitter headers = { "Accept": "application/json, text/javascript, */*; q=0.01", "x-push-state-request": "true", "accept-encoding": "gzip, deflate, br", "accept-language": "en" } # set dont_filter to be True means we allowing duplicating on this url yield http.Request(url, method='GET', headers=headers, dont_filter=True, meta={"query": self.queries[i]}) if hasattr(self, 'state'): self.state['crawled_stuidx'] = self.state.get( 'crawled_stuidx', -1) + 1
def parse(self, response): total_page_number = 177 url = 'http://ieeexplore.ieee.org/rest/publication' page_number = 1 header = { 'Accept': 'application/json, text/plain, */*', 'Content-Type': 'application/json;charset=UTF-8' } body = { "contentType": "conferences", "tabId": "topic", "publisher": "", "collection": "", "pageNumber": str(page_number), "selectedValue": "4291946551" } for i in range(total_page_number): yield http.Request(url, method="POST", body=json.dumps(body), headers=header, callback=self.parse_records, dont_filter=True) page_number += 1 body["pageNumber"] = str(page_number)
def parse(self, response): nbRes1 = Selector(response).xpath( '//*[@id="modalWrapper"]/div[2]/div[3]/div[3]/div[1]/div/text()' ).extract() nbRes2 = Selector(response).xpath( '//*[@id="modalWrapper"]/div[2]/div[4]/div[3]/div[1]/div/text()' ).extract() tmp = nbRes1 + nbRes2 self.logger.info("\n\n\n>>>NbAnnonces=%s" % tmp) if tmp: nbRes = int(tmp[0].split(' annonces')[0]) else: item = SelogercrawlerItem() yield item nbResPages = int(ceil(nbRes / 20)) # 20 res/page url = response.url headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'en', 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36' } for i in range(1, nbResPages + 1): yield http.Request(url=url + '&LISTING-LISTpg=' + str(i), dont_filter=True, callback=self.parse2, headers=headers)
def start_requests(self): # can not connect in __init__ because crawler is not binded before construct self.crawler.signals.connect(self.spider_closed, signal=signals.spider_closed) self.limit = self.settings.get('SPIDER_FOLLOWING_LIMIT', 2000) yield http.Request("https://twitter.com/login?lang=en", \ meta={'cookiejar': 1}, callback=self.pre_login)
def pre_login(self, response): script_url = "https://twitter.com/i/js_inst?c_name=ui_metrics" yield http.Request(script_url, meta={ 'cookiejar': 1, 'response': response }, callback=self.login)
def parse_page(self, response): data = json.loads(response.body) for item in self.parse_tweets_block(data['items_html']): yield item min_position = data['min_position'] url = self.url % (parse.quote(self.query), min_position) yield http.Request(url, callback=self.parse_page, meta=proxies)
def start_requests(self): url, h = self.gen_next_page(0) cookies = self.http_client.cookies.get_dict() yield http.Request(url, headers=h, cookies=cookies, meta={'dont_merge_cookies': True}, callback=self.parse_page)
def start_requests(self): #13个实时 #urls = ['https://twitter.com', 'https://twitter.com/i/streams/category/686639666771046402',] #urls = ['https://twitter.com/i/streams/category/686639666779394057', 'https://twitter.com/i/streams/category/686639666779426835',] #urls = ['https://twitter.com/i/streams/category/686639666779394055', 'https://twitter.com/i/streams/category/686639666779426842',] #urls = ['https://twitter.com/i/streams/category/686639666779426845', 'https://twitter.com/i/streams/category/686639666779394072'] #urls = ['https://twitter.com/i/streams/category/690675490684678145', 'https://twitter.com/i/streams/category/692079932940259328'] #urls = ['https://twitter.com/i/streams/category/788602775839965184', 'https://twitter.com/i/streams/category/841388582518562816'] #urls = ['https://twitter.com/i/streams/category/841390443338309632'] #88 个名人 #urls = ['https://twitter.com/BarackObama', 'https://twitter.com/BillClinton'] #urls = ['https://twitter.com/HillaryClinton','https://twitter.com/FLOTUS'] #urls = ['https://twitter.com/mike_pence','https://twitter.com/KellyannePolls'] #urls = ['https://twitter.com/MichelleObama','https://twitter.com/Pontifex'] #urls = ['https://twitter.com/Queen_Europe','https://twitter.com/BillGates'] #urls = ['https://twitter.com/David_Cameron','https://twitter.com/JeffBezos'] #urls = ['https://twitter.com/narendramodi','https://twitter.com/Cristiano'] #urls = ['https://twitter.com/KingJames','https://twitter.com/rogerfederer'] #urls = ['https://twitter.com/neymarjr','https://twitter.com/RafaelNadal'] #urls = ['https://twitter.com/StephenCurry30','https://twitter.com/DjokerNole'] #urls = ['https://twitter.com/RondaRousey','https://twitter.com/serenawilliams'] #urls = ['https://twitter.com/MariaSharapova','https://twitter.com/TheNotoriousMMA'] #urls = ['https://twitter.com/kobebryant','https://twitter.com/KDTrey5'] #urls = ['https://twitter.com/FloydMayweather','https://twitter.com/GalGadot'] #urls = ['https://twitter.com/EmmaWatson','https://twitter.com/lizasoberano'] #urls = ['https://twitter.com/NargisFakhri','https://twitter.com/russellcrowe'] #urls = ['https://twitter.com/McConaughey','https://twitter.com/LeoDiCaprio'] #urls = ['https://twitter.com/realdepp','https://twitter.com/RobertDowneyJr'] #urls = ['https://twitter.com/TomCruise','https://twitter.com/justinbieber'] #urls = ['https://twitter.com/khloekardashian','https://twitter.com/kourtneykardash'] #urls = ['https://twitter.com/KendallJenner','https://twitter.com/nytimes'] #urls = ['https://twitter.com/cnnbrk','https://twitter.com/BBCBreaking'] #urls = ['https://twitter.com/Google','https://twitter.com/FoxNews'] #urls = ['https://twitter.com/WhiteHouse','https://twitter.com/ABC'] #urls = ['https://twitter.com/ImRaina','https://twitter.com/BreakingNews'] #urls = ['https://twitter.com/gmail','https://twitter.com/Aly_Raisman'] #urls = ['https://twitter.com/JohnWall','https://twitter.com/JHarden13'] #urls = ['https://twitter.com/espn','https://twitter.com/CNN'] #urls = ['https://twitter.com/NBA','https://twitter.com/GameOfThrones'] #urls = ['https://twitter.com/SamuelLJackson','https://twitter.com/AntDavis23'] #urls = ['https://twitter.com/jtimberlake','https://twitter.com/tomhanks'] #urls = ['https://twitter.com/mark_wahlberg','https://twitter.com/danielwuyanzu'] #urls = ['https://twitter.com/TheLewisTan','https://twitter.com/adamcarolla'] #urls = ['https://twitter.com/KyrieIrving','https://twitter.com/NiallOfficial'] #urls = ['https://twitter.com/chelseahandler','https://twitter.com/twhiddleston'] #urls = ['https://twitter.com/taylorswift13','https://twitter.com/chrishemsworth'] #urls = ['https://twitter.com/MarvelStudios','https://twitter.com/Schwarzenegger'] #urls = ['https://twitter.com/JimCameron','https://twitter.com/TheRock'] #urls = ['https://twitter.com/OfficialKat', 'https://twitter.com/SophieT'] #urls = ['https://twitter.com/Maisie_Williams','https://twitter.com/susmitchakrabo1'] #urls = ['https://twitter.com/ladygaga','https://twitter.com/katyperry'] #urls = ['https://twitter.com/KimKardashian','https://twitter.com/aplusk'] #urls = ['https://twitter.com/rihanna','https://twitter.com/justdemi'] #urls = ['https://twitter.com/rustyrockets','https://twitter.com/MileyCyrus'] for url in urls: yield http.Request(url, callback=self.parse_page)
def OY_page(self, response): data = json.loads(response.body.decode("utf-8")) for item in self.OY_twets_page(data['items_html']): yield item min_position = data['min_position'].replace("+","%2B") url = self.url % (quote(self.OYprams), min_position) yield http.Request(url, callback=self.OY_page)
def parse_page(self, response): data = json.loads(response.body.decode("utf-8")) for item in self.parse_tweets_block(data['items_html']): yield item min_position = data['min_position'] min_position = min_position.replace("+", "%2B") url = self.url % (quote(self.query), min_position) yield http.Request(url, callback=self.parse_page)
def parse_page(self, response): data = json.loads(response.body) for item in self.parse_tweets_block(data['items_html']): yield item # get next page self.min_position = data['min_position'] url = self.url % (quote(' '.join(self.query.split(','))), self.min_position) url += '&oldest_unread_id=0&reset_error_state=false' yield http.Request(url, callback=self.parse_page)
def gen_request(self, meta, callback, max_pos='-1'): data = { 'include_available_features': '1', 'include_entities': '0', 'max_position': max_pos, 'reset_error_state': 'false' } return http.Request("https://twitter.com/{}/following/users?".format(meta['name']) + urlencode(data), \ meta=meta, callback=callback)
def start_requests(self): if self.query: url = self.url % (quote(self.query), '') yield http.Request(url, callback=self.parse_page) else: for query in search_keywords: self.extract_more = True url = self.url % (quote(query), '') print(query) yield scrapy.Request(url, callback=self.parse_page, meta={'query': query})
def start_requests(self): if self.no_query: url = self.url % (quote(self.query), None, None) elif self.state: url = self.url % (quote(self.query), None, None) elif self.date: url = self.url % (quote(self.query), None, None) else: url = self.url % (quote(self.query), None) yield http.Request(url, callback=self.parse_page)
def start_requests(self): if not model_util.did_site_run(sites.HN): model_util.set_site_ran(sites.HN) yield http.Request("https://news.ycombinator.com/newest", meta={ 'type': 'page', 'num': 1 }) else: log.debug(u"Not running HN, too soon")
def parse_page(self, response): # inspect_response(response) # handle current page data = json.loads(response.body) for item in self.parse_tweets_block(data['items_html']): yield item # get next page min_position = data['min_position'] url = self.url % (quote(self.query), min_position) yield http.Request(url, callback=self.parse_page)
def start_requests(self): #yield http.Request("http://www.4chan.org", meta={'type': 'page', 'num': 1}) for i in range(20): yield http.Request( "https://www.tumblr.com/svc/discover/posts?offset={}&askingForPage={}&limit={}&type=trending&with_form_key=true" .format(i * 20, i + 1, 20), headers={"x-requested-with": "XMLHttpRequest"}, meta={ 'type': 'page', 'num': 1 })
def parse_page(self, response): data = json.loads(response.body.decode("utf-8")) logging.debug(f'pagina geparsed van {response.request.url}') for item in self.parse_tweets_block(data['items_html']): logging.debug(f"item geparsed {item}") yield item min_position = data['min_position'] min_position = min_position.replace("+", "%2B") url = self.url % (quote(self.query), min_position) logger.debug('opvragen volgende pagina') yield http.Request(url, callback=self.parse_page)
def start_requests(self): for url in self.start_urls: conversationId_index = url.rindex("/") screenName_index = url.index("/", len("https://twitter.com/")) screenName = url[len("https://twitter.com/"):screenName_index] yield http.Request(url, dont_filter=True, meta={ "conversationId": url[conversationId_index + 1:], "screenName": screenName })
def parse_page(self, response): #************** # inspect_response(response, self) # handle current page data = json.loads(response.body.decode("utf-8")) for item in self.parse_tweets_block(data['items_html']): yield item # get next page min_position = data['min_position'] min_position = min_position.replace("+", "%2B") url = self.url % (quote(self.query), min_position) yield http.Request(url, callback=self.parse_page)
def start_requests(self): while True: for tweet in self.tweetCollection.find({ '_id': { '$gt': self.start_tweet_id } }).sort('_id', pymongo.ASCENDING).limit(1000): url = self.url % tweet['url'] self.start_tweet_id = tweet['_id'] parse_page = partial(self.parse_page, tweet) yield http.Request(url, callback=parse_page)
def parse_page(self, response): # 回调这个函数之后,首先处理返回的json数据,json中的items_html是这一页的内容 data = json.loads(response.body, encoding='utf-8') for item in self.prase_tweets_block(data['items_html']): yield item # 从json的min_position可以获得前往下一页的参数,对应下一条URL的max_position参数 min_position = data['min_position'] url = self.url % (quote(self.query), min_position) logger.debug('Prepare to crawl A NEW PAGE with keywords[' + self.query + '] and min_position[' + min_position + ']') yield http.Request(url, callback=self.parse_page)
def parse_page(self, response): # inspect_response(response, self) # handle current page data = json.loads(response.body.decode("utf-8")) for item in self.parse_tweets_block(data['items_html']): yield item print(self.crawler.stats.get_value('item_scraped_count', 0)) # get next page min_position = data['min_position'] url = self.url % (quote(self.query), min_position) print('-> to Twitter next page: {}'.format(url)) yield http.Request(url, callback=self.parse_page)
def parse(self, response): sel = Selector(response) # print(response.body) sites = sel.xpath('//div[@class="search_right_item ml10"]/div/a') # print(sites.extract()) items = [] for site in sites: item = Tianyan() link = site.xpath('@href').extract()[0] name = site.xpath('span/text()').extract()[0] print(name) # item['link'] = link item['name'] = name items.append(item) yield http.Request(url=item["link"], meta={'item': item},headers= self.header,cookies=self.cookie,callback=self.parseDetail, dont_filter=True) # yield item print(sel.xpath('//div[@class="search_pager human_pager in-block"]/ul/li[@class="pagination-next ng-scope "]/a').extract()) nextPage = sel.xpath('//div[@class="search_pager human_pager in-block"]/ul/li[@class="pagination-next ng-scope "]/a/@href').extract() if nextPage: next = nextPage[0] yield http.Request(next, callback=self.parse)