def start_requests(self): # cookiejar 参数用来自动管理cookie filepath为了记录目录,会传递给response return [Request(self.target_url + str(self.page['/']), meta={'cookiejar': 1, 'filepath': '/'})]
def parse_sub_categories(self, response): sub_categories = response.xpath("//*[contains(@href, 'dir/index?')]/@href").extract() for link in sub_categories: url = "https://br.answers.yahoo.com%s" % link yield Request(url, callback=self.parse_question)
def start_requests(self): for i in xrange(1, 10): yield Request('http://www.kuaidaili.com/free/inha/%s/' % i, callback=self.parse)
def start_requests(self): print("start request.", self.parse_idx) yield Request( self.start_urls[0], callback=self.parse_fun[self.parse_idx] )
def start_requests(self): base_url = 'http://guba.eastmoney.com/list,%s.html' for ticker_id in open('ticker_list.txt'): url = base_url % ticker_id.strip() yield Request(url, self.parse_item) pass
url=SETTINGS["CRAWLERA_FETCH_URL"], status=200, headers={ "Content-Type": "application/json", "Content-Encoding": "gzip", "Transfer-Encoding": "chunked", "Date": "Fri, 24 Apr 2020 18:06:42 GMT", "Proxy-Connection": "close", "Connection": "close", }, request=Request( url=SETTINGS["CRAWLERA_FETCH_URL"], meta={ "crawlera_fetch": { "timing": {"start_ts": mocked_time()}, "original_request": request_to_dict( Request("https://fake.host.com"), spider=dummy_spider, ), } }, ), body=b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""", # noqa: E501 ), "expected": TextResponse( url="https://fake.host.com", status=123, headers={"Fake-Header": "true"}, body=b"""foobar""", # noqa: E501 ), } )
def start_requests(self): for url in self.start_urls: yield Request(url, cookies=self.cookie_dict)
def get_media_requests(self, item, info): for url in item['file_urls']: yield Request(url, meta={'item': item})
def start_requests(self): yield Request(url='https://movie.douban.com/chart', callback=self.parse_rank)
def start_requests(self): # cookiejar 参数用来自动管理cookie, 可以自动管理多个,根据cookiejar对应的值不同 return [Request('http://temp.wuxingxiangsheng.com/test/request', meta = {'cookiejar':1})]
def start_requests(self): #meta = {"search_url" : "search_url"} ##书籍 #yield Request("http://www.jstor.org/stable/10.7249/mg358cf.10?seq=1#page_scan_tab_contents", self.parse_issue, meta = meta, dont_filter = True) ##期刊 ##yield Request("http://www.jstor.org/stable/40279148?Search=yes&resultItemClick=true&searchText=agriculture&searchText=OR&searchText=agricultural&searchText=OR&searchText=rural&searchUri=%2Faction%2FdoBasicSearch%3Fgroup%3Dnone%26amp%3Bsd%3D2009%252F03%26amp%3BsearchType%3DfacetSearch%26amp%3BQuery%3Dagriculture%2BOR%2Bagricultural%2BOR%2Brural%26amp%3Bpage%3D6%26amp%3Bfc%3Doff%26amp%3Bed%3D2009%252F04%26amp%3Bacc%3Don%26amp%3Bwc%3Don&seq=1#page_scan_tab_contents", self.parse_issue, meta = meta, dont_filter = True) #return if self.url_file: #指定了爬取哪些url with open(self.url_file) as f: for line in f: json_data = json.loads(line) if "url" in json_data: url = json_data["url"] if url in self.crawled_url: print "filter url: %s" % url else: meta = {"origin_url": json_data["url"]} url = "https://www.jstor.org/stable/25097205?seq=1#page_scan_tab_contents" yield Request(url, self.parse_issue, meta=meta, dont_filter=True) return yield Request(json_data["url"], self.parse_issue, meta=meta, dont_filter=True) #return return i = 0 final_page = 250 #11.25, total result is 25028 #final_page = 1 start_year = 2005 end_yrar = 2017 start_month = 1 end_month = 12 current_year = start_year while (current_year <= end_yrar): current_month = start_month while (current_month < end_month): start_date = str(current_year) + "%2F" + "%02d" % current_month end_date = str( current_year) + "%2F" + "%02d" % (current_month + 1) start_url = "http://www.jstor.org/action/doBasicSearch?searchType=facetSearch&page=1&sd=%s&ed=%s&wc=on&acc=on&fc=off&Query=agriculture+OR+agricultural+OR+rural&group=none" % ( start_date, end_date) if start_url in self.crawled_url: print "start_url crawled, filter: %s" % start_url else: meta = { "page": 1, "current_year": current_year, "current_month": current_month } yield Request(start_url, self.parse_result_of_date, meta=meta, dont_filter=True) current_month = current_month + 1 current_year = current_year + 1
def start_requests(self): for url in self.start_urls: yield Request(url, cookies={'xxx.com': 'true'}, callback=self.parse, dont_filter=True)
def start_requests(self): yield Request("https://kyfw.12306.cn/otn/userCommon/allProvince", callback=self.parse, meta={"turn": self.turn})
def start_requests(self): return [ Request( "http://api.rtvslo.si/ava/getShows?client_id=82013fb3a531d5414f478747c1aca622", callback=self.parse_oddaje) ]
def list_parse(self, response): urls = response.xpath('//div[@class="ContentDesc"]/a/@href').extract() for url in urls: yield Request(url, meta=response.meta, callback=self.detail_parse)
def next_request(self): self.logger.info( "length of queue %s is %s" % (self.queue_name, self.redis_conn.zcard(self.queue_name))) item = None if time.time() - self.request_interval < self.last_acs_time: return item if self.settings.getbool("CUSTOM_REDIS"): item = self.redis_conn.zpop(self.queue_name) else: pipe = self.redis_conn.pipeline() pipe.multi() pipe.zrange(self.queue_name, 0, 0).zremrangebyrank(self.queue_name, 0, 0) result, count = pipe.execute() if result: item = result[0] if item: self.last_acs_time = time.time() item = pickle.loads(item) self.present_item = item headers = item.get("headers", {}) body = item.get("body") if item.get("method"): method = item.get("method") else: method = "GET" try: req = Request(item['url'], method=method, body=body, headers=headers) except ValueError: req = Request('http://' + item['url'], method=method, body=body, headers=headers) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = getattr(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = getattr(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], (str, bytes)): req.cookies = parse_cookie(item['cookie']) return req
def parse(self, response): urls = response.xpath('//div[@class="tcat"]//a/@href').extract() for url in urls: if "10032191" not in url: continue yield Request(url, callback=self.get_all_list)
def gen_request(url, callback, item=None): r = Request(url, callback=callback) if item: r.meta['item'] = item return r
def start_requests(self): open('./ips.json', 'w').close() request = Request(url= "https://free-proxy-list.net/", callback=self.parse, headers=self.free_list_header) yield request
def parse(self, response): data = response.text[22:-2] data = json.loads(data) posts = data['posts'] for post in posts: if post['type'] == 'video': video_player = post['video-player'] try: video_id = re.findall(r'/(tumblr_[^_]*)_[^\.]*?\.jpg', video_player)[0] video_url = 'https://vtt.tumblr.com/{}_480.mp4'.format( video_id) video_name = video_url.split('/')[-1] video_path = post['type'] + '/' + video_name item = TumblrspiderItem() item['file_url'] = video_url item['file_path'] = video_path item['file_type'] = post['type'] yield item except IndexError: print(video_player) elif post['type'] == 'photo': photo_url = post['photo-url-1280'] photo_name = photo_url.split('/')[-1] photo_path = post['type'] + '/' + photo_name item = TumblrspiderItem() item['file_url'] = photo_url item['file_path'] = photo_path item['file_type'] = post['type'] yield item else: print(post['type']) try: reblogged_url = post['reblogged-from-url'] except KeyError: continue try: user_name = re.findall(r'://([^\.]*)\.tumblr\.com', reblogged_url)[0] except IndexError: continue print(user_name) url = '''https://{}.tumblr.com/api/read/json?start=0&num=200'''.format( user_name) depth = response.meta['depth'] + 1 if depth <= self.max_depth: yield Request(url, headers=self.get_headers(), callback=self.parse, meta={'depth': depth})
def get_media_requests(self, item, info): if isinstance(item, PDFItem): yield Request(url=item['file_urls'], headers=DEFAULT_REQUEST_HEADERS, meta={'file_names': item['file_names']})
def start_requests(self): for url in self.start_urls: yield Request(url, dont_filter=True, headers={'Host': 'jandan.net'}, callback=self.parse)
status=200, headers={ "Content-Type": "application/json", "Content-Encoding": "gzip", "Transfer-Encoding": "chunked", "Date": "Fri, 24 Apr 2020 18:06:42 GMT", "Proxy-Connection": "close", "Connection": "close", }, request=Request( SETTINGS["CRAWLERA_FETCH_URL"], meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": { "url": "https://fake.host.com" }, } }, ), body= b"""{"url":"https://fake.host.com","original_status":123,"headers":{"fake-header":"true"},"body":"foobar"}""", # noqa: E501 ), "expected": TextResponse( url="https://fake.host.com", status=123, headers={"Fake-Header": "true"}, body=b"""foobar""", # noqa: E501
def parse(self, response): categories = response.xpath('//ul[@class="product-categories"]/li/a') for cat in categories: cat_url = cat.xpath('./@href').get() parent_drug = cat.xpath('./text()').get() yield Request(cat_url, callback=self.list_parse, meta={"parent_drug": parent_drug})
def start_requests(self): dc = datetime.datetime.now() dc = dc.replace(minute=0, second=0, microsecond=0) self.start = dc dn = dc + datetime.timedelta(hours=1) self.d = defaultdict(int) c = 0 q = self.api.getDataResources(self.snapshot, format=self.format, portalid=self.portalID) log.info("Querying for uris", start=dc, end=dn, query=str(q)) schedules = [s for s in q] log.info("Received seed uris", count=len(schedules)) #schedules=[Schedule(uri='http://umbrich.org/', experiment='test')] for s in schedules: #],Schedule(uri='http://polleres.net/', experiment='test'),Schedule(uri='http://notavailable/', experiment='test')]: domain = '' try: parsed_uri = urlparse(s.uri) domain = '{uri.netloc}'.format(uri=parsed_uri) except: domain = 'error' self.d[domain] += 1 # set hard link to git location filename = None if self.git_location: d = self.api.getDatasetData(md5=s.md5) # try to get name if 'name' in d.raw: dir_name = d.raw['name'] else: dir_name = s.id # try to get resource name filename = None for r in d.raw.get('resources', []): if r.get('url', '') == s.uri: filename = r.get('name') if not filename: filename = r.get('id') break if not filename: filename = s.uri.split('/')[-1] if len(filename) < 4: filename = s.uri[:-150] filename = utils.helper_functions.format_filename(filename) res_dir = os.path.join(self.git_location, self.portalID, dir_name, 'resources') if not os.path.exists(res_dir): os.mkdir(res_dir) filename = os.path.join(res_dir, filename) yield Request(s.uri, dont_filter=True, meta={ 'handle_httpstatus_all': True, 'domain': domain, 'referrer': None, 'snapshot': self.snapshot, 'git': filename, 'orig_url': s.uri }) self.crawler.stats.inc_value('seeds') c = +1 self.crawler.stats.set_value('seedPLDs', len(self.d)) self.crawler.stats.set_value('domains', dict(self.d)) log.info("InitScheduled", uris=c)
def parse(self, response): ref_urls = response.xpath('//ol/li/div//a/@href').extract() for ref_url in ref_urls: url = self.base_url + ref_url yield Request(url, callback=self.list_parse)
def parse_question(self, response): questions_links = response.xpath("//*[contains(@href, 'question/index?qid')]/@href").extract() for link in questions_links: url = "https://br.answers.yahoo.com%s" % link yield Request(url, callback=self.extract_question)
def parse(self, response): tmp = "http://www.hi-chemical.com/?s={}&post_type=product" parents = response.xpath('//table[contains(@id,"table")]//tr[position()>1]/td/input/@value').extract() for parent in parents: yield Request(tmp.format(parent), meta={"parent": parent}, callback=self.list_parse)
def start_requests(self): yield Request(self.myurl, self.parse)
def start_requests(self): yield Request("http://www.12306.cn/mormhweb/kyyyz/", callback = self.parse, meta = {"turn":self.turn})