def parse(self, response: scrapy.http.response.Response): base_css = "[data-test=qsp-financial] tbody " datetimes = response.css( base_css + 'tr:first-child td:not(:first-child) span::text').extract() labels = response.css( base_css + 'tr:not(:first-child) td:first-child:not([colspan]) span::text' ).extract() values = response.css( base_css + 'tr:not(:first-child) td:not(:first-child) ::text').extract() datetimes = list(map(lambda x: x.replace('/', '-'), datetimes)) symbol = response.request.url.split('=')[1] target_file = os.path.join(YahooFinanceSpider.target_dir, symbol + '.csv') current_label_index = -1 current_datetime_index = -1 datetimes_len = len(datetimes) df = pd.DataFrame(index=labels, columns=datetimes) pd.options.mode.chained_assignment = None for i in range(0, len(values)): current_datetime_index += 1 if i % datetimes_len == 0: current_label_index += 1 current_datetime_index = 0 val = str(values[i]).replace('-', '') val = str(val).replace(',', '') if str(val) != '': val = int(float(val) * 1000) #TODO check if all numbers are in thousands df.loc[labels[current_label_index]][ datetimes[current_datetime_index]] = val mode = 'w' header = True if os.path.isfile(target_file): mode = 'a' header = False if df.shape[0] != 0 and df.shape[1] != 0: with open(target_file, mode) as f: df.to_csv(f, header=header)
def parse(self, response: scrapy.http.response.Response): print(response.url) print('gathering links', response.url) self.company_links[response.url] = response.css( '.qTableFull tr td:first-child a::attr(href)').extract() # Continue only when all company_links are gathered can_continue = True for start_url in self.start_urls: if start_url not in self.company_links: print('Not all company links yet gathered', response.url) can_continue = False break if can_continue: print('All links gathered. Proceeding.') company_links = [] # Organize links in correct order (same as start_urls) for start_url in self.start_urls: company_links += self.company_links[start_url] links_len = len(company_links) for i, link in enumerate(company_links): # print(self.url_base + link + self.suffix) yield scrapy.Request(self.url_base + link + self.suffix, self.parse_company_page, priority=links_len - i) print('Scheduled all requests. Total', links_len)
def parse(self, response: scrapy.http.response.Response): key_words = ['望京', '望馨花园', '望馨园', '东湖渠'] send = SendEmail() history = [] with open('history.txt') as f: tmp = f.readlines() if len(tmp): history.extend(tmp) else: self.log('历史记录是空', level=logging.WARNING) page = response.css('td.title') for i in page: title = i.css('a::text').extract_first().strip() link = i.css('a::attr(href)').extract_first() self.log('租房标题:{0}'.format(title), level=logging.WARNING) self.log('租房链接:{0}'.format(link), level=logging.WARNING) email_message = '租房标题:{0}\n租房链接:{1}'.format(title, link) for j in key_words: if j in title and link not in history: # QQ邮箱对发信频率有限制,所以没有找到好的方法之前,无脑 sleep time.sleep(10) send.send_email('', email_message) history.append(link + '\n') with open('history.txt', 'w') as f: f.writelines(history)
def parse_list(self, response:scrapy.http.response.Response): art_links_selector = response.xpath('//*[@id="J_main-container"]//h2[@class="post-title"]/a') for art_link_selector in art_links_selector: link = art_link_selector.xpath('@href') title = art_link_selector.xpath('text()') # 首页的第二页按钮 second_page = response.xpath('//*[@id="J_main-container"]' '//a[contains(@class, "home-browser-more-btn")]/@href').get() if second_page: yield response.follow(second_page, callback=self.parse_list) # next_page = response.xpath('//*[@id="J_main-container"]//ul[@class="pagination"]' '/li[not(contains(@class, "disabled"))]/a[@aria-label="Next"]/@href').get() if next_page: yield response.follow(next_page, callback=self.parse_list)
def parse_ticker_page(self, response: scrapy.http.response.Response): self.parse_price_page(response) next_page_href = response.css('.pages_right::attr(href)').extract() if len(next_page_href) > 0: time.sleep(1) return scrapy.Request(self.url_base + next_page_href[0], self.parse_ticker_page)
def GetMovieCountry(response: scrapy.http.response.Response): """ 解析制片国 :param response: scrapy返回的response :return: 制片国家 """ return response.xpath( "//div[@id='info']/span[text()='制片国家/地区:'][1]/following-sibling::text()[1]" ).extract_first(default="").strip()
def GetRateInfo(response: scrapy.http.response.Response): """ 解析评分信息 :param response: scrapy返回的response :return: 评分人数,评分信息 """ rateNumber = int( response.xpath("//a[@class='rating_people']/span/text()"). extract_first(default=0)) rateDetails_dict = dict() for start_num in range(1, 6): rate = response.xpath( "//span[@class='stars{} starstop']/../span[@class='rating_per']/text()" .format(start_num)).extract_first(default="0") rateDetails_dict.update( {start_num: float(rate.strip('%')) / 100.0}) return rateNumber, rateDetails_dict
def parse_subpage(self, response: scrapy.http.response.Response): links = response.css("#main-content a::attr(href)").extract() for link in links: filename = link.rsplit('/', 1)[-1] pattern = re.compile("^R\d+\.htm$") if pattern.match(filename): yield scrapy.Request(self.url_base + link, self.get_data)
def parse(self, response: scrapy.http.response.Response): next_page = response.xpath( '//div[@class="navigation-wrapper"]/div/a[@class="next"]/@href' ).get() if next_page: print(next_page) self.count += 1 if self.count < 20: yield response.follow(next_page, callback=self.parse) desc = response.xpath('//meta[@name="description"]/@content').get() tags = response.xpath('//span[@class="tag-links"]/a/text()').getall() res = self.extractor.extract(response.text) yield MeituanArticleSpiderItem(url=response.url, title=res['title'], content=res['content'], tags=tags, author=res['author'], publish_time=res['publish_time'])
def _parse_role(response: scrapy.http.response.Response) -> Dict[str, str]: """ Extract Sphinx role from a crawled page. Valid roles: - function - class - module Args: response: PLACEHOLDER. Returns: String containing the role. """ url = response.url name_query = "//h1/text()" name = response.xpath(name_query).get() if url in ( "https://www.tensorflow.org/api_docs/python/tf", "https://www.tensorflow.org/probability/api_docs/python/tfp", ): return {"name": name, "url": url, "role": "package"} section_query = "//h2/text()" sections = response.xpath(section_query).getall() if "Module" in name.split(": "): role = "module" name = name.split(": ")[-1] elif "Attributes" in sections or "Methods" in sections: role = "class" else: # If the object is not a Module or a Class then it is a function. role = "function" return {"name": name, "url": url, "role": role}
def parse(self, response: scrapy.http.response.Response) -> scrapy.Request: """ Main scrapy parser :param response: scrapy response object :return: new scrapy request """ for url in response.xpath( '//ul[@class="fl_titlelist"]/li/div[@class="fl_name"]/a/@href' ): url_val = url.extract() if url_val and url_val.strip('/') in self.already_harvested: continue else: yield scrapy.Request( url=url_val, callback=self.parse_item, cb_kwargs={'on_netflix': '/netflix/' in response.url}) next_url = response.xpath( '//li[@class="page-item"]/a[text() = "Next"]/@href').extract_first( ) if next_url: yield scrapy.Request(url=next_url, callback=self.parse)
def GetActorsInfo(response: scrapy.http.response.Response): """ 解析演员信息 :param response: scrapy返回的response :return: 演员信息字典 """ try: actor_info_list = response.xpath("//span[@class='actor']//a") return { actor_info.xpath("text()").extract_first(): actor_info.xpath("@href").extract_first() for actor_info in actor_info_list } except: return dict()
def _parse_role(response: scrapy.http.response.Response) -> Dict[str, str]: """ Extract Sphinx role from a crawled page. Valid roles: - function - class - module Args: response: PLACEHOLDER. Returns: String containing the role. """ url = response.url if response.url == "https://www.tensorflow.org/api_docs/python/tf": return "package" name_query = "//h1/text()" name = response.xpath(name_query).get() class_selector = response.xpath("//h2/text()").get() if "Module" in name.split(": "): role = "module" name = name.split(": ")[-1] elif class_selector == "Class ": role = "class" else: # If the object is not a Module or a Class then it is a function. role = "function" return {"name": name, "url": url, "role": role}
def GetDirectorOrAuthorInfo(key, response: scrapy.http.response.Response): """ 解析导演或编剧信息 :param key: '导演'或'编剧' :param response: scrapy返回的response :return: 导演或编剧信息字典 """ try: info_list = response.xpath( "//div[@id='info']//span[text()='{key}']/following-sibling::span[1]/a" .format(key=key)) return { info.xpath("text()").extract_first(): info.xpath("@href").extract_first() for info in info_list } except: return dict()
def parse(self, response: scrapy.http.response.Response, **kwargs): data_list = response.xpath('//*[@id="main-container"]/div[2]/ol/li') for data in data_list: item = CcspiderItem() item['title'] = data.xpath('.//p[1]/text()')[2].get().strip() item['authors'] = data.xpath('.//p[2]/a/text()').extract() date = utils.merge_text(data.xpath('.//p[4]/text()[last()]').get()) date = date.split(' ') item['month'] = utils.month_to_int(date[0]) item['year'] = int(date[1][:4]) item['subjects'] = utils.deduplicate( data.xpath('.//div/div/span/@data-tooltip').extract()) item['abstract'] = utils.merge_text( data.xpath('.//p[3]/span[3]/text()').get()) item['citation'] = 0 yield item print('已爬完{}页(共{}条)'.format(self.page, self.page * self.size)) self.page += 1
def get_data(self, response: scrapy.http.response.Response): document_type = response.css('th.tl strong').extract_first() period_label = response.css('th.th::text').extract_first() dt = response.css('th.th div::text').extract_first() if period_label is None or document_type is None or dt is None: # print(period_label) # print(document_type) # print(dt) return document_type = document_type.lower() period_label = period_label.lower() period_labels = ['12 months ended'] document_types = { 'income_statement': 'consolidated statements of income', 'balance_sheet': 'consolidated balance sheets', 'cash_flow': 'consolidated statements of cash flows' } is_period_important = False is_document_important = False for p_label in period_labels: if p_label in period_label: is_period_important = True break for slug, d_type in document_types.items(): if d_type in document_type: is_document_important = True break if is_period_important and is_document_important: if "thousand" in document_type: multiplier = 1000 elif "million" in document_type: multiplier = 1000000 elif "billion" in document_type: multiplier = 1000000000 else: raise RuntimeError('No multiplier defined in ' + response.url + '. Document heading: ' + document_type) year = dt[-4:] cik = response.url.rsplit('/')[-3] fin_dict = {'cik': cik} records = response.css('tr') for record in records: record_title = record.css('td.pl a::text').extract_first() if record_title: record_title = record_title.replace(',', '') value = record.css('td.nump::text').extract_first() # print(record_title, value) if value: digit_val = re.findall(r'[\d+,]+', value)[0] # print('digit_val', digit_val) if digit_val: digit_val = float(digit_val.replace( ',', '.')) * multiplier fin_dict[record_title] = str(digit_val) file_path = os.path.join(self.output_dir, year + '.csv') mode = 'w' if os.path.isfile(file_path): mode = 'a' with open(file_path, mode) as f: print('Saving output to ' + file_path) #FIXME sort before saving w = csv.DictWriter(f, fin_dict.keys()) # if mode == 'w': w.writeheader() w.writerow(fin_dict) else: pass
def parse(self, response: scrapy.http.response.Response): links = response.css("#main-content a::attr(href)").extract() for link in links: yield scrapy.Request(self.url_base + link, self.parse_subpage)
def parse_item(self, response: scrapy.http.response.Response, on_netflix) -> CritickerMoviesItem: """ Extract data from given item url :param response: scrapy response object :return: Criticker Movies item object """ movie_data = CritickerMoviesItem() movie_data['on_netflix'] = int(on_netflix) movie_data['url'] = response.url.strip('/') movie_data['uid'] = self.extract_uid_from_url(movie_data['url']) movie_data['type'] = response.xpath( '//*[@id="fi_info_type"]/text()').extract_first() movie_data['name'] = response.xpath( '//h1/span[@itemprop="name"]/text()').extract_first() movie_data['date_published'] = response.xpath( '//h1/span[@itemprop="datePublished"]/text()').extract_first() movie_data['start_date'] = response.xpath( '//h1/span[@itemprop="startDate"]/text()').extract_first() movie_data['end_date'] = response.xpath( '//h1/span[@itemprop="endDate"]/text()').extract_first() movie_data['poster_url'] = response.xpath( '//div[@id="poster"]/img/@src').extract_first() movie_data['description'] = ' '.join([ _.extract().strip() for _ in response.xpath('//span[@itemprop="description"]//text()') ]).strip() if not movie_data['description']: movie_data['description'] = None more_info_elem = response.xpath('//div[@id="fi_moreinfo"]') h = more_info_elem.xpath('./p') for i, hi in enumerate(h): hi_ = hi.attrib['id'] label = self.extract_label_from_id(hi_) if 'aka' in label: movie_data[label] = response.xpath( '//p[@id="{}"]/text()'.format(hi_)).extract_first() else: movie_data[label] = self.extract_more_info(hi) movie_data['trailer_url'] = response.xpath( '//div[@id="fi_trailer"]/iframe/@src').extract_first() if movie_data['trailer_url'] == 'http://www.youtube.com/watch?v=': movie_data['trailer_url'] = None movie_data['rss_feed_url'] = response.xpath( '//*[@id="fi_titlerss"]/a/@href').extract_first() movie_data['avg_percentile'] = response.xpath( '//span[@itemprop="ratingValue"]/text()').extract_first() movie_data['n_ratings'] = response.xpath( '//span[@itemprop="reviewCount"]/text()').extract_first() return movie_data
def parse(self, response: scrapy.http.response.Response): links = response.css( ".list a[href^=\/files\/dera\/data\/financial-statement-data-sets\/]::attr(href)" ).extract() for link in links: yield scrapy.Request(self.url_base + link, self.get_data)