def parse(self, response): item = GermanyItem() self.MaxPage = response.xpath( '//ul[@class="searchlist-submen"]/li[contains(@id,"page")]/a/text()' ).extract()[-1] data_list = response.xpath( '//div[@class="table-responsive"]/table[@class="table"]/tbody/tr') print(self.MaxPage) for temp in data_list: Name = temp.xpath('./td[1]//a/strong/text()').extract() if len(Name) == 0: item["Name"] = "NULL" else: item["Name"] = Name[0].strip() wkn_isin = temp.xpath('./td[1]/div[2]/text()').extract() if len(wkn_isin) == 0: item["wkn"] = None item["isin"] = None else: item["wkn"] = str(wkn_isin[0]).split("/")[0].strip() item["isin"] = str(wkn_isin[0]).split("/")[-1].strip() item["doc_source_url"] = None yield item while self.page < int(self.MaxPage): self.page += 1 url = self.url1 + str(self.page) yield scrapy.Request(url, callback=self.parse)
def start_requests(self): for temp in self.results: item = GermanyItem() isin = temp[0] item["company_id"] = temp[1] item["info_disclosure_id"] = isin item["country_code_listed"] = "DEU" item["exchange_market_code"] = "Frankfurt" item["currency_code"] = "AUD" item["doc_source_url"] = None url = self.url1 + isin yield scrapy.Request(url, callback=self.parse, meta={"item": item})
def pdf_func(self, response): latest_mark = response.meta["latest_mark"] company_id = response.meta["company_id"] data_list = response.xpath( '//div[@id="main-wrapper"]/div[5]//div[@class="table-responsive"]/table[@class="table"]/tbody/tr' ) for temp in data_list: item = GermanyItem() item["doc_source_url"] = temp.xpath('./td/a/@href').extract()[0] mark = self.pattern.search(str(item["doc_source_url"])) item["latest_mark"] = mark.group(1) if int(item["latest_mark"]) > int(latest_mark): item["country_code"] = "DEU" item["exchange_market_code"] = "Frankfurt" item["company_code"] = company_id item["financial_reporting_standard_code"] = "IFRS/German GAAP" item["language_written_code"] = "de-DE" item["doc_type"] = "pdf" item["is_doc_url_direct"] = 1 item["doc_downloaded_timestamp"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["currency_code"] = "AUD" item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["user_create"] = "zx" item["is_downloaded"] = 1 date = temp.xpath('./td[1]/text()').extract()[0].replace( "\r", "").replace("\n", "").replace("\t", "").replace("/", "-") start_time = str(date).split(" till ")[0] standard_start_time = start_time.split("-") item["start_date"] = standard_start_time[2].strip( ) + "-" + standard_start_time[1].strip( ) + "-" + standard_start_time[0].strip() + " 00:00:00" end_time = str(date).split(" till ")[1] standard_end_time = end_time.split("-") item["end_date"] = standard_end_time[2].strip( ) + "-" + standard_end_time[1].strip( ) + "-" + standard_end_time[0].strip() + " 00:00:00" item["fiscal_year"] = standard_start_time[2] title = temp.xpath('./td[2]/text()').extract()[0].replace( "\r", "").replace("\n", "").replace("\t", "").replace("/", "-") item["origin_pdf_name"] = title if any(i in title for i in self.Q1_list): item["financial_statement_season_type_code"] = "Q1" item["announcement_type"] = "1" item["report_id"] = company_id + self.uniqueIDMaker() elif any(i in title for i in self.Q2_list): item["financial_statement_season_type_code"] = "Q2" item["announcement_type"] = "1" item["report_id"] = company_id + self.uniqueIDMaker() elif any(i in title for i in self.Q3_list): item["financial_statement_season_type_code"] = "Q3" item["announcement_type"] = "1" item["report_id"] = company_id + self.uniqueIDMaker() elif any(i in title for i in self.FY_list): item["financial_statement_season_type_code"] = "FY" item["announcement_type"] = "1" item["report_id"] = company_id + self.uniqueIDMaker() elif any(i in title for i in self.Financial_report_list): item["financial_statement_season_type_code"] = "" item["announcement_type"] = "1" item["report_id"] = company_id + self.uniqueIDMaker() else: item["announcement_type"] = "0" item["report_id"] = company_id + self.uniqueIDMaker() item["doc_local_path"] = "/volume3/homes3/Germany/" + item[ "fiscal_year"] + "/" + item["report_id"] + ".pdf" item["pdf_name"] = item["report_id"] yield item else: break
def pdf_func(self, response): num = response.meta["num"] company_id = response.meta["company_id"] data_list = response.xpath( '//div[@id="main-wrapper"]/div[5]//div[@class="table-responsive"]/table[@class="table"]/tbody/tr' ) for temp in data_list: item = GermanyItem() item["country_code"] = "DEU" item["exchange_market_code"] = "Frankfurt" item["company_code"] = company_id item["financial_reporting_standard_code"] = "IFRS/German GAAP" item["language_written_code"] = "de-DE" item["doc_type"] = "pdf" item["is_doc_url_direct"] = 1 item["doc_downloaded_timestamp"] = "20171223000000" item["currency_code"] = "AUD" item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["user_create"] = "root" item["is_downloaded"] = 1 date = temp.xpath('./td[1]/text()').extract()[0].replace( "\r", "").replace("\n", "").replace("\t", "").replace("/", "-") start_time = str(date).split(" till ")[0] standard_start_time = start_time.split("-") item["start_date"] = standard_start_time[2].strip( ) + "-" + standard_start_time[1].strip( ) + "-" + standard_start_time[0].strip() + " 00:00:00" end_time = str(date).split(" till ")[1] standard_end_time = end_time.split("-") item["end_date"] = standard_end_time[2].strip( ) + "-" + standard_end_time[1].strip( ) + "-" + standard_end_time[0].strip() + " 00:00:00" item["fiscal_year"] = standard_start_time[2] item["doc_source_url"] = temp.xpath('./td/a/@href').extract()[0] title = temp.xpath('./td[2]/text()').extract()[0].replace( "\r", "").replace("\n", "").replace("\t", "").replace("/", "-") item["origin_pdf_name"] = title if any(i in title for i in self.Q1_list): num = self.go_heavy_num(num) item["financial_statement_season_type_code"] = "Q1" item["report_id"] = company_id + item[ "fiscal_year"] + "00" + "01" + "01" + num num = int(num) elif any(i in title for i in self.Q2_list): num = self.go_heavy_num(num) item["financial_statement_season_type_code"] = "Q2" item["report_id"] = company_id + item[ "fiscal_year"] + "00" + "02" + "01" + num num = int(num) elif any(i in title for i in self.Q3_list): num = self.go_heavy_num(num) item["financial_statement_season_type_code"] = "Q3" item["report_id"] = company_id + item[ "fiscal_year"] + "00" + "03" + "01" + num num = int(num) elif any(i in title for i in self.FY_list): num = self.go_heavy_num(num) item["financial_statement_season_type_code"] = "FY" item["report_id"] = company_id + item[ "fiscal_year"] + "00" + "06" + "01" + num num = int(num) elif any(i in title for i in self.Financial_report_list): num = self.go_heavy_num(num) item["financial_statement_season_type_code"] = None item["report_id"] = company_id + item[ "fiscal_year"] + "00" + "00" + "01" + num num = int(num) else: continue item["doc_local_path"] = "/volum1/homes/Germany/" + item[ "fiscal_year"] + "/" + item["report_id"] + ".pdf" item["pdf_name"] = item["report_id"] yield item