Example #1
0
 def parse(self, response):
     item = GermanyItem()
     self.MaxPage = response.xpath(
         '//ul[@class="searchlist-submen"]/li[contains(@id,"page")]/a/text()'
     ).extract()[-1]
     data_list = response.xpath(
         '//div[@class="table-responsive"]/table[@class="table"]/tbody/tr')
     print(self.MaxPage)
     for temp in data_list:
         Name = temp.xpath('./td[1]//a/strong/text()').extract()
         if len(Name) == 0:
             item["Name"] = "NULL"
         else:
             item["Name"] = Name[0].strip()
         wkn_isin = temp.xpath('./td[1]/div[2]/text()').extract()
         if len(wkn_isin) == 0:
             item["wkn"] = None
             item["isin"] = None
         else:
             item["wkn"] = str(wkn_isin[0]).split("/")[0].strip()
             item["isin"] = str(wkn_isin[0]).split("/")[-1].strip()
         item["doc_source_url"] = None
         yield item
     while self.page < int(self.MaxPage):
         self.page += 1
         url = self.url1 + str(self.page)
         yield scrapy.Request(url, callback=self.parse)
 def start_requests(self):
     for temp in self.results:
         item = GermanyItem()
         isin = temp[0]
         item["company_id"] = temp[1]
         item["info_disclosure_id"] = isin
         item["country_code_listed"] = "DEU"
         item["exchange_market_code"] = "Frankfurt"
         item["currency_code"] = "AUD"
         item["doc_source_url"] = None
         url = self.url1 + isin
         yield scrapy.Request(url, callback=self.parse, meta={"item": item})
Example #3
0
 def pdf_func(self, response):
     latest_mark = response.meta["latest_mark"]
     company_id = response.meta["company_id"]
     data_list = response.xpath(
         '//div[@id="main-wrapper"]/div[5]//div[@class="table-responsive"]/table[@class="table"]/tbody/tr'
     )
     for temp in data_list:
         item = GermanyItem()
         item["doc_source_url"] = temp.xpath('./td/a/@href').extract()[0]
         mark = self.pattern.search(str(item["doc_source_url"]))
         item["latest_mark"] = mark.group(1)
         if int(item["latest_mark"]) > int(latest_mark):
             item["country_code"] = "DEU"
             item["exchange_market_code"] = "Frankfurt"
             item["company_code"] = company_id
             item["financial_reporting_standard_code"] = "IFRS/German GAAP"
             item["language_written_code"] = "de-DE"
             item["doc_type"] = "pdf"
             item["is_doc_url_direct"] = 1
             item["doc_downloaded_timestamp"] = time.strftime(
                 '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             item["currency_code"] = "AUD"
             item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime(time.time()))
             item["user_create"] = "zx"
             item["is_downloaded"] = 1
             date = temp.xpath('./td[1]/text()').extract()[0].replace(
                 "\r", "").replace("\n", "").replace("\t",
                                                     "").replace("/", "-")
             start_time = str(date).split(" till ")[0]
             standard_start_time = start_time.split("-")
             item["start_date"] = standard_start_time[2].strip(
             ) + "-" + standard_start_time[1].strip(
             ) + "-" + standard_start_time[0].strip() + " 00:00:00"
             end_time = str(date).split(" till ")[1]
             standard_end_time = end_time.split("-")
             item["end_date"] = standard_end_time[2].strip(
             ) + "-" + standard_end_time[1].strip(
             ) + "-" + standard_end_time[0].strip() + " 00:00:00"
             item["fiscal_year"] = standard_start_time[2]
             title = temp.xpath('./td[2]/text()').extract()[0].replace(
                 "\r", "").replace("\n", "").replace("\t",
                                                     "").replace("/", "-")
             item["origin_pdf_name"] = title
             if any(i in title for i in self.Q1_list):
                 item["financial_statement_season_type_code"] = "Q1"
                 item["announcement_type"] = "1"
                 item["report_id"] = company_id + self.uniqueIDMaker()
             elif any(i in title for i in self.Q2_list):
                 item["financial_statement_season_type_code"] = "Q2"
                 item["announcement_type"] = "1"
                 item["report_id"] = company_id + self.uniqueIDMaker()
             elif any(i in title for i in self.Q3_list):
                 item["financial_statement_season_type_code"] = "Q3"
                 item["announcement_type"] = "1"
                 item["report_id"] = company_id + self.uniqueIDMaker()
             elif any(i in title for i in self.FY_list):
                 item["financial_statement_season_type_code"] = "FY"
                 item["announcement_type"] = "1"
                 item["report_id"] = company_id + self.uniqueIDMaker()
             elif any(i in title for i in self.Financial_report_list):
                 item["financial_statement_season_type_code"] = ""
                 item["announcement_type"] = "1"
                 item["report_id"] = company_id + self.uniqueIDMaker()
             else:
                 item["announcement_type"] = "0"
                 item["report_id"] = company_id + self.uniqueIDMaker()
             item["doc_local_path"] = "/volume3/homes3/Germany/" + item[
                 "fiscal_year"] + "/" + item["report_id"] + ".pdf"
             item["pdf_name"] = item["report_id"]
             yield item
         else:
             break
Example #4
0
 def pdf_func(self, response):
     num = response.meta["num"]
     company_id = response.meta["company_id"]
     data_list = response.xpath(
         '//div[@id="main-wrapper"]/div[5]//div[@class="table-responsive"]/table[@class="table"]/tbody/tr'
     )
     for temp in data_list:
         item = GermanyItem()
         item["country_code"] = "DEU"
         item["exchange_market_code"] = "Frankfurt"
         item["company_code"] = company_id
         item["financial_reporting_standard_code"] = "IFRS/German GAAP"
         item["language_written_code"] = "de-DE"
         item["doc_type"] = "pdf"
         item["is_doc_url_direct"] = 1
         item["doc_downloaded_timestamp"] = "20171223000000"
         item["currency_code"] = "AUD"
         item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S',
                                            time.localtime(time.time()))
         item["user_create"] = "root"
         item["is_downloaded"] = 1
         date = temp.xpath('./td[1]/text()').extract()[0].replace(
             "\r", "").replace("\n", "").replace("\t",
                                                 "").replace("/", "-")
         start_time = str(date).split(" till ")[0]
         standard_start_time = start_time.split("-")
         item["start_date"] = standard_start_time[2].strip(
         ) + "-" + standard_start_time[1].strip(
         ) + "-" + standard_start_time[0].strip() + " 00:00:00"
         end_time = str(date).split(" till ")[1]
         standard_end_time = end_time.split("-")
         item["end_date"] = standard_end_time[2].strip(
         ) + "-" + standard_end_time[1].strip(
         ) + "-" + standard_end_time[0].strip() + " 00:00:00"
         item["fiscal_year"] = standard_start_time[2]
         item["doc_source_url"] = temp.xpath('./td/a/@href').extract()[0]
         title = temp.xpath('./td[2]/text()').extract()[0].replace(
             "\r", "").replace("\n", "").replace("\t",
                                                 "").replace("/", "-")
         item["origin_pdf_name"] = title
         if any(i in title for i in self.Q1_list):
             num = self.go_heavy_num(num)
             item["financial_statement_season_type_code"] = "Q1"
             item["report_id"] = company_id + item[
                 "fiscal_year"] + "00" + "01" + "01" + num
             num = int(num)
         elif any(i in title for i in self.Q2_list):
             num = self.go_heavy_num(num)
             item["financial_statement_season_type_code"] = "Q2"
             item["report_id"] = company_id + item[
                 "fiscal_year"] + "00" + "02" + "01" + num
             num = int(num)
         elif any(i in title for i in self.Q3_list):
             num = self.go_heavy_num(num)
             item["financial_statement_season_type_code"] = "Q3"
             item["report_id"] = company_id + item[
                 "fiscal_year"] + "00" + "03" + "01" + num
             num = int(num)
         elif any(i in title for i in self.FY_list):
             num = self.go_heavy_num(num)
             item["financial_statement_season_type_code"] = "FY"
             item["report_id"] = company_id + item[
                 "fiscal_year"] + "00" + "06" + "01" + num
             num = int(num)
         elif any(i in title for i in self.Financial_report_list):
             num = self.go_heavy_num(num)
             item["financial_statement_season_type_code"] = None
             item["report_id"] = company_id + item[
                 "fiscal_year"] + "00" + "00" + "01" + num
             num = int(num)
         else:
             continue
         item["doc_local_path"] = "/volum1/homes/Germany/" + item[
             "fiscal_year"] + "/" + item["report_id"] + ".pdf"
         item["pdf_name"] = item["report_id"]
         yield item