コード例 #1
0
 def start_requests(self):
     newest_name_num = self.get_newest_company_file("NSE")
     excel_name = "D:\item\OPDCMS\listed company update\india\data\companyList/NSE_" + newest_name_num + ".csv"
     f = open(excel_name, "r")
     reader = csv.reader(f)
     for i, row in enumerate(reader):
         if i >= 1:
             Symbol = row[0].decode("gbk").encode("utf-8")
             if Symbol in self.code_list:
                 item = IndiaItem()
                 item["security_code"] = None
                 item["info_disclosure_id"] = Symbol
                 iNum = self.code_list.index(Symbol)
                 item["company_id"] = self.company_id_list[iNum]
                 item["name_origin"] = row[1].decode("gbk").encode("utf-8")
                 item["name_en"] = item["name_origin"]
                 item["First_Listing_Date"] = row[2].decode("gbk").encode("utf-8")
                 item["Face_Value"] = row[3].decode("gbk").encode("utf-8")
                 item["Paid_Up_Value"] = row[4].decode("gbk").encode("utf-8")
                 item["Market_Lot"] = row[5].decode("gbk").encode("utf-8")
                 item["ISIN"] = row[6].decode("gbk").encode("utf-8")
                 item["country_code_listed"] = "IND"
                 item["exchange_market_code"] = "BSE"
                 item["currency_code"] = "INR"
                 item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                 item["user_create"] = "root"
                 item["website_url"] = None
                 item["status"] = None
                 url = self.url1 + str(Symbol) + self.url2
                 link = self.link + str(Symbol)
                 yield scrapy.Request(url, callback=self.parse, meta={"item": item, "link": link})
コード例 #2
0
 def start_requests(self):
     newest_name_num = self.get_newest_company_file("BSE")
     excel_name = "/data/OPDCMS/india/listed_company_update/company_list/BSE_" + newest_name_num + ".csv"
     f = open(excel_name, "r")
     reader = csv.reader(f)
     for i, row in enumerate(reader):
         if i >= 1:
             code = row[0]
             if code in self.code_list:
                 item = IndiaItem()
                 item["security_code"] = code
                 iNum = self.code_list.index(code)
                 item["company_id"] = self.company_id_list[iNum]
                 item["Symbol"] = row[1]
                 item["name_origin"] = row[2]
                 item["name_en"] = item["name_origin"]
                 item["status"] = row[3]
                 item["Group_Num"] = row[4]
                 item["Face_Value"] = row[5]
                 item["ISIN"] = row[6]
                 item["Industry"] = row[7]
                 item["Instrument"] = row[8]
                 item["country_code_listed"] = "IND"
                 item["exchange_market_code"] = "BSE"
                 item["currency_code"] = "INR"
                 item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                 item["user_create"] = "zx"
                 item["info_disclosure_id"] = None
                 url = self.url + str(code) + "/"
                 yield scrapy.Request(url, callback=self.parse, meta={"item": item})
コード例 #3
0
 def parse(self, response):
     tr_list = response.xpath(
         '//div[@class="content"]//table[@cellspacing="1"]//tr')
     del tr_list[:1]
     for temp in tr_list:
         item = IndiaItem()
         item["fiscal_year"] = temp.xpath('./td[1]/text()').extract()[0]
         item["doc_source_url"] = temp.xpath('./td[2]/a/@href').extract()[0]
         item["company_code"] = response.meta["company_id"]
         item["country_code"] = "IND"
         item["exchange_market_code"] = "BSE"
         item["financial_reporting_standard_code"] = "IFRS/IND AS"
         item["doc_type"] = "pdf"
         item["is_doc_url_direct"] = 1
         item["is_downloaded"] = 1
         item["currency_code"] = "INR"
         item["doc_downloaded_timestamp"] = time.strftime(
             '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
         item["language_written_code"] = "en"
         num = int(self.report_num_dict[item["company_code"]]) + 1
         num = self.go_heavy_num(num)
         self.report_num_dict[item["company_code"]] = num
         item["report_id"] = item["company_code"] + item[
             "fiscal_year"] + "00" + "06" + "01" + self.report_num_dict[
                 item["company_code"]]
         item["doc_local_path"] = "/volum1/homes/India/" + item[
             "fiscal_year"] + "/" + item["report_id"] + ".pdf"
         item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S',
                                            time.localtime(time.time()))
         item["user_create"] = "root"
         item["financial_statement_season_type_code"] = "FY"
         item["file_name"] = item["report_id"]
         item["jud"] = 0
         yield item
コード例 #4
0
 def parse(self, response):
     max_fiscal_year = response.meta["max_fiscal_year"]
     tr_list = response.xpath('//table[@class="ng-scope"]/tbody/tr')
     for temp in tr_list:
         item = IndiaItem()
         item["fiscal_year"] = temp.xpath('./td[1]/text()').extract()[0]
         if int(item["fiscal_year"]) > int(max_fiscal_year):
             item["doc_source_url"] = "https://www.bseindia.com" + temp.xpath('./td[2]/a/@href').extract()[0]
             item["company_code"] = response.meta["company_id"]
             item["country_code"] = "IND"
             item["exchange_market_code"] = "BSE"
             item["financial_reporting_standard_code"] = "IFRS/IND AS"
             item["doc_type"] = "pdf"
             item["is_doc_url_direct"] = 1
             item["is_downloaded"] = 1
             item["currency_code"] = "INR"
             item["doc_downloaded_timestamp"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             item["language_written_code"] = "en"
             item["report_id"] = item["company_code"] + self.uniqueIDMaker()
             item["doc_local_path"] = "/volume3/homes3/India/" + item["fiscal_year"] + "/" + item["report_id"] + ".pdf"
             item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             item["user_create"] = "zx"
             item["financial_statement_season_type_code"] = "FY"
             item["file_name"] = item["report_id"]
             item["jud"] = 0
             yield item
コード例 #5
0
 def download(self, response):
     newstdate = response.meta["newstdate"]
     item = IndiaItem()
     try:
         start_year = response.xpath(
             '//table[@id="ContentPlaceHolder1_tbl_typeID"]//tr[2]/td[2]/text()'
         ).extract()[0]
         end_year = response.xpath(
             '//table[@id="ContentPlaceHolder1_tbl_typeID"]//tr[3]/td[2]/text()'
         ).extract()[0]
         tr_list = response.xpath(
             '//table[@id="ContentPlaceHolder1_tbl_typeID"]//tr')
         start = str(start_year).split("-")
         end = str(end_year).split("-")
         item["start_date"] = "20" + start[-1] + "-" + start[1].replace(
             "Jan", "01").replace("Feb", "02").replace("Mar", "03").replace(
                 "Apr", "04").replace("May", "05").replace(
                     "Jun", "06").replace("Jul", "07").replace(
                         "Aug", "08").replace("Sep", "09").replace(
                             "Oct", "10").replace("Nov", "11").replace(
                                 "Dec", "12") + "-" + start[0] + " 00:00:00"
         item["end_date"] = "20" + end[-1] + "-" + end[1].replace(
             "Jan", "01").replace("Feb", "02").replace("Mar", "03").replace(
                 "Apr", "04").replace("May", "05").replace(
                     "Jun", "06").replace("Jul", "07").replace(
                         "Aug", "08").replace("Sep", "09").replace(
                             "Oct", "10").replace("Nov", "11").replace(
                                 "Dec", "12") + "-" + end[0] + " 00:00:00"
         if item["end_date"] > str(newstdate):
             item["company_code"] = response.meta["company_id"]
             item["fiscal_year"] = str(item["end_date"]).split("-")[0]
             item["country_code"] = "IND"
             item["exchange_market_code"] = "BSE"
             item["financial_reporting_standard_code"] = "IFRS/IND AS"
             item["doc_type"] = "csv"
             item["source_url"] = response.url
             item["doc_source_url"] = None
             item["is_doc_url_direct"] = 1
             item["is_downloaded"] = 1
             item["currency_code"] = "INR"
             item["doc_downloaded_timestamp"] = time.strftime(
                 '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             item["language_written_code"] = "en"
             item["report_id"] = item["company_code"] + self.uniqueIDMaker()
             item["doc_local_path"] = "/volume3/homes3/India/" + item[
                 "fiscal_year"] + "/" + item["report_id"] + ".csv"
             item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime(time.time()))
             item["user_create"] = "zx"
             item["financial_statement_type_code"] = "IS"
             item["financial_statement_season_type_code"] = response.meta[
                 "type"]
             del tr_list[0:4]
             self.saveExcel(tr_list, item["report_id"])
             yield item
     except IndexError:
         pass
コード例 #6
0
 def getzip(self, response):
     report_id_list = response.meta["report_id_list"]
     item = IndiaItem()
     zip_link = response.xpath(
         '//table[@class="viewTable"]//td[@class="t0"]/a/@href').extract()
     if len(zip_link) != 0:
         item["doc_source_url"] = "https://www.nseindia.com" + zip_link[0]
         season_num = response.meta["season_num"]
         season_type_code = response.meta["season_type_code"]
         item["company_code"] = response.meta["company_id"]
         start_year = response.meta["FromDate"]
         end_year = response.meta["ToDate"]
         start = str(start_year).split("-")
         end = str(end_year).split("-")
         item["start_date"] = start[-1] + "-" + start[1].replace(
             "Jan", "01").replace("Feb", "02").replace("Mar", "03").replace(
                 "Apr", "04").replace("May", "05").replace(
                     "Jun", "06").replace("Jul", "07").replace(
                         "Aug", "08").replace("Sep", "09").replace(
                             "Oct", "10").replace("Nov", "11").replace(
                                 "Dec", "12") + "-" + start[0] + " 00:00:00"
         item["end_date"] = end[-1] + "-" + end[1].replace(
             "Jan", "01").replace("Feb", "02").replace("Mar", "03").replace(
                 "Apr", "04").replace("May", "05").replace(
                     "Jun", "06").replace("Jul", "07").replace(
                         "Aug", "08").replace("Sep", "09").replace(
                             "Oct", "10").replace("Nov", "11").replace(
                                 "Dec", "12") + "-" + end[0] + " 00:00:00"
         item["fiscal_year"] = str(item["end_date"]).split("-")[0]
         item["country_code"] = "IND"
         item["exchange_market_code"] = "NSE"
         item["financial_reporting_standard_code"] = "IFRS/IND AS"
         item["doc_type"] = "pdf"
         item["is_doc_url_direct"] = 1
         item["is_downloaded"] = 1
         item["currency_code"] = "INR"
         item["doc_downloaded_timestamp"] = time.strftime(
             '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
         item["language_written_code"] = "en"
         num = int(self.report_num_dict[item["company_code"]]) + 1
         num = self.go_heavy_num(num)
         self.report_num_dict[item["company_code"]] = num
         item["report_id"] = item["company_code"] + item[
             "fiscal_year"] + "00" + season_num + "01" + self.report_num_dict[
                 item["company_code"]]
         id = self.pattern2.search(str(item["report_id"])).group()
         if id not in report_id_list:
             item["doc_local_path"] = "/volum1/homes/India/" + item[
                 "fiscal_year"] + "/" + item["report_id"] + ".pdf"
             item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime(time.time()))
             item["user_create"] = "root"
             item["financial_statement_season_type_code"] = season_type_code
             item["file_name"] = item["report_id"]
             item["jud"] = 1
             yield item
コード例 #7
0
 def getzip(self, response):
     item = IndiaItem()
     season_num = response.meta["season_num"]
     season_type_code = response.meta["season_type_code"]
     item["company_code"] = response.meta["company_id"]
     start_year = response.meta["FromDate"]
     end_year = response.meta["ToDate"]
     start = str(start_year).split("-")
     end = str(end_year).split("-")
     item["start_date"] = start[-1] + "-" + start[1].replace("Jan", "01").replace("Feb", "02").replace(
         "Mar", "03").replace("Apr", "04").replace("May", "05").replace("Jun", "06").replace("Jul", "07").replace("Aug",
         "08").replace("Sep", "09").replace("Oct", "10").replace("Nov", "11").replace("Dec", "12") + "-" + start[0] + " 00:00:00"
     item["end_date"] = end[-1] + "-" + end[1].replace("Jan", "01").replace("Feb", "02").replace("Mar","03").replace(
         "Apr", "04").replace("May", "05").replace("Jun", "06").replace("Jul", "07").replace("Aug", "08").replace(
         "Sep","09").replace("Oct", "10").replace("Nov", "11").replace("Dec", "12") + "-" + end[0] + " 00:00:00"
     item["fiscal_year"] = str(item["end_date"]).split("-")[0]
     item["country_code"] = "IND"
     item["exchange_market_code"] = "NSE"
     item["financial_reporting_standard_code"] = "IFRS/IND AS"
     item["doc_type"] = "csv"
     item["source_url"] = response.url
     item["doc_source_url"] = None
     item["is_doc_url_direct"] = 1
     item["is_downloaded"] = 1
     item["currency_code"] = "INR"
     item["doc_downloaded_timestamp"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
     item["language_written_code"] = "en"
     num = int(self.report_num_dict[item["company_code"]]) + 1
     num = self.go_heavy_num(num)
     self.report_num_dict[item["company_code"]] = num
     item["report_id"] = item["company_code"] + item["fiscal_year"] + "00" + season_num + "01" + self.report_num_dict[item["company_code"]]
     item["doc_local_path"] = "/volum1/homes/India/" + item["fiscal_year"] + "/" + item["report_id"] + ".csv"
     item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
     item["user_create"] = "root"
     item["financial_statement_season_type_code"] = season_type_code
     item["file_name"] = item["report_id"]
     yield item
     tr_list = response.xpath('//td[@valign="top"]//table//tr')
     for temp in tr_list:
         title = temp.xpath('./td[1]//text()').extract()
         if len(title) == 0:
             title = None
         else:
             title = str(title[0]).replace("\n", "").replace(",", "|#|")
         value = temp.xpath('./td[2]/text()').extract()
         if len(value) == 0:
             value = ""
         else:
             value = str(value[0]).replace("\n", "").replace(",", "|#|")
         if title is not None or title != "\n" or title != " " or title != " ":
             with open("D:\item\OPDCMS\listed company update\india\data\csv/" + item["report_id"] + ".csv", "a") as f:
                 f.write(title + "," + value + "\n")
コード例 #8
0
 def download(self, response):
     report_id_list = response.meta["report_id_list"]
     code = response.meta["code"]
     type = response.meta["type"]
     item = IndiaItem()
     start_year = response.xpath('//table[@id="ctl00_ContentPlaceHolder1_tbl_typeID"]//tr[2]/td[2]/text()').extract()[0]
     end_year = response.xpath('//table[@id="ctl00_ContentPlaceHolder1_tbl_typeID"]//tr[3]/td[2]/text()').extract()[0]
     tr_list = response.xpath('//table[@id="ctl00_ContentPlaceHolder1_tbl_typeID"]//tr')
     start = str(start_year).split("-")
     end = str(end_year).split("-")
     item["start_date"] = "20" + start[-1] + "-" + start[1].replace("Jan", "01").replace("Feb", "02").replace("Mar", "03").replace("Apr",
                 "04").replace("May", "05").replace("Jun", "06").replace("Jul", "07").replace("Aug", "08").replace("Sep",
                 "09").replace("Oct", "10").replace("Nov", "11").replace("Dec", "12") + "-" + start[0] + " 00:00:00"
     item["end_date"] = "20" + end[-1] + "-" + end[1].replace("Jan", "01").replace("Feb", "02").replace("Mar", "03").replace("Apr",
                 "04").replace("May", "05").replace("Jun", "06").replace("Jul", "07").replace("Aug", "08").replace("Sep",
                 "09").replace("Oct", "10").replace("Nov", "11").replace("Dec", "12") + "-" + end[0] + " 00:00:00"
     item["company_code"] = response.meta["company_id"]
     item["fiscal_year"] = str(item["end_date"]).split("-")[0]
     item["country_code"] = "IND"
     item["exchange_market_code"] = "BSE"
     item["financial_reporting_standard_code"] = "IFRS/IND AS"
     item["doc_type"] = "csv"
     item["source_url"] = response.url
     item["doc_source_url"] = None
     item["is_doc_url_direct"] = 1
     item["is_downloaded"] = 1
     item["currency_code"] = "INR"
     item["doc_downloaded_timestamp"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
     item["language_written_code"] = "en"
     num = int(self.report_num_dict[item["company_code"]]) + 1
     num = self.go_heavy_num(num)
     self.report_num_dict[item["company_code"]] = num
     if type == "FY":
         item["report_id"] = item["company_code"] + item["fiscal_year"] + "03" + "06" + "01" + self.report_num_dict[item["company_code"]]
     else:
         item["report_id"] = item["company_code"] + item["fiscal_year"] + "03" + "05" + "01" + self.report_num_dict[item["company_code"]]
     id = self.pattern2.search(str(item["report_id"])).group()
     if id not in report_id_list:
         item["doc_local_path"] = "/volum1/homes/India/" + item["fiscal_year"] + "/" + item["report_id"] + ".csv"
         item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
         item["user_create"] = "root"
         item["financial_statement_type_code"] = "IS"
         del tr_list[0:4]
         self.saveExcel(tr_list, item["report_id"])
         yield item
コード例 #9
0
 def parse(self, response):
     company_id = response.meta["company_id"]
     code = response.meta["code"]
     time_list = []
     data_list = response.xpath('//table[@cellspacing="1"]//tr[@style="background-color:white;height:32px;"]')
     for temp in data_list:
         item = IndiaItem()
         item["company_code"] = company_id
         pdf_url = temp.xpath('./preceding-sibling::tr[1]/td[3]/a/@href').extract()
         title = temp.xpath('./preceding-sibling::tr[1]/td[1]/text()').extract()
         title_backup = temp.xpath('./preceding-sibling::tr[1]/td[1]/a/text()').extract()
         data = temp.xpath('./preceding-sibling::tr[2]/td/text()').extract()
         data_time = self.pattern.findall(str(data))
         data_title = self.pattern.findall(str(title))
         if len(pdf_url) == 0:
             continue
         else:
             item["doc_source_url"] = pdf_url[0]
             if len(data_title) != 0 or len(title) == 0:
                 title = str(title_backup[0]).replace("(", "").replace(")", "").replace(",", "").replace("&", "")
             else:
                 title = str(title[0]).replace("(", "").replace(")", "").replace(",", "").replace("&", "")
             if any(i in title for i in self.Q1_list):
                 item["end_date"] = self.get_pdf_time(title)[0]
                 fiscal_year = self.get_pdf_time(title)[1]
                 item["fiscal_year"] = fiscal_year
                 if fiscal_year is None:
                     fiscal_year = "0000"
                 num = int(self.report_num_dict[item["company_code"]]) + 1
                 num = self.go_heavy_num(num)
                 self.report_num_dict[item["company_code"]] = num
                 item["financial_statement_season_type_code"] = "Q1"
                 item["report_id"] = item["company_code"] + fiscal_year + "00" + "01" + "01" + self.report_num_dict[item["company_code"]]
             elif any(i in title for i in self.Q2_list):
                 item["end_date"] = self.get_pdf_time(title)[0]
                 fiscal_year = self.get_pdf_time(title)[1]
                 item["fiscal_year"] = fiscal_year
                 if fiscal_year is None:
                     fiscal_year = "0000"
                 num = int(self.report_num_dict[item["company_code"]]) + 1
                 num = self.go_heavy_num(num)
                 self.report_num_dict[item["company_code"]] = num
                 item["financial_statement_season_type_code"] = "Q2"
                 item["report_id"] = item["company_code"] + fiscal_year + "00" + "02" + "01" + \
                                     self.report_num_dict[item["company_code"]]
             elif any(i in title for i in self.Q3_list):
                 item["end_date"] = self.get_pdf_time(title)[0]
                 fiscal_year = self.get_pdf_time(title)[1]
                 item["fiscal_year"] = fiscal_year
                 if fiscal_year is None:
                     fiscal_year = "0000"
                 num = int(self.report_num_dict[item["company_code"]]) + 1
                 num = self.go_heavy_num(num)
                 self.report_num_dict[item["company_code"]] = num
                 item["financial_statement_season_type_code"] = "Q3"
                 item["report_id"] = item["company_code"] + fiscal_year + "00" + "03" + "01" + \
                                     self.report_num_dict[item["company_code"]]
             elif any(i in title for i in self.Q4_list):
                 item["end_date"] = self.get_pdf_time(title)[0]
                 fiscal_year = self.get_pdf_time(title)[1]
                 item["fiscal_year"] = fiscal_year
                 if fiscal_year is None:
                     fiscal_year = "0000"
                 num = int(self.report_num_dict[item["company_code"]]) + 1
                 num = self.go_heavy_num(num)
                 self.report_num_dict[item["company_code"]] = num
                 item["financial_statement_season_type_code"] = "Q4"
                 item["report_id"] = item["company_code"] + fiscal_year + "00" + "04" + "01" + \
                                     self.report_num_dict[item["company_code"]]
             elif any(i in title for i in self.Q_list):
                 item["end_date"] = self.get_pdf_time(title)[0]
                 fiscal_year = self.get_pdf_time(title)[1]
                 item["fiscal_year"] = fiscal_year
                 if fiscal_year is None:
                     fiscal_year = "0000"
                 num = int(self.report_num_dict[item["company_code"]]) + 1
                 num = self.go_heavy_num(num)
                 self.report_num_dict[item["company_code"]] = num
                 item["financial_statement_season_type_code"] = "Q"
                 item["report_id"] = item["company_code"] + fiscal_year + "00" + "05" + "01" + \
                                     self.report_num_dict[item["company_code"]]
             elif any(i in title for i in self.FY_list):
                 item["end_date"] = self.get_pdf_time(title)[0]
                 fiscal_year = self.get_pdf_time(title)[1]
                 item["fiscal_year"] = fiscal_year
                 if fiscal_year is None:
                     fiscal_year = "0000"
                 num = int(self.report_num_dict[item["company_code"]]) + 1
                 num = self.go_heavy_num(num)
                 self.report_num_dict[item["company_code"]] = num
                 item["financial_statement_season_type_code"] = "FY"
                 item["report_id"] = item["company_code"] + fiscal_year + "00" + "06" + "01" + \
                                     self.report_num_dict[item["company_code"]]
             elif any(i in title for i in self.Financial_list):
                 item["end_date"] = self.get_pdf_time(title)[0]
                 fiscal_year = self.get_pdf_time(title)[1]
                 item["fiscal_year"] = fiscal_year
                 if fiscal_year is None:
                     fiscal_year = "0000"
                 num = int(self.report_num_dict[item["company_code"]]) + 1
                 num = self.go_heavy_num(num)
                 self.report_num_dict[item["company_code"]] = num
                 item["financial_statement_season_type_code"] = None
                 item["report_id"] = item["company_code"] + fiscal_year + "00" + "00" + "01" + \
                                     self.report_num_dict[item["company_code"]]
             else:
                 continue
             if len(data_time) != 0 or len(data) == 0:
                 data = time_list[-1]
             else:
                 data = data[0]
                 time_list.append(data)
             release_time = str(data).replace("Jan", "01").replace("Feb", "02").replace("Mar", "03").replace(
                 "Apr", "04").replace("May", "05").replace("Jun", "06").replace("Jul", "07").replace("Aug","08").replace(
                 "Sep", "09").replace("Oct", "10").replace("Nov", "11").replace("Dec", "12").replace("  ","").replace(" ","")
             release_time_list = re.findall("(\d{2})(\d{2})(\d{4})", release_time)
             if len(release_time_list) != 0:
                 temp = release_time_list[0]
                 item["disclosure_date"] = temp[2] + "-" + temp[1] + "-" + temp[0] + " 00:00:00"
             else:
                 item["disclosure_date"] = None
             item["country_code"] = "IND"
             item["exchange_market_code"] = "BSE"
             item["financial_reporting_standard_code"] = "IFRS/IND AS"
             item["doc_type"] = "pdf"
             item["is_doc_url_direct"] = 1
             item["is_downloaded"] = 1
             item["currency_code"] = "INR"
             item["doc_downloaded_timestamp"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             item["language_written_code"] = "en"
             item["doc_local_path"] = "/volum1/homes/India/" + fiscal_year + "/" + item["report_id"] + ".pdf"
             item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             item["user_create"] = "root"
             item["file_name"] = item["report_id"]
             item["jud"] = 0
             yield item
     page_link = response.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblNext"]/a/@href').extract()
     if len(page_link) != 0:
         url = "http://www.bseindia.com/corporates/" + page_link[0]
         yield scrapy.Request(url, callback=self.parse, meta={"code": code, "company_id": company_id})
コード例 #10
0
 def parse(self, response):
     data_list = re.findall(
         '"NEWSSUB":"(.*?)",.*?"ATTACHMENTNAME":"(.*?)",.*?"DissemDT":"(.*?)",',
         response.text)
     for temp in data_list:
         item = IndiaItem()
         item["company_code"] = response.meta["company_id"]
         if len(temp[1]) > 3:
             item[
                 "doc_source_url"] = "https://www.bseindia.com/xml-data/corpfiling/AttachLive/" + str(
                     temp[1]).replace('"', "")
             title = temp[0]
             item["disclosure_date"] = str(temp[2]).split(".")[0].replace(
                 "T", " ")
             item["fiscal_year"] = item["disclosure_date"].split("-")[0]
             if any(i in title for i in self.Q1_list):
                 item["financial_statement_season_type_code"] = "Q1"
                 item["report_id"] = item[
                     "company_code"] + self.uniqueIDMaker()
                 item["announcement"] = 1
             elif any(i in title for i in self.Q2_list):
                 item["financial_statement_season_type_code"] = "Q2"
                 item["report_id"] = item[
                     "company_code"] + self.uniqueIDMaker()
                 item["announcement"] = 1
             elif any(i in title for i in self.Q3_list):
                 item["financial_statement_season_type_code"] = "Q3"
                 item["report_id"] = item[
                     "company_code"] + self.uniqueIDMaker()
                 item["announcement"] = 1
             elif any(i in title for i in self.Q4_list):
                 item["financial_statement_season_type_code"] = "Q4"
                 item["report_id"] = item[
                     "company_code"] + self.uniqueIDMaker()
                 item["announcement"] = 1
             elif any(i in title for i in self.Q_list):
                 item["financial_statement_season_type_code"] = "Q"
                 item["report_id"] = item[
                     "company_code"] + self.uniqueIDMaker()
                 item["announcement"] = 1
             elif any(i in title for i in self.FY_list):
                 item["financial_statement_season_type_code"] = "FY"
                 item["report_id"] = item[
                     "company_code"] + self.uniqueIDMaker()
                 item["announcement"] = 1
             elif any(i in title for i in self.Financial_list):
                 item["financial_statement_season_type_code"] = None
                 item["report_id"] = item[
                     "company_code"] + self.uniqueIDMaker()
                 item["announcement"] = 1
             else:
                 item["report_id"] = item[
                     "company_code"] + self.uniqueIDMaker()
                 item["announcement"] = 0
             item["country_code"] = "IND"
             item["exchange_market_code"] = "BSE"
             item["financial_reporting_standard_code"] = "IFRS/IND AS"
             item["doc_type"] = "pdf"
             item["is_doc_url_direct"] = 1
             item["is_downloaded"] = 1
             item["currency_code"] = "INR"
             item["doc_downloaded_timestamp"] = time.strftime(
                 '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             item["language_written_code"] = "en"
             item["doc_local_path"] = "/volume3/homes3/India/" + item[
                 "fiscal_year"] + "/" + item["report_id"] + ".pdf"
             item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime(time.time()))
             item["user_create"] = "zx"
             item["file_name"] = item["report_id"]
             item["jud"] = 0
             item["pdf_name"] = title
             yield item