def start_requests(self): for temp in self.results: code = temp[0] company_id = temp[1] if int(code) >= 600000: #if int(code) >= 600827: self.report_num_dict[company_id] = "000" for each in self.report_type: eachdatelist = [] sql_select = "select disclosure_date from financial_statement_index where country_code = 'CHN' and company_code = %s and financial_statement_season_type_code = %s" self.cursor.execute(sql_select, [company_id, each["name"]]) results = self.cursor.fetchall() for eachnoe in results: eachdate = int(str(eachnoe[0]).replace("-", "").replace(" 00:00:00", "")) eachdatelist.append(eachdate) eachdatelist.sort() newstdate = eachdatelist[-1] #print("%s,%s,%s" % (company_id, each["name"], newstdate)) time.sleep(1) item = ChinaIntroItem() item["company_code"] = company_id item["financial_statement_season_type_code"] = each["name"] item["exchange_market_code"] = "SSE" url = self.url1 + code + self.url2 + each["value"] + self.url3 yield scrapy.Request(url, callback=self.parse, meta={"item": item, "newstdate": newstdate})
def parse(self, response): item = ChinaIntroItem() typeCode = response.meta["typeCode"] if typeCode == "1": tr_list = response.xpath( '//table[@class="table_grey_border ms-rteTable-BlueTable_ENG"]//tr[starts-with(@class, "tr_normal")]' ) else: tr_list = response.xpath( '//table[@class="table_grey_border"]//tr[starts-with(@class, "tr_normal")]' ) for temp in tr_list: try: value_lenth = temp.xpath('./td[1]/p/text()') if len(value_lenth) != 0: item["code"] = "0" + value_lenth.extract()[0] else: item["code"] = "0" + temp.xpath( './td[1]/text()').extract()[0] item["website_url"] = temp.xpath( './td[3]//a/text()').extract()[0] item["doc_source_url"] = None yield item except Exception as e: print(e)
def parse(self, response): url = response.meta["url"] code = response.meta["code"] item = ChinaIntroItem() #网页爬取数据--company item["name_origin"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[1]/td[2]/text()').extract()[0] item["name_en"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[2]/td[2]/text()').extract()[0] item["security_code"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[4]/td[2]/text()').extract()[0] ipo_date = response.xpath('//table[@id="1743_detail_smetab1"]//tr[5]/td[2]/text()').extract()[0] item["ipo_date"] = str(ipo_date) + " 00:00:00" website_url = response.xpath('//table[@id="1743_detail_smetab1"]//tr[10]/td[2]/a/text()').extract() if len(website_url) == 0: item["website_url"] = None else: item["website_url"] = website_url[0] #自定义添加的数据--company item["code"] = code item["country_code_listed"] = "CHN" item["country_code_origin"] = "CHN" item["exchange_market_code"] = "SZSE" item["currency_code"] = "CNY" item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["user_create"] = "root" #网页爬取数据--detail title item["registered_address_title"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[3]/td[1]/text()').extract()[0] item["company_short_name_zh_title"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[4]/td[3]/text()').extract()[0] item["Total_share_capital_of_A_shares_title"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[5]/td[3]/text()').extract()[0] item["A_shares_circulating_capital_title"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[5]/td[5]/text()').extract()[0] item["district_belong_to_title"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[8]/td[5]/text()').extract()[0] item["industry_title"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[9]/td[1]/text()').extract()[0] #网页爬取数据--detail value item["registered_address"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[3]/td[2]/text()').extract()[0] item["company_short_name_zh"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[4]/td[4]/text()').extract()[0] item["Total_share_capital_of_A_shares"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[5]/td[4]/text()').extract()[0] item["A_shares_circulating_capital"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[5]/td[6]/text()').extract()[0] item["district_belong_to"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[8]/td[6]/text()').extract()[0] item["industry"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[9]/td[2]/text()').extract()[0] yield item """
def parse(self, response): item = ChinaIntroItem() item["code"] = response.meta["company_id"] try: #title item["company_profile_HCK"] = "company_profile" item["Issued_Shares_HCK"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[1]/span[1]/text()' ).extract()[0] item["Industry_HCK"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[2]/span[1]/text()' ).extract()[0] item["Listing_Date_HCK"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[3]/span[1]/text()' ).extract()[0] item["Financial_Year_HCK"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[4]/span[1]/text()' ).extract()[0] item["Chairman_HCK"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[1]/span[1]/text()' ).extract()[0] item["Principal_Office_HCK"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[2]/span[1]/text()' ).extract()[0] item["Place_of_Incorporation_HCK"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[3]/span[1]/text()' ).extract()[0] item["Listing_Category_HCK"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[4]/span[1]/text()' ).extract()[0] item["Registrar_HCK"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[8]/span[1]/text()' ).extract()[0] #value item["company_profile"] = response.xpath( '//div[@class="company_txt col_summary"]/text()').extract()[0] item["Issued_Shares"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[1]/span[2]/text()' ).extract()[0] item["Industry"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[2]/span[2]/span/span/text()' ).extract()[0] Listing_Date = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[3]/span[2]/text()' ).extract()[0] try: date = str(Listing_Date).split(" ")[0] if int(date) < 10: date = date.replace("1", "01").replace("2", "02").replace( "3", "03").replace("4", "04").replace("5", "05").replace( "6", "06").replace("7", "07").replace( "8", "08").replace("9", "09") item["Listing_Date"] = str(Listing_Date).split(" ")[-1] + "-" \ + str(Listing_Date).split(" ")[1].replace("Jan", "01").replace("Feb", "02").replace("Mar", "03").replace("Apr", "04").replace("May", "05").replace("Jun", "06").replace("Jul", "07").replace("Aug", "08").replace("Sep", "09").replace("Oct", "10").replace("Nov", "11").replace("Dec", "12").replace(" ", "").replace(" ", "") \ + "-" + date + " 00:00:00" except: item["Listing_Date"] = "-" item["Financial_Year"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[4]/span[2]/text()' ).extract()[0] item["Chairman"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[1]/span[2]/text()' ).extract()[0] item["Principal_Office"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[2]/span[2]/text()' ).extract() item["Place_of_Incorporation"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[3]/span[2]/text()' ).extract()[0] item["Listing_Category"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[4]/span[2]/text()' ).extract()[0] item["Registrar"] = response.xpath( '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[8]/span[2]/a/text()' ).extract()[0] table_list = response.xpath( '//table[@class="table_divi"]/tbody/tr') item_list = [] item["entitlement_HCK"] = "entitlement" for temp in table_list: item_dict = {} item_dict["Date_Announced"] = temp.xpath( './td[1]/text()').extract()[0] item_dict["Ex_Date"] = temp.xpath( './td[2]/text()').extract()[0] item_dict["Details"] = temp.xpath( '//table[@class="table_divi"]/tbody/tr[1]/td[3]/text()' ).extract() item_dict["Financial_Year_End"] = temp.xpath( './td[4]/text()').extract()[0] item_dict["Book_Close_Date"] = temp.xpath( './td[5]/text()').extract()[0] item_dict["Payment_Date"] = temp.xpath( './td[6]/text()').extract()[0] item_list.append(item_dict) item["entitlement"] = item_list item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["user_create"] = "zx" item["doc_source_url"] = None yield item except: conn1 = pymysql.connect(host="10.100.4.100", port=3306, db="Standard_database", user="******", passwd="OPDATA", charset="utf8") cursor1 = conn1.cursor() sql = "insert into HCK_information_for_loss(company_id, not_have_url)value (%s,%s)" cursor1.execute(sql, [item["code"], response.url]) conn1.commit() cursor1.close() conn1.close()
def parse(self, response): company_id = response.meta["company_id"] item = ChinaIntroItem() # 网页爬取的数据--company item["security_code"] = response.xpath( '//table[@class="table search_"]/tbody/tr[1]/td/text()').extract( )[0] ipo_date = str( response.xpath( '//table[@class="table search_"]//tr[3]/td/a[@target="_blank"]/text()' ).extract()[0]).split("/")[0] item["ipo_date"] = str(ipo_date) + " 00:00:00" item["name_origin"] = str( response.xpath( '//table[@class="table search_"]/tbody/tr[6]/td/text()'). extract()[0]).split("/")[0] item["name_en"] = str( response.xpath( '//table[@class="table search_"]/tbody/tr[6]/td/text()'). extract()[0]).split("/")[-1] website_url = response.xpath( '//table[@class="table search_"]/tbody/tr[13]/td/a/text()' ).extract() if len(website_url) == 0: item["website_url"] = None else: item["website_url"] = website_url[0] item["status"] = str( response.xpath( '//table[@class="table search_"]/tbody/tr[17]/td/text()'). extract()[0]).split("/")[0] # 自定义添加的数据--company item["code"] = company_id item["country_code_listed"] = "CHN" item["country_code_origin"] = "CHN" item["exchange_market_code"] = "SSE" item["currency_code"] = "CNY" item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["user_create"] = "root" # 网页爬取的数据--detail-value item["convertible_bonds_for_short"] = response.xpath( '//table[@class="table search_"]/tbody/tr[4]/td/text()').extract( )[0] item["company_short_name_zh"] = str( response.xpath( '//table[@class="table search_"]/tbody/tr[5]/td/text()'). extract()[0]).split("/")[0] item["company_short_name_en"] = str( response.xpath( '//table[@class="table search_"]/tbody/tr[5]/td/text()'). extract()[0]).split("/")[-1] item["registered_address"] = response.xpath( '//table[@class="table search_"]/tbody/tr[7]/td/text()').extract( )[0] item["mailing_address"] = response.xpath( '//table[@class="table search_"]/tbody/tr[8]/td/text()').extract( )[0] item["legal_representative"] = str( response.xpath( '//table[@class="table search_"]/tbody/tr[9]/td/text()'). extract()[0]).replace(" ", "") item["secretary_name"] = response.xpath( '//table[@class="table search_"]/tbody/tr[10]/td/text()').extract( )[0] item["e_mail"] = response.xpath( '//table[@class="table search_"]/tbody/tr[11]/td/a/text()' ).extract()[0] item["phone_number"] = response.xpath( '//table[@class="table search_"]/tbody/tr[12]/td/text()').extract( )[0] item["CSRC_industry"] = response.xpath( '//table[@class="table search_"]/tbody/tr[14]/td/text()').extract( )[0] item["SSE_industry"] = response.xpath( '//table[@class="table search_"]/tbody/tr[15]/td/text()').extract( )[0] item["district_belong_to"] = response.xpath( '//table[@class="table search_"]/tbody/tr[16]/td/text()').extract( )[0] item["is_SSE_180_sample_stock"] = response.xpath( '//table[@class="table search_"]/tbody/tr[18]/td/text()').extract( )[0] item["is_overseas_listing"] = response.xpath( '//table[@class="table search_"]/tbody/tr[19]/td/text()').extract( )[0] item["overseas_listing_land"] = response.xpath( '//table[@class="table search_"]/tbody/tr[20]/td/text()').extract( )[0] # 网页爬取的数据--detail-title item["convertible_bonds_for_short_title"] = response.xpath( '//table[@class="table search_"]/tbody/tr[4]/th/text()').extract( )[0] item["company_short_name_zh_title"] = response.xpath( '//table[@class="table search_"]/tbody/tr[5]/th/text()').extract( )[0] item["company_short_name_en_title"] = item[ "company_short_name_zh_title"] item["registered_address_title"] = response.xpath( '//table[@class="table search_"]/tbody/tr[7]/th/text()').extract( )[0] item["mailing_address_title"] = response.xpath( '//table[@class="table search_"]/tbody/tr[8]/th/text()').extract( )[0] item["legal_representative_title"] = str( response.xpath( '//table[@class="table search_"]/tbody/tr[9]/th/text()'). extract()[0]).replace(" ", "") item["secretary_name_title"] = response.xpath( '//table[@class="table search_"]/tbody/tr[10]/th/text()').extract( )[0] item["e_mail_title"] = response.xpath( '//table[@class="table search_"]/tbody/tr[11]/th/text()').extract( )[0] item["phone_number_title"] = response.xpath( '//table[@class="table search_"]/tbody/tr[12]/th/text()').extract( )[0] item["CSRC_industry_title"] = response.xpath( '//table[@class="table search_"]/tbody/tr[14]/th/text()').extract( )[0] item["SSE_industry_title"] = response.xpath( '//table[@class="table search_"]/tbody/tr[15]/th/text()').extract( )[0] item["district_belong_to_title"] = response.xpath( '//table[@class="table search_"]/tbody/tr[16]/th/text()').extract( )[0] item["is_SSE_180_sample_stock_title"] = response.xpath( '//table[@class="table search_"]/tbody/tr[18]/th/text()').extract( )[0] item["is_overseas_listing_title"] = response.xpath( '//table[@class="table search_"]/tbody/tr[19]/th/text()').extract( )[0] item["overseas_listing_land_title"] = response.xpath( '//table[@class="table search_"]/tbody/tr[20]/th/text()').extract( )[0] yield item
def parse(self, response): newstdate = response.meta["newstdate"] link_list = response.xpath('//td[@align="left"]//tbody/tr') for temp in range(1, len(link_list) + 1): item = ChinaIntroItem() item["exchange_market_code"] = response.meta[ "exchange_market_code"] item["company_code"] = response.meta["company_code"] item["financial_statement_season_type_code"] = response.meta[ "financial_statement_season_type_code"] """ if item["company_code"] not in self.code_list: self.code_list.append(item["company_code"]) self.num = 0 """ title = response.xpath('//td[@align="left"]//tbody/tr' + "[" + str(temp) + "]" + '/td/a/text()').extract()[0] if any(i in title for i in self.Keywords): pass else: try: item["fiscal_year"] = self.pattern.search( str(title)).group() fiscal_year = item["fiscal_year"] except: item["fiscal_year"] = None fiscal_year = "0000" date = response.xpath( '//td[@align="left"]//tbody/tr' + "[" + str(temp) + "]" + '/td[@align="left"]/span/text()').extract()[0] disclosure_date = str(date).replace("[", "").replace("]", "") if int(str(disclosure_date).replace("-", "")) >= newstdate: item["disclosure_date"] = disclosure_date + " 00:00:00" pdf_link = response.xpath( '//td[@align="left"]//tbody/tr' + "[" + str(temp) + "]" + '/td[@align="left"]/a/@href').extract()[0] item[ "doc_source_url"] = "http://disclosure.szse.cn/" + pdf_link season_num = self.jud_season_num( item["financial_statement_season_type_code"]) num = int(self.report_num_dict[item["company_code"]]) + 1 num = self.go_heavy_num(num) item["report_id"] = item[ "company_code"] + fiscal_year + "00" + season_num + "01" + num #report_num = re.search("CHN\d{15}(\d{3})", str(item["report_id"])).group(1) self.report_num_dict[item["company_code"]] = num #print(item["report_id"]) item["doc_local_path"] = "/volume1/homes/China/" + str( fiscal_year) + "/" + item["report_id"] + ".pdf" item["country_code"] = "CHN" item["financial_reporting_standard_code"] = "CAS" item["doc_type"] = "pdf" item["is_doc_url_direct"] = 1 item["is_downloaded"] = 1 item["currency_code"] = "CNY" item["language_written_code"] = "zh-simple" item["gmt_create"] = time.strftime( '%Y-%m-%d %H:%M:%S', time.localtime(time.time())) item["doc_downloaded_timestamp"] = item["gmt_create"] item["user_create"] = "root" item["file_name"] = title item["spiderName"] = "shenzhen_download_spider" yield item