Esempio n. 1
0
 def start_requests(self):
     for temp in self.results:
         code = temp[0]
         company_id = temp[1]
         if int(code) >= 600000:
         #if int(code) >= 600827:
             self.report_num_dict[company_id] = "000"
             for each in self.report_type:
                 eachdatelist = []
                 sql_select = "select disclosure_date from financial_statement_index where country_code = 'CHN' and company_code = %s and financial_statement_season_type_code = %s"
                 self.cursor.execute(sql_select, [company_id, each["name"]])
                 results = self.cursor.fetchall()
                 for eachnoe in results:
                     eachdate = int(str(eachnoe[0]).replace("-", "").replace(" 00:00:00", ""))
                     eachdatelist.append(eachdate)
                     eachdatelist.sort()
                 newstdate = eachdatelist[-1]
                 #print("%s,%s,%s" % (company_id, each["name"], newstdate))
                 time.sleep(1)
                 item = ChinaIntroItem()
                 item["company_code"] = company_id
                 item["financial_statement_season_type_code"] = each["name"]
                 item["exchange_market_code"] = "SSE"
                 url = self.url1 + code + self.url2 + each["value"] + self.url3
                 yield scrapy.Request(url, callback=self.parse, meta={"item": item, "newstdate": newstdate})
 def parse(self, response):
     item = ChinaIntroItem()
     typeCode = response.meta["typeCode"]
     if typeCode == "1":
         tr_list = response.xpath(
             '//table[@class="table_grey_border ms-rteTable-BlueTable_ENG"]//tr[starts-with(@class, "tr_normal")]'
         )
     else:
         tr_list = response.xpath(
             '//table[@class="table_grey_border"]//tr[starts-with(@class, "tr_normal")]'
         )
     for temp in tr_list:
         try:
             value_lenth = temp.xpath('./td[1]/p/text()')
             if len(value_lenth) != 0:
                 item["code"] = "0" + value_lenth.extract()[0]
             else:
                 item["code"] = "0" + temp.xpath(
                     './td[1]/text()').extract()[0]
             item["website_url"] = temp.xpath(
                 './td[3]//a/text()').extract()[0]
             item["doc_source_url"] = None
             yield item
         except Exception as e:
             print(e)
    def parse(self, response):
        url = response.meta["url"]
        code = response.meta["code"]
        item = ChinaIntroItem()
        #网页爬取数据--company
        item["name_origin"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[1]/td[2]/text()').extract()[0]
        item["name_en"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[2]/td[2]/text()').extract()[0]
        item["security_code"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[4]/td[2]/text()').extract()[0]
        ipo_date = response.xpath('//table[@id="1743_detail_smetab1"]//tr[5]/td[2]/text()').extract()[0]
        item["ipo_date"] = str(ipo_date) + " 00:00:00"
        website_url = response.xpath('//table[@id="1743_detail_smetab1"]//tr[10]/td[2]/a/text()').extract()
        if len(website_url) == 0:
            item["website_url"] = None
        else:
            item["website_url"] = website_url[0]
        #自定义添加的数据--company
        item["code"] = code
        item["country_code_listed"] = "CHN"
        item["country_code_origin"] = "CHN"
        item["exchange_market_code"] = "SZSE"
        item["currency_code"] = "CNY"
        item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        item["user_create"] = "root"

        #网页爬取数据--detail title
        item["registered_address_title"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[3]/td[1]/text()').extract()[0]
        item["company_short_name_zh_title"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[4]/td[3]/text()').extract()[0]
        item["Total_share_capital_of_A_shares_title"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[5]/td[3]/text()').extract()[0]
        item["A_shares_circulating_capital_title"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[5]/td[5]/text()').extract()[0]
        item["district_belong_to_title"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[8]/td[5]/text()').extract()[0]
        item["industry_title"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[9]/td[1]/text()').extract()[0]
        #网页爬取数据--detail value
        item["registered_address"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[3]/td[2]/text()').extract()[0]
        item["company_short_name_zh"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[4]/td[4]/text()').extract()[0]
        item["Total_share_capital_of_A_shares"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[5]/td[4]/text()').extract()[0]
        item["A_shares_circulating_capital"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[5]/td[6]/text()').extract()[0]
        item["district_belong_to"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[8]/td[6]/text()').extract()[0]
        item["industry"] = response.xpath('//table[@id="1743_detail_smetab1"]//tr[9]/td[2]/text()').extract()[0]
        yield item
        """
Esempio n. 4
0
    def parse(self, response):
        item = ChinaIntroItem()
        item["code"] = response.meta["company_id"]
        try:
            #title
            item["company_profile_HCK"] = "company_profile"
            item["Issued_Shares_HCK"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[1]/span[1]/text()'
            ).extract()[0]
            item["Industry_HCK"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[2]/span[1]/text()'
            ).extract()[0]
            item["Listing_Date_HCK"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[3]/span[1]/text()'
            ).extract()[0]
            item["Financial_Year_HCK"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[4]/span[1]/text()'
            ).extract()[0]
            item["Chairman_HCK"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[1]/span[1]/text()'
            ).extract()[0]
            item["Principal_Office_HCK"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[2]/span[1]/text()'
            ).extract()[0]
            item["Place_of_Incorporation_HCK"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[3]/span[1]/text()'
            ).extract()[0]
            item["Listing_Category_HCK"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[4]/span[1]/text()'
            ).extract()[0]
            item["Registrar_HCK"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[8]/span[1]/text()'
            ).extract()[0]
            #value
            item["company_profile"] = response.xpath(
                '//div[@class="company_txt col_summary"]/text()').extract()[0]
            item["Issued_Shares"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[1]/span[2]/text()'
            ).extract()[0]
            item["Industry"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[2]/span[2]/span/span/text()'
            ).extract()[0]
            Listing_Date = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[3]/span[2]/text()'
            ).extract()[0]
            try:
                date = str(Listing_Date).split(" ")[0]
                if int(date) < 10:
                    date = date.replace("1", "01").replace("2", "02").replace(
                        "3",
                        "03").replace("4", "04").replace("5", "05").replace(
                            "6", "06").replace("7", "07").replace(
                                "8", "08").replace("9", "09")
                item["Listing_Date"] = str(Listing_Date).split(" ")[-1] + "-" \
                                       + str(Listing_Date).split(" ")[1].replace("Jan", "01").replace("Feb", "02").replace("Mar",
                                        "03").replace("Apr", "04").replace("May", "05").replace("Jun", "06").replace("Jul",
                                        "07").replace("Aug", "08").replace("Sep", "09").replace("Oct", "10").replace("Nov",
                                        "11").replace("Dec", "12").replace("  ", "").replace(" ", "") \
                                       + "-" + date + " 00:00:00"
            except:
                item["Listing_Date"] = "-"
            item["Financial_Year"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[1]/div[4]/span[2]/text()'
            ).extract()[0]
            item["Chairman"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[1]/span[2]/text()'
            ).extract()[0]
            item["Principal_Office"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[2]/span[2]/text()'
            ).extract()
            item["Place_of_Incorporation"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[3]/span[2]/text()'
            ).extract()[0]
            item["Listing_Category"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[4]/span[2]/text()'
            ).extract()[0]
            item["Registrar"] = response.xpath(
                '//div[@class="company_detail"]/div[@class="company_list"]/div[2]/div[8]/span[2]/a/text()'
            ).extract()[0]

            table_list = response.xpath(
                '//table[@class="table_divi"]/tbody/tr')
            item_list = []
            item["entitlement_HCK"] = "entitlement"
            for temp in table_list:
                item_dict = {}
                item_dict["Date_Announced"] = temp.xpath(
                    './td[1]/text()').extract()[0]
                item_dict["Ex_Date"] = temp.xpath(
                    './td[2]/text()').extract()[0]
                item_dict["Details"] = temp.xpath(
                    '//table[@class="table_divi"]/tbody/tr[1]/td[3]/text()'
                ).extract()
                item_dict["Financial_Year_End"] = temp.xpath(
                    './td[4]/text()').extract()[0]
                item_dict["Book_Close_Date"] = temp.xpath(
                    './td[5]/text()').extract()[0]
                item_dict["Payment_Date"] = temp.xpath(
                    './td[6]/text()').extract()[0]
                item_list.append(item_dict)
            item["entitlement"] = item_list
            item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S',
                                               time.localtime(time.time()))
            item["user_create"] = "zx"
            item["doc_source_url"] = None
            yield item
        except:
            conn1 = pymysql.connect(host="10.100.4.100",
                                    port=3306,
                                    db="Standard_database",
                                    user="******",
                                    passwd="OPDATA",
                                    charset="utf8")
            cursor1 = conn1.cursor()
            sql = "insert into HCK_information_for_loss(company_id, not_have_url)value (%s,%s)"
            cursor1.execute(sql, [item["code"], response.url])
            conn1.commit()
            cursor1.close()
            conn1.close()
Esempio n. 5
0
    def parse(self, response):
        company_id = response.meta["company_id"]
        item = ChinaIntroItem()
        # 网页爬取的数据--company
        item["security_code"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[1]/td/text()').extract(
            )[0]
        ipo_date = str(
            response.xpath(
                '//table[@class="table search_"]//tr[3]/td/a[@target="_blank"]/text()'
            ).extract()[0]).split("/")[0]
        item["ipo_date"] = str(ipo_date) + " 00:00:00"
        item["name_origin"] = str(
            response.xpath(
                '//table[@class="table search_"]/tbody/tr[6]/td/text()').
            extract()[0]).split("/")[0]
        item["name_en"] = str(
            response.xpath(
                '//table[@class="table search_"]/tbody/tr[6]/td/text()').
            extract()[0]).split("/")[-1]
        website_url = response.xpath(
            '//table[@class="table search_"]/tbody/tr[13]/td/a/text()'
        ).extract()
        if len(website_url) == 0:
            item["website_url"] = None
        else:
            item["website_url"] = website_url[0]
        item["status"] = str(
            response.xpath(
                '//table[@class="table search_"]/tbody/tr[17]/td/text()').
            extract()[0]).split("/")[0]
        # 自定义添加的数据--company
        item["code"] = company_id
        item["country_code_listed"] = "CHN"
        item["country_code_origin"] = "CHN"
        item["exchange_market_code"] = "SSE"
        item["currency_code"] = "CNY"
        item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S',
                                           time.localtime(time.time()))
        item["user_create"] = "root"

        # 网页爬取的数据--detail-value
        item["convertible_bonds_for_short"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[4]/td/text()').extract(
            )[0]
        item["company_short_name_zh"] = str(
            response.xpath(
                '//table[@class="table search_"]/tbody/tr[5]/td/text()').
            extract()[0]).split("/")[0]
        item["company_short_name_en"] = str(
            response.xpath(
                '//table[@class="table search_"]/tbody/tr[5]/td/text()').
            extract()[0]).split("/")[-1]
        item["registered_address"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[7]/td/text()').extract(
            )[0]
        item["mailing_address"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[8]/td/text()').extract(
            )[0]
        item["legal_representative"] = str(
            response.xpath(
                '//table[@class="table search_"]/tbody/tr[9]/td/text()').
            extract()[0]).replace(" ", "")
        item["secretary_name"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[10]/td/text()').extract(
            )[0]
        item["e_mail"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[11]/td/a/text()'
        ).extract()[0]
        item["phone_number"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[12]/td/text()').extract(
            )[0]
        item["CSRC_industry"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[14]/td/text()').extract(
            )[0]
        item["SSE_industry"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[15]/td/text()').extract(
            )[0]
        item["district_belong_to"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[16]/td/text()').extract(
            )[0]
        item["is_SSE_180_sample_stock"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[18]/td/text()').extract(
            )[0]
        item["is_overseas_listing"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[19]/td/text()').extract(
            )[0]
        item["overseas_listing_land"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[20]/td/text()').extract(
            )[0]
        # 网页爬取的数据--detail-title
        item["convertible_bonds_for_short_title"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[4]/th/text()').extract(
            )[0]
        item["company_short_name_zh_title"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[5]/th/text()').extract(
            )[0]
        item["company_short_name_en_title"] = item[
            "company_short_name_zh_title"]
        item["registered_address_title"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[7]/th/text()').extract(
            )[0]
        item["mailing_address_title"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[8]/th/text()').extract(
            )[0]
        item["legal_representative_title"] = str(
            response.xpath(
                '//table[@class="table search_"]/tbody/tr[9]/th/text()').
            extract()[0]).replace(" ", "")
        item["secretary_name_title"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[10]/th/text()').extract(
            )[0]
        item["e_mail_title"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[11]/th/text()').extract(
            )[0]
        item["phone_number_title"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[12]/th/text()').extract(
            )[0]
        item["CSRC_industry_title"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[14]/th/text()').extract(
            )[0]
        item["SSE_industry_title"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[15]/th/text()').extract(
            )[0]
        item["district_belong_to_title"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[16]/th/text()').extract(
            )[0]
        item["is_SSE_180_sample_stock_title"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[18]/th/text()').extract(
            )[0]
        item["is_overseas_listing_title"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[19]/th/text()').extract(
            )[0]
        item["overseas_listing_land_title"] = response.xpath(
            '//table[@class="table search_"]/tbody/tr[20]/th/text()').extract(
            )[0]
        yield item
 def parse(self, response):
     newstdate = response.meta["newstdate"]
     link_list = response.xpath('//td[@align="left"]//tbody/tr')
     for temp in range(1, len(link_list) + 1):
         item = ChinaIntroItem()
         item["exchange_market_code"] = response.meta[
             "exchange_market_code"]
         item["company_code"] = response.meta["company_code"]
         item["financial_statement_season_type_code"] = response.meta[
             "financial_statement_season_type_code"]
         """
         if item["company_code"] not in self.code_list:
             self.code_list.append(item["company_code"])
             self.num = 0
         """
         title = response.xpath('//td[@align="left"]//tbody/tr' + "[" +
                                str(temp) + "]" +
                                '/td/a/text()').extract()[0]
         if any(i in title for i in self.Keywords):
             pass
         else:
             try:
                 item["fiscal_year"] = self.pattern.search(
                     str(title)).group()
                 fiscal_year = item["fiscal_year"]
             except:
                 item["fiscal_year"] = None
                 fiscal_year = "0000"
             date = response.xpath(
                 '//td[@align="left"]//tbody/tr' + "[" + str(temp) + "]" +
                 '/td[@align="left"]/span/text()').extract()[0]
             disclosure_date = str(date).replace("[", "").replace("]", "")
             if int(str(disclosure_date).replace("-", "")) >= newstdate:
                 item["disclosure_date"] = disclosure_date + " 00:00:00"
                 pdf_link = response.xpath(
                     '//td[@align="left"]//tbody/tr' + "[" + str(temp) +
                     "]" + '/td[@align="left"]/a/@href').extract()[0]
                 item[
                     "doc_source_url"] = "http://disclosure.szse.cn/" + pdf_link
                 season_num = self.jud_season_num(
                     item["financial_statement_season_type_code"])
                 num = int(self.report_num_dict[item["company_code"]]) + 1
                 num = self.go_heavy_num(num)
                 item["report_id"] = item[
                     "company_code"] + fiscal_year + "00" + season_num + "01" + num
                 #report_num = re.search("CHN\d{15}(\d{3})", str(item["report_id"])).group(1)
                 self.report_num_dict[item["company_code"]] = num
                 #print(item["report_id"])
                 item["doc_local_path"] = "/volume1/homes/China/" + str(
                     fiscal_year) + "/" + item["report_id"] + ".pdf"
                 item["country_code"] = "CHN"
                 item["financial_reporting_standard_code"] = "CAS"
                 item["doc_type"] = "pdf"
                 item["is_doc_url_direct"] = 1
                 item["is_downloaded"] = 1
                 item["currency_code"] = "CNY"
                 item["language_written_code"] = "zh-simple"
                 item["gmt_create"] = time.strftime(
                     '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                 item["doc_downloaded_timestamp"] = item["gmt_create"]
                 item["user_create"] = "root"
                 item["file_name"] = title
                 item["spiderName"] = "shenzhen_download_spider"
                 yield item