Example #1
class CompanylistSpider(scrapy.Spider):
    name = 'downloadPdf_BSE_Q'
    allowed_domains = ['bseindia.com']
    page = 0
    report_num_dict = {}
    pattern = re.compile("xa0")
    url1 = "http://www.bseindia.com/corporates/ann.aspx?curpg=1&annflag=1&dt=&dur=A&dtto=&cat=Result&scrip="
    url2 = "&anntype=C"
    conn = pymysql.connect(host="", port=3306, db="opd_common", user="******", passwd="OPDATA", charset="utf8")
    cursor = conn.cursor()
    sql = "select security_code,company_id from company_data_source where mark = 0 and company_id like " + "'IND%' and spider_name = 'BasicInfoBSE'"
    results = cursor.fetchall()
    Q1_list = ["Q1", "First Quarter", "1ST Quarter"]
    Q2_list = ["Q2", "THE QUARTER AND HALF YEAR ENDED", "The Quarter And Half Year Ended", "Half Year Ended",
               "Second Quarter", "2Nd Quarter", "Six Months", "HALF YEAR ENDED", "The Quarter And Half Year Ended",
               "HALF YEAR AND YEAR ENDED"]
    Q3_list = ["Q3", "9 months"]
    Q4_list = ["Q4", "4Th Quarter"]
    Q_list = ["Quarter Ended", "QUARTER ENDED", "Quarter", "June", "Jun", "JUNE", "Qtr. June", "September", "Sep",
              "Sept", "SEPTEMBER", "December", "Dec", "March", "Mar", "MAR", "MARCH", "Three Months Ended",
              "quarter ended"]
    FY_list = ["FY", "Annual", "period ended", "The Quarter And Year Ended", "Quarter And Year Ended", "Year Ended",
               "F.Y", "Financial Year", "FINANCIAL YEAR"]
    Financial_list = ["March", "Mar", "MAR", "MARCH" "Dec", "December", "Sep", "Sept", "September", "SEPTEMBER",
                      "September", "Three Months Ended", "Jun", "JUNE""Qtr. June", "June", "Quarter Ended",
                      "QUARTER ENDED", "Quarter", "Ended", "quarter ended", "Financial Results", "FINANCIAL RESULTS",
                      "Financial Result", "Financial Resuult", "Financial  Results", "FINANCIAL UNAUDITED RESULTS",
                      "Financials Results", "FINANCIAL RESULT", "Financials Result", "FINANCIAL REULTS",
                      "Financial Resut", "Financial Reports", "FINANACIAL RESULT"]

    def get_pdf_time(self, title):
        pattern = re.compile(
        pdf_time_list = pattern.findall(title)
        if len(pdf_time_list) != 0:
            pdf_time = pdf_time_list[0].replace("June", "06").replace("Jun", "06").replace("JUNE", "06").replace(
                "September", "09").replace("Sept", "09").replace("Sep", "09").replace("SEPTEMBER", "09").replace(
                "December", "12").replace("Dec", "12").replace("March", "03").replace("Mar", "03").replace("MARCH","03").replace(
                "MAR", "03").replace("  ", "").replace(" ", "")
            special_time_list = re.findall("(\d{2})(\d{2})(\d{4})", pdf_time)
            for temp in special_time_list:
                month = temp[0]
                date = int(temp[1])
                if month == "06" or month == "09" and date >= 31:
                    date = 30
                pdf_standard_time = temp[2] + "-" + temp[0] + "-" + str(date) + " 00:00:00"
                end_date = pdf_standard_time
                fiscal_year = str(pdf_standard_time).split("-")[0]
            end_date = None
            fiscal_year = None
        return end_date, fiscal_year

    def go_heavy_num(self, num):
        if num < 10:
            num = "00" + str(num)
        elif 10 <= num < 100:
            num = "0" + str(num)
        elif num >= 100:
            num = str(num)
        return num

    def start_requests(self):
        for temp in self.results:
            code = temp[0]
            company_id = temp[1]
            self.report_num_dict[company_id] = "000"
            url = self.url1 + str(code) + self.url2
            yield scrapy.Request(url, callback=self.parse, meta={"company_id": company_id, "code": code})

    def parse(self, response):
        company_id = response.meta["company_id"]
        code = response.meta["code"]
        time_list = []
        data_list = response.xpath('//table[@cellspacing="1"]//tr[@style="background-color:white;height:32px;"]')
        for temp in data_list:
            item = IndiaItem()
            item["company_code"] = company_id
            pdf_url = temp.xpath('./preceding-sibling::tr[1]/td[3]/a/@href').extract()
            title = temp.xpath('./preceding-sibling::tr[1]/td[1]/text()').extract()
            title_backup = temp.xpath('./preceding-sibling::tr[1]/td[1]/a/text()').extract()
            data = temp.xpath('./preceding-sibling::tr[2]/td/text()').extract()
            data_time = self.pattern.findall(str(data))
            data_title = self.pattern.findall(str(title))
            if len(pdf_url) == 0:
                item["doc_source_url"] = pdf_url[0]
                if len(data_title) != 0 or len(title) == 0:
                    title = str(title_backup[0]).replace("(", "").replace(")", "").replace(",", "").replace("&", "")
                    title = str(title[0]).replace("(", "").replace(")", "").replace(",", "").replace("&", "")
                if any(i in title for i in self.Q1_list):
                    item["end_date"] = self.get_pdf_time(title)[0]
                    fiscal_year = self.get_pdf_time(title)[1]
                    item["fiscal_year"] = fiscal_year
                    if fiscal_year is None:
                        fiscal_year = "0000"
                    num = int(self.report_num_dict[item["company_code"]]) + 1
                    num = self.go_heavy_num(num)
                    self.report_num_dict[item["company_code"]] = num
                    item["financial_statement_season_type_code"] = "Q1"
                    item["report_id"] = item["company_code"] + fiscal_year + "00" + "01" + "01" + self.report_num_dict[item["company_code"]]
                elif any(i in title for i in self.Q2_list):
                    item["end_date"] = self.get_pdf_time(title)[0]
                    fiscal_year = self.get_pdf_time(title)[1]
                    item["fiscal_year"] = fiscal_year
                    if fiscal_year is None:
                        fiscal_year = "0000"
                    num = int(self.report_num_dict[item["company_code"]]) + 1
                    num = self.go_heavy_num(num)
                    self.report_num_dict[item["company_code"]] = num
                    item["financial_statement_season_type_code"] = "Q2"
                    item["report_id"] = item["company_code"] + fiscal_year + "00" + "02" + "01" + \
                elif any(i in title for i in self.Q3_list):
                    item["end_date"] = self.get_pdf_time(title)[0]
                    fiscal_year = self.get_pdf_time(title)[1]
                    item["fiscal_year"] = fiscal_year
                    if fiscal_year is None:
                        fiscal_year = "0000"
                    num = int(self.report_num_dict[item["company_code"]]) + 1
                    num = self.go_heavy_num(num)
                    self.report_num_dict[item["company_code"]] = num
                    item["financial_statement_season_type_code"] = "Q3"
                    item["report_id"] = item["company_code"] + fiscal_year + "00" + "03" + "01" + \
                elif any(i in title for i in self.Q4_list):
                    item["end_date"] = self.get_pdf_time(title)[0]
                    fiscal_year = self.get_pdf_time(title)[1]
                    item["fiscal_year"] = fiscal_year
                    if fiscal_year is None:
                        fiscal_year = "0000"
                    num = int(self.report_num_dict[item["company_code"]]) + 1
                    num = self.go_heavy_num(num)
                    self.report_num_dict[item["company_code"]] = num
                    item["financial_statement_season_type_code"] = "Q4"
                    item["report_id"] = item["company_code"] + fiscal_year + "00" + "04" + "01" + \
                elif any(i in title for i in self.Q_list):
                    item["end_date"] = self.get_pdf_time(title)[0]
                    fiscal_year = self.get_pdf_time(title)[1]
                    item["fiscal_year"] = fiscal_year
                    if fiscal_year is None:
                        fiscal_year = "0000"
                    num = int(self.report_num_dict[item["company_code"]]) + 1
                    num = self.go_heavy_num(num)
                    self.report_num_dict[item["company_code"]] = num
                    item["financial_statement_season_type_code"] = "Q"
                    item["report_id"] = item["company_code"] + fiscal_year + "00" + "05" + "01" + \
                elif any(i in title for i in self.FY_list):
                    item["end_date"] = self.get_pdf_time(title)[0]
                    fiscal_year = self.get_pdf_time(title)[1]
                    item["fiscal_year"] = fiscal_year
                    if fiscal_year is None:
                        fiscal_year = "0000"
                    num = int(self.report_num_dict[item["company_code"]]) + 1
                    num = self.go_heavy_num(num)
                    self.report_num_dict[item["company_code"]] = num
                    item["financial_statement_season_type_code"] = "FY"
                    item["report_id"] = item["company_code"] + fiscal_year + "00" + "06" + "01" + \
                elif any(i in title for i in self.Financial_list):
                    item["end_date"] = self.get_pdf_time(title)[0]
                    fiscal_year = self.get_pdf_time(title)[1]
                    item["fiscal_year"] = fiscal_year
                    if fiscal_year is None:
                        fiscal_year = "0000"
                    num = int(self.report_num_dict[item["company_code"]]) + 1
                    num = self.go_heavy_num(num)
                    self.report_num_dict[item["company_code"]] = num
                    item["financial_statement_season_type_code"] = None
                    item["report_id"] = item["company_code"] + fiscal_year + "00" + "00" + "01" + \
                if len(data_time) != 0 or len(data) == 0:
                    data = time_list[-1]
                    data = data[0]
                release_time = str(data).replace("Jan", "01").replace("Feb", "02").replace("Mar", "03").replace(
                    "Apr", "04").replace("May", "05").replace("Jun", "06").replace("Jul", "07").replace("Aug","08").replace(
                    "Sep", "09").replace("Oct", "10").replace("Nov", "11").replace("Dec", "12").replace("  ","").replace(" ","")
                release_time_list = re.findall("(\d{2})(\d{2})(\d{4})", release_time)
                if len(release_time_list) != 0:
                    temp = release_time_list[0]
                    item["disclosure_date"] = temp[2] + "-" + temp[1] + "-" + temp[0] + " 00:00:00"
                    item["disclosure_date"] = None
                item["country_code"] = "IND"
                item["exchange_market_code"] = "BSE"
                item["financial_reporting_standard_code"] = "IFRS/IND AS"
                item["doc_type"] = "pdf"
                item["is_doc_url_direct"] = 1
                item["is_downloaded"] = 1
                item["currency_code"] = "INR"
                item["doc_downloaded_timestamp"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                item["language_written_code"] = "en"
                item["doc_local_path"] = "/volum1/homes/India/" + fiscal_year + "/" + item["report_id"] + ".pdf"
                item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                item["user_create"] = "root"
                item["file_name"] = item["report_id"]
                item["jud"] = 0
                yield item
        page_link = response.xpath('//span[@id="ctl00_ContentPlaceHolder1_lblNext"]/a/@href').extract()
        if len(page_link) != 0:
            url = "http://www.bseindia.com/corporates/" + page_link[0]
            yield scrapy.Request(url, callback=self.parse, meta={"code": code, "company_id": company_id})
Example #2
 def start_requests(self):
     url = "https://www.baidu.com"
     yield scrapy.Request(url, callback=self.parse)
Example #3
class CompanylistSpider(scrapy.Spider):
    name = 'downloadZip_NSE'
    allowed_domains = ['nseindia.com']
    report_num_dict = {}
    pattern2 = re.compile(r"IND\d{15}")
    url1_Last_24_Months = "https://www.nseindia.com/corporates/corpInfo/equities/getFinancialResults.jsp?broadcastPeriod=Last%2024%20Months&symbol="
    url2_Last_24_Months = "&industry=&period="
    url1_More_than_24_Months = "https://www.nseindia.com/corporates/corpInfo/equities/getFinancialResults.jsp?broadcastPeriod=More%20than%2024%20Months&symbol="
    url2_More_than_24_Months = "&industry=&period="
    link1 = "https://www.nseindia.com/corporates/corpInfo/equities/results_new.jsp?param="
    link2 = "&seq_id="
    link3 = "&industry=-&viewFlag=N&frOldNewFlag=N"
    conn = pymysql.connect(host="",
    cursor = conn.cursor()
    sql = "select info_disclosure_id,company_id from company_data_source where mark = 0 and company_id like " + "'IND%' and spider_name = 'BasicInfoNSE'"
    results = cursor.fetchall()

    def go_heavy_num(self, num):
        if num < 10:
            num = "00" + str(num)
        elif 10 <= num < 100:
            num = "0" + str(num)
        elif num >= 100:
            num = str(num)
        return num

    def start_requests(self):
        for temp in self.results:
            code = temp[0]
            company_id = temp[1]
            self.report_num_dict[company_id] = "100"
            report_id_list = []
            sql_jud = 'select report_id from financial_statement_index where country_code = "IND" and exchange_market_code = "NSE" and doc_type = "pdf" and company_code = %s'
            self.cursor.execute(sql_jud, temp[1])
            results = self.cursor.fetchall()
            for temp in results:
                if temp[0] not in report_id_list:
                    id = self.pattern2.search(temp[0]).group()
                    if id not in report_id_list:
            url1 = self.url1_Last_24_Months + str(
                code) + self.url2_Last_24_Months
            url2 = self.url1_More_than_24_Months + str(
                code) + self.url2_More_than_24_Months
            yield scrapy.Request(url1,
                                     "code": code,
                                     "company_id": company_id,
                                     "report_id_list": report_id_list
            yield scrapy.Request(url2,
                                     "code": code,
                                     "company_id": company_id,
                                     "report_id_list": report_id_list

    def parse(self, response):
        report_id_list = response.meta["report_id_list"]
        code = response.meta["code"]
        company_id = response.meta["company_id"]
        data = response.body
        pattern_RelatingTo = re.compile('(?:RelatingTo:".*?")')
        pattern_SeqNumber = re.compile('(?:SeqNumber:".*?")')
        pattern_FromDate = re.compile('(?:FromDate:".*?")')
        pattern_ToDate = re.compile('(?:ToDate:".*?")')
        data_RelatingTo = pattern_RelatingTo.findall(data)
        data_SeqNumber = pattern_SeqNumber.findall(data)
        data_FromDate = pattern_FromDate.findall(data)
        data_ToDate = pattern_ToDate.findall(data)
        for temp in range(len(data_RelatingTo)):
            RelatingTo = data_RelatingTo[temp].split(":")[-1].replace(
                '"', " ").strip()
            SeqNumber = data_SeqNumber[temp].split(":")[-1].replace(
                '"', " ").strip()
            FromDate = data_FromDate[temp].split(":")[-1].replace('"',
                                                                  " ").strip()
            ToDate = data_ToDate[temp].split(":")[-1].replace('"', " ").strip()
            if RelatingTo == "Annual":
                link = self.link1 + FromDate + ToDate + "ANANCNAE" + code + self.link2 + SeqNumber + self.link3
                season_type_code = "FY"
                season_num = "06"
            elif RelatingTo == "First Quarter":
                link = self.link1 + FromDate + ToDate + "Q1UNNNNE" + code + self.link2 + SeqNumber + self.link3
                season_type_code = "Q1"
                season_num = "01"
            elif RelatingTo == "Second Quarter":
                link = self.link1 + FromDate + ToDate + "Q2ANNCNE" + code + self.link2 + SeqNumber + self.link3
                season_type_code = "Q2"
                season_num = "02"
            elif RelatingTo == "Third Quarter":
                link = self.link1 + FromDate + ToDate + "Q3UNNCNE" + code + self.link2 + SeqNumber + self.link3
                season_type_code = "Q3"
                season_num = "03"
            elif RelatingTo == "Fourth Quarter":
                link = self.link1 + FromDate + ToDate + "Q4ANNNAE" + code + self.link2 + SeqNumber + self.link3
                season_type_code = "Q4"
                season_num = "04"
                link = None
                season_type_code = None
                season_num = None
            if link is not None:
                yield scrapy.Request(link,
                                         "company_id": company_id,
                                         "FromDate": FromDate,
                                         "ToDate": ToDate,
                                         "season_type_code": season_type_code,
                                         "season_num": season_num,
                                         "report_id_list": report_id_list

    def getzip(self, response):
        report_id_list = response.meta["report_id_list"]
        item = IndiaItem()
        zip_link = response.xpath(
        if len(zip_link) != 0:
            item["doc_source_url"] = "https://www.nseindia.com" + zip_link[0]
            season_num = response.meta["season_num"]
            season_type_code = response.meta["season_type_code"]
            item["company_code"] = response.meta["company_id"]
            start_year = response.meta["FromDate"]
            end_year = response.meta["ToDate"]
            start = str(start_year).split("-")
            end = str(end_year).split("-")
            item["start_date"] = start[-1] + "-" + start[1].replace(
                "Jan", "01").replace("Feb", "02").replace("Mar", "03").replace(
                    "Apr", "04").replace("May", "05").replace(
                        "Jun", "06").replace("Jul", "07").replace(
                            "Aug", "08").replace("Sep", "09").replace(
                                "Oct", "10").replace("Nov", "11").replace(
                                    "Dec", "12") + "-" + start[0] + " 00:00:00"
            item["end_date"] = end[-1] + "-" + end[1].replace(
                "Jan", "01").replace("Feb", "02").replace("Mar", "03").replace(
                    "Apr", "04").replace("May", "05").replace(
                        "Jun", "06").replace("Jul", "07").replace(
                            "Aug", "08").replace("Sep", "09").replace(
                                "Oct", "10").replace("Nov", "11").replace(
                                    "Dec", "12") + "-" + end[0] + " 00:00:00"
            item["fiscal_year"] = str(item["end_date"]).split("-")[0]
            item["country_code"] = "IND"
            item["exchange_market_code"] = "NSE"
            item["financial_reporting_standard_code"] = "IFRS/IND AS"
            item["doc_type"] = "pdf"
            item["is_doc_url_direct"] = 1
            item["is_downloaded"] = 1
            item["currency_code"] = "INR"
            item["doc_downloaded_timestamp"] = time.strftime(
                '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            item["language_written_code"] = "en"
            num = int(self.report_num_dict[item["company_code"]]) + 1
            num = self.go_heavy_num(num)
            self.report_num_dict[item["company_code"]] = num
            item["report_id"] = item["company_code"] + item[
                "fiscal_year"] + "00" + season_num + "01" + self.report_num_dict[
            id = self.pattern2.search(str(item["report_id"])).group()
            if id not in report_id_list:
                item["doc_local_path"] = "/volum1/homes/India/" + item[
                    "fiscal_year"] + "/" + item["report_id"] + ".pdf"
                item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S',
                item["user_create"] = "root"
                item["financial_statement_season_type_code"] = season_type_code
                item["file_name"] = item["report_id"]
                item["jud"] = 1
                yield item
Example #4
class CompanylistSpider(scrapy.Spider):
    name = 'downloadExcel_BSE'
    allowed_domains = ['bseindia.com']
    report_num_dict = {}
    pattern2 = re.compile(r"IND\d{15}")
    url1 = "http://www.bseindia.com/stock-share-price/stockreach_financials.aspx?scripcode="
    url2 = "&expandable=0"
    conn = pymysql.connect(host="", port=3306, db="opd_common", user="******", passwd="OPDATA", charset="utf8")
    cursor = conn.cursor()
    sql = "select security_code,company_id from company_data_source where company_id like " + "'IND%' and spider_name = 'BasicInfoBSE'"
    results = cursor.fetchall()

    def saveExcel(self, tr_list, name):
        for temp in tr_list:
            title = temp.xpath('./td[1]/text()').extract()
            if len(title) == 0:
                title = None
                title = str(title[0]).replace(",", "|#|")
            value = temp.xpath('./td[2]/text()').extract()
            if len(value) == 0:
                value = ""
                value = str(value[0]).replace(",", "|#|")
            if title is not None:
                with open("D:\item\OPDCMS/report data update\india\data\csv/" + name + ".csv", "a") as f:
                    f.write(title + "," + value + "\n")

    def go_heavy_num(self, num):
        if num < 10:
            num = "00" + str(num)
        elif 10 <= num < 100:
            num = "0" + str(num)
        elif num >= 100:
            num = str(num)
        return num

    def download(self, response):
        report_id_list = response.meta["report_id_list"]
        code = response.meta["code"]
        type = response.meta["type"]
        item = IndiaItem()
        start_year = response.xpath('//table[@id="ctl00_ContentPlaceHolder1_tbl_typeID"]//tr[2]/td[2]/text()').extract()[0]
        end_year = response.xpath('//table[@id="ctl00_ContentPlaceHolder1_tbl_typeID"]//tr[3]/td[2]/text()').extract()[0]
        tr_list = response.xpath('//table[@id="ctl00_ContentPlaceHolder1_tbl_typeID"]//tr')
        start = str(start_year).split("-")
        end = str(end_year).split("-")
        item["start_date"] = "20" + start[-1] + "-" + start[1].replace("Jan", "01").replace("Feb", "02").replace("Mar", "03").replace("Apr",
                    "04").replace("May", "05").replace("Jun", "06").replace("Jul", "07").replace("Aug", "08").replace("Sep",
                    "09").replace("Oct", "10").replace("Nov", "11").replace("Dec", "12") + "-" + start[0] + " 00:00:00"
        item["end_date"] = "20" + end[-1] + "-" + end[1].replace("Jan", "01").replace("Feb", "02").replace("Mar", "03").replace("Apr",
                    "04").replace("May", "05").replace("Jun", "06").replace("Jul", "07").replace("Aug", "08").replace("Sep",
                    "09").replace("Oct", "10").replace("Nov", "11").replace("Dec", "12") + "-" + end[0] + " 00:00:00"
        item["company_code"] = response.meta["company_id"]
        item["fiscal_year"] = str(item["end_date"]).split("-")[0]
        item["country_code"] = "IND"
        item["exchange_market_code"] = "BSE"
        item["financial_reporting_standard_code"] = "IFRS/IND AS"
        item["doc_type"] = "csv"
        item["source_url"] = response.url
        item["doc_source_url"] = None
        item["is_doc_url_direct"] = 1
        item["is_downloaded"] = 1
        item["currency_code"] = "INR"
        item["doc_downloaded_timestamp"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
        item["language_written_code"] = "en"
        num = int(self.report_num_dict[item["company_code"]]) + 1
        num = self.go_heavy_num(num)
        self.report_num_dict[item["company_code"]] = num
        if type == "FY":
            item["report_id"] = item["company_code"] + item["fiscal_year"] + "03" + "06" + "01" + self.report_num_dict[item["company_code"]]
            item["report_id"] = item["company_code"] + item["fiscal_year"] + "03" + "05" + "01" + self.report_num_dict[item["company_code"]]
        id = self.pattern2.search(str(item["report_id"])).group()
        if id not in report_id_list:
            item["doc_local_path"] = "/volum1/homes/India/" + item["fiscal_year"] + "/" + item["report_id"] + ".csv"
            item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            item["user_create"] = "root"
            item["financial_statement_type_code"] = "IS"
            del tr_list[0:4]
            self.saveExcel(tr_list, item["report_id"])
            yield item

    def judgument_func(self, request_list):
        type_list = []
        url_list = []
        for temp in request_list:
            type = str(temp).split("=")[-1]
            if type not in type_list:
        if "c" in type_list:
            for i in request_list:
                type_judgment1 = str(i).split("=")[-1]
                if type_judgment1 == "c":
                    url = i.replace(" ", "+")
            type = "C"
            for j in request_list:
                type_judgment2 = str(j).split("=")[-1]
                if type_judgment2 == "D":
                    url = j.replace(" ", "+")
            type = "D"
        del type_list[:]
        return url_list

    def start_requests(self):
        for temp in self.results:
            code = temp[0]
            company_id = temp[1]
            self.report_num_dict[company_id] = "000"
            report_id_list = []
            sql_jud = 'select report_id from financial_statement_index where country_code = "IND" and exchange_market_code = "BSE" and doc_type = "csv" and company_code = %s'
            self.cursor.execute(sql_jud, temp[1])
            results = self.cursor.fetchall()
            for temp in results:
                if temp[0] not in report_id_list:
                    id = self.pattern2.search(temp[0]).group()
                    if id not in report_id_list:
            url = self.url1 + str(code) + self.url2
            yield scrapy.Request(url, callback=self.parse, meta={"company_id": company_id, "code": code, "report_id_list": report_id_list})

    def parse(self, response):
        report_id_list = response.meta["report_id_list"]
        code = response.meta["code"]
        company_id = response.meta["company_id"]
        quarterly_list = response.xpath('//table[@id="cr"]//tr/td/a/@href').extract()
        annual_list = response.xpath('//table[@id="acr"]//tr/td/a/@href').extract()
        url_list = self.judgument_func(quarterly_list)
        for temp in url_list:
            yield scrapy.Request(temp, callback=self.download, meta={"code": code, "company_id": company_id, "type": "Q", "report_id_list": report_id_list})
        url_list2 = self.judgument_func(annual_list)
        for each in url_list2:
            yield scrapy.Request(each, callback=self.download, meta={"code": code, "company_id": company_id, "type": "FY", "report_id_list": report_id_list})
Example #5
class CompanylistSpider(scrapy.Spider):
    name = 'downloadPdf_BSE_A'
    allowed_domains = ['bseindia.com']
    report_num_dict = {}
    url1 = "http://www.bseindia.com/stock-share-price/stockreach_annualreports.aspx?scripcode="
    url2 = "&expandable=0"
    conn = pymysql.connect(host="",
    cursor = conn.cursor()
    sql = "select security_code,company_id from company_data_source where mark = 0 and company_id like " + "'IND%' and spider_name = 'BasicInfoBSE'"
    results = cursor.fetchall()

    def go_heavy_num(self, num):
        if num < 10:
            num = "00" + str(num)
        elif 10 <= num < 100:
            num = "0" + str(num)
        elif num >= 100:
            num = str(num)
        return num

    def start_requests(self):
        for temp in self.results:
            code = temp[0]
            company_id = temp[1]
            self.report_num_dict[company_id] = "000"
            url = self.url1 + str(code) + self.url2
            yield scrapy.Request(url,
                                 meta={"company_id": company_id})

    def parse(self, response):
        tr_list = response.xpath(
        del tr_list[:1]
        for temp in tr_list:
            item = IndiaItem()
            item["fiscal_year"] = temp.xpath('./td[1]/text()').extract()[0]
            item["doc_source_url"] = temp.xpath('./td[2]/a/@href').extract()[0]
            item["company_code"] = response.meta["company_id"]
            item["country_code"] = "IND"
            item["exchange_market_code"] = "BSE"
            item["financial_reporting_standard_code"] = "IFRS/IND AS"
            item["doc_type"] = "pdf"
            item["is_doc_url_direct"] = 1
            item["is_downloaded"] = 1
            item["currency_code"] = "INR"
            item["doc_downloaded_timestamp"] = time.strftime(
                '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            item["language_written_code"] = "en"
            num = int(self.report_num_dict[item["company_code"]]) + 1
            num = self.go_heavy_num(num)
            self.report_num_dict[item["company_code"]] = num
            item["report_id"] = item["company_code"] + item[
                "fiscal_year"] + "00" + "06" + "01" + self.report_num_dict[
            item["doc_local_path"] = "/volum1/homes/India/" + item[
                "fiscal_year"] + "/" + item["report_id"] + ".pdf"
            item["gmt_create"] = time.strftime('%Y-%m-%d %H:%M:%S',
            item["user_create"] = "root"
            item["financial_statement_season_type_code"] = "FY"
            item["file_name"] = item["report_id"]
            item["jud"] = 0
            yield item