Python SpiderItem Examples, MyProject.items.SpiderItem Python Examples

Example #1

0

Show file

    def parse_latest(self,response):
        aLists = response.body.split("],[")
        announcementType = u"最新上市公司公告".encode('utf-8')
        today = time.strftime("%Y-%m-%d", time.localtime(int(time.time())))
        length = len(aLists)
        output = sys.stdout
        for index, aList in enumerate(aLists):
            pdfUrl = aList.split(",")[1].strip('" ')
            pdfTitle = re.sub('[":?/*<>|\ ]','',aList.split(",")[2].strip('":?/*<>|\ ')).decode('gbk').encode('utf-8')            #正则去掉文件名中非法字符
            pdfTime = aList.split(",")[5].strip('" ')

            if pdfTime == today:
                relativeFilePath = "CCXR_Spider\\Data\\"+"{}\\{}\\{}\\".format(today,self.tag,announcementType)
                dirPath = unicode(self.basepath+relativeFilePath)
                savePath = unicode(dirPath + "{}.pdf".format(pdfTitle))
                if not os.path.exists(dirPath):
                    os.makedirs(dirPath)
                try:
                    urllib.urlretrieve(self.baseUrl + str(pdfUrl), savePath)
                    item = SpiderItem(
                        AnnouncementTitle=pdfTitle,
                        AnnouncementFrom=self.tag,
                        ReportingDate=today,
                        AnnouncementSort=announcementType,
                        AnnouncementPath=unicode(relativeFilePath+"{}.pdf".format(pdfTitle))
                    )
                    yield item
                except Exception:
                    print "Exception:",pdfTitle," ",pdfUrl
            process = '#' * int((index + 1) * 100.0 / length) + '=' * int((length - index - 1) * 100.0 / length)
            output.write("\r[%s] %s complete :" % (self.tag,announcementType) + process + "%.0f%%" % int((index + 1) * 100.0 / length))
            output.flush()
        print

Example #2

0

Show file

File: sse_spiders.py Project: nemochin/ExchangeSpider

    def parse_common(self, response):
        """
        解析债券公告信息,下载PDF文件
        :param response:
        :return:
        """
        selector = Selector(response)
        aSorts = selector.xpath(
            '//div[@class="sse_common_wrap_cn"]/div[@class="sse_title_common"]/h2/text()'
        ).extract()  #债券公告类别
        aTimes = selector.xpath(
            '//div[@class="sse_list_1"]/dl/dd/span/text()').extract()  #债券公告时间
        aTitles = selector.xpath(
            '//div[@class="sse_list_1"]/dl/dd/a/text()').extract()  #债券公告标题
        #aUrls = selector.xpath('//ul[@class="dl-submenu"]/li[@class=""]/a[@target="_self"]/@href').extract()                #其他债券公告链接
        #usedUrl = selector.xpath('//ul[@class="dl-submenu"]/li[@class="active"]/a[@target="_self"]/@href').extract()       #当前打开的债券公告链接
        pdfUrls = selector.xpath(
            '//div[@class="sse_list_1"]/dl/dd/a[@target="_blank"]/@href'
        ).extract()  #PDF文档链接

        aSort = aSorts[0].encode('utf-8').strip()
        today = time.strftime("%Y-%m-%d", time.localtime(int(time.time())))
        relativeFilePath = "ExchangeSpider\\Data\\" + "{}\\{}\\{}\\".format(
            today, self.tag, aSort)
        dirPath = unicode(self.basepath + relativeFilePath)
        length = len(pdfUrls)
        output = sys.stdout
        for index, pdfUrl in enumerate(pdfUrls):
            if aTimes[index] == today:
                aTitle = re.sub('[":?/*<>|\ ]', '',
                                aTitles[index]).encode('utf-8').strip()
                savePath = unicode(dirPath + "{}.pdf".format(aTitle))
                if not os.path.exists(dirPath):
                    os.makedirs(dirPath)
                try:
                    urllib.urlretrieve(self.host + str(pdfUrl), savePath)
                    item = SpiderItem(
                        AnnouncementTitle=aTitle,
                        AnnouncementFrom=self.tag,
                        ReportingDate=today,
                        AnnouncementSort=aSort,
                        AnnouncementPath=unicode(relativeFilePath +
                                                 "{}.pdf".format(aTitle)))
                    yield item
                except Exception, e:
                    print e.message
            process = '#' * int((index + 1) * 100.0 / length) + '=' * int(
                (length - index - 1) * 100.0 / length)
            output.write("\r[%s] %s complete :" % (self.tag, aSort.strip()) +
                         process +
                         "%.0f%%" % int((index + 1) * 100.0 / length))
            output.flush()

Example #3

0

Show file

File: sse_spiders.py Project: nemochin/ExchangeSpider

    def parse_latest(self, response):
        """
        解析最新公告页面，下载最新公告PDF文件
        :param response:
        :return:
        """
        selector = Selector(response)
        lSorts = selector.xpath(
            '//div[@class="sse_title_common"]/h2/text()').extract()
        lTimes = selector.xpath(
            '//dl[@class="modal_pdf_list"]/dd[@class="just_this_only"]/span/text()'
        ).extract()
        lTitles = selector.xpath(
            '//dl[@class="modal_pdf_list"]/dd[@class="just_this_only"]/em[@class="pdf-first"]/@title'
        ).extract()
        lPdfUrls = selector.xpath(
            '//dl[@class="modal_pdf_list"]/dd[@class="just_this_only"]/em[@class="pdf-first"]/a[@target="_blank"]/@href'
        ).extract()

        lSort = lSorts[0].encode('utf-8').strip()
        today = time.strftime("%Y-%m-%d", time.localtime(int(time.time())))
        relativeFilePath = "CCXR_Spider\\Data\\" + "{}\\{}\\{}\\".format(
            today, self.tag, lSort)
        dirPath = unicode(self.basepath + relativeFilePath)
        length = len(lPdfUrls)
        output = sys.stdout
        for index, lPdfUrl in enumerate(lPdfUrls):
            if lTimes[index] == today:
                lTime = lTimes[index].strip()
                lTitle = re.sub('[":?/*<>|\ ]', '',
                                lTitles[index].strip())  #正则去掉文件名中非法字符
                savePath = unicode(dirPath + "{}.pdf".format(lTitle))
                if not os.path.exists(dirPath):
                    os.makedirs(dirPath)
                try:
                    urllib.urlretrieve(self.host + str(lPdfUrl), savePath)
                    item = SpiderItem(
                        AnnouncementTitle=lTitle,
                        AnnouncementFrom=self.tag,
                        ReportingDate=today,
                        AnnouncementSort=lSort,
                        AnnouncementPath=unicode(relativeFilePath +
                                                 "{}.pdf".format(lTitle)))
                    yield item
                except Exception, e:
                    print e.message
            process = '#' * int((index + 1) * 100.0 / length) + '=' * int(
                (length - index - 1) * 100.0 / length)
            output.write("\r[%s] %s complete :" % (self.tag, lSort.strip()) +
                         process +
                         "%.0f%%" % int((index + 1) * 100.0 / length))
            output.flush()

Example #4

0

Show file

File: sse_spiders.py Project: nemochin/ExchangeSpider

    def parse_stoconvertible(self, response):
        aSort = u"分离交换的可转换公司债券公告".encode('utf-8')
        aLists = response.body.split("_t.push(")
        today = time.strftime("%Y-%m-%d", time.localtime(int(time.time())))
        relativeFilePath = "CCXR_Spider\\Data\\" + "{}\\{}\\{}\\".format(
            today, self.tag, aSort)
        dirPath = unicode(self.basepath + relativeFilePath)
        length = len(aLists[1:])
        output = sys.stdout
        for index, aList in enumerate(aLists[1:]):
            aTitle = re.sub(
                '[":?/*<>|\ ]', '',
                aList.split(",")[0][7:].strip('":?/*<>|\ ')).encode(
                    'utf-8')  #正则去掉文件名中非法字符
            aTime = aList.split(",")[1][5:].strip('" ')
            if index == length - 2:
                pdfUrl = (aList.split(",")[2][4:])[:-15:].strip('" ')
            else:
                pdfUrl = (aList.split(",")[2][4:])[:-4:].strip('" ')

            if aTime == today:
                savePath = unicode(dirPath + "{}.pdf".format(aTitle))
                if not os.path.exists(dirPath):
                    os.makedirs(dirPath)
                try:
                    urllib.urlretrieve(self.host + str(pdfUrl), savePath)
                    item = SpiderItem(
                        AnnouncementTitle=aTitle,
                        AnnouncementFrom=self.tag,
                        ReportingDate=today,
                        AnnouncementSort=aSort,
                        AnnouncementPath=unicode(relativeFilePath +
                                                 "{}.pdf".format(aTitle)))
                    yield item
                except Exception, e:
                    print e.message
            process = '#' * int((index + 1) * 100.0 / length) + '=' * int(
                (length - index - 1) * 100.0 / length)
            output.write("\r[%s] %s complete :" % (self.tag, aSort.strip()) +
                         process +
                         "%.0f%%" % int((index + 1) * 100.0 / length))
            output.flush()

Example #5

0

Show file

    def parse_bond(self, response):
        """
        下载深交所债券公告，由于无法从返回页面直接解析，因此直接抓取js返回数据
        :param response:
        :return:
        """
        #aTitles = selector.xpath('//table[@class="ggnr"]/tbody/tr/td[@class="td2"]/a[@target="_blank"]/text()').extract()
        #aTimes = selector.xpath('//table[@class="ggnr"]/tbody/tr/td[@class="td2"]/span[@class="link1"]/text()').extract()
        #pdfUrls = selector.xpath('//table[@class="ggnr"]/tbody/tr/td[@class="td2"]/a[@target="_blank"]/@href').extract()

        aLists = response.body.split("],[")
        today = time.strftime("%Y-%m-%d", time.localtime(int(time.time())))
        announcementType = u"债券公告".encode('utf-8')
        length = len(aLists)
        output = sys.stdout
        for index,aList in enumerate(aLists):
            pdfUrl = aList.split(",")[1].strip('" ')
            pdfTitle = re.sub('[":?/*<>|\ ]','',aList.split(",")[2].strip('":?/*<>|\ ')).decode('gbk').encode('utf-8')        #正则去掉文件名中非法字符
            pdfTime = aList.split(",")[5].strip('" ')

            if pdfTime==today:
                relativeFilePath = "ExchangeSpider\\Data\\"+"{}\\{}\\{}\\".format(today,self.tag,announcementType)
                dirPath = unicode(self.basepath+relativeFilePath)
                savePath = unicode(dirPath + "{}.pdf".format(pdfTitle))
                if not os.path.exists(dirPath):
                    os.makedirs(dirPath)
                try:
                    urllib.urlretrieve(self.baseUrl + str(pdfUrl), savePath)
                    item = SpiderItem(
                        AnnouncementTitle=pdfTitle,
                        AnnouncementFrom=self.tag,
                        ReportingDate=today,
                        AnnouncementSort=announcementType,
                        AnnouncementPath=unicode(relativeFilePath+"{}.pdf".format(pdfTitle))
                    )
                    yield item
                except Exception:
                    print "Exception:",pdfTitle," ",pdfUrl
            process = '#'*int((index + 1) * 100.0 / length) + '='*int((length-index-1)*100.0/length)
            output.write("\r[%s] %s complete :" % (self.tag,announcementType) + process + "%.0f%%" % int((index + 1) * 100.0 / length))
            output.flush()
        print

Example #6

0

Show file

    def parse(self, response):
        """
        Ajax返回数据解析
        """
        jsonObjs = json.loads(response.text)['announcements']
        length = len(jsonObjs)
        output = sys.stdout
        today = time.strftime("%Y-%m-%d", time.localtime(int(time.time())))
        aSort = u"监管机构公告".encode('utf-8')
        relativeFilePath = "ExchangeSpider\\Data\\" + "{}\\{}\\{}\\".format(
            today, self.tag, aSort)
        dirPath = unicode(self.basepath + relativeFilePath)
        for index, jsonObj in enumerate(jsonObjs):
            announcementTitle = re.sub('[":?/*<>|\ ]', '',
                                       jsonObj['announcementTitle']).encode(
                                           'utf-8')  #去除文件名中的非法字符,转换编码
            adjunctUrl = jsonObj['adjunctUrl']  #公告链接
            #print jsonObj['announcementTime'],time.time()
            announcementTime = time.strftime(
                "%Y-%m-%d",
                time.localtime(int(int(jsonObj['announcementTime']) *
                                   0.001)))  #格式化时间戳

            adjunctType = jsonObj['adjunctType']
            if today == announcementTime:
                if not os.path.exists(dirPath):
                    os.makedirs(dirPath)
                if adjunctType == "PDF":
                    try:
                        savePath = unicode(dirPath +
                                           "{}.pdf".format(announcementTitle))
                        urllib.urlretrieve(self.host + str(adjunctUrl),
                                           savePath)
                        item = SpiderItem(
                            AnnouncementTitle=announcementTitle,
                            AnnouncementFrom=self.tag,
                            ReportingDate=today,
                            AnnouncementSort=aSort,
                            AnnouncementPath=unicode(
                                relativeFilePath +
                                "{}.pdf".format(announcementTitle)))
                        yield item
                    except Exception:
                        print "Exception:", announcementTitle, " ", adjunctUrl
                if adjunctType == "TXT":
                    try:
                        url = self.host + str(adjunctUrl)
                        html = requests.get(url)
                        list0 = re.findall(r'(?<=\[).*?(?=\])', html.text)
                        content = json.loads(list0[0])
                        contentTitle = re.sub('[":?/*<>|\ ]', '',
                                              content['Title']).encode('utf-8')
                        contentZw = content['Zw'].encode('utf-8')
                        savePath = unicode(dirPath +
                                           "{}.html".format(contentTitle))
                        with open(savePath, 'wb') as fs:
                            fs.write(contentZw)
                            item = SpiderItem(
                                AnnouncementTitle=announcementTitle,
                                AnnouncementFrom=self.tag,
                                ReportingDate=today,
                                AnnouncementSort=aSort,
                                AnnouncementPath=unicode(
                                    relativeFilePath +
                                    "{}.html".format(contentTitle)))
                            yield item
                    except Exception:
                        pass
            process = '#' * int((index + 1) * 100.0 / length) + '=' * int(
                (length - index - 1) * 100.0 / length)
            output.write("\r[%s] %s complete :" % (self.tag, aSort) + process +
                         "%.0f%%" % int((index + 1) * 100.0 / length))
            output.flush()
        print