def parse_latest(self,response): aLists = response.body.split("],[") announcementType = u"最新上市公司公告".encode('utf-8') today = time.strftime("%Y-%m-%d", time.localtime(int(time.time()))) length = len(aLists) output = sys.stdout for index, aList in enumerate(aLists): pdfUrl = aList.split(",")[1].strip('" ') pdfTitle = re.sub('[":?/*<>|\ ]','',aList.split(",")[2].strip('":?/*<>|\ ')).decode('gbk').encode('utf-8') #正则去掉文件名中非法字符 pdfTime = aList.split(",")[5].strip('" ') if pdfTime == today: relativeFilePath = "CCXR_Spider\\Data\\"+"{}\\{}\\{}\\".format(today,self.tag,announcementType) dirPath = unicode(self.basepath+relativeFilePath) savePath = unicode(dirPath + "{}.pdf".format(pdfTitle)) if not os.path.exists(dirPath): os.makedirs(dirPath) try: urllib.urlretrieve(self.baseUrl + str(pdfUrl), savePath) item = SpiderItem( AnnouncementTitle=pdfTitle, AnnouncementFrom=self.tag, ReportingDate=today, AnnouncementSort=announcementType, AnnouncementPath=unicode(relativeFilePath+"{}.pdf".format(pdfTitle)) ) yield item except Exception: print "Exception:",pdfTitle," ",pdfUrl process = '#' * int((index + 1) * 100.0 / length) + '=' * int((length - index - 1) * 100.0 / length) output.write("\r[%s] %s complete :" % (self.tag,announcementType) + process + "%.0f%%" % int((index + 1) * 100.0 / length)) output.flush() print
def parse_common(self, response): """ 解析债券公告信息,下载PDF文件 :param response: :return: """ selector = Selector(response) aSorts = selector.xpath( '//div[@class="sse_common_wrap_cn"]/div[@class="sse_title_common"]/h2/text()' ).extract() #债券公告类别 aTimes = selector.xpath( '//div[@class="sse_list_1"]/dl/dd/span/text()').extract() #债券公告时间 aTitles = selector.xpath( '//div[@class="sse_list_1"]/dl/dd/a/text()').extract() #债券公告标题 #aUrls = selector.xpath('//ul[@class="dl-submenu"]/li[@class=""]/a[@target="_self"]/@href').extract() #其他债券公告链接 #usedUrl = selector.xpath('//ul[@class="dl-submenu"]/li[@class="active"]/a[@target="_self"]/@href').extract() #当前打开的债券公告链接 pdfUrls = selector.xpath( '//div[@class="sse_list_1"]/dl/dd/a[@target="_blank"]/@href' ).extract() #PDF文档链接 aSort = aSorts[0].encode('utf-8').strip() today = time.strftime("%Y-%m-%d", time.localtime(int(time.time()))) relativeFilePath = "ExchangeSpider\\Data\\" + "{}\\{}\\{}\\".format( today, self.tag, aSort) dirPath = unicode(self.basepath + relativeFilePath) length = len(pdfUrls) output = sys.stdout for index, pdfUrl in enumerate(pdfUrls): if aTimes[index] == today: aTitle = re.sub('[":?/*<>|\ ]', '', aTitles[index]).encode('utf-8').strip() savePath = unicode(dirPath + "{}.pdf".format(aTitle)) if not os.path.exists(dirPath): os.makedirs(dirPath) try: urllib.urlretrieve(self.host + str(pdfUrl), savePath) item = SpiderItem( AnnouncementTitle=aTitle, AnnouncementFrom=self.tag, ReportingDate=today, AnnouncementSort=aSort, AnnouncementPath=unicode(relativeFilePath + "{}.pdf".format(aTitle))) yield item except Exception, e: print e.message process = '#' * int((index + 1) * 100.0 / length) + '=' * int( (length - index - 1) * 100.0 / length) output.write("\r[%s] %s complete :" % (self.tag, aSort.strip()) + process + "%.0f%%" % int((index + 1) * 100.0 / length)) output.flush()
def parse_latest(self, response): """ 解析最新公告页面,下载最新公告PDF文件 :param response: :return: """ selector = Selector(response) lSorts = selector.xpath( '//div[@class="sse_title_common"]/h2/text()').extract() lTimes = selector.xpath( '//dl[@class="modal_pdf_list"]/dd[@class="just_this_only"]/span/text()' ).extract() lTitles = selector.xpath( '//dl[@class="modal_pdf_list"]/dd[@class="just_this_only"]/em[@class="pdf-first"]/@title' ).extract() lPdfUrls = selector.xpath( '//dl[@class="modal_pdf_list"]/dd[@class="just_this_only"]/em[@class="pdf-first"]/a[@target="_blank"]/@href' ).extract() lSort = lSorts[0].encode('utf-8').strip() today = time.strftime("%Y-%m-%d", time.localtime(int(time.time()))) relativeFilePath = "CCXR_Spider\\Data\\" + "{}\\{}\\{}\\".format( today, self.tag, lSort) dirPath = unicode(self.basepath + relativeFilePath) length = len(lPdfUrls) output = sys.stdout for index, lPdfUrl in enumerate(lPdfUrls): if lTimes[index] == today: lTime = lTimes[index].strip() lTitle = re.sub('[":?/*<>|\ ]', '', lTitles[index].strip()) #正则去掉文件名中非法字符 savePath = unicode(dirPath + "{}.pdf".format(lTitle)) if not os.path.exists(dirPath): os.makedirs(dirPath) try: urllib.urlretrieve(self.host + str(lPdfUrl), savePath) item = SpiderItem( AnnouncementTitle=lTitle, AnnouncementFrom=self.tag, ReportingDate=today, AnnouncementSort=lSort, AnnouncementPath=unicode(relativeFilePath + "{}.pdf".format(lTitle))) yield item except Exception, e: print e.message process = '#' * int((index + 1) * 100.0 / length) + '=' * int( (length - index - 1) * 100.0 / length) output.write("\r[%s] %s complete :" % (self.tag, lSort.strip()) + process + "%.0f%%" % int((index + 1) * 100.0 / length)) output.flush()
def parse_stoconvertible(self, response): aSort = u"分离交换的可转换公司债券公告".encode('utf-8') aLists = response.body.split("_t.push(") today = time.strftime("%Y-%m-%d", time.localtime(int(time.time()))) relativeFilePath = "CCXR_Spider\\Data\\" + "{}\\{}\\{}\\".format( today, self.tag, aSort) dirPath = unicode(self.basepath + relativeFilePath) length = len(aLists[1:]) output = sys.stdout for index, aList in enumerate(aLists[1:]): aTitle = re.sub( '[":?/*<>|\ ]', '', aList.split(",")[0][7:].strip('":?/*<>|\ ')).encode( 'utf-8') #正则去掉文件名中非法字符 aTime = aList.split(",")[1][5:].strip('" ') if index == length - 2: pdfUrl = (aList.split(",")[2][4:])[:-15:].strip('" ') else: pdfUrl = (aList.split(",")[2][4:])[:-4:].strip('" ') if aTime == today: savePath = unicode(dirPath + "{}.pdf".format(aTitle)) if not os.path.exists(dirPath): os.makedirs(dirPath) try: urllib.urlretrieve(self.host + str(pdfUrl), savePath) item = SpiderItem( AnnouncementTitle=aTitle, AnnouncementFrom=self.tag, ReportingDate=today, AnnouncementSort=aSort, AnnouncementPath=unicode(relativeFilePath + "{}.pdf".format(aTitle))) yield item except Exception, e: print e.message process = '#' * int((index + 1) * 100.0 / length) + '=' * int( (length - index - 1) * 100.0 / length) output.write("\r[%s] %s complete :" % (self.tag, aSort.strip()) + process + "%.0f%%" % int((index + 1) * 100.0 / length)) output.flush()
def parse_bond(self, response): """ 下载深交所债券公告,由于无法从返回页面直接解析,因此直接抓取js返回数据 :param response: :return: """ #aTitles = selector.xpath('//table[@class="ggnr"]/tbody/tr/td[@class="td2"]/a[@target="_blank"]/text()').extract() #aTimes = selector.xpath('//table[@class="ggnr"]/tbody/tr/td[@class="td2"]/span[@class="link1"]/text()').extract() #pdfUrls = selector.xpath('//table[@class="ggnr"]/tbody/tr/td[@class="td2"]/a[@target="_blank"]/@href').extract() aLists = response.body.split("],[") today = time.strftime("%Y-%m-%d", time.localtime(int(time.time()))) announcementType = u"债券公告".encode('utf-8') length = len(aLists) output = sys.stdout for index,aList in enumerate(aLists): pdfUrl = aList.split(",")[1].strip('" ') pdfTitle = re.sub('[":?/*<>|\ ]','',aList.split(",")[2].strip('":?/*<>|\ ')).decode('gbk').encode('utf-8') #正则去掉文件名中非法字符 pdfTime = aList.split(",")[5].strip('" ') if pdfTime==today: relativeFilePath = "ExchangeSpider\\Data\\"+"{}\\{}\\{}\\".format(today,self.tag,announcementType) dirPath = unicode(self.basepath+relativeFilePath) savePath = unicode(dirPath + "{}.pdf".format(pdfTitle)) if not os.path.exists(dirPath): os.makedirs(dirPath) try: urllib.urlretrieve(self.baseUrl + str(pdfUrl), savePath) item = SpiderItem( AnnouncementTitle=pdfTitle, AnnouncementFrom=self.tag, ReportingDate=today, AnnouncementSort=announcementType, AnnouncementPath=unicode(relativeFilePath+"{}.pdf".format(pdfTitle)) ) yield item except Exception: print "Exception:",pdfTitle," ",pdfUrl process = '#'*int((index + 1) * 100.0 / length) + '='*int((length-index-1)*100.0/length) output.write("\r[%s] %s complete :" % (self.tag,announcementType) + process + "%.0f%%" % int((index + 1) * 100.0 / length)) output.flush() print
def parse(self, response): """ Ajax返回数据解析 """ jsonObjs = json.loads(response.text)['announcements'] length = len(jsonObjs) output = sys.stdout today = time.strftime("%Y-%m-%d", time.localtime(int(time.time()))) aSort = u"监管机构公告".encode('utf-8') relativeFilePath = "ExchangeSpider\\Data\\" + "{}\\{}\\{}\\".format( today, self.tag, aSort) dirPath = unicode(self.basepath + relativeFilePath) for index, jsonObj in enumerate(jsonObjs): announcementTitle = re.sub('[":?/*<>|\ ]', '', jsonObj['announcementTitle']).encode( 'utf-8') #去除文件名中的非法字符,转换编码 adjunctUrl = jsonObj['adjunctUrl'] #公告链接 #print jsonObj['announcementTime'],time.time() announcementTime = time.strftime( "%Y-%m-%d", time.localtime(int(int(jsonObj['announcementTime']) * 0.001))) #格式化时间戳 adjunctType = jsonObj['adjunctType'] if today == announcementTime: if not os.path.exists(dirPath): os.makedirs(dirPath) if adjunctType == "PDF": try: savePath = unicode(dirPath + "{}.pdf".format(announcementTitle)) urllib.urlretrieve(self.host + str(adjunctUrl), savePath) item = SpiderItem( AnnouncementTitle=announcementTitle, AnnouncementFrom=self.tag, ReportingDate=today, AnnouncementSort=aSort, AnnouncementPath=unicode( relativeFilePath + "{}.pdf".format(announcementTitle))) yield item except Exception: print "Exception:", announcementTitle, " ", adjunctUrl if adjunctType == "TXT": try: url = self.host + str(adjunctUrl) html = requests.get(url) list0 = re.findall(r'(?<=\[).*?(?=\])', html.text) content = json.loads(list0[0]) contentTitle = re.sub('[":?/*<>|\ ]', '', content['Title']).encode('utf-8') contentZw = content['Zw'].encode('utf-8') savePath = unicode(dirPath + "{}.html".format(contentTitle)) with open(savePath, 'wb') as fs: fs.write(contentZw) item = SpiderItem( AnnouncementTitle=announcementTitle, AnnouncementFrom=self.tag, ReportingDate=today, AnnouncementSort=aSort, AnnouncementPath=unicode( relativeFilePath + "{}.html".format(contentTitle))) yield item except Exception: pass process = '#' * int((index + 1) * 100.0 / length) + '=' * int( (length - index - 1) * 100.0 / length) output.write("\r[%s] %s complete :" % (self.tag, aSort) + process + "%.0f%%" % int((index + 1) * 100.0 / length)) output.flush() print