def parse_v2(self, response): paper_title = self.paper_title paper_author = self.paper_author paper_url = response.xpath('/html/frameset/frame[1]/@src').extract()[0] paper_number = paper_url.split('/')[-1] if paper_number not in self.exist_file: filename = 'A18-' + paper_number + 'AAAI2018' + '_' + paper_author + '_' \ + paper_title + '.pdf' filepath = os.path.join(self.path, filename) # print('================== save paper as pdf file ========================') try: paper_response = request.urlopen(paper_url, timeout=30) paper_response = HtmlResponse(url=paper_url, body=paper_response.read(), encoding='utf-8') paper_url = Selector(response=paper_response)\ .xpath('/html/body/div/div/div/div[@id="content"]/p/a[3]/@href').extract()[0] request.urlretrieve(paper_url, filepath) print(paper_number, paper_title) except socket.timeout: print('----------- paper%s time out ------------- skip -----' % paper_number) except URLError: print('----------- url error: %s ----------- skip -----' % paper_url) # print('================== save successfully ========================') else: print('\t' + paper_number)
def process_request(self, request, spider): stockCode = StockCode service_args = ['--load-image=false', '--disk-cache=true'] url = request.url postdata = '' head = '' startYear = 0 print "Start process_request: request:", request #request.url = startUrl return response with queryUrl to parse if (self.start_urls == request.url[0:len(self.startUrl)]): print "Start URL" if(None==request.meta.get('jsonStockIndex', None)): jsonStockIndex=0 else: jsonStockIndex =request.meta['jsonStockIndex'] content='jsonStockIndex='+str(jsonStockIndex) return HtmlResponse(self.queryUrl, encoding='utf-8', status=200, body=content) if (request.queryUrl[0:len(self.queryUrl)] == self.queryUrl): heads = { 'Accept':'application/json, text/javascript, */*; q=0.01', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'zh-CN,zh;q=0.8', 'Connection':'keep-alive', 'Content-Length':'32', 'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8', 'Cookie':'JSESSIONID=DA9BC88402AD775B071E9DAF29CAB669', 'Host':'www.cninfo.com.cn', 'Origin':'http://www.cninfo.com.cn', 'Referer':'http://www.cninfo.com.cn/cninfo-new/index', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36', 'X-Requested-With':'XMLHttpRequest' } postdata = 'keyWord=' + stockCode + '&maxNum=10&hq_or_cw=1' #postdata="K_code=&&market=sh&&type=hq&&code=600000&&orgid=gssh0600000&&minYear=1999&&maxYear=2016&&hq_code=600000&&hq_k_code=&&cw_code=&&cw_k_code=" newRequest = urllib2.Request(self.queryUrl,postdata,heads) response = urllib2.urlopen(newRequest,None, 10) content = response.read() #set_cookie = response.info()['Set-Cookie'] response = HtmlResponse(self.queryUrl, encoding='utf-8', status=200, body=content) #get Json data and startYear jsonResponse = json.loads(response.body) startYear = jsonResponse['startTime'] market = jsonResponse['market'] orgid = jsonResponse['orgid'] code = jsonResponse['code'] print "Middlewares queryUrl, stockCode: ",stockCode, ' startTime:',startYear if startYear != 0: boundary = '----WebKitFormBoundaryA5otDEFHCmAaF76I' heads = { 'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'zh-CN,zh;q=0.8', 'Cache-Control':'max-age=0', 'Connection':'keep-alive', 'Content-Length':'1107', 'Content-Type':'multipart/form-data; boundary=----WebKitFormBoundaryA5otDEFHCmAaF76I', 'Cookie':'JSESSIONID=851F059690F838B7498EEDC42F6B5BD9', 'Host':'www.cninfo.com.cn', 'Origin':'http://www.cninfo.com.cn', 'Referer':'http://www.cninfo.com.cn/cninfo-new/index', 'Upgrade-Insecure-Requests':'1', 'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36' } #postdata = 'keyWord=' + globalvariables.StockCode + '&maxNum=10&hq_or_cw=1' postdata='K_code=&&market='+market+'&&type=hq&&code='+code+'&&orgid='+orgid+'&&minYear='+startYear+'&&maxYear='+MaxYear+'&&hq_code='+code+'&&hq_k_code=&&cw_code=&&cw_k_code=' newRequest = urllib2.Request(self.downloadUrl,postdata,heads) response = urllib2.urlopen(newRequest,None, 10) content = response.read() #set_cookie = response.info()['Set-Cookie'] response = HtmlResponse(self.queryUrl, encoding='utf-8', status=200, body=content) return response