def process_detail(self, response): soup = bs(response.m_response.content, 'lxml') result = dict() result['newsProductId'] = time.strftime( '%Y%m%d%H%M%S', time.localtime(time.time())) + SeqUtil.get_seq() result['newsCateId'] = response.request.meta['newsCateId'] result['name'] = response.request.meta['name'] result['imageUrl'] = response.request.meta['img_name'] result['newsCateId'] = response.request.meta['newsCateId'] result['shortDes'] = response.request.meta['shortDes'] result['createTime'] = response.request.meta['createTime'] result['newsFromWebUrl'] = response.request.url span_list = soup.select('div.article-header p.text-gray-9 span') for span in span_list: if '来源:' in span.text: result['newsFrom'] = span.text.replace('来源:', '').strip() break else: result['newsFrom'] = '互联网' longDes = soup.select('div.article-content')[0] tag_list = longDes.find_all() # 去除样式 for tag in tag_list: attrs = copy.copy(tag.attrs) for key in attrs.iterkeys(): if key != 'src': del tag.attrs[key] result['longDes'] = str(longDes) yield pipeItem(['database', 'console'], result)
def process_detail(self, response): soup = bs(response.m_response.content, 'lxml') result = dict() result['newsProductId'] = time.strftime( '%Y%m%d%H%M%S', time.localtime(time.time())) + SeqUtil.get_seq() result['newsCateId'] = response.request.meta['newsCateId'] result['name'] = response.request.meta['name'] result['imageUrl'] = response.request.meta['img_name'] result['newsCateId'] = response.request.meta['newsCateId'] result['shortDes'] = response.request.meta['shortDes'] result['newsFromWebUrl'] = response.request.url result['newsFrom'] = soup.select('p.writ span')[1].text.replace( '来源:', '') result['createTime'] = soup.select('p.writ span')[2].text.replace( '时间:', '') longDes = soup.select('div#art_content')[0] # 去除广告 adv_list = longDes.select( 'img[src=http://www.zhue.com.cn/images/zhue888.jpg]') for adv in adv_list: adv.decompose() tag_list = longDes.find_all() # 去除样式 for tag in tag_list: attrs = copy.copy(tag.attrs) for key in attrs.iterkeys(): if key != 'src': del tag.attrs[key] result['longDes'] = str(longDes) yield pipeItem(['database', 'console'], result)
def process_detail(self, response): soup = bs(response.m_response.content, 'lxml') result = dict() result['newsProductId'] = time.strftime( '%Y%m%d%H%M%S', time.localtime(time.time())) + SeqUtil.get_seq() result['newsCateId'] = response.request.meta['newsCateId'] result['name'] = response.request.meta['name'] result['imageUrl'] = response.request.meta['img_name'] result['newsCateId'] = response.request.meta['newsCateId'] result['shortDes'] = response.request.meta['shortDes'] result['createTime'] = response.request.meta['createTime'] result['newsFromWebUrl'] = response.request.url result['newsFrom'] = '互联网' longDes = soup.select('td#article_content')[0] longDes.name = 'div' tag_list = longDes.find_all() # 去除样式 for tag in tag_list: attrs = copy.copy(tag.attrs) for key in attrs.iterkeys(): if key != 'src': del tag.attrs[key] else: tag.attrs[ key] = 'http://www.gengzhongbang.com/' + tag.attrs[key] result['longDes'] = str(longDes) yield pipeItem(['database', 'console'], result)
def process_paper(self, response): soup = BeautifulSoup(response.m_response.content, 'html.parser') straname = soup.find('header').text.split(',') catory = straname[0] parl = response.request.meta['paperFrom'].split('/') parl = parl[len(parl) - 1].split('.')[0] volume = int(re.findall("\d+", parl)[0]) trasplist = soup.find_all('li', class_="entry article") articleinfo = [] for item in trasplist: atag = item.find('div', class_='head').find('a') if atag is None: return 0, 0, 0 paperurl = atag.get('href') articleinfo = item.find('article', class_="data").find_all('span') title = item.find('span', class_='title').text articleinfo.pop() articleinfo.pop() authors = "" for author in articleinfo: authors = authors + author.text + ";" result = dict() result['title'] = title result['authors'] = authors result['paperUrl'] = paperurl result['catory'] = catory result['volume'] = volume yield pipeItem(['database'], result)
def process_detail(self, response): soup = bs(response.m_response.content, 'lxml') dd_tail = soup.select('div.zxxwleft p.zxxw2')[0].text.replace( '来源: ', '').replace('来源:', '').split(' ') date_time = dd_tail[1].strip() + ' ' + dd_tail[2].strip().replace( '|', '') newsFrom = dd_tail[0].strip() result = dict() result['date_time'] = date_time result['newsFrom'] = newsFrom yield pipeItem(['console'], result)
def process_detail(self, response): soup = bs(response.m_response.content, 'lxml') result = dict() result['newsProductId'] = time.strftime('%Y%m%d%H%M%S', time.localtime(time.time())) result['newsCateId'] = response.request.meta['newsCateId'] result['name'] = response.request.meta['name'] result['imageUrl'] = response.request.meta['img_name'] result['newsCateId'] = response.request.meta['newsCateId'] result['shortDes'] = response.request.meta['shortDes'] result['createTime'] = response.request.meta['createTime'] result['newsFromWebUrl'] = response.request.url span_list = soup.select('div.article-header p.text-gray-9 span') for span in span_list: if '来源:' in span.text: result['newsFrom'] = span.text.replace('来源:', '').strip() break else: result['newsFrom'] = '互联网' longDes = soup.select('div.article-content')[0] result['longDes'] = str(longDes) yield pipeItem(['console'], result)
def process_pic(self, response): item = dict() item['content'] = response.m_response.content item['name'] = response.request.meta['img_name'] yield pipeItem(['pic'], item)
def process_pic(self, response): result = response.m_response.content yield pipeItem(['save'], result)