def process(self, response): soup = bs(response.m_response.content, 'lxml') gzb_div_list = soup.select('div.bm_c.xld dl.bbda.cl') for gzb_div in gzb_div_list: if gzb_div.select('a img'): detail_url = gzb_div.select('a')[0]['href'] img_url = 'http://www.gengzhongbang.com/' + gzb_div.select( 'a img')[0]['src'] name = gzb_div.select('dt.xs2')[0].text.strip() createTime = gzb_div.select('span.xg1')[0].text.strip() shortDes = gzb_div.select('dd.xs2.cl')[0].text.strip() md5 = hashlib.md5() rand_name = str(time.time()) + str(random.random()) md5.update(rand_name) img_name = md5.hexdigest() + '.jpg' request = Request(url=img_url, priority=1, callback=self.process_pic) request.meta['img_name'] = img_name yield request request = Request(url=detail_url, priority=1, callback=self.process_detail) request.meta['name'] = name request.meta['createTime'] = createTime request.meta['shortDes'] = shortDes request.meta['img_name'] = img_name request.meta['newsCateId'] = response.request.meta[ 'newsCateId'] yield request
def process(self, response): if '404 Not Found' not in response.m_response.content: soup = bs(response.m_response.content, 'lxml') toutiao_div_list = soup.select('div.warp_left dl.channeldl') for toutiao_div in toutiao_div_list: if toutiao_div.select('a img'): detail_url = toutiao_div.select('a')[0]['href'] img_url = toutiao_div.select('a img')[0]['src'] name = toutiao_div.select('h3')[0].text.strip() shortDes = toutiao_div.select('dd.shortdd')[0].text md5 = hashlib.md5() rand_name = str(time.time()) + str(random.random()) md5.update(rand_name) img_name = md5.hexdigest() + '.jpg' request = Request(url=img_url, priority=1, callback=self.process_pic) request.meta['img_name'] = img_name yield request request = Request(url=detail_url, priority=1, callback=self.process_detail) request.meta['name'] = name request.meta['shortDes'] = shortDes request.meta['img_name'] = img_name request.meta['newsCateId'] = response.request.meta[ 'newsCateId'] yield request
def init_start_requests(cls): cls.start_requests.extend([ Request(url='http://www.tuliu.com/news/list-c165/%s.html' % page, priority=0, meta={'newsCateId': '20171102111907007'}) for page in range(1, 9) ]) cls.start_requests.extend([ Request(url='http://www.tuliu.com/news/list-c163/%s.html' % page, priority=0, meta={'newsCateId': '20171102111907007'}) for page in range(1, 30) ])
def process_page(self, response): soup = bs(response.m_response.content, 'lxml') zhu_div_list = soup.select('div.zxleft ul li') for zhu_div in zhu_div_list: detail_url = zhu_div.select('a')[0]['href'] img_url = zhu_div.select('a img')[0]['src'] title = zhu_div.select('a img')[0]['alt'].strip() shortDes = zhu_div.select('p.zxleft32 a')[0].text md5 = hashlib.md5() rand_name = str(time.time()) + str(random.random()) md5.update(rand_name) img_name = md5.hexdigest() + '.jpg' request = Request(url=img_url, priority=1, callback=self.process_pic) request.meta['img_name'] = img_name yield request request = Request(url=detail_url, priority=1, callback=self.process_detail) request.meta['title'] = title request.meta['shortDes'] = shortDes request.meta['img_name'] = img_name yield request
def init_start_requests(cls): cls.start_requests.extend([ Request(url='http://www.gengzhongbang.com/14/index.php?page=%s' % page, priority=0, meta={'newsCateId': '20171102111913008'}) for page in range(1, 9) ]) cls.start_requests.extend([ Request(url='http://www.gengzhongbang.com/10/index.php?page=%s' % page, priority=0, meta={'newsCateId': '20171102111913008'}) for page in range(1, 9) ])
class ZhiHu_Processor(BaseProcessor): spider_id = 'weibo_spider' start_requests = [Request(url='', priority=0)] @check def process(self, response): pass
def request_from_dict(d, processor=None): """Create Request object from a dict. If a spider is given, it will try to resolve the callbacks looking at the spider for methods with the same name. """ cb = d['callback'] if cb and processor: cb = _get_method(processor, cb) eb = d['errback'] if eb and processor: eb = _get_method(processor, eb) return Request(url=to_native_str(d['url']), data=d['data'], json=d['json'], allow_redirects=d['allow_redirects'], duplicate_remove=d['duplicate_remove'], timeout=d['timeout'], callback=cb, errback=eb, method=d['method'], headers=d['headers'], cookies=d['cookies'], meta=d['meta'], priority=d['priority'])
def process(self, response): soup = bs(response.m_response.content, 'lxml') tuliu_div_list = soup.select('div.news_list_list ul li.list_box') detail_processor = Tuliu_Detail_Processor() for tuliu_div in tuliu_div_list[:3]: if tuliu_div.select('a img'): detail_url = tuliu_div.select('a')[0]['href'] img_url = tuliu_div.select('a img')[0]['src'] name = tuliu_div.select( 'h1.category_title nobr.l')[0].text.strip() createTime = tuliu_div.select( 'h1.category_title nobr.r')[0].text.replace('发布时间 ', '').strip() shortDes = tuliu_div.select('div')[0].text.replace( '[查看全文]', '') md5 = hashlib.md5() rand_name = str(time.time()) + str(random.random()) md5.update(rand_name.encode("utf8")) img_name = md5.hexdigest() + '.jpg' request = Request(url=detail_url, priority=1) request.meta['name'] = name request.meta['createTime'] = createTime request.meta['shortDes'] = shortDes request.meta['img_name'] = img_name request.meta['newsCateId'] = response.request.meta[ 'newsCateId'] d = request_to_dict(request, detail_processor) yield Violet(Tuliu_Detail_Processor, d)
def process(self, response): soup = BeautifulSoup(response.m_response.content, 'html.parser') link = soup.find(name='div', class_='hide-body').find_all('a') for ref in link: stranurl = ref.get('href') request = Request(url=stranurl, priority=1, callback=self.process_stran, meta={'hello': 'goodlife'}) yield request
def process(self, response): soup = bs(response.m_response.content, 'lxml') page_list = soup.select('div.zxpage a') total_page = int(page_list[page_list.__len__() - 2].text) page = 1 while page <= total_page: yield Request(url='http://www.zhuwang.cc/list-58-%d.html' % page, callback=self.process_page, priority=0, duplicate_remove=False) page += 1
def process(self, response): if '404错误' not in response.m_response.content: soup = bs(response.m_response.content, 'lxml') tuliu_div_list = soup.select('div.news_list_list ul li.list_box') for tuliu_div in tuliu_div_list: if tuliu_div.select('a img'): detail_url = tuliu_div.select('a')[0]['href'] img_url = tuliu_div.select('a img')[0]['src'] name = tuliu_div.select( 'h1.category_title nobr.l')[0].text.strip() createTime = tuliu_div.select( 'h1.category_title nobr.r')[0].text.replace( '发布时间 ', '').strip() shortDes = tuliu_div.select('div')[0].text.replace( '[查看全文]', '') md5 = hashlib.md5() rand_name = str(time.time()) + str(random.random()) md5.update(rand_name) img_name = md5.hexdigest() + '.jpg' request = Request(url=img_url, priority=1, callback=self.process_pic) request.meta['img_name'] = img_name yield request request = Request(url=detail_url, priority=1, callback=self.process_detail) request.meta['name'] = name request.meta['createTime'] = createTime request.meta['shortDes'] = shortDes request.meta['img_name'] = img_name request.meta['newsCateId'] = response.request.meta[ 'newsCateId'] yield request
def process(self, response): if hasattr(self, 'rules'): rules = getattr(self, 'rules', None) else: rules = () for rule in rules: links = rule.link_extractor.extract_links(response) if links: for link in links: request = Request(url=link, callback=rule.callback, priority=rule.priority) request = rule.process_request(request) yield request if rule.only_first: break
def process_stran(self, response): soup = BeautifulSoup(response.m_response.content, 'html.parser') link = soup.select('#main')[0] ullink = "" for item in link.children: if item.name == 'ul': ullink = item infolist = ullink.find_all('a') for temp in infolist: paperlink = temp.get('href') request = Request(url=paperlink, priority=1, callback=self.process_paper, meta={'paperFrom': paperlink}) yield request
class Zhu_Processor(BaseProcessor): spider_id = 'zhu_spider' allowed_domains = ['zhuwang.cc'] start_requests = [ Request(url='http://www.zhuwang.cc/list-58-1.html', priority=0) ] @check def process(self, response): soup = bs(response.m_response.content, 'lxml') page_list = soup.select('div.zxpage a') total_page = int(page_list[page_list.__len__() - 2].text) page = 1 while page <= total_page: yield Request(url='http://www.zhuwang.cc/list-58-%d.html' % page, callback=self.process_page, priority=0, duplicate_remove=False) page += 1 @check def process_page(self, response): soup = bs(response.m_response.content, 'lxml') zhu_div_list = soup.select('div.zxleft ul li') for zhu_div in zhu_div_list: detail_url = zhu_div.select('a')[0]['href'] img_url = zhu_div.select('a img')[0]['src'] title = zhu_div.select('a img')[0]['alt'].strip() shortDes = zhu_div.select('p.zxleft32 a')[0].text md5 = hashlib.md5() rand_name = str(time.time()) + str(random.random()) md5.update(rand_name) img_name = md5.hexdigest() + '.jpg' request = Request(url=img_url, priority=1, callback=self.process_pic) request.meta['img_name'] = img_name yield request request = Request(url=detail_url, priority=1, callback=self.process_detail) request.meta['title'] = title request.meta['shortDes'] = shortDes request.meta['img_name'] = img_name yield request @check def process_pic(self, response): result = response.m_response.content yield pipeItem(['save'], result) @check def process_detail(self, response): soup = bs(response.m_response.content, 'lxml') dd_tail = soup.select('div.zxxwleft p.zxxw2')[0].text.replace( '来源: ', '').replace('来源:', '').split(' ') date_time = dd_tail[1].strip() + ' ' + dd_tail[2].strip().replace( '|', '') newsFrom = dd_tail[0].strip() result = dict() result['date_time'] = date_time result['newsFrom'] = newsFrom yield pipeItem(['console'], result)
def init_start_requests(cls): cls.start_requests.extend([ Request(url='http://cj.zhue.com.cn/guoneixinwen/35-%s.html' % page, priority=0, meta={'newsCateId': '20171101140728002'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://cj.zhue.com.cn/guojixinwen/36-%s.html' % page, priority=0, meta={'newsCateId': '20171101140728002'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://cj.zhue.com.cn/zimeiti/677-%s.html' % page, priority=0, meta={'newsCateId': '20171101140728002'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://cj.zhue.com.cn/zhongzhu/172-%s.html' % page, priority=0, meta={'newsCateId': '20171101140728002'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://qx.zhue.com.cn/xingyexinwen/list_731_%s.html' % page, priority=0, meta={'newsCateId': '20171101140728002'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://cj.zhue.com.cn/guojijishu/list_673_%s.html' % page, priority=0, meta={'newsCateId': '20171101140728002'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://js.zhue.com.cn/zhuchangjianshe/31-%s.html' % page, priority=0, meta={'newsCateId': '20171101142701004'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://js.zhue.com.cn/zhuqunbaojian/69-%s.html' % page, priority=0, meta={'newsCateId': '20171101142701004'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://js.zhue.com.cn/fangyiguicheng/72-%s.html' % page, priority=0, meta={'newsCateId': '20171101142701004'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://js.zhue.com.cn/yichuanyuzhong/71-%s.html' % page, priority=0, meta={'newsCateId': '20171101142701004'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://js.zhue.com.cn/rengongshoujing/67-%s.html' % page, priority=0, meta={'newsCateId': '20171101142701004'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://js.zhue.com.cn/yibingfangzhi/3-%s.html' % page, priority=0, meta={'newsCateId': '20171101142701004'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://qx.zhue.com.cn/jishuxinwen/list_732_%s.html' % page, priority=0, meta={'newsCateId': '20171101142701004'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request( url= 'http://cj.zhue.com.cn/guoneixinwen/yangzhugushi/list_669_%s.html' % page, priority=0, meta={'newsCateId': '20171101142708005'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://cj.zhue.com.cn/renwuxinwen/121-%s.html' % page, priority=0, meta={'newsCateId': '20171101142708005'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request( url='http://qx.zhue.com.cn/gaoduanfangtan/list_733_%s.html' % page, priority=0, meta={'newsCateId': '20171101142708005'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request(url='http://cj.zhue.com.cn/zhengcefagui/16-%s.html' % page, priority=0, meta={'newsCateId': '20171101140923003'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request( url='http://cj.zhue.com.cn/dianzishangwu/list_586_%s.html' % page, priority=0, meta={'newsCateId': '20171101142714006'}) for page in range(1, 11) ]) cls.start_requests.extend([ Request( url='http://cj.zhue.com.cn/wangluoyingxiao/list_588_%s.html' % page, priority=0, meta={'newsCateId': '20171101142714006'}) for page in range(1, 11) ])
class Zhu_Processor(BaseProcessor): spider_id = 'zhu_spider' allowed_domains = ['doi.org', 'dblp.uni-trier.de'] start_requests = [ Request(url='https://dblp.uni-trier.de/db/journals?pos=01', priority=0) ] @check def process(self, response): soup = BeautifulSoup(response.m_response.content, 'html.parser') link = soup.find(name='div', class_='hide-body').find_all('a') for ref in link: stranurl = ref.get('href') request = Request(url=stranurl, priority=1, callback=self.process_stran, meta={'hello': 'goodlife'}) yield request @check def process_stran(self, response): soup = BeautifulSoup(response.m_response.content, 'html.parser') link = soup.select('#main')[0] ullink = "" for item in link.children: if item.name == 'ul': ullink = item infolist = ullink.find_all('a') for temp in infolist: paperlink = temp.get('href') request = Request(url=paperlink, priority=1, callback=self.process_paper, meta={'paperFrom': paperlink}) yield request @check def process_paper(self, response): soup = BeautifulSoup(response.m_response.content, 'html.parser') straname = soup.find('header').text.split(',') catory = straname[0] parl = response.request.meta['paperFrom'].split('/') parl = parl[len(parl) - 1].split('.')[0] volume = int(re.findall("\d+", parl)[0]) trasplist = soup.find_all('li', class_="entry article") articleinfo = [] for item in trasplist: atag = item.find('div', class_='head').find('a') if atag is None: return 0, 0, 0 paperurl = atag.get('href') articleinfo = item.find('article', class_="data").find_all('span') title = item.find('span', class_='title').text articleinfo.pop() articleinfo.pop() authors = "" for author in articleinfo: authors = authors + author.text + ";" result = dict() result['title'] = title result['authors'] = authors result['paperUrl'] = paperurl result['catory'] = catory result['volume'] = volume yield pipeItem(['database'], result)