Example #1
0
    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')

        gzb_div_list = soup.select('div.bm_c.xld dl.bbda.cl')
        for gzb_div in gzb_div_list:
            if gzb_div.select('a img'):
                detail_url = gzb_div.select('a')[0]['href']
                img_url = 'http://www.gengzhongbang.com/' + gzb_div.select(
                    'a img')[0]['src']
                name = gzb_div.select('dt.xs2')[0].text.strip()
                createTime = gzb_div.select('span.xg1')[0].text.strip()
                shortDes = gzb_div.select('dd.xs2.cl')[0].text.strip()

                md5 = hashlib.md5()
                rand_name = str(time.time()) + str(random.random())
                md5.update(rand_name)
                img_name = md5.hexdigest() + '.jpg'

                request = Request(url=img_url,
                                  priority=1,
                                  callback=self.process_pic)
                request.meta['img_name'] = img_name
                yield request

                request = Request(url=detail_url,
                                  priority=1,
                                  callback=self.process_detail)
                request.meta['name'] = name
                request.meta['createTime'] = createTime
                request.meta['shortDes'] = shortDes
                request.meta['img_name'] = img_name
                request.meta['newsCateId'] = response.request.meta[
                    'newsCateId']
                yield request
Example #2
0
    def process(self, response):
        if '404 Not Found' not in response.m_response.content:
            soup = bs(response.m_response.content, 'lxml')

            toutiao_div_list = soup.select('div.warp_left dl.channeldl')
            for toutiao_div in toutiao_div_list:
                if toutiao_div.select('a img'):
                    detail_url = toutiao_div.select('a')[0]['href']
                    img_url = toutiao_div.select('a img')[0]['src']
                    name = toutiao_div.select('h3')[0].text.strip()
                    shortDes = toutiao_div.select('dd.shortdd')[0].text

                    md5 = hashlib.md5()
                    rand_name = str(time.time()) + str(random.random())
                    md5.update(rand_name)
                    img_name = md5.hexdigest() + '.jpg'

                    request = Request(url=img_url,
                                      priority=1,
                                      callback=self.process_pic)
                    request.meta['img_name'] = img_name
                    yield request

                    request = Request(url=detail_url,
                                      priority=1,
                                      callback=self.process_detail)
                    request.meta['name'] = name
                    request.meta['shortDes'] = shortDes
                    request.meta['img_name'] = img_name
                    request.meta['newsCateId'] = response.request.meta[
                        'newsCateId']
                    yield request
Example #3
0
 def init_start_requests(cls):
     cls.start_requests.extend([
         Request(url='http://www.tuliu.com/news/list-c165/%s.html' % page,
                 priority=0,
                 meta={'newsCateId': '20171102111907007'})
         for page in range(1, 9)
     ])
     cls.start_requests.extend([
         Request(url='http://www.tuliu.com/news/list-c163/%s.html' % page,
                 priority=0,
                 meta={'newsCateId': '20171102111907007'})
         for page in range(1, 30)
     ])
Example #4
0
    def process_page(self, response):
        soup = bs(response.m_response.content, 'lxml')

        zhu_div_list = soup.select('div.zxleft ul li')
        for zhu_div in zhu_div_list:
            detail_url = zhu_div.select('a')[0]['href']
            img_url = zhu_div.select('a img')[0]['src']
            title = zhu_div.select('a img')[0]['alt'].strip()
            shortDes = zhu_div.select('p.zxleft32 a')[0].text

            md5 = hashlib.md5()
            rand_name = str(time.time()) + str(random.random())
            md5.update(rand_name)
            img_name = md5.hexdigest() + '.jpg'

            request = Request(url=img_url,
                              priority=1,
                              callback=self.process_pic)
            request.meta['img_name'] = img_name
            yield request

            request = Request(url=detail_url,
                              priority=1,
                              callback=self.process_detail)
            request.meta['title'] = title
            request.meta['shortDes'] = shortDes
            request.meta['img_name'] = img_name
            yield request
Example #5
0
 def init_start_requests(cls):
     cls.start_requests.extend([
         Request(url='http://www.gengzhongbang.com/14/index.php?page=%s' %
                 page,
                 priority=0,
                 meta={'newsCateId': '20171102111913008'})
         for page in range(1, 9)
     ])
     cls.start_requests.extend([
         Request(url='http://www.gengzhongbang.com/10/index.php?page=%s' %
                 page,
                 priority=0,
                 meta={'newsCateId': '20171102111913008'})
         for page in range(1, 9)
     ])
Example #6
0
class ZhiHu_Processor(BaseProcessor):
    spider_id = 'weibo_spider'
    start_requests = [Request(url='', priority=0)]

    @check
    def process(self, response):
        pass
Example #7
0
def request_from_dict(d, processor=None):
    """Create Request object from a dict.

    If a spider is given, it will try to resolve the callbacks looking at the
    spider for methods with the same name.
    """
    cb = d['callback']
    if cb and processor:
        cb = _get_method(processor, cb)
    eb = d['errback']
    if eb and processor:
        eb = _get_method(processor, eb)
    return Request(url=to_native_str(d['url']),
                   data=d['data'],
                   json=d['json'],
                   allow_redirects=d['allow_redirects'],
                   duplicate_remove=d['duplicate_remove'],
                   timeout=d['timeout'],
                   callback=cb,
                   errback=eb,
                   method=d['method'],
                   headers=d['headers'],
                   cookies=d['cookies'],
                   meta=d['meta'],
                   priority=d['priority'])
Example #8
0
    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')

        tuliu_div_list = soup.select('div.news_list_list ul li.list_box')
        detail_processor = Tuliu_Detail_Processor()
        for tuliu_div in tuliu_div_list[:3]:
            if tuliu_div.select('a img'):
                detail_url = tuliu_div.select('a')[0]['href']
                img_url = tuliu_div.select('a img')[0]['src']
                name = tuliu_div.select(
                    'h1.category_title nobr.l')[0].text.strip()
                createTime = tuliu_div.select(
                    'h1.category_title nobr.r')[0].text.replace('发布时间 ',
                                                                '').strip()
                shortDes = tuliu_div.select('div')[0].text.replace(
                    '[查看全文]', '')

                md5 = hashlib.md5()
                rand_name = str(time.time()) + str(random.random())
                md5.update(rand_name.encode("utf8"))
                img_name = md5.hexdigest() + '.jpg'

                request = Request(url=detail_url, priority=1)
                request.meta['name'] = name
                request.meta['createTime'] = createTime
                request.meta['shortDes'] = shortDes
                request.meta['img_name'] = img_name
                request.meta['newsCateId'] = response.request.meta[
                    'newsCateId']
                d = request_to_dict(request, detail_processor)
                yield Violet(Tuliu_Detail_Processor, d)
Example #9
0
 def process(self, response):
     soup = BeautifulSoup(response.m_response.content, 'html.parser')
     link = soup.find(name='div', class_='hide-body').find_all('a')
     for ref in link:
         stranurl = ref.get('href')
         request = Request(url=stranurl,
                           priority=1,
                           callback=self.process_stran,
                           meta={'hello': 'goodlife'})
         yield request
Example #10
0
    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')

        page_list = soup.select('div.zxpage a')
        total_page = int(page_list[page_list.__len__() - 2].text)
        page = 1
        while page <= total_page:
            yield Request(url='http://www.zhuwang.cc/list-58-%d.html' % page,
                          callback=self.process_page,
                          priority=0,
                          duplicate_remove=False)
            page += 1
Example #11
0
    def process(self, response):
        if '404错误' not in response.m_response.content:
            soup = bs(response.m_response.content, 'lxml')

            tuliu_div_list = soup.select('div.news_list_list ul li.list_box')
            for tuliu_div in tuliu_div_list:
                if tuliu_div.select('a img'):
                    detail_url = tuliu_div.select('a')[0]['href']
                    img_url = tuliu_div.select('a img')[0]['src']
                    name = tuliu_div.select(
                        'h1.category_title nobr.l')[0].text.strip()
                    createTime = tuliu_div.select(
                        'h1.category_title nobr.r')[0].text.replace(
                            '发布时间 ', '').strip()
                    shortDes = tuliu_div.select('div')[0].text.replace(
                        '[查看全文]', '')

                    md5 = hashlib.md5()
                    rand_name = str(time.time()) + str(random.random())
                    md5.update(rand_name)
                    img_name = md5.hexdigest() + '.jpg'

                    request = Request(url=img_url,
                                      priority=1,
                                      callback=self.process_pic)
                    request.meta['img_name'] = img_name
                    yield request

                    request = Request(url=detail_url,
                                      priority=1,
                                      callback=self.process_detail)
                    request.meta['name'] = name
                    request.meta['createTime'] = createTime
                    request.meta['shortDes'] = shortDes
                    request.meta['img_name'] = img_name
                    request.meta['newsCateId'] = response.request.meta[
                        'newsCateId']
                    yield request
Example #12
0
 def process(self, response):
     if hasattr(self, 'rules'):
         rules = getattr(self, 'rules', None)
     else:
         rules = ()
     for rule in rules:
         links = rule.link_extractor.extract_links(response)
         if links:
             for link in links:
                 request = Request(url=link, callback=rule.callback, priority=rule.priority)
                 request = rule.process_request(request)
                 yield request
                 if rule.only_first:
                     break
Example #13
0
 def process_stran(self, response):
     soup = BeautifulSoup(response.m_response.content, 'html.parser')
     link = soup.select('#main')[0]
     ullink = ""
     for item in link.children:
         if item.name == 'ul':
             ullink = item
     infolist = ullink.find_all('a')
     for temp in infolist:
         paperlink = temp.get('href')
         request = Request(url=paperlink,
                           priority=1,
                           callback=self.process_paper,
                           meta={'paperFrom': paperlink})
         yield request
Example #14
0
class Zhu_Processor(BaseProcessor):
    spider_id = 'zhu_spider'
    allowed_domains = ['zhuwang.cc']
    start_requests = [
        Request(url='http://www.zhuwang.cc/list-58-1.html', priority=0)
    ]

    @check
    def process(self, response):
        soup = bs(response.m_response.content, 'lxml')

        page_list = soup.select('div.zxpage a')
        total_page = int(page_list[page_list.__len__() - 2].text)
        page = 1
        while page <= total_page:
            yield Request(url='http://www.zhuwang.cc/list-58-%d.html' % page,
                          callback=self.process_page,
                          priority=0,
                          duplicate_remove=False)
            page += 1

    @check
    def process_page(self, response):
        soup = bs(response.m_response.content, 'lxml')

        zhu_div_list = soup.select('div.zxleft ul li')
        for zhu_div in zhu_div_list:
            detail_url = zhu_div.select('a')[0]['href']
            img_url = zhu_div.select('a img')[0]['src']
            title = zhu_div.select('a img')[0]['alt'].strip()
            shortDes = zhu_div.select('p.zxleft32 a')[0].text

            md5 = hashlib.md5()
            rand_name = str(time.time()) + str(random.random())
            md5.update(rand_name)
            img_name = md5.hexdigest() + '.jpg'

            request = Request(url=img_url,
                              priority=1,
                              callback=self.process_pic)
            request.meta['img_name'] = img_name
            yield request

            request = Request(url=detail_url,
                              priority=1,
                              callback=self.process_detail)
            request.meta['title'] = title
            request.meta['shortDes'] = shortDes
            request.meta['img_name'] = img_name
            yield request

    @check
    def process_pic(self, response):
        result = response.m_response.content
        yield pipeItem(['save'], result)

    @check
    def process_detail(self, response):
        soup = bs(response.m_response.content, 'lxml')

        dd_tail = soup.select('div.zxxwleft p.zxxw2')[0].text.replace(
            '来源: ', '').replace('来源:', '').split(' ')
        date_time = dd_tail[1].strip() + ' ' + dd_tail[2].strip().replace(
            '|', '')
        newsFrom = dd_tail[0].strip()

        result = dict()
        result['date_time'] = date_time
        result['newsFrom'] = newsFrom

        yield pipeItem(['console'], result)
Example #15
0
 def init_start_requests(cls):
     cls.start_requests.extend([
         Request(url='http://cj.zhue.com.cn/guoneixinwen/35-%s.html' % page,
                 priority=0,
                 meta={'newsCateId': '20171101140728002'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://cj.zhue.com.cn/guojixinwen/36-%s.html' % page,
                 priority=0,
                 meta={'newsCateId': '20171101140728002'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://cj.zhue.com.cn/zimeiti/677-%s.html' % page,
                 priority=0,
                 meta={'newsCateId': '20171101140728002'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://cj.zhue.com.cn/zhongzhu/172-%s.html' % page,
                 priority=0,
                 meta={'newsCateId': '20171101140728002'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://qx.zhue.com.cn/xingyexinwen/list_731_%s.html' %
                 page,
                 priority=0,
                 meta={'newsCateId': '20171101140728002'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://cj.zhue.com.cn/guojijishu/list_673_%s.html' %
                 page,
                 priority=0,
                 meta={'newsCateId': '20171101140728002'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://js.zhue.com.cn/zhuchangjianshe/31-%s.html' %
                 page,
                 priority=0,
                 meta={'newsCateId': '20171101142701004'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://js.zhue.com.cn/zhuqunbaojian/69-%s.html' %
                 page,
                 priority=0,
                 meta={'newsCateId': '20171101142701004'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://js.zhue.com.cn/fangyiguicheng/72-%s.html' %
                 page,
                 priority=0,
                 meta={'newsCateId': '20171101142701004'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://js.zhue.com.cn/yichuanyuzhong/71-%s.html' %
                 page,
                 priority=0,
                 meta={'newsCateId': '20171101142701004'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://js.zhue.com.cn/rengongshoujing/67-%s.html' %
                 page,
                 priority=0,
                 meta={'newsCateId': '20171101142701004'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://js.zhue.com.cn/yibingfangzhi/3-%s.html' % page,
                 priority=0,
                 meta={'newsCateId': '20171101142701004'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://qx.zhue.com.cn/jishuxinwen/list_732_%s.html' %
                 page,
                 priority=0,
                 meta={'newsCateId': '20171101142701004'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(
             url=
             'http://cj.zhue.com.cn/guoneixinwen/yangzhugushi/list_669_%s.html'
             % page,
             priority=0,
             meta={'newsCateId': '20171101142708005'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://cj.zhue.com.cn/renwuxinwen/121-%s.html' % page,
                 priority=0,
                 meta={'newsCateId': '20171101142708005'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(
             url='http://qx.zhue.com.cn/gaoduanfangtan/list_733_%s.html' %
             page,
             priority=0,
             meta={'newsCateId': '20171101142708005'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(url='http://cj.zhue.com.cn/zhengcefagui/16-%s.html' % page,
                 priority=0,
                 meta={'newsCateId': '20171101140923003'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(
             url='http://cj.zhue.com.cn/dianzishangwu/list_586_%s.html' %
             page,
             priority=0,
             meta={'newsCateId': '20171101142714006'})
         for page in range(1, 11)
     ])
     cls.start_requests.extend([
         Request(
             url='http://cj.zhue.com.cn/wangluoyingxiao/list_588_%s.html' %
             page,
             priority=0,
             meta={'newsCateId': '20171101142714006'})
         for page in range(1, 11)
     ])
Example #16
0
class Zhu_Processor(BaseProcessor):
    spider_id = 'zhu_spider'
    allowed_domains = ['doi.org', 'dblp.uni-trier.de']
    start_requests = [
        Request(url='https://dblp.uni-trier.de/db/journals?pos=01', priority=0)
    ]

    @check
    def process(self, response):
        soup = BeautifulSoup(response.m_response.content, 'html.parser')
        link = soup.find(name='div', class_='hide-body').find_all('a')
        for ref in link:
            stranurl = ref.get('href')
            request = Request(url=stranurl,
                              priority=1,
                              callback=self.process_stran,
                              meta={'hello': 'goodlife'})
            yield request

    @check
    def process_stran(self, response):
        soup = BeautifulSoup(response.m_response.content, 'html.parser')
        link = soup.select('#main')[0]
        ullink = ""
        for item in link.children:
            if item.name == 'ul':
                ullink = item
        infolist = ullink.find_all('a')
        for temp in infolist:
            paperlink = temp.get('href')
            request = Request(url=paperlink,
                              priority=1,
                              callback=self.process_paper,
                              meta={'paperFrom': paperlink})
            yield request

    @check
    def process_paper(self, response):
        soup = BeautifulSoup(response.m_response.content, 'html.parser')
        straname = soup.find('header').text.split(',')
        catory = straname[0]
        parl = response.request.meta['paperFrom'].split('/')
        parl = parl[len(parl) - 1].split('.')[0]
        volume = int(re.findall("\d+", parl)[0])
        trasplist = soup.find_all('li', class_="entry article")
        articleinfo = []
        for item in trasplist:
            atag = item.find('div', class_='head').find('a')
            if atag is None:
                return 0, 0, 0
            paperurl = atag.get('href')
            articleinfo = item.find('article', class_="data").find_all('span')
            title = item.find('span', class_='title').text
            articleinfo.pop()
            articleinfo.pop()
            authors = ""
            for author in articleinfo:
                authors = authors + author.text + ";"
            result = dict()
            result['title'] = title
            result['authors'] = authors
            result['paperUrl'] = paperurl
            result['catory'] = catory
            result['volume'] = volume
            yield pipeItem(['database'], result)