Esempio n. 1
0
    def parse(self,response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')        
        
        #print '====start %s==' %response.url
        #未成功获取query    
        if response.url == self.domain_url:
            print 'error of query'
            return
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)
        #抽取搜索结果页详细页面链接
        
        requests = []
        for url in sel.xpath(u'//a[@class="np"]/@href').extract():
            requests.append(self.make_requests_from_url(self.domain_url + url))

        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
                    
        #return requests
        for request in requests:
            continue
            yield request
Esempio n. 2
0
    def parse(self, response):
        #print '====start %s==' %response.url
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1,
                                        table='origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')

        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)
        #抽取搜索结果页详细页面链接

        requests = []
        for url in sel.xpath(
                u'//div[@class="long-pages"]/a[text()="下一页"]/@href').re(
                    'go\(([\d]*?)\)'):
            tp_url = re.sub('&pn=[\d]+?', '', response.url)
            requests.append(self.make_requests_from_url(tp_url + '&pn=' + url))

        for item in items:
            yield Request(url=item['url'],
                          meta={'item': item},
                          callback=self.parse_content)

        #return requests
        for request in requests:
            continue
            yield request
Esempio n. 3
0
    def parse(self, response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table="origin")
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider("no thrift or hbase server!")

        # print '====start %s==' %response.url

        # 抽取并解析新闻网页内容
        items = self.parse_items(response)
        # 构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)

        # 尝试寻找下一页
        requests = []
        try:
            url = sel.xpath(u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1]
            requests.append(self.make_requests_from_url(self.domain_url + url))
        except:
            pass

        for item in items:
            yield Request(url=item["url"], meta={"item": item}, callback=self.parse_content)
        # return requests
        for request in requests:
            continue
            yield request
Esempio n. 4
0
    def parse(self, response):
        print "====start %s==" % response.url
        # print response.body
        time.sleep(random.randint(self.time_interval, 2))

        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table="origin")
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider("no thrift or hbase server!")

        # 未成功获取query
        if response.url == self.domain_url:
            print "error of query"
            return

        # 抽取并解析新闻网页内容
        items = self.parse_items(response)
        # 构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)

        requests = []
        for url in sel.xpath(u'//a[@class="np"]/@href').extract():
            requests.append(self.make_requests_from_url(self.domain_url + url))

        for item in items:
            yield Request(url=item["url"], meta={"item": item}, callback=self.parse_content)

        for request in requests:
            continue
            yield request
Esempio n. 5
0
    def parse(self,response):
        #print '====start %s==' %response.url
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)
        #抽取搜索结果页详细页面链接

        requests = []
        for url in sel.xpath(u'//div[@class="long-pages"]/a[text()="下一页"]/@href').re('go\(([\d]*?)\)'):
            tp_url = re.sub('&pn=[\d]+?', '', response.url)
            requests.append(self.make_requests_from_url(tp_url + '&pn=' + url))

        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
            
        #return requests
        for request in requests:
            continue
            yield request
Esempio n. 6
0
    def parse(self,response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')
        
        print '====start %s==' %response.url
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        
        #尝试寻找下一页
        requests = []
        if response.url.find('page') < 0:
            #构造一个Xpath的select对象,用来进行网页元素抽取
            sel = Selector(response)        
            page_num = sel.xpath('//div[@class="pg"]/label/span')

            if page_num:
                page_num = re.sub("<.*?>", "", page_num.extract()[0])
                page_num = int(re.search("([\d]+)", page_num).group(1))
                for idx in range(2, page_num+1):
                    url = response.url + ("&page=%d" % idx)
                    requests.append(self.make_requests_from_url(url))
                    
        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
        #return requests
        for request in requests:
            yield request
Esempio n. 7
0
    def parse(self, response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')
                        
        #print '====start %s==' %response.url
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)

        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
Esempio n. 8
0
    def parse(self, response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1,
                                        table='origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')

        print '====start %s==' % response.url

        #抽取并解析新闻网页内容
        items = self.parse_items(response)

        #尝试寻找下一页
        requests = []
        if response.url.find('page') < 0:
            #构造一个Xpath的select对象,用来进行网页元素抽取
            sel = Selector(response)
            page_num = sel.xpath('//div[@class="pg"]/label/span')

            if page_num:
                page_num = re.sub("<.*?>", "", page_num.extract()[0])
                page_num = int(re.search("([\d]+)", page_num).group(1))
                for idx in range(2, page_num + 1):
                    url = response.url + ("&page=%d" % idx)
                    requests.append(self.make_requests_from_url(url))

        for item in items:
            yield Request(url=item['url'],
                          meta={'item': item},
                          callback=self.parse_content)
        #return requests
        for request in requests:
            yield request
Esempio n. 9
0
class TianyaBBSSpider(Spider):
    name = "tianyabbs"
    domain_url = "http://search.tianya.cn/"
    tool = Utools()
    dc = dataCleaner()
    start_urls = []
    test_hbase = True

    def __init__(self):
        super(TianyaBBSSpider, self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial, signals.engine_started)
        dispatcher.connect(self.finalize, signals.engine_stopped)

    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3)
        #self.htable=HBaseTest(table = 'origin')

    def finalize(self):
        self.log('---stopped---')
        #self.htable.close_trans()
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        #发帖时间
        pageTag = '&s=4'
        #回复时间
        #pageTag = '&s=6'
        #默认相关性排序

        qlist = GetQuery().get_data()
        for query in qlist:
            if query:
                query_url = '/bbs?q=' + urllib.quote(
                    query.encode('utf8')) + pageTag
                self.start_urls.append(self.domain_url + query_url)

    #一个回调函数中返回多个Request以及Item的例子
    def parse(self, response):
        #print '====start %s==' %response.url
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1,
                                        table='origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')

        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)
        #抽取搜索结果页详细页面链接

        requests = []
        for url in sel.xpath(
                u'//div[@class="long-pages"]/a[text()="下一页"]/@href').re(
                    'go\(([\d]*?)\)'):
            tp_url = re.sub('&pn=[\d]+?', '', response.url)
            requests.append(self.make_requests_from_url(tp_url + '&pn=' + url))

        for item in items:
            yield Request(url=item['url'],
                          meta={'item': item},
                          callback=self.parse_content)

        #return requests
        for request in requests:
            continue
            yield request

    def parse_content(self, response):
        item = response.meta['item']

        if response.body:
            bsoup = BeautifulSoup(response.body)

            item_content_list = bsoup.find_all('div', class_='bbs-content')

            #only get the first floor
            if len(item_content_list) > 0:
                item['content'] = item_content_list[0].extract().encode('utf8')
                #item['content'] = ' '.join(v.get_text().encode('utf8') for v in item_content_list)
            item['content'] = re.sub(r'\n|\t|\r', '', item['content'])
            item['content'] = self.dc.process(item['content'])
            if item['content']:
                print 'url: ' + item['url'] + ' is added'
                return item

    def parse_items(self, response):
        if response.body:
            bsoup = BeautifulSoup(response.body)
        main_content = bsoup.select('div#main')[0]
        #查询项中有一项多余
        if main_content:
            if main_content.select('li#search_msg'):
                elem_list = main_content.find_all('li')[:-1]
            else:
                elem_list = main_content.find_all('li')

        items = []
        if len(elem_list) > 0:
            for elem in elem_list:
                item = DataItem()
                item['dtype'] = 'forum'
                item['source'] = '天涯论坛'
                item['channel'] = 'Search engine'
                try:
                    item['title'] = elem.div.h3.a.get_text()
                except:
                    continue
                item['url'] = elem.div.h3.a['href']

                author = elem.find('p', class_='source')
                if author:
                    item['medianame'] = author.a.get_text()
                    #item['author'] = author.a.get_text()
                    if author.span.get_text().find('-') > 0:
                        item['pubtime'] = author.span.get_text()
                    else:
                        item['pubtime'] = author.find_all(
                            'span')[-2].get_text()
                    if self.tool.old_news(item['pubtime']):
                        continue
                else:
                    print 'element of author no found!\n'
                    return

                if self.r.exists(item['url']):
                    #if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                    continue

                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M",
                                                    time.localtime())
                item['abstract'] = elem.div.p.get_text()
                items.append(item)
        return items
Esempio n. 10
0
class B2bNewSpider(Spider):
    name = "b2bnew"
    domain_url = "http://b2b.10086.cn/"
    tool = Utools()
    dc = dataCleaner()
    start_urls = []
    test_hbase = True
    
    def __init__ (self):
        super(B2bNewSpider,self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial,signals.engine_started)
        dispatcher.connect(self.finalize,signals.engine_stopped)
    
    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3)
    
    def finalize(self):
        self.log('---stopped---')
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        url = 'http://b2b.10086.cn/b2b/main/listVendorNotice.html?noticeType=2'        
        self.start_urls.append(url)
        
    #一个回调函数中返回多个Request以及Item的例子
    def parse(self, response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')
                        
        #print '====start %s==' %response.url
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)

        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)

    def parse_content(self,response):
        item = response.meta['item']
        
        if response.body:
            bsoup = BeautifulSoup(response.body, from_encoding='utf-8')
            
            try:
                content = bsoup.select('div#mobanDiv')[0]
            except:
                content = self.dc.process(str(response.body))
            item['content'] = content
            print 'url: ' + item['url'] + ' is added'
            return item

    def parse_items(self, response):
        elem_list = []        
        items = []
        url = "http://b2b.10086.cn/b2b/main/listVendorNoticeResult.html?noticeBean.noticeType=2"
        data = "&page.currentPage=1&page.perPageSize=50&noticeBean.sourceCH=&noticeBean.source=&noticeBean.title=&noticeBean.startDate=&noticeBean.endDate="
        elem_list = re.findall('<tr(.*?)</tr>', re.sub('\s', '', requests.post(url + data).text))
        
        if len(elem_list) > 0:
            for elem in elem_list:
                item = DataItem()
                
                item['dtype'] = 'news'
                item['source'] = '中国移动采购与招标'
                item['channel'] = 'Search engine'                
                
                if elem.find("onmouseout") < 0:
                    continue
                itemID = re.search("selectResult\(\'([\d]+?)\'\)", elem).group(1)                                
                item['url'] = ('http://b2b.10086.cn/b2b/main/viewNoticeContent.html?'
                    + 'noticeBean.id=' + itemID)
                
                if self.r.exists(item['url']):
                    continue
                
                res = re.findall('<td.*?</td>', elem)
                item['medianame'] = re.sub('<.*?>', '', res[0])
                item['title'] = re.sub('<.*?>', '', res[2])
                
                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime())
                item['pubtime'] = re.sub('<.*?>', '', res[-1]) + item['collecttime'][-6:]
                if self.tool.old_news(item['pubtime']):
                    continue
                
                items.append(item)
                
        return items
Esempio n. 11
0
class SogouWeixinSpider(Spider):
    name = "sogouwx"
    domain_url = "http://weixin.sogou.com/weixin"
    UA = 'Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0'
    start_urls = []
    tool = Utools()
    dc = dataCleaner()
    time_interval = 0
    cookie = []
    test_hbase = True

    custom_settings = {
        "DOWNLOAD_DELAY": 0.2,
        "COOKIES_ENABLED": True,
    }

    def __init__(self):
        super(SogouWeixinSpider, self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial, signals.engine_started)
        dispatcher.connect(self.finalize, signals.engine_stopped)

    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3)

    def finalize(self):
        self.log('---stopped---')
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        #过去24小时
        timeTag = '&tsn=1'
        qlist = GetQuery().get_data()

        for query in qlist:
            if query:
                query_url = '?type=2&query=' + urllib.quote(
                    query.encode('utf8')) + timeTag
                self.start_urls.append(self.domain_url + query_url)

    def start_requests(self):
        for i in range(len(self.start_urls)):
            if i % 5 == 0:
                self.cookie = self.update_cookies()
            yield Request(self.start_urls[i], cookies=self.cookie)

    #一个回调函数中返回多个Request以及Item的例子
    def parse(self, response):
        print '====start %s==' % response.url
        #print response.body
        time.sleep(random.randint(self.time_interval, 2))

        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1,
                                        table='origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')

        #未成功获取query
        if response.url == self.domain_url:
            print 'error of query'
            return

        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)

        requests = []
        for url in sel.xpath(u'//a[@class="np"]/@href').extract():
            requests.append(self.make_requests_from_url(self.domain_url + url))

        for item in items:
            yield Request(url=item['url'],
                          meta={'item': item},
                          callback=self.parse_content)

        for request in requests:
            continue
            yield request

    def parse_content(self, response):
        item = response.meta['item']
        if response.body:
            res = re.sub('\n|\r|\t', '', response.body)
            res = re.sub('<script.*?</script>', '', res)
            bsoup = BeautifulSoup(res, from_encoding='utf8')

            try:
                item['content'] = str(
                    bsoup.select('div#js_content')[0]).encode('utf8')
                print 'url:' + item['url'] + ' is added'
                return item
            except:
                print 'url:' + item['url'] + ' load failed'

    def parse_items(self, response):
        if response.body:
            #去除干扰内容<!.*?>
            res = re.sub(r'<!.*?>', '', response.body)
            bsoup = BeautifulSoup(res, from_encoding='utf8')
        main_content = bsoup.select('div#wrapper')[0]

        if main_content:
            elem_list = main_content.find_all('div', class_='txt-box')
        items = []
        if len(elem_list) > 0:
            for elem in elem_list:
                item = DataItem()
                item['dtype'] = 'weixin'
                item['source'] = '搜狗微信'
                item['channel'] = 'Search engine'
                if elem.h4.a.get_text():
                    item['title'] = elem.h4.a.get_text()
                else:
                    continue
                item['url'] = elem.h4.a['href']
                item['medianame'] = elem.div.a['title']
                #时间戳转换时间
                item['pubtime'] = time.strftime(
                    '%Y-%m-%d %H:%M', time.localtime(float(elem.div['t'])))
                if self.tool.old_news(item['pubtime']):
                    continue
                if self.r.exists(item['url']):
                    #if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                    continue

                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M",
                                                    time.localtime())
                item['abstract'] = elem.p.get_text()
                items.append(item)
        return items

    def update_cookies(self):
        s = requests.Session()
        s.headers = {"User-Agent": self.UA}

        r = s.post('http://weixin.sogou.com/antispider/thank.php')
        pcontent = re.search("setCookie\('SNUID'.*?\)", r.content).group(0)
        SNUID = eval(pcontent.split(',')[1])

        suv = ''.join(
            [str(int(time.time() * 1000000) + random.randint(0, 1000))])
        s.cookies['SUV'] = suv
        s.cookies['SNUID'] = SNUID

        return dict(s.cookies)
Esempio n. 12
0
class SogouNewSpider(Spider):
    name = "sogounew"
    domain_url = "http://news.sogou.com/news"
    start_urls = []
    tool = Utools()
    dc = dataCleaner()
    test_hbase = True

    def __init__ (self):
        super(SogouNewSpider,self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial,signals.engine_started)
        dispatcher.connect(self.finalize,signals.engine_stopped)
        self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3)
    
    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        #self.htable=HBaseTest(table = 'origin')

    def finalize(self):
        self.log('---stopped---')
        #self.htable.close_trans()
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        #sort_by_time = '&sort=1'
        sort_by_time = ''
        qlist = GetQuery().get_data()
        for query in qlist:
            if query:
                query_url = '?query=' + urllib.quote(query.encode('utf8')) + sort_by_time
                self.start_urls.append(self.domain_url + query_url)
        
    #一个回调函数中返回多个Request以及Item的例子
    def parse(self,response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')        
        
        #print '====start %s==' %response.url
        #未成功获取query    
        if response.url == self.domain_url:
            print 'error of query'
            return
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)
        #抽取搜索结果页详细页面链接
        
        requests = []
        for url in sel.xpath(u'//a[@class="np"]/@href').extract():
            requests.append(self.make_requests_from_url(self.domain_url + url))

        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
                    
        #return requests
        for request in requests:
            continue
            yield request

    def parse_content(self,response):
        item = response.meta['item']
        try:
            charset = response.encoding
        except:
            charset = 'utf-8'

        if response.body:
            try:
                bsoup = BeautifulSoup(response.body, from_encoding=charset)
                item['content'] = self.dc.process(str(bsoup).decode(charset))
            except:
                bsoup = BeautifulSoup(response.body, from_encoding='utf-8')
                item['content'] = self.dc.process(str(bsoup))
            if len(item['content'].encode('utf8')) < len(item['abstract']):
                item['content'] = item['abstract'].replace('百度快照', '')
            if item['content']:
                print 'url: ' + item['url'] + ' is added'
                return item

    def parse_items(self,response):
        if response.body:
            #去除干扰内容<!.*?>
            res = re.sub(r'<!.*?>', '', response.body)
            bsoup = BeautifulSoup(res, from_encoding='utf8')
        main_content = bsoup.select('div#wrapper')[0]
        if main_content:
            elem_list = main_content.find_all('div', class_='rb')
        items = []
        if len(elem_list) > 0:
            for elem in elem_list:
                item = DataItem()
                item['dtype'] = 'news'
                item['source'] = '搜狗新闻'
                item['channel'] = 'Search engine'
                if elem.h3.a.get_text():
                    item['title'] = elem.h3.a.get_text()
                else:
                    continue
                item['url'] = elem.h3.a['href']
                
                author = elem.cite.get_text()
                if len(author.split()) > 1:
                    item['medianame'] = author.split()[0]
                    item['pubtime'] = ' '.join(author.split()[1:])
                    if self.tool.old_news(item['pubtime']):
                        continue
                else:
                    item['source'] = author.split()[0]

                if item['url'].find("html?") > 0 or item['url'].find("htm?") > 0:
                    item['url'] = "".join(item['url'].split("?")[0:-1])

                if self.r.exists(item['url']):
                    #if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                    continue
                
                try:                
                    item['source'] = self.tool.get_realname(item['medianame'])
                    item['medianame'] = ' '
                except:
                    pass
                
                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime())
                item['abstract']=elem.find('div',class_='ft').get_text()
                items.append(item)
        return items
Esempio n. 13
0
class TiebaBBSSpider(Spider):
    name = "tiebabbs"
    domain_url = "http://tieba.baidu.com"
    tool = Utools()
    dc = dataCleaner()
    start_urls = []
    test_hbase = True

    def __init__(self):
        super(TiebaBBSSpider, self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial, signals.engine_started)
        dispatcher.connect(self.finalize, signals.engine_stopped)

    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3)
        #self.htable=HBaseTest(table = 'origin')

    def finalize(self):
        self.log('---stopped---')
        #self.htable.close_trans()
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        qlist = GetQuery().get_data()
        for query in qlist:
            if query:
                #默认时间排序
                query_url = "/f/search/res?ie=utf-8&rn=20&qw=" + urllib.quote(
                    query.encode('utf8')) + '&ct=0'
                self.start_urls.append(self.domain_url + query_url)

    #一个回调函数中返回多个Request以及Item的例子
    def parse(self, response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1,
                                        table='origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')

        #print '====start %s==' %response.url

        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)

        #尝试寻找下一页
        requests = []
        try:
            url = sel.xpath(
                u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1]
            requests.append(self.make_requests_from_url(self.domain_url + url))
        except:
            pass

        for item in items:
            yield Request(url=item['url'],
                          meta={'item': item},
                          callback=self.parse_content)
        #return requests
        for request in requests:
            continue
            yield request

    def parse_content(self, response):
        item = response.meta['item']
        if response.body:
            bsoup = BeautifulSoup(response.body, from_encoding='utf-8')
            if bsoup.find('h1', class_='core_title_txt'):
                item['title'] = bsoup.find('h1',
                                           class_='core_title_txt')['title']
            elif bsoup.find('h3', class_='core_title_txt'):
                item['title'] = bsoup.find('h3',
                                           class_='core_title_txt')['title']
            else:
                return

            timeform = '%Y-%m-%d %H:%M'
            pubtimes = [time.strptime(item['pubtime'], timeform)]
            for pubtime in re.findall('/d{4}-/d{2}-/d{2} /d{2}:/d{2}',
                                      str(bsoup)):
                pubtimes.append(time.strptime(pubtime, timeform))

            item['pubtime'] = time.strftime(timeform, min(pubtimes))
            if self.tool.old_news(item['pubtime']):
                print item['utl'] + ' ' + item['pubtime']
                return

            item['content'] = []
            for elem in bsoup.find_all('div', class_='d_post_content'):
                item['content'].append(str(elem.extract()))
                #onlt get the first floor
                break

            if item:
                item['content'] = ' '.join(item['content']).encode('utf8')
                item['content'] = self.dc.process(item['content'])
                print 'url: ' + item['url'] + ' is added'
                yield item

    def parse_items(self, response):
        if response.body:
            bsoup = BeautifulSoup(response.body, from_encoding='utf-8')
        main_content = bsoup.find('div', class_='s_post_list')

        items = []
        if main_content:
            elem_list = main_content.find_all('div', class_='s_post')
        else:
            return items

        if len(elem_list) > 0:
            for elem in elem_list:
                item = DataItem()
                item['dtype'] = 'forum'
                item['source'] = '百度贴吧'
                item['channel'] = 'Search engine'
                try:
                    item['pubtime'] = elem.find('font',
                                                class_='p_date').get_text()
                    if self.tool.old_news(item['pubtime']):
                        continue

                    #item['title'] = elem.span.a.get_text()
                    item['medianame'] = elem.find(
                        'font', class_='p_violet').get_text()
                    item['abstract'] = elem.find(
                        'div', class_='p_content').get_text()
                except:
                    continue

                item['url'] = self.domain_url + re.findall(
                    '(/p/.*?)[^\d]', elem.span.a['href'])[0]
                if self.r.exists(item['url']):
                    #if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                    continue

                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M",
                                                    time.localtime())
                items.append(item)
        #去重
        new_items = []
        url_list = []
        for item in items:
            if item['url'] not in url_list:
                new_items.append(item)
                url_list.append(item['url'])
        items = new_items
        return items
Esempio n. 14
0
class BingNewSpider(Spider):
    name = "bingnew"
    domain_url = "http://cn.bing.com"
    start_urls = []
    tool = Utools()
    dc = dataCleaner()
    test_hbase = True

    def __init__ (self):
        super(BingNewSpider,self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial,signals.engine_started)
        dispatcher.connect(self.finalize,signals.engine_stopped)
        self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3)
    
    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        #self.htable=HBaseTest(table = 'origin')

    def finalize(self):
        self.log('---stopped---')
        #self.htable.close_trans()
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        #sort_by_time = '&qft=sortbydate%3d"1"'
        sort_by_time = ''
        qlist = GetQuery().get_data()
        for query in qlist:
            if query:
                query_url = '/news/search?q=' + urllib.quote(query.encode('utf8')) + sort_by_time
                self.start_urls.append(self.domain_url + query_url)
    
    #一个回调函数中返回多个Request以及Item的例子
    def parse(self,response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')
        
        #print '====start %s==' %response.url
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)
        requests = []
        
        for url in sel.xpath(u'//li/a[@class="sb_pagN"]/@href').extract():
            requests.append(self.make_requests_from_url(self.domain_url+url))
            
        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
  
        #return requests
        for request in requests:
            continue
            yield request

    def parse_content(self,response):
        item = response.meta['item']
        try:
            charset = response.encoding
        except:
            charset = 'utf-8'
        
        if response.body:
            try:
                bsoup = BeautifulSoup(response.body, from_encoding=charset)
                item['content'] = self.dc.process(str(bsoup).decode(charset))
            except:
                bsoup = BeautifulSoup(response.body, from_encoding='utf-8')
                item['content'] = self.dc.process(str(bsoup))
            if len(item['content'].encode('utf8')) < len(item['abstract']):
                item['content'] = item['abstract']
            if item['content']:
                print 'url: ' + item['url'] + ' is added'
                return item

    def parse_items(self,response):
        if response.body:
            bsoup = BeautifulSoup(response.body,from_encoding='utf-8')
        main_content = bsoup.select('div#SerpResult')[0]
        
        if main_content:
            elem_list = main_content.find_all('div', class_='sn_r')
        items = []
        if len(elem_list) > 0:
            for elem in elem_list:
                item = DataItem()
                item['dtype'] = 'news'
                item['source'] = '必应资讯'
                item['channel'] = 'Search engine'
                
                title = elem.find('div', 'newstitle')
                if title and title.a.get_text():
                    item['title'] = title.a.get_text()
                else:
                    continue
                item['url'] = title.a['href']
                
                author = elem.find('span',class_='sn_ST')
                if author:
                    #m = re.search('(\d{4}\/\d{1,2}\/\d{1,2})',source_time[0])
                    item['medianame'] = author.cite.get_text()
                    item['pubtime'] = self.normalize_time(str(author.span.get_text()))
                    if self.tool.old_news(item['pubtime']):
                        continue
                else:
                    print 'no element of author'
                    continue
                                
                if item['url'].find("html?") > 0 or item['url'].find("htm?") > 0:
                    item['url'] = "".join(item['url'].split("?")[0:-1])
                
                if self.r.exists(item['url']): 
                    #if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                    continue
                
                try:                
                    item['source'] = self.tool.get_realname(item['medianame'])
                    item['medianame'] = ' '
                except:
                    continue
                
                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime())
                if elem.find('span',class_='sn_snip'):
                    item['abstract'] = elem.find('span',class_='sn_snip').get_text()
                else:
                    item['abstract'] = ' '
                items.append(item)
        return items
     
    def normalize_time(self, time_text):
        time_text = time_text.encode('utf8')
        if re.match('\d{4}.*?\d{1,2}.*?\d{1,2}', time_text):
            time_text = time_text.replace('/', '-') + ' 00:00'
        else:
            #非标准时间转换为时间戳,再转为标准时间
            time_digit = float(filter(str.isdigit, time_text))
            
            interval = 0;
            if time_text.find('天'.encode('utf8')) > 0 or time_text.find('day') > 0:
                interval = 86400
            elif time_text.find('时'.encode('utf8')) > 0 or time_text.find('hour') > 0:
                interval = 3600
            elif time_text.find('分'.encode('utf8')) > 0 or time_text.find('min') > 0:
                interval = 60
            elif time_text.find('秒'.encode('utf8')) > 0 or time_text.find('second') > 0:
                interval = 1
            else:
                return time_text
            
            time_true = time.time() - time_digit*interval
            time_text = time.strftime("%Y-%m-%d %H:%M", time.localtime(time_true))

        return time_text
Esempio n. 15
0
class BaiduNewSpider(Spider):
    name = "baidunew"
    domain_url = "http://news.baidu.com"
    tool = Utools()
    dc = dataCleaner()
    start_urls = []
    test_hbase = True
    
    def __init__ (self):
        super(BaiduNewSpider,self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial,signals.engine_started)
        dispatcher.connect(self.finalize,signals.engine_stopped)
    
    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3)
            
    def finalize(self):
        self.log('---stopped---')
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        qlist = GetQuery().get_data()
        for query in qlist:
            if query:
                #默认时间排序
                query_url = "/ns?rn=20&word=" + urllib.quote(query.encode('utf8')) + '&ct=0'
                self.start_urls.append(self.domain_url + query_url)

    #一个回调函数中返回多个Request以及Item的例子
    def parse(self,response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')
        
        #print '====start %s==' %response.url
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)
        
        #尝试寻找下一页
        requests = []
        try:
            url = sel.xpath(u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1]
            requests.append(self.make_requests_from_url(self.domain_url+url))
        except:
            pass

        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
        #return requests
        for request in requests:
            continue
            yield request

    def parse_content(self, response):
        item = response.meta['item']
        try:
            charset = response.encoding
        except:
            charset = 'utf-8'
        
        if response.body:
            try:
                bsoup = BeautifulSoup(response.body, from_encoding=charset)
                item['content'] = self.dc.process(str(bsoup).decode(charset))
            except:
                bsoup = BeautifulSoup(response.body, from_encoding='utf-8')
                item['content'] = self.dc.process(str(bsoup))
            if len(item['content'].encode('utf8')) < len(item['abstract']):
                item['content'] = item['abstract'].replace('百度快照', '')
            if item['content']:
                print 'url: ' + item['url'] + ' is added'
                return item

    def parse_items(self,response):
        if response.body:
            bsoup = BeautifulSoup(response.body,from_encoding='utf-8')
        main_content = 0
        try:
            main_content = bsoup.select('div#container')[0].select('div#content_left')[0]
        except:
            print 'url: ' + response.url + ' is empty'
            return []
        if main_content:
            elem_list = main_content.find_all('div', class_='result')
        items = []
        
        if len(elem_list)>0:
            for elem in elem_list:
                item = DataItem()
                item['dtype'] = 'news'
                item['source'] = '百度新闻'
                item['channel'] = 'Search engine'
                try:
                    item['title'] = elem.h3.a.get_text()
                except:
                    continue
                item['url'] = elem.h3.a['href']

                author = elem.find('p',class_='c-author')
                if author:
                    source_time = author.get_text().split()
                    if re.match(r'\d{4}.*?\d{1,2}.*?\d{1,2}', source_time[0].encode('utf8')):
                        item['medianame'] = 'None'
                        item['pubtime'] = self.normalize_time(str(' '.join(source_time)))
                    elif filter(str.isdigit, source_time[0].encode('utf8')) and len(source_time) == 1:
                        item['medianame'] = 'None'
                        item['pubtime'] = self.normalize_time(str(' '.join(source_time)))
                    else:
                        item['medianame'] = source_time[0]
                        item['pubtime'] = self.normalize_time(str(' '.join(source_time[1:])))
                    if self.tool.old_news(item['pubtime']):
                        continue                        
                else:
                    print 'no element of author'
                    continue
                
                if item['url'].find("html?") > 0 or item['url'].find("htm?") > 0:
                    item['url'] = "".join(item['url'].split("?")[0:-1])

                if self.r.exists(item['url']):
                    #if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                    continue

                try:
                    item['source'] = self.tool.get_realname(item['medianame'])
                    item['medianame'] = ' '
                except:
                    pass
            
                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime())
                if elem.find('div',class_='c-summary'):
                    item['abstract'] = elem.find('div',class_='c-summary').get_text()
                items.append(item)
        return items
                
    def normalize_time(self, time_text):
        time_text = time_text.encode('utf8')
        if re.match('\d{4}.*?\d{1,2}.*?\d{1,2}.*?\d{1,2}:\d{1,2}', time_text):
            time_text = time_text.replace('年'.encode('utf8'), '-').replace('月'.encode('utf8'), '-').replace('日'.encode('utf8'), '')
        else:
            #非标准时间转换为时间戳,再转为标准时间
            time_digit = float(filter(str.isdigit, time_text))
            
            interval = 0;
            if time_text.find('天'.encode('utf8')) > 0:
                interval = 86400
            elif time_text.find('时'.encode('utf8')) > 0:
                interval = 3600.
            elif time_text.find('分'.encode('utf8')) > 0:
                interval = 60
            elif time_text.find('秒'.encode('utf8')) > 0:
                interval = 1
            else:
                return time_text
            
            time_true = time.time() - time_digit*interval
            time_text = time.strftime("%Y-%m-%d %H:%M", time.localtime(time_true))

        return time_text
Esempio n. 16
0
class XicibbsSpider(Spider):
    name = "xicibbs1"
    domain_url = "http://www.xici.net/"
    tool = Utools()
    dc = dataCleaner()
    start_urls = []
    xici_dict = dict()
    test_hbase = True

    def __init__(self):
        super(XicibbsSpider, self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial, signals.engine_started)
        dispatcher.connect(self.finalize, signals.engine_stopped)

    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3)

        #self.htable=HBaseTest(table = 'origin')

    def finalize(self):
        self.log('---stopped---')
        #self.htable.close_trans()
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        fp = open('xici.txt', 'rb')
        for line in fp.readlines():
            keys = line.split('\t')
            self.xici_dict.setdefault(keys[1], keys[0].decode('utf8'))
        fp.close()

        tag = '?sort=date'
        for key in self.xici_dict.keys():
            self.start_urls.append(key + tag)

    #一个回调函数中返回多个Request以及Item的例子
    def parse(self, response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1,
                                        table='origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')

        #print '====start %s==' %response.url

        #抽取并解析新闻网页内容
        items = self.parse_items(response)

        for item in items:
            yield Request(url=item['url'],
                          meta={'item': item},
                          callback=self.parse_content)

    def parse_content(self, response):
        item = response.meta['item']

        main_content = response.xpath('//head').extract()[0]
        content_list = re.findall('({"del_w".*?})', main_content)

        if len(content_list) > 0:
            try:
                #store the keys
                content_list[0] = re.sub('<div.*?>', '<p>',
                                         content_list[0]).replace(
                                             '</div>', '</p>')
                tags = re.findall('<.*?>', content_list[0].encode('utf8'))
                tagdict = dict()
                for i in range(len(tags)):
                    tagdict.setdefault('&tag_' + str(i) + ';', tags[i])

                for key in tagdict.keys():
                    content_list[0] = content_list[0].replace(
                        tagdict[key], key)
                    tagdict[key] = str(tagdict[key].replace('\\"', ''))

                content_list[0] = content_list[0].replace('{',
                                                          '').replace('}', '')
                maindict = json.loads('{' + content_list[0] + '}',
                                      encoding='utf8')
                item['medianame'] = maindict['UserName']
                item['pubtime'] = maindict['really_updated_at'][:-3]
                if self.tool.old_news(item['pubtime']):
                    return

                item['content'] = []
                for content in content_list:
                    content = re.sub('<.*?>', '',
                                     content).replace('{',
                                                      '').replace('}', '')
                    content_dict = json.loads('{' + content + '}',
                                              encoding='utf8')
                    if content_dict.has_key('floorcontent'):
                        #release the tags
                        for key in tagdict.keys():
                            content_dict['floorcontent'] = content_dict[
                                'floorcontent'].replace(key, tagdict[key])

                        content_dict['floorcontent'] = content_dict[
                            'floorcontent']
                        item['content'].append(content_dict['floorcontent'])
                        #only get the first floor
                        break
                if item:
                    item['content'] = self.dc.process(
                        '<div>' + ' '.join(item['content']) + '</div>')
                    print 'url: ' + item['url'] + ' ' + str(
                        item['pubtime']) + ' is added'
                    return item
            except:
                print item['url'] + ' load failed.'
                pass
        else:
            return

    def parse_items(self, response):
        elem_list = []
        items = []
        content = re.findall(r'"docinfo":\[.*?\]', response.body)
        if self.xici_dict.has_key(response.url.replace('?sort=date', '')):
            source_name = self.xici_dict[response.url.replace(
                '?sort=date', '')]
        else:
            source_name = '西祠胡同'

        if len(content) > 0:
            elem_list = re.findall('\{\".*?visited\":[a-z]{4,5}\}', content[0])

        if len(elem_list) > 0:
            for elem in elem_list:
                item = DataItem()
                item['dtype'] = 'forum'

                elem = elem.decode('gb18030')
                try:
                    elem = json.loads(elem)
                except:
                    print elem
                    continue

                item['url'] = 'http://www.xici.net/d%s.htm' % elem['aDocs_i_0']
                if self.r.exists(item['url']):
                    continue
                item['title'] = elem['aDocs_i_1']

                item['source'] = source_name
                item['channel'] = 'Search engine'

                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M",
                                                    time.localtime())
                item['pubtime'] = item['collecttime'][0:4] + '-' + elem[
                    'ShortDate']
                if self.tool.old_news(item['pubtime']):
                    continue

                items.append(item)

        return items
Esempio n. 17
0
class TianyaBBSSpider(Spider):
    name = "tianyabbs"
    domain_url = "http://search.tianya.cn/"
    tool = Utools()    
    dc = dataCleaner()
    start_urls = []
    test_hbase = True   
    
    def __init__ (self):
        super(TianyaBBSSpider,self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial,signals.engine_started)
        dispatcher.connect(self.finalize,signals.engine_stopped)
    
    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3)        
        #self.htable=HBaseTest(table = 'origin')

    def finalize(self):
        self.log('---stopped---')
        #self.htable.close_trans()
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        #发帖时间
        pageTag = '&s=4'
        #回复时间
        #pageTag = '&s=6'   
        #默认相关性排序       
        
        qlist = GetQuery().get_data()
        for query in qlist:
            if query:
                query_url = '/bbs?q=' + urllib.quote(query.encode('utf8')) + pageTag
                self.start_urls.append(self.domain_url + query_url)
        
    #一个回调函数中返回多个Request以及Item的例子
    def parse(self,response):
        #print '====start %s==' %response.url
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)
        #抽取搜索结果页详细页面链接

        requests = []
        for url in sel.xpath(u'//div[@class="long-pages"]/a[text()="下一页"]/@href').re('go\(([\d]*?)\)'):
            tp_url = re.sub('&pn=[\d]+?', '', response.url)
            requests.append(self.make_requests_from_url(tp_url + '&pn=' + url))

        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
            
        #return requests
        for request in requests:
            continue
            yield request

    def parse_content(self,response):
        item = response.meta['item']
        
        if response.body:
            bsoup = BeautifulSoup(response.body)
            
            item_content_list = bsoup.find_all('div', class_='bbs-content')
            
            #only get the first floor
            if len(item_content_list) > 0:
                item['content'] = item_content_list[0].extract().encode('utf8')
                #item['content'] = ' '.join(v.get_text().encode('utf8') for v in item_content_list)
            item['content'] = re.sub(r'\n|\t|\r', '', item['content'])
            item['content'] = self.dc.process(item['content'])
            if item['content']:
                print 'url: ' + item['url'] + ' is added' 
                return item

    def parse_items(self,response):
        if response.body:
            bsoup = BeautifulSoup(response.body)
        main_content = bsoup.select('div#main')[0]
        #查询项中有一项多余
        if main_content:
            if main_content.select('li#search_msg'):
                elem_list = main_content.find_all('li')[:-1]
            else:
                elem_list = main_content.find_all('li')
                
        items = []
        if len(elem_list) > 0:
            for elem in elem_list:
                item = DataItem()
                item['dtype'] = 'forum'
                item['source'] = '天涯论坛'
                item['channel'] = 'Search engine'
                try:
                    item['title'] = elem.div.h3.a.get_text()
                except:
                    continue
                item['url'] = elem.div.h3.a['href']         
                
                author = elem.find('p', class_='source')
                if author:
                    item['medianame'] = author.a.get_text()
                    #item['author'] = author.a.get_text()                    
                    if author.span.get_text().find('-') > 0:
                        item['pubtime'] = author.span.get_text()
                    else:
                        item['pubtime'] = author.find_all('span')[-2].get_text()
                    if self.tool.old_news(item['pubtime']):
                        continue
                else:
                    print 'element of author no found!\n'
                    return

                if self.r.exists(item['url']):  
                    #if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                    continue
                
                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime())                
                item['abstract']=elem.div.p.get_text()
                items.append(item)
        return items
Esempio n. 18
0
class BaiduNewSpider(Spider):
    name = "baidunew"
    domain_url = "http://news.baidu.com"
    tool = Utools()
    dc = dataCleaner()
    start_urls = []
    test_hbase = True

    def __init__(self):
        super(BaiduNewSpider, self).__init__()
        # 将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial, signals.engine_started)
        dispatcher.connect(self.finalize, signals.engine_stopped)

    def initial(self):
        self.log("---started----")
        self.getStartUrl()
        self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3)

    def finalize(self):
        self.log("---stopped---")
        # url持久化

    def getStartUrl(self):
        # 从文件初始化查询关键词
        qlist = GetQuery().get_data()
        for query in qlist:
            if query:
                # 默认时间排序
                query_url = "/ns?rn=20&word=" + urllib.quote(query.encode("utf8")) + "&ct=0"
                self.start_urls.append(self.domain_url + query_url)

    # 一个回调函数中返回多个Request以及Item的例子
    def parse(self, response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table="origin")
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider("no thrift or hbase server!")

        # print '====start %s==' %response.url

        # 抽取并解析新闻网页内容
        items = self.parse_items(response)
        # 构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)

        # 尝试寻找下一页
        requests = []
        try:
            url = sel.xpath(u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1]
            requests.append(self.make_requests_from_url(self.domain_url + url))
        except:
            pass

        for item in items:
            yield Request(url=item["url"], meta={"item": item}, callback=self.parse_content)
        # return requests
        for request in requests:
            continue
            yield request

    def parse_content(self, response):
        item = response.meta["item"]
        try:
            charset = response.encoding
        except:
            charset = "utf-8"

        if response.body:
            try:
                bsoup = BeautifulSoup(response.body, from_encoding=charset)
                item["content"] = self.dc.process(str(bsoup).decode(charset))
            except:
                bsoup = BeautifulSoup(response.body, from_encoding="utf-8")
                item["content"] = self.dc.process(str(bsoup))
            if len(item["content"].encode("utf8")) < len(item["abstract"]):
                item["content"] = item["abstract"].replace("百度快照", "")
            if item["content"]:
                print "url: " + item["url"] + " is added"
                return item

    def parse_items(self, response):
        if response.body:
            bsoup = BeautifulSoup(response.body, from_encoding="utf-8")
        main_content = 0
        try:
            main_content = bsoup.select("div#container")[0].select("div#content_left")[0]
        except:
            print "url: " + response.url + " is empty"
            return []
        if main_content:
            elem_list = main_content.find_all("div", class_="result")
        items = []

        if len(elem_list) > 0:
            for elem in elem_list:
                item = DataItem()
                item["dtype"] = "news"
                item["source"] = "百度新闻"
                item["channel"] = "Search engine"
                try:
                    item["title"] = elem.h3.a.get_text()
                except:
                    continue
                item["url"] = elem.h3.a["href"]

                author = elem.find("p", class_="c-author")
                if author:
                    source_time = author.get_text().split()
                    if re.match(r"\d{4}.*?\d{1,2}.*?\d{1,2}", source_time[0].encode("utf8")):
                        item["medianame"] = "None"
                        item["pubtime"] = self.normalize_time(str(" ".join(source_time)))
                    elif filter(str.isdigit, source_time[0].encode("utf8")) and len(source_time) == 1:
                        item["medianame"] = "None"
                        item["pubtime"] = self.normalize_time(str(" ".join(source_time)))
                    else:
                        item["medianame"] = source_time[0]
                        item["pubtime"] = self.normalize_time(str(" ".join(source_time[1:])))
                    if self.tool.old_news(item["pubtime"]):
                        continue
                else:
                    print "no element of author"
                    continue

                if item["url"].find("html?") > 0 or item["url"].find("htm?") > 0:
                    item["url"] = "".join(item["url"].split("?")[0:-1])

                if self.r.exists(item["url"]):
                    # if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                    continue

                try:
                    item["source"] = self.tool.get_realname(item["medianame"])
                    item["medianame"] = " "
                except:
                    pass

                item["collecttime"] = time.strftime("%Y-%m-%d %H:%M", time.localtime())
                if elem.find("div", class_="c-summary"):
                    item["abstract"] = elem.find("div", class_="c-summary").get_text()
                items.append(item)
        return items

    def normalize_time(self, time_text):
        time_text = time_text.encode("utf8")
        if re.match("\d{4}.*?\d{1,2}.*?\d{1,2}.*?\d{1,2}:\d{1,2}", time_text):
            time_text = (
                time_text.replace("年".encode("utf8"), "-")
                .replace("月".encode("utf8"), "-")
                .replace("日".encode("utf8"), "")
            )
        else:
            # 非标准时间转换为时间戳,再转为标准时间
            time_digit = float(filter(str.isdigit, time_text))

            interval = 0
            if time_text.find("天".encode("utf8")) > 0:
                interval = 86400
            elif time_text.find("时".encode("utf8")) > 0:
                interval = 3600.0
            elif time_text.find("分".encode("utf8")) > 0:
                interval = 60
            elif time_text.find("秒".encode("utf8")) > 0:
                interval = 1
            else:
                return time_text

            time_true = time.time() - time_digit * interval
            time_text = time.strftime("%Y-%m-%d %H:%M", time.localtime(time_true))

        return time_text
Esempio n. 19
0
class ToutiaoSpider(Spider):
    name = "toutiaonew"
    domain_url = "http://toutiao.com/search_content"
    tool = Utools()
    dc = dataCleaner()
    start_urls = []
    test_hbase = True

    def __init__(self):
        super(ToutiaoSpider, self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial, signals.engine_started)
        dispatcher.connect(self.finalize, signals.engine_stopped)

    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3)

    def finalize(self):
        self.log('---stopped---')
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        qlist = GetQuery().get_data()
        for query in qlist:
            if query:
                #默认时间排序
                query_url = "?offset=0&format=json&count=50&keyword=" + urllib.quote(
                    query.encode('utf8'))
                self.start_urls.append(self.domain_url + query_url)

    #一个回调函数中返回多个Request以及Item的例子
    def parse(self, response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1,
                                        table='origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')

        #print '====start %s==' %response.url

        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)

        #尝试寻找下一页
        requests = []
        try:
            url = sel.xpath(
                u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1]
            requests.append(self.make_requests_from_url(self.domain_url + url))
        except:
            pass

        for item in items:
            yield Request(url=item['url'],
                          meta={'item': item},
                          callback=self.parse_content)
        #return requests
        for request in requests:
            continue
            yield request

    def parse_content(self, response):
        item = response.meta['item']
        charset = 'utf-8'

        try:
            for meta_item in response.xpath('//meta[@http-equiv]').extract():
                is_exsit = re.match('charset=(.*?)"', meta_item)
                if is_exsit:
                    charset = is_exsit.group(0)
                    break
        except:
            pass

        if response.body:
            try:
                bsoup = BeautifulSoup(response.body, from_encoding=charset)
            except:
                bsoup = BeautifulSoup(response.body, from_encoding='utf-8')
            item['content'] = self.dc.process(str(bsoup))
            if len(item['content'].encode('utf8')) < len(item['abstract']):
                item['content'] = item['abstract'].replace('百度快照', '')
            if item['content']:
                print 'url: ' + item['url'] + ' is added'
                return item

    def parse_items(self, response):
        if response.body:
            itemdatas = json.loads(response.body)['data']
        else:
            return []

        items = []
        for itemdata in itemdatas:
            item = DataItem()
            item['dtype'] = 'news'
            item['source'] = '今日头条'
            item['channel'] = 'Search engine'

            item['collecttime'] = time.strftime("%Y-%m-%d %H:%M",
                                                time.localtime())
            item['pubtime'] = itemdata['datetime']
            if self.tool.old_news(item['pubtime']):
                continue

            item['url'] = itemdata['display_url'].encode('utf8')

            if item['url'].find("html?") > 0 or item['url'].find("htm?") > 0:
                item['url'] = "".join(item['url'].split("?")[0:-1])

            if self.r.exists(item['url']):
                #if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                continue

            item['title'] = itemdata['title'].encode('utf8')
            item['medianame'] = itemdata['source'].encode('utf8')
            item['abstract'] = itemdata['abstract'].encode('utf8')
            items.append(item)

        return items
Esempio n. 20
0
class XicibbsSpider(Spider):
    name = "xicibbs1"
    domain_url = "http://www.xici.net/"
    tool = Utools()
    dc = dataCleaner()
    start_urls = []
    xici_dict = dict()
    test_hbase = True
    
    def __init__ (self):
        super(XicibbsSpider,self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial,signals.engine_started)
        dispatcher.connect(self.finalize,signals.engine_stopped)
    
    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3)
    
        #self.htable=HBaseTest(table = 'origin')

    def finalize(self):
        self.log('---stopped---')
        #self.htable.close_trans()
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        fp = open('xici.txt', 'rb')
        for line in fp.readlines():
            keys = line.split('\t');
            self.xici_dict.setdefault(keys[1], keys[0].decode('utf8'))
        fp.close()
        
        tag = '?sort=date'
        for key in self.xici_dict.keys():
            self.start_urls.append(key + tag)
            
    #一个回调函数中返回多个Request以及Item的例子
    def parse(self,response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')
                        
        #print '====start %s==' %response.url
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)

        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)

    def parse_content(self,response):
        item = response.meta['item']
    
        main_content = response.xpath('//head').extract()[0]
        content_list = re.findall('({"del_w".*?})', main_content)
        
        if len(content_list) > 0:
            try:
                #store the keys
                content_list[0] = re.sub('<div.*?>', '<p>', content_list[0]).replace('</div>', '</p>')
                tags = re.findall('<.*?>', content_list[0].encode('utf8'))
                tagdict = dict()
                for i in range(len(tags)):
                    tagdict.setdefault('&tag_' + str(i) + ';', tags[i])
                
                for key in tagdict.keys():
                    content_list[0] = content_list[0].replace(tagdict[key], key)
                    tagdict[key] = str(tagdict[key].replace('\\"', ''))

                content_list[0] = content_list[0].replace('{','').replace('}', '')
                maindict = json.loads('{' + content_list[0] + '}', encoding='utf8')
                item['medianame'] = maindict['UserName']
                item['pubtime'] = maindict['really_updated_at'][:-3]
                if self.tool.old_news(item['pubtime']):
                    return
                
                item['content'] = []    
                for content in content_list:
                    content = re.sub('<.*?>', '', content).replace('{','').replace('}', '')
                    content_dict = json.loads('{' + content + '}', encoding='utf8')
                    if content_dict.has_key('floorcontent'):
                        #release the tags
                        for key in tagdict.keys():
                            content_dict['floorcontent'] = content_dict['floorcontent'].replace(key, tagdict[key])

                        content_dict['floorcontent'] = content_dict['floorcontent']                         
                        item['content'].append(content_dict['floorcontent'])
                        #only get the first floor                        
                        break
                if item:
                    item['content'] = self.dc.process('<div>' + ' '.join(item['content']) + '</div>')
                    print 'url: ' + item['url'] + ' ' + str(item['pubtime']) + ' is added'
                    return item
            except:
                print item['url'] + ' load failed.'
                pass
        else:
            return

    def parse_items(self, response):
        elem_list = []        
        items = []
        content = re.findall(r'"docinfo":\[.*?\]', response.body)
        if self.xici_dict.has_key(response.url.replace('?sort=date', '')):
            source_name = self.xici_dict[response.url.replace('?sort=date', '')]
        else:
            source_name = '西祠胡同'
            
        if len(content) > 0:
            elem_list = re.findall('\{\".*?visited\":[a-z]{4,5}\}', content[0])
        
        if len(elem_list) > 0:
            for elem in elem_list:
                item = DataItem()
                item['dtype'] = 'forum'
                
                elem = elem.decode('gb18030')
                try:
                    elem = json.loads(elem)
                except:
                    print elem
                    continue
                
                item['url'] = 'http://www.xici.net/d%s.htm' % elem['aDocs_i_0']
                if self.r.exists(item['url']):
                    continue
                item['title'] = elem['aDocs_i_1']
                
                item['source'] = source_name
                item['channel'] = 'Search engine'
                
                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime())
                item['pubtime'] = item['collecttime'][0:4] + '-' + elem['ShortDate']
                if self.tool.old_news(item['pubtime']):
                    continue
                
                items.append(item)
                
        return items
Esempio n. 21
0
class LuChengBBSSpider(Spider):
    name = "luchengbbs"
    domain_url = "http://www.zjxslm.com/"
    combine_url = ("forum.php?mod=forumdisplay&fid=%d&orderby=lastpost" +
                   "&filter=dateline&dateline=86400")

    tool = Utools()
    dc = dataCleaner()
    start_urls = []
    test_hbase = True

    def __init__(self):
        super(LuChengBBSSpider, self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial, signals.engine_started)
        dispatcher.connect(self.finalize, signals.engine_stopped)

    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3)

    def finalize(self):
        self.log('---stopped---')
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        ids = range(195, 209)
        ids.append(193)
        for idx in ids:
            self.start_urls.append("http://www.zjxslm.com/forum.php?" +
                                   "mod=forumdisplay&orderby=lastpost" +
                                   "&filter=dateline&dateline=86400&fid=%d" %
                                   idx)

    #一个回调函数中返回多个Request以及Item的例子
    def parse(self, response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1,
                                        table='origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')

        print '====start %s==' % response.url

        #抽取并解析新闻网页内容
        items = self.parse_items(response)

        #尝试寻找下一页
        requests = []
        if response.url.find('page') < 0:
            #构造一个Xpath的select对象,用来进行网页元素抽取
            sel = Selector(response)
            page_num = sel.xpath('//div[@class="pg"]/label/span')

            if page_num:
                page_num = re.sub("<.*?>", "", page_num.extract()[0])
                page_num = int(re.search("([\d]+)", page_num).group(1))
                for idx in range(2, page_num + 1):
                    url = response.url + ("&page=%d" % idx)
                    requests.append(self.make_requests_from_url(url))

        for item in items:
            yield Request(url=item['url'],
                          meta={'item': item},
                          callback=self.parse_content)
        #return requests
        for request in requests:
            yield request

    def parse_content(self, response):
        item = response.meta['item']

        if response.body:
            bsoup = BeautifulSoup(response.body, from_encoding='utf-8')
            item['pubtime'] = bsoup.find_all(
                'div', class_="authi")[1].em.span['title']
            if self.tool.old_news(item['pubtime'][0:-3]):
                return

            item['content'] = str(bsoup.find('div', class_='pcb'))
            if item['content']:
                print 'url: ' + item['url'] + ' is added'
                return item

    def parse_items(self, response):
        if response.body:
            bsoup = BeautifulSoup(response.body, from_encoding='utf-8')
        main_content = bsoup.select('div#threadlist')[0]
        if main_content:
            elem_list = main_content.find_all('tbody')
        items = []

        if len(elem_list) > 0:
            for elem in elem_list:
                item = DataItem()
                item['dtype'] = 'forum'
                item['source'] = '鹿城论坛'
                item['channel'] = 'Search engine'

                #抓取id获取url
                try:
                    tid = elem['id']
                except:
                    continue

                if tid.find('_') < 0:
                    continue
                else:
                    tid = tid.split('_')[1]

                item['url'] = self.domain_url + 'thread-' + tid + '-1-1.html'
                if self.r.exists(item['url']):
                    #if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                    continue

                item['title'] = elem.find('th').get_text().split('\n')[2]
                item['medianame'] = elem.tr.find(
                    'td', class_='by').cite.get_text().replace('\n', '')
                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M",
                                                    time.localtime())

                items.append(item)

        return items

    def normalize_time(self, time_text):
        time_text = time_text.encode('utf8')
        if re.match('\d{4}.*?\d{1,2}.*?\d{1,2}.*?\d{1,2}:\d{1,2}', time_text):
            time_text = time_text.replace('年'.encode('utf8'), '-').replace(
                '月'.encode('utf8'), '-').replace('日'.encode('utf8'), '')
        else:
            #非标准时间转换为时间戳,再转为标准时间
            time_digit = float(filter(str.isdigit, time_text))

            interval = 0
            if time_text.find('天'.encode('utf8')) > 0:
                interval = 86400
            elif time_text.find('时'.encode('utf8')) > 0:
                interval = 3600.
            elif time_text.find('分'.encode('utf8')) > 0:
                interval = 60
            elif time_text.find('秒'.encode('utf8')) > 0:
                interval = 1
            else:
                return time_text

            time_true = time.time() - time_digit * interval
            time_text = time.strftime("%Y-%m-%d %H:%M",
                                      time.localtime(time_true))

        return time_text
Esempio n. 22
0
class SogouWeixinSpider(Spider):
    name = "sogouwx"
    domain_url = "http://weixin.sogou.com/weixin"
    UA = "Mozilla/5.0 (X11; Linux x86_64; rv:38.0) Gecko/20100101 Firefox/38.0"
    start_urls = []
    tool = Utools()
    dc = dataCleaner()
    time_interval = 0
    cookie = []
    test_hbase = True

    custom_settings = {"DOWNLOAD_DELAY": 0.2, "COOKIES_ENABLED": True}

    def __init__(self):
        super(SogouWeixinSpider, self).__init__()
        # 将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial, signals.engine_started)
        dispatcher.connect(self.finalize, signals.engine_stopped)

    def initial(self):
        self.log("---started----")
        self.getStartUrl()
        self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3)

    def finalize(self):
        self.log("---stopped---")
        # url持久化

    def getStartUrl(self):
        # 从文件初始化查询关键词
        # 过去24小时
        timeTag = "&tsn=1"
        qlist = GetQuery().get_data()

        for query in qlist:
            if query:
                query_url = "?type=2&query=" + urllib.quote(query.encode("utf8")) + timeTag
                self.start_urls.append(self.domain_url + query_url)

    def start_requests(self):
        for i in range(len(self.start_urls)):
            if i % 5 == 0:
                self.cookie = self.update_cookies()
            yield Request(self.start_urls[i], cookies=self.cookie)

    # 一个回调函数中返回多个Request以及Item的例子
    def parse(self, response):
        print "====start %s==" % response.url
        # print response.body
        time.sleep(random.randint(self.time_interval, 2))

        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1, table="origin")
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider("no thrift or hbase server!")

        # 未成功获取query
        if response.url == self.domain_url:
            print "error of query"
            return

        # 抽取并解析新闻网页内容
        items = self.parse_items(response)
        # 构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)

        requests = []
        for url in sel.xpath(u'//a[@class="np"]/@href').extract():
            requests.append(self.make_requests_from_url(self.domain_url + url))

        for item in items:
            yield Request(url=item["url"], meta={"item": item}, callback=self.parse_content)

        for request in requests:
            continue
            yield request

    def parse_content(self, response):
        item = response.meta["item"]
        if response.body:
            res = re.sub("\n|\r|\t", "", response.body)
            res = re.sub("<script.*?</script>", "", res)
            bsoup = BeautifulSoup(res, from_encoding="utf8")

            try:
                item["content"] = str(bsoup.select("div#js_content")[0]).encode("utf8")
                print "url:" + item["url"] + " is added"
                return item
            except:
                print "url:" + item["url"] + " load failed"

    def parse_items(self, response):
        if response.body:
            # 去除干扰内容<!.*?>
            res = re.sub(r"<!.*?>", "", response.body)
            bsoup = BeautifulSoup(res, from_encoding="utf8")
        main_content = bsoup.select("div#wrapper")[0]

        if main_content:
            elem_list = main_content.find_all("div", class_="txt-box")
        items = []
        if len(elem_list) > 0:
            for elem in elem_list:
                item = DataItem()
                item["dtype"] = "weixin"
                item["source"] = "搜狗微信"
                item["channel"] = "Search engine"
                if elem.h4.a.get_text():
                    item["title"] = elem.h4.a.get_text()
                else:
                    continue
                item["url"] = elem.h4.a["href"]
                item["medianame"] = elem.div.a["title"]
                # 时间戳转换时间
                item["pubtime"] = time.strftime("%Y-%m-%d %H:%M", time.localtime(float(elem.div["t"])))
                if self.tool.old_news(item["pubtime"]):
                    continue
                if self.r.exists(item["url"]):
                    # if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                    continue

                item["collecttime"] = time.strftime("%Y-%m-%d %H:%M", time.localtime())
                item["abstract"] = elem.p.get_text()
                items.append(item)
        return items

    def update_cookies(self):
        s = requests.Session()
        s.headers = {"User-Agent": self.UA}

        r = s.post("http://weixin.sogou.com/antispider/thank.php")
        pcontent = re.search("setCookie\('SNUID'.*?\)", r.content).group(0)
        SNUID = eval(pcontent.split(",")[1])

        suv = "".join([str(int(time.time() * 1000000) + random.randint(0, 1000))])
        s.cookies["SUV"] = suv
        s.cookies["SNUID"] = SNUID

        return dict(s.cookies)
Esempio n. 23
0
class XiciBBSSpider(Spider):
    name = "xicibbs"
    domain_url = "http://baidu.xici.net/cse"
    tool = Utools()
    dc = dataCleaner()
    start_urls = []
    test_hbase = True

    def __init__(self):
        super(XiciBBSSpider, self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial, signals.engine_started)
        dispatcher.connect(self.finalize, signals.engine_stopped)

    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host=self.tool.HOST_REDIS1, port=6379, db=3)
        #self.htable=HBaseTest(table = 'origin')

    def finalize(self):
        self.log('---stopped---')
        #self.htable.close_trans()
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        tag = '&s=11800334043319024933&srt=lds&sti=1440&nsid=0'
        qlist = GetQuery().get_data()
        for query in qlist:
            if query:
                #默认时间排序
                query_url = "/search?q=" + urllib.quote(
                    query.encode('utf8')) + tag
                self.start_urls.append(self.domain_url + query_url)

    #一个回调函数中返回多个Request以及Item的例子
    def parse(self, response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable = HBaseTest(host=self.tool.HOST_HBASE1,
                                        table='origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')

        #print '====start %s==' %response.url

        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)

        #尝试寻找下一页
        requests = []
        try:
            url = sel.xpath(
                u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1]
            requests.append(self.make_requests_from_url(self.domain_url + url))
        except:
            pass

        for item in items:
            yield Request(url=item['url'],
                          meta={'item': item},
                          callback=self.parse_content)

        #return requests
        for request in requests:
            continue
            yield request

    def parse_content(self, response):
        item = response.meta['item']

        main_content = response.xpath('//head').extract()[0]
        content_list = re.findall('({"del_w".*?})', main_content)

        if len(content_list) > 0:
            try:
                #store the keys
                content_list[0] = re.sub('<div.*?>', '<p>',
                                         content_list[0]).replace(
                                             '</div>', '</p>')
                tags = re.findall('<.*?>', content_list[0].encode('utf8'))
                tagdict = dict()
                for i in range(len(tags)):
                    tagdict.setdefault('&tag_' + str(i) + ';', tags[i])

                for key in tagdict.keys():
                    content_list[0] = content_list[0].replace(
                        tagdict[key], key)
                    tagdict[key] = str(tagdict[key].replace('\\"', ''))

                content_list[0] = content_list[0].replace('{',
                                                          '').replace('}', '')
                maindict = json.loads('{' + content_list[0] + '}',
                                      encoding='utf8')
                item['medianame'] = maindict['UserName']
                item['pubtime'] = maindict['really_updated_at'][:-3]
                if self.tool.old_news(item['pubtime']):
                    return

                item['content'] = []
                for content in content_list:
                    content = re.sub('<.*?>', '',
                                     content).replace('{',
                                                      '').replace('}', '')
                    content_dict = json.loads('{' + content + '}',
                                              encoding='utf8')
                    if content_dict.has_key('floorcontent'):
                        #release the tags
                        for key in tagdict.keys():
                            content_dict['floorcontent'] = content_dict[
                                'floorcontent'].replace(key, tagdict[key])

                        content_dict['floorcontent'] = content_dict[
                            'floorcontent']
                        item['content'].append(content_dict['floorcontent'])
                        #only get the first floor
                        break
                if item:
                    item['content'] = self.dc.process(
                        '<div>' + ' '.join(item['content']) + '</div>')
                    print 'url: ' + item['url'] + ' is added'
                    return item
            except:
                print item['url'] + ' load failed.'
                pass
        else:
            return

    def parse_items(self, response):
        if response.body:
            bsoup = BeautifulSoup(response.body, from_encoding='utf-8')
        main_content = bsoup.select('div#results')[0]
        if main_content:
            elem_list = main_content.find_all('div', class_='result')
        items = []

        if len(elem_list) > 0:
            for elem in elem_list:
                item = DataItem()
                item['dtype'] = 'forum'
                item['source'] = '西祠胡同'
                item['channel'] = 'Search engine'
                try:
                    item['title'] = elem.h3.a.get_text()
                except:
                    continue
                item['url'] = elem.h3.a['href'].replace('user', 'www')

                if self.r.exists(item['url']):
                    #if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                    continue

                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M",
                                                    time.localtime())
                if elem.find('div', class_='c-summary'):
                    item['abstract'] = elem.find(
                        'div', class_='c-content').get_text()
                items.append(item)

        #去重
        new_items = []
        url_list = []
        for item in items:
            if item['url'] not in url_list:
                new_items.append(item)
                url_list.append(item['url'])
        items = new_items
        return items
Esempio n. 24
0
class TiebaBBSSpider(Spider):
    name = "tiebabbs"
    domain_url = "http://tieba.baidu.com"
    tool = Utools()    
    dc = dataCleaner()
    start_urls = []
    test_hbase = True
 
    def __init__ (self):
        super(TiebaBBSSpider,self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial,signals.engine_started)
        dispatcher.connect(self.finalize,signals.engine_stopped)
    
    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3)
        #self.htable=HBaseTest(table = 'origin')

    def finalize(self):
        self.log('---stopped---')
        #self.htable.close_trans()
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        qlist = GetQuery().get_data()
        for query in qlist:
            if query:
                #默认时间排序
                query_url = "/f/search/res?ie=utf-8&rn=20&qw=" + urllib.quote(query.encode('utf8')) + '&ct=0'
                self.start_urls.append(self.domain_url + query_url)

    #一个回调函数中返回多个Request以及Item的例子
    def parse(self,response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')
        
        #print '====start %s==' %response.url
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)

        #尝试寻找下一页
        requests = []
        try:
            url = sel.xpath(u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1]
            requests.append(self.make_requests_from_url(self.domain_url+url))
        except:
            pass

        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
        #return requests
        for request in requests:
            continue
            yield request

    def parse_content(self,response):
        item = response.meta['item']
        if response.body:
            bsoup = BeautifulSoup(response.body,from_encoding='utf-8')
            if bsoup.find('h1', class_='core_title_txt'):
                item['title'] = bsoup.find('h1', class_='core_title_txt')['title']
            elif bsoup.find('h3', class_='core_title_txt'):
                item['title'] = bsoup.find('h3', class_='core_title_txt')['title']
            else:
                return
            
            timeform = '%Y-%m-%d %H:%M'
            pubtimes = [time.strptime(item['pubtime'], timeform)]
            for pubtime in re.findall('/d{4}-/d{2}-/d{2} /d{2}:/d{2}', str(bsoup)):
                pubtimes.append(time.strptime(pubtime, timeform))
        
            item['pubtime'] = time.strftime(timeform, min(pubtimes))
            if self.tool.old_news(item['pubtime']):
                print item['utl'] + ' ' + item['pubtime']
                return           
            
            item['content'] = []
            for elem in bsoup.find_all('div', class_='d_post_content'):
                item['content'].append(str(elem.extract()))
                #onlt get the first floor
                break
            
            if item:
                item['content'] = ' '.join(item['content']).encode('utf8')
                item['content'] = self.dc.process(item['content'])
                print 'url: ' + item['url'] + ' is added'
                yield item

    def parse_items(self,response):
        if response.body:
            bsoup = BeautifulSoup(response.body,from_encoding='utf-8')
        main_content = bsoup.find('div', class_='s_post_list')
              
        items = []
        if main_content:
            elem_list = main_content.find_all('div', class_='s_post')
        else:
            return items
        
        if len(elem_list)>0:
            for elem in elem_list:
                item = DataItem()
                item['dtype'] = 'forum'
                item['source'] = '百度贴吧'
                item['channel'] = 'Search engine'
                try:
                    item['pubtime'] = elem.find('font', class_='p_date').get_text()
                    if self.tool.old_news(item['pubtime']):
                        continue
                    
                    #item['title'] = elem.span.a.get_text()
                    item['medianame'] = elem.find('font', class_='p_violet').get_text()
                    item['abstract'] = elem.find('div',class_='p_content').get_text()           
                except:
                    continue
                
                item['url'] = self.domain_url + re.findall('(/p/.*?)[^\d]', elem.span.a['href'])[0]
                if self.r.exists(item['url']): 
                    #if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                    continue
                
                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime())
                items.append(item)
        #去重
        new_items = []
        url_list = []
        for item in items:
            if item['url'] not in url_list:
                new_items.append(item)
                url_list.append(item['url'])
        items = new_items;
        return items
Esempio n. 25
0
class ToutiaoSpider(Spider):
    name = "toutiaonew"
    domain_url = "http://toutiao.com/search_content"
    tool = Utools()
    dc = dataCleaner()
    start_urls = []
    test_hbase = True
    
    def __init__ (self):
        super(ToutiaoSpider,self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial,signals.engine_started)
        dispatcher.connect(self.finalize,signals.engine_stopped)
    
    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3)
            
    def finalize(self):
        self.log('---stopped---')
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        qlist = GetQuery().get_data()
        for query in qlist:
            if query:
                #默认时间排序
                query_url = "?offset=0&format=json&count=50&keyword=" + urllib.quote(query.encode('utf8'))
                self.start_urls.append(self.domain_url + query_url)

    #一个回调函数中返回多个Request以及Item的例子
    def parse(self,response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')
        
        #print '====start %s==' %response.url
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        #构造一个Xpath的select对象,用来进行网页元素抽取
        sel = Selector(response)
        
        #尝试寻找下一页
        requests = []
        try:
            url = sel.xpath(u'//p[@id="page"]/a[@class="n"]/@href').extract()[-1]
            requests.append(self.make_requests_from_url(self.domain_url+url))
        except:
            pass

        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
        #return requests
        for request in requests:
            continue
            yield request

    def parse_content(self,response):
        item = response.meta['item']
        charset = 'utf-8'
        
        try:
            for meta_item in response.xpath('//meta[@http-equiv]').extract():
                is_exsit = re.match('charset=(.*?)"', meta_item)
                if is_exsit:
                    charset = is_exsit.group(0)
                    break
        except:
            pass                 
        
        if response.body:
            try:
                bsoup = BeautifulSoup(response.body, from_encoding=charset)
            except:
                bsoup = BeautifulSoup(response.body, from_encoding='utf-8')
            item['content'] = self.dc.process(str(bsoup))
            if len(item['content'].encode('utf8')) < len(item['abstract']):
                item['content'] = item['abstract'].replace('百度快照', '')
            if item['content']:
                print 'url: ' + item['url'] + ' is added'
                return item

    def parse_items(self,response):
        if response.body:
            itemdatas = json.loads(response.body)['data']
        else:
            return []
        
        items = []
        for itemdata in itemdatas:
            item = DataItem()
            item['dtype'] = 'news'
            item['source'] = '今日头条'
            item['channel'] = 'Search engine'

            item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime())                        
            item['pubtime'] = itemdata['datetime']
            if self.tool.old_news(item['pubtime']):
                continue

            item['url'] = itemdata['display_url'].encode('utf8')

            if item['url'].find("html?") > 0 or item['url'].find("htm?") > 0:
                item['url'] = "".join(item['url'].split("?")[0:-1])         
            
            if self.r.exists(item['url']):
                #if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                continue

            item['title'] = itemdata['title'].encode('utf8')
            item['medianame'] = itemdata['source'].encode('utf8')
            item['abstract'] = itemdata['abstract'].encode('utf8')
            items.append(item)
            
        return items
Esempio n. 26
0
class LuChengBBSSpider(Spider):
    name = "luchengbbs"
    domain_url = "http://www.zjxslm.com/"
    combine_url = ("forum.php?mod=forumdisplay&fid=%d&orderby=lastpost"
                    + "&filter=dateline&dateline=86400")
    
    tool = Utools()
    dc = dataCleaner()
    start_urls = []
    test_hbase = True
    
    def __init__ (self):
        super(LuChengBBSSpider,self).__init__()
        #将final绑定到爬虫结束的事件上
        dispatcher.connect(self.initial,signals.engine_started)
        dispatcher.connect(self.finalize,signals.engine_stopped)
    
    def initial(self):
        self.log('---started----')
        self.getStartUrl()
        self.r = Redis(host = self.tool.HOST_REDIS1, port = 6379, db = 3)
            
    def finalize(self):
        self.log('---stopped---')
        #url持久化

    def getStartUrl(self):
        #从文件初始化查询关键词
        ids = range(195, 209)
        ids.append(193)
        for idx in ids:
            self.start_urls.append("http://www.zjxslm.com/forum.php?"
                +"mod=forumdisplay&orderby=lastpost"
                +"&filter=dateline&dateline=86400&fid=%d" % idx)

    #一个回调函数中返回多个Request以及Item的例子
    def parse(self,response):
        # test the status of hbase and thrift server
        if self.test_hbase:
            try:
                self.htable=HBaseTest(host = self.tool.HOST_HBASE1, 
                                      table = 'origin')
                self.htable.close_trans()
                self.test_hbase = False
            except:
                raise CloseSpider('no thrift or hbase server!')
        
        print '====start %s==' %response.url
        
        #抽取并解析新闻网页内容
        items = self.parse_items(response)
        
        #尝试寻找下一页
        requests = []
        if response.url.find('page') < 0:
            #构造一个Xpath的select对象,用来进行网页元素抽取
            sel = Selector(response)        
            page_num = sel.xpath('//div[@class="pg"]/label/span')

            if page_num:
                page_num = re.sub("<.*?>", "", page_num.extract()[0])
                page_num = int(re.search("([\d]+)", page_num).group(1))
                for idx in range(2, page_num+1):
                    url = response.url + ("&page=%d" % idx)
                    requests.append(self.make_requests_from_url(url))
                    
        for item in items:
            yield Request(url=item['url'], meta={'item': item}, callback=self.parse_content)
        #return requests
        for request in requests:
            yield request

    def parse_content(self,response):
        item = response.meta['item']
        
        if response.body:
            bsoup = BeautifulSoup(response.body, from_encoding='utf-8')
            item['pubtime'] = bsoup.find_all('div', class_="authi")[1].em.span['title']
            if self.tool.old_news(item['pubtime'][0:-3]):
                return
                
            item['content'] = str(bsoup.find('div', class_='pcb'))
            if item['content']:
                print 'url: ' + item['url'] + ' is added'
                return item

    def parse_items(self,response):
        if response.body:
            bsoup = BeautifulSoup(response.body,from_encoding='utf-8')
        main_content = bsoup.select('div#threadlist')[0]
        if main_content:
            elem_list = main_content.find_all('tbody')
        items = []
        
        if len(elem_list)>0:
            for elem in elem_list:
                item = DataItem()
                item['dtype'] = 'forum'
                item['source'] = '鹿城论坛'
                item['channel'] = 'Search engine'
                
                #抓取id获取url
                try:
                    tid = elem['id']
                except:
                    continue
                
                if tid.find('_') < 0:
                    continue
                else:
                    tid = tid.split('_')[1]
                
                item['url'] = self.domain_url + 'thread-' + tid + '-1-1.html'
                if self.r.exists(item['url']):
                    #if self.htable.getRowByColumns(item['url'], ['indexData:url']):
                    continue
                
                item['title'] = elem.find('th').get_text().split('\n')[2]
                item['medianame'] = elem.tr.find('td', class_='by').cite.get_text().replace('\n','')
                item['collecttime'] = time.strftime("%Y-%m-%d %H:%M", time.localtime())
                
                items.append(item)

        return items
                
    def normalize_time(self, time_text):
        time_text = time_text.encode('utf8')
        if re.match('\d{4}.*?\d{1,2}.*?\d{1,2}.*?\d{1,2}:\d{1,2}', time_text):
            time_text = time_text.replace('年'.encode('utf8'), '-').replace('月'.encode('utf8'), '-').replace('日'.encode('utf8'), '')
        else:
            #非标准时间转换为时间戳,再转为标准时间
            time_digit = float(filter(str.isdigit, time_text))
            
            interval = 0;
            if time_text.find('天'.encode('utf8')) > 0:
                interval = 86400
            elif time_text.find('时'.encode('utf8')) > 0:
                interval = 3600.
            elif time_text.find('分'.encode('utf8')) > 0:
                interval = 60
            elif time_text.find('秒'.encode('utf8')) > 0:
                interval = 1
            else:
                return time_text
            
            time_true = time.time() - time_digit*interval
            time_text = time.strftime("%Y-%m-%d %H:%M", time.localtime(time_true))

        return time_text