Esempio n. 1
0
    def parse(self, response):
        '''

        :param response:
        :return:

        取各版块的url并传入到parse_pages

        '''
        #test
        # url = 'http://m.sohu.com/cl/58/?page=1'
        # yield scrapy.Request(url, self.parse_item)
        #end test
        file_name = "log-" + str(datetime.date.today()) + ".txt"
        t_file = codecs.open(file_name, 'ab', encoding='utf-8')
        if self.isFirst:
            self.isFirst = False
            line = str(datetime.datetime.now()) + " " + self.name + " spider start!\n"
            t_file.write(line.decode("unicode_escape"))

        updatetool = UpdateTool()
        self.log('Hi, this is an page! %s' % response.url)
        self.new = 0
        self.pages += 1
        root = bs(response.body.decode('utf-8'))
        div = root.find("div", attrs={"class": "bd3 pb1"})
        lis = div.findAll("p")
        for li in lis:
            item = FenghuoItem()
            iurl = 'm.sohu.com'+li.find("a").get("href")
            title = li.find("a").text
            pubdate = root.find('p',attrs={'class': 'w c2'}).text
            month = pubdate[16:18]
            day = pubdate[19:21]
            hour = pubdate[22:24]
            year = pubdate[11:15]
            item_date = datetime.date(int(year), int(month), int(day))
            item['url'] = iurl
            item['title'] = title
            item['pubdate'] = str(item_date)
            item['snatch_time'] = datetime.datetime.now()
            item['topPost'] = 1
            item['site_name'] = '手机搜狐网'
            item['site_url'] = "m.sohu.com/"
            print item
            if (not updatetool.hasUrl(iurl)) and self.now - item_date < self.delay:
                self.new += 1
                self.total_new += 1
                fp.process_item(item, "123")
        url = 'http://m.sohu.com/cl/58/?page='+str(self.pages)
        if self.new > 3 and self.hasNext:
            yield scrapy.Request(url, self.parse)
        else:
            line = str(datetime.datetime.now()) + " Totally crawled " + str(self.total_new) + " items " + self.name + " spider has finished start!\n\n"
            t_file.write(line.decode("unicode_escape"))
Esempio n. 2
0
    def parse(self, response):
        '''

        :param response:
        :return:

        取各版块的url并传入到parse_pages

        '''
        #test
        # url = 'http://www.gs.whu.edu.cn/index.php/index-show-tid-40-p-1.html'
        # yield scrapy.Request(url, self.parse_item)
        #end test
        file_name = "log-" + str(datetime.date.today()) + ".txt"
        t_file = codecs.open(file_name, 'ab', encoding='utf-8')
        if self.isFirst:
            self.isFirst = False
            line = str(datetime.datetime.now()) + " " + self.name + " spider start!\n"
            t_file.write(line.decode("unicode_escape"))

        updatetool = UpdateTool()
        self.log('Hi, this is an page! %s' % response.url)
        self.new = 0
        self.pages += 1
        root = bs(response.body.decode('utf-8'))
        div = root.find("div", attrs={"class": "ulnotice"})
        lis = div.findAll("li")
        for li in lis:
            item = FenghuoItem()
            iurl = li.find("a").get("href")
            if iurl[0:4]!='http':
                iurl='http://gs.whu.edu.cn'+iurl
            title = li.find("a").text
            pubdate = li.find("span").text
            month = pubdate[6:8]
            day = pubdate[9:11]
            hour = '00'
            year = pubdate[1:5]
            item_date = datetime.date(int(year), int(month), int(day))
            if (not updatetool.hasUrl(iurl)) and self.now - item_date < self.delay:
                self.new += 1
                self.total_new += 1
                yield scrapy.Request(iurl, self.parse_items)#according to iurl,requuest the detail page
        url = 'http://www.gs.whu.edu.cn/index.php/index-show-tid-40-p-'+str(self.pages)+'.html'
        if self.new > 10 and self.hasNext:
            yield scrapy.Request(url, self.parse)
        else:
            line = str(datetime.datetime.now()) + " Totally crawled " + str(self.total_new) + " items " + self.name + " spider has finished start!\n\n"
            t_file.write(line.decode("unicode_escape"))
Esempio n. 3
0
    def parse(self, response):

        file_name = "log-" + str(datetime.date.today()) + ".txt"
        t_file = codecs.open(file_name, 'ab', encoding='utf-8')
        if self.isFirst:
            self.isFirst = False
            line = str(
                datetime.datetime.now()) + " " + self.name + " spider start!\n"
            t_file.write(line.decode("unicode_escape"))

        updatetool = UpdateTool()

        self.log('Hi, this is an page! %s' % response.url)
        self.new = 0
        self.pages += 1
        root = bs(response.body)
        div = root.find("div", "fallsFlow")
        lis = div.findAll("li")
        for li in lis:
            iurl = li.find("a").get("href")
            pubdate = li.find("h6").text
            year = pubdate[0:4]
            month = pubdate[5:7]
            day = pubdate[8:10]
            item_date = datetime.date(int(year), int(month), int(day))

            yield scrapy.Request(
                iurl,
                self.parse_items)  #according to iurl,requuest the detail page

            if (not updatetool.hasUrl(iurl)
                ) and self.now - item_date < self.delay:
                self.new += 1
                self.total_new += 1
        if self.pages >= 2:
            url = 'http://china.huanqiu.com/local/' + str(self.pages) + '.html'
        else:
            url = 'http://china.huanqiu.com/local/index.html'
        if self.new > 10 and self.hasNext:
            yield scrapy.Request(self.url, self.parse)
        else:
            line = str(datetime.datetime.now()) + " Totally crawled " + str(
                self.total_new
            ) + " items " + self.name + " spider has finished start!\n\n"
            t_file.write(line.decode("unicode_escape"))
Esempio n. 4
0
    def parse(self, response):
        '''

        :param response:
        :return:

        取出新闻页面的具体信息

        '''
        #test
        # url = 'http://www.xcxww.com/content-19-3751-1.html'
        # yield scrapy.Request(url, self.parse_item)
        #end test
        self.log('Hi, this is an page! %s' % response.url)
        file_name = "log-" + str(datetime.date.today()) + ".txt"
        t_file = codecs.open(file_name, 'ab', encoding='utf-8')
        if self.isFirst:
            self.isFirst = False
            line = str(datetime.datetime.now()
                       ) + " " + self.name + " spider has started!\n"
            t_file.write(line.decode("unicode_escape"))

        updatetool = UpdateTool()
        self.log('Hi, this is an page! %s' % response.url)

        self.new = 0
        self.pages += 1
        root = bs(response.body.decode('gb2312'))
        codeTexts = root.find("script", attrs={"language": "JavaScript"}).text

        maxPage = codeTexts[codeTexts.index("maxPage") +
                            10:codeTexts.index("maxPage") + 14]

        # maxPage = int(maxPage)
        #print maxPage
        div = root.find("div", attrs={"class": "new-article"})
        lis = div.findAll("div", attrs={"class": "article-list"})

        for li in lis:
            iurl = li.find("h3").contents[1].get("href")
            title = li.find("h3").contents[1].text
            pubdate = li.find("div", attrs={"class": "time"}).text
            pubdate = pubdate[pubdate.index(" ") + 1:]
            year = pubdate[0:4]
            month = pubdate[5:7]
            day = pubdate[8:10]
            item_date = datetime.date(int(year), int(month), int(day))
            yield scrapy.Request(
                iurl,
                self.parse_items)  #according to iurl,requuest the detail page
            if (not updatetool.hasUrl(iurl)
                ) and self.now - item_date < self.delay:
                self.new += 1
                self.total_new += 1

        url = 'http://news.sohu.com/guoneixinwen_' + str(maxPage - self.pages +
                                                         1) + '.shtml'
        print url
        if self.new > 10 and self.hasNext:
            yield scrapy.Request(url, self.parse)
        else:
            line = str(datetime.datetime.now()) + " Totally crawled " + str(
                self.total_new
            ) + " items " + self.name + " spider has finished start!\n\n"
            t_file.write(line.decode("unicode_escape"))
Esempio n. 5
0
from BeautifulSoup import BeautifulSoup as bs
from BeautifulSoup import Comment  #used to remove comments in text
from fenghuo.items import FenghuoItem
import HTMLParser  #used to parse HTML
import requests
from com.crawler.dao.UpdateTool import UpdateTool
import datetime, calendar

root_domain = "http://news.sohu.com/"
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

from fenghuo.pipelines import FenghuoPipeline as FP
fp = FP()
updatetool = UpdateTool()


class SoHuSpider(CrawlSpider):
    name = "sohus"
    allowed_domains = ["news.sohu.com"]

    start_urls = ['http://news.sohu.com/guoneixinwen.shtml']
    new = 11
    total_new = 0
    url = start_urls[0]
    hasNext = True
    now = datetime.date.today()
    delay = datetime.timedelta(days=3)
    pages = 1
    isFirst = True