예제 #1
0
 def parse(self, response):
     nodelist = response.xpath('//tbody/tr/th')  #得到一页中的所有帖子
     item = BaiduspiderItem()
     isHasContent = False  # 判断此页中是否有合适的信息
     NextPageUrl = ''
     timecount = 0  # 计数器
     for node in nodelist:  #分析帖子信息
         childUrl = node.xpath(
             "./a[2][@class='s xst']/@href").extract_first()
         item["title"] = node.xpath(
             "./a[2][@class='s xst']/text()").extract_first()
         item["UrlId"] = node.xpath(
             "./a[2][@class='s xst']/@href").extract_first()
         if (childUrl != None):
             item["info"] = ChildPage.ChildPage(childUrl, '1')
         item["time"] = node.xpath(
             './a[2]/../../td[@class="by"]/em/span/text()').extract_first()
         if item["time"] == None:
             item["time"] = node.xpath(
                 './a[2]/../../td[@class="by"]/em/span/span/text()'
             ).extract_first()
         #处理时间为空的情况
         if item["time"] == None:
             item["time"] = ''
         else:
             item["time"] = item["time"].strip()
             item["time"] = TimeCalculate.time_calculate(
                 item["time"], self.name)
         # # 处理简介为空的情况
         # if item["info"] == None:
         #     item["info"] = ''
         # 判断这个帖子是否符合时间
         if (TimeMarch.time_March(item["time"],
                                  self.default_scope_day) == True):
             item["IsLimitedTime"] = 'y'
         else:
             item["IsLimitedTime"] = 'n'
             timecount = timecount + 1
         if (NextPageUrl == ''):  #记录下一页的链接
             NextPageUrl = response.xpath(
                 '//a[@class="bm_h"]/@rel').extract_first()
         if item["UrlId"] != None:  # 非普通帖子的错误处理(置顶帖等异常的帖子)
             yield item  #返回数据到pipeline
     if (timecount > self.allowed_timesup
             or NextPageUrl == None):  #根据判断决定继续爬取还是结束
         #结束爬取
         item = BaiduspiderItem()
         item["IsLimitedTime"] = 'n'
         yield item
     else:
         yield scrapy.Request('http://www.huhutong315.com/' + NextPageUrl,
                              callback=self.parse)
예제 #2
0
    def parse(self, response):
        nodelist = response.xpath('//tbody/tr/th')  #得到一页中的所有帖子
        item = BaiduspiderItem()
        isHasContent = False  # 判断此页中是否有合适的信息
        NextPageUrl = ''
        timecount = 0  # 计数器

        for node in nodelist:  #分析帖子信息
            #首判断是否符合时间限制
            item["time"] = node.xpath(
                './a[2]/../../td[2]/em//text()').extract_first()
            # 处理时间为空的情况
            if item["time"] == None:
                item["time"] = ''
            item["time"] = item["time"].strip()
            item["time"] = TimeCalculate.time_calculate(
                item["time"], self.name)
            if (TimeMarch.time_March(item["time"],
                                     self.default_scope_day) == True):
                item["IsLimitedTime"] = 'y'
            else:
                item["IsLimitedTime"] = 'n'
                timecount = timecount + 1
            item["title"] = node.xpath(
                "./a[2][@class='s xst']/text()").extract_first()
            item["UrlId"] = node.xpath(
                "./a[2][@class='s xst']/@href").extract_first()
            if (item["IsLimitedTime"] == 'y'):  #如果符合时间限制的话
                childUrl = node.xpath(
                    "./a[2][@class='s xst']/@href").extract_first()
                if (childUrl != None):  #判断是否已经爬过,决定是否访问子页面
                    id = item['UrlId'].split('/')[3]  # 得到urlid
                    num = id.split('-')[1]
                    if num not in self.idlist:
                        item["info"] = ChildPage.ChildPage(childUrl, '3')
                    else:
                        print("已经爬过")

            if (NextPageUrl == ''):  #记录下一页的链接
                NextPageUrl = response.xpath(
                    '//a[@class="bm_h"]/@rel').extract_first()
            if item["UrlId"] != None:  # 非普通帖子的错误处理(置顶帖等异常的帖子)
                yield item  #返回数据到pipeline
        if (timecount > self.allowed_timesup
                or NextPageUrl == None):  #根据判断决定继续爬取还是结束
            #结束爬取
            item = BaiduspiderItem()
            item["IsLimitedTime"] = 'n'
            yield item
        else:
            yield scrapy.Request('https://www.wszgw.net/' + NextPageUrl,
                                 callback=self.parse)
예제 #3
0
 def parse(self, response):
     nodelist = response.xpath('//tbody/tr')#得到一页中的所有帖子
     item = BaiduspiderItem()
     item = inititem(item)
     isHasContent = False  # 判断此页中是否有合适的信息
     NextPageUrl = ''
     timecount = 0  # 计数器
     for node in nodelist:#分析帖子信息
         item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
         childUrl = node.xpath("./th/a[2][@class='s xst']/@href").extract_first()
         item["title"]= node.xpath("./th/a[2][@class='s xst']/text()").extract_first()
         item["url"] = node.xpath("./th/a[2][@class='s xst']/@href").extract_first()
         item["comment"] = node.xpath("./td[@class='num']/a/text()").extract_first()
         item["read"] = node.xpath("./td[@class='num']/em/text()").extract_first()
         item["latestcomtime"] = node.xpath("./td[4]/em/a/span/@title | ./td[4]/em/a/text()").extract_first()
         if(childUrl != None):
             item["info"] = ChildPage.ChildPage(childUrl,'1')
         item["time"] = node.xpath('./th/a[2]/../../td[@class="by"]/em/span/text()').extract_first()
         if item["time"] == None:
             item["time"] = node.xpath('./th/a[2]/../../td[@class="by"]/em/span/span/text()').extract_first()
         #处理时间为空的情况
         if item["time"] == None:
             item["time"] = ''
         else:
             item["time"] = item["time"].strip()
             item["time"] = TimeCalculate.time_calculate(item["time"], self.name)
         # # 处理简介为空的情况
         # if item["info"] == None:
         #     item["info"] = ''
         # 判断这个帖子是否符合时间
         if(TimeMarch.time_March(item["time"],self.default_scope_day)==True):
             item["IsFilter"] = True
         else:
             item["IsFilter"] = False
             timecount = timecount + 1
         if(NextPageUrl == ''):#记录下一页的链接
             NextPageUrl =response.xpath('//a[@class="bm_h"]/@rel').extract_first()
         if item["url"] != None:  # 非普通帖子的错误处理(置顶帖等异常的帖子)
             item['urlId'] = item['url'].split('/')[3].split('-')[1]  # 得到urlId
             item["urlId"] = '%s_%s'%(self.name,item["urlId"])
             yield item #返回数据到pipeline
     if(timecount>self.allowed_timesup or NextPageUrl==None):#根据判断决定继续爬取还是结束
         #结束爬取
         item = BaiduspiderItem()
         item["IsFilter"]=False
         yield item
     else:
         yield scrapy.Request('http://www.huhutong315.com/'+NextPageUrl,callback = self.parse)
예제 #4
0
 def parse(self, response):
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     timecount = 0  # 计数器
     try:
         item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                            time.localtime(time.time()))
         item["title"] = response.xpath(
             "//div[@class='cont']/h2/text()").extract()
         item["title"] = "".join(item["title"])
         item["url"] = response.url
         item["urlId"] = item["url"].split('/')[-1].split('.')[0]
         item["urlId"] = '%s_%s' % (self.name, item["urlId"])
         item["time"] = response.xpath(
             "//table[@class='s1']/tbody/tr[2]/td[2]/text()").extract_first(
             )
         item["info"] = response.xpath(
             "//p[@class='MsoNormal']/text()").extract()
         item["info"] = "".join(item["info"])
         # 判断这个帖子是否符合时间
         if TimeMarch.time_March(item["time"], self.default_scope_day):
             item["IsFilter"] = True
         else:
             item["IsFilter"] = False
             timecount = timecount + 1
     except:
         item['IsFilter'] = False
     yield item
     self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #5
0
 def parse(self, response):
     nodelist = response.xpath(
         "//div[@class = 'search-news-mod']")  #得到一页中的所有帖子
     nodelist = [] if nodelist == None else nodelist
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     timecount = 0  # 计数器
     for node in nodelist:  #分析帖子信息
         try:
             item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime(time.time()))
             item["title"] = node.xpath("./h1/a/text()").extract_first()
             item["url"] = node.xpath("./h1/a/@href").extract_first()
             item["urlId"] = item["url"].split('.')[0].split('/')[-1]
             item["urlId"] = '%s_%s' % (self.name, item["urlId"])
             item["time"] = node.xpath("./p[2]/text()").extract_first()
             item["info"] = node.xpath("./p[1]/text()").extract_first()
             # item["info"] = node.xpath("./p[1]/text()").extract_first()
             # 判断这个帖子是否符合时间
             if TimeMarch.time_March(item["time"], self.default_scope_day):
                 item["IsFilter"] = True
             else:
                 item["IsFilter"] = False
                 timecount = timecount + 1
         except:
             item['IsFilter'] = False
         yield item
     self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #6
0
 def child_page(self, response):
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     try:
         item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                            time.localtime(time.time()))
         item["title"] = response.xpath(
             "//tr/td/font/text()").extract_first()
         item["url"] = response.url
         item["urlId"] = item["url"].split('id=')[1]
         item["urlId"] = '%s_%s' % (self.name, item["urlId"])
         item['time'] = response.xpath(
             "//table[@class='dx']/tr/td/text()").extract_first()
         item['time'] = '%s日' % item['time'].split('发布时间:')[1].split('日')[0]
         item['info'] = response.xpath(
             "//div/span/text() | //div/p/span/text()").extract()
         item["info"] = ("".join(item["info"])).replace('\xa0', '').replace(
             '\r\n', '')
         try:
             # 判断这个帖子是否符合时间
             item['time'] = time.strftime(
                 "%Y-%m-%d", time.strptime(item['time'], "%Y年%m月%d日"))
             if TimeMarch.time_March(item['time'], self.default_scope_day):
                 item['IsFilter'] = True
         except:
             item['IsFilter'] = False
     except:
         item['IsFilter'] = False
     yield item
예제 #7
0
 def parse(self, response):
     print(response)
     nodelist = response.xpath(
         "//td[@class = 'td_left30_right30']/table/tr/td")  #得到一页中的所有帖子
     nodelist = [] if nodelist == None else nodelist
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     timecount = 0  # 计数器
     for node in nodelist:  #分析帖子信息
         try:
             item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime(time.time()))
             item["title"] = node.xpath(
                 "./span[@class = 'blue14bold']/a/text()").extract_first()
             item["url"] = node.xpath(
                 "./span[@class = 'blue14bold']/a/@href").extract_first()
             item["urlId"] = item["url"].split('_')[-1].split('.')[0]
             item["urlId"] = '%s_%s' % (self.name, item["urlId"])
             item["time"] = node.xpath(
                 "./span[@class = 'black12bold']/text()").extract_first()
             item["time"] = item["time"].split(' ')[1].split('\n')[
                 0]  # 换行符表示
             item["time"] = time.strftime(
                 "%Y-%m-%d",
                 time.strptime(item["time"].split(' ')[0], "%Y.%m.%d"))
             # 判断这个帖子是否符合时间
             if TimeMarch.time_March(item["time"], self.default_scope_day):
                 item["IsFilter"] = True
             else:
                 item["IsFilter"] = False
                 timecount = timecount + 1
             res_child = child_page(item["url"])
             item["info"] = res_child.xpath(
                 "//div[@class='cas_content']/p/text() | //div[@class='Custom_UnionStyle']/p/text() | //div[@class='Custom_UnionStyle']/span/p/text() |//div[@class='TRS_Editor']/div/div/p/text() | //div[@class='Custom_UnionStyle']/div/span/text()"
             )  # 格式不统一
             item["info"] = "".join(item["info"])
         except:
             item['IsFilter'] = False
         yield item
     if (len(nodelist) != 0) and (timecount < self.allowed_timesup):
         keyword = response.url.split('searchword=')[1].split('&')[0]
         page_num = response.url.split('page=')[-1]
         print(
             '\n第***********************************%s***********************************页\n'
             % page_num)
         page_num = int(page_num) + 1
         NextPageUrl = "http://was.jl.gov.cn/was5/web/search?presearchword=&searchword1=&channelid=193132&StringEncoding=UTF-8&searchword=%s&page=%s" % (
             str(page_num), keyword)
         print(NextPageUrl)
         yield scrapy.Request(NextPageUrl,
                              callback=self.parse,
                              dont_filter=True)
     else:
         self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #8
0
    def parse(self, response):
        nodelist = response.xpath(
            "//div[@style='background:#FFF;padding:5px;width:100%']"
        )  #得到一页中的所有帖子
        nodelist = [] if nodelist == None else nodelist
        item = BaiduspiderItem()
        item = inititem(item)
        # 是否符合爬取条件
        item['IsFilter'] = False
        timecount = 0  # 计数器
        for node in nodelist:  #分析帖子信息
            try:
                item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                   time.localtime(time.time()))
                item["title"] = node.xpath("./a/text()").extract()
                item["title"] = "".join(item["title"])
                item["url"] = node.xpath("./a/@href").extract_first()
                item["url"] = 'http://gdj.ah.gov.cn/%s' % item["url"]
                item["urlId"] = item["url"].split('id=')[-1]
                item["urlId"] = '%s_%s' % (self.name, item["urlId"])
                item["time"] = node.xpath(
                    "./p[2]/span[2]/text()").extract_first()
                if item["time"] is not None:
                    item["time"] = str(item["time"]).replace(' ', '')
                    item["time"] = item["time"][0:10]
                # item["time"] = item["time"][0].split(' ')[0]
                # 判断这个帖子是否符合时间
                if TimeMarch.time_March(item["time"], self.default_scope_day):
                    item["IsFilter"] = True
                else:
                    item["IsFilter"] = False
                    timecount = timecount + 1
                res_child = child_page(item["url"])
                print(res_child)
                item["info"] = res_child.xpath(
                    "//div[@id = 'Zoom']/p/text() | //div[@id = 'Zoom']/p/font/text() | //div[@id = 'Zoom']/text() | //div[@id = 'Zoom']/font/text() | //div[@id = 'Zoom']/p/span/text()"
                )
                item["info"] = "".join(item["info"])
            except:
                item['IsFilter'] = False

            yield item
        if (len(nodelist) != 0) and (timecount < self.allowed_timesup):
            keyword = response.url.split('keycontent=')[1].split('&')[0]
            page_num = response.url.split('StartPage=')[1]
            page_num = int(page_num) / 15
            print(
                '\n第***********************************%s***********************************页\n'
                % page_num)
            page_num = int(page_num) * 15 + 15
            NextPageUrl = "http://gdj.ah.gov.cn/isearch.php?keytype=1&keycontent=%s&StartPage=%s" % (
                str(page_num), keyword)
            print(NextPageUrl)
            yield scrapy.Request(NextPageUrl, callback=self.parse)
        else:
            self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #9
0
    def parse(self, response):
        nodelist = response.xpath(
            "//div[@class='jsearch-result-box']")  #得到一页中的所有帖子
        nodelist = [] if nodelist == None else nodelist
        item = BaiduspiderItem()
        item = inititem(item)
        # 是否符合爬取条件
        item['IsFilter'] = False
        timecount = 0  # 计数器
        for node in nodelist:  #分析帖子信息
            try:
                item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                   time.localtime(time.time()))
                item["url"] = node.xpath(
                    "./div/div/div[@class='jsearch-result-url']/a/text()"
                ).extract_first()
                item["urlId"] = item["url"].split('/')[-1].split('.')[0]
                item["urlId"] = '%s_%s' % (self.name, item["urlId"])
                item["time"] = node.xpath(
                    "./div/div/span[@class='jsearch-result-date']/text()"
                ).extract_first()
                item["time"] = item["time"].split(' ')[0]
                item["time"] = time.strftime(
                    "%Y-%m-%d",
                    time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日"))
                # 判断这个帖子是否符合时间
                if TimeMarch.time_March(item["time"], self.default_scope_day):
                    item["IsFilter"] = True
                else:
                    item["IsFilter"] = False
                    timecount = timecount + 1
                res_child = child_page(item["url"])
                item["info"] = res_child.xpath("//p/text()")
                item["info"] = "".join(item["info"])
                item["title"] = res_child.xpath("//td[@class='title']/text()")
                item["title"] = "".join(item["title"])
                item["title"] = item["title"].replace(' ', '')
                item["title"] = item["title"].replace('\r', '')
                item["title"] = item["title"].replace('\n', '')
            except:
                item['IsFilter'] = False

            yield item
        if (len(nodelist) != 0) and (timecount < self.allowed_timesup):
            keyword = response.url.split('q=')[1]
            page_num = response.url.split('p=')[1].split('&')[0]
            print(
                '\n第***********************************%s***********************************页\n'
                % page_num)
            page_num = int(page_num) + 1
            NextPageUrl = "http://www.nrta.gov.cn/jrobot/search.do?webid=1&pg=12&p=%s&tpl=&category=&q=%s" % (
                str(page_num), keyword)
            print(NextPageUrl)
            yield scrapy.Request(NextPageUrl, callback=self.parse)
        else:
            self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #10
0
 def parse(self, response):
     html = response.text
     html = str(html)
     docs = html.split("DOCPUBURL")[1:]
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     timecount = 0  # 计数器
     for doc in docs:
         item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                            time.localtime(time.time()))
         url = doc.split(':\"', 1)[1].split('\"', 1)[0]
         doc = doc.split("DOCPUBTIME", 1)[1]
         creattime = doc.split(':\"', 1)[1].split('\"', 1)[0]
         doc = doc.split("DOCID", 1)[1]
         id = doc.split(':', 1)[1].split(',', 1)[0]
         doc = doc.split("DOCTITLE", 1)[1]
         title = doc.split(':\"', 1)[1].split('\"', 1)[0]
         title = title.replace('<em>', '')
         title = title.replace('</em>', '')
         item["title"] = title
         item["url"] = url
         item["urlId"] = '%s_%s' % (self.name, id)
         item["time"] = creattime.replace(' ', '')[0:10]
         print(item["time"])
         if TimeMarch.time_March(item["time"], self.default_scope_day):
             item["IsFilter"] = True
         else:
             item["IsFilter"] = False
             timecount = timecount + 1
         res_child = child_page(item["url"])
         item["info"] = res_child.xpath(
             "//font[@id='Zoom']/div/span/text() | //font[@id='Zoom']//span/text() | //font[@id='Zoom']//p/text()"
         )
         item["info"] = "".join(item["info"])
         yield item
     if (len(docs) != 0) and (timecount < self.allowed_timesup):
         keyword = response.url.split('keyword=')[1].split('&')[0]
         page_num = response.url.split('pageNumber=')[1].split('&')[0]
         page_num = int(page_num) + 1
         print(
             '\n第***********************************%s***********************************页\n'
             % str(page_num))
         NextPageUrl = "http://gbdsj.guizhou.gov.cn/57/front/search.jhtml?code=c10a0a56f987453cb15e6a1fe45f7b8&keyword=" + str(
             keyword
         ) + "&pageNumber=" + str(
             page_num
         ) + "&filterParam=typename%3A1%3BsiteName%3A50&timeScope=+&orderBy=time&_=1569230227733"
         print(NextPageUrl)
         yield scrapy.Request(NextPageUrl,
                              callback=self.parse,
                              dont_filter=True)
     else:
         self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #11
0
    def parse(self, response):
        nodelist = response.xpath(
            "//div[@class='wr_body_type1 cont2']//li")  #得到一页中的所有帖子
        nodelist = [] if nodelist == None else nodelist
        item = BaiduspiderItem()
        item = inititem(item)
        # 是否符合爬取条件
        item['IsFilter'] = False
        timecount = 0  # 计数器
        for node in nodelist:  #分析帖子信息
            try:
                item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                   time.localtime(time.time()))
                item["url"] = node.xpath("./a/@href").extract_first()
                item["urlId"] = item["url"].split('_')[-1].split('.')[0]
                item["urlId"] = '%s_%s' % (self.name, item["urlId"])
                item["title"] = node.xpath("./a/text()").extract_first()
                # item["time"] = time.strftime("%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日"))
                res_child = child_page(item["url"])
                item["info"] = res_child.xpath(
                    "//div[@id = 'z']/p//span/text()")
                item["info"] = "".join(item["info"])
                item["time"] = res_child.xpath(
                    "//td[@align='center']/span/text()")
                item["time"] = "".join(item["time"])
                item["time"] = item["time"].split(':')[-1]
                item["time"] = item["time"].replace('年', '-')
                item["time"] = item["time"].replace('月', '-')
                item["time"] = item["time"].replace('日', '')
                # 判断这个帖子是否符合时间
                if TimeMarch.time_March(item["time"], self.default_scope_day):
                    item["IsFilter"] = True
                else:
                    item["IsFilter"] = False
                    timecount = timecount + 1
            except:
                item['IsFilter'] = False

            yield item
        if (len(nodelist) != 0) and (timecount < self.allowed_timesup):
            keyword = response.url.split('searchword=')[1].split('&')[0]
            page_num = response.url.split('page=')[1]
            print(
                '\n第***********************************%s***********************************页\n'
                % page_num)
            page_num = int(page_num) + 1
            NextPageUrl = "http://gdj.nx.gov.cn/was5/web/search?searchword=%s&channelid=244757&page=%s" % (
                keyword, str(page_num))
            print(NextPageUrl)
            yield scrapy.Request(NextPageUrl,
                                 callback=self.parse,
                                 dont_filter=True)
        else:
            self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #12
0
 def parse_html(self, response):
     item = BaiduspiderItem()
     html = json.loads(response.text)
     if html:
         results = html['data']
         for result in results:
             try:
                 item['img_url'] = result['objURL']
                 yield item
             except:
                 pass
예제 #13
0
    def parse(self, response):
        nodelist = response.xpath("//div[@align='left']")#得到一页中的所有帖子
        nodelist = [] if nodelist==None else nodelist
        item = BaiduspiderItem()
        item = inititem(item)
        # 是否符合爬取条件
        item['IsFilter'] = False
        timecount = 0  # 计数器
        pagecount = 0
        for node in nodelist:#分析帖子信息
            try:
                item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                item["url"] = node.xpath("./a[1]/@href").extract_first()
                item["url"] = 'http://gd.shandong.gov.cn%s'%item["url"]
                item["urlId"] = item["url"].split('articles/')[-1].split('/')[0]
                item["urlId"] = '%s_%s' % (self.name, item["urlId"])
                item["time"] = node.xpath("./font[@class='filterTime']/text()").extract_first()
                item["time"] = "".join(item["time"])
                # item["time"] = time.strftime("%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日"))
                # 判断这个帖子是否符合时间
                if TimeMarch.time_March(item["time"],self.default_scope_day):
                    item["IsFilter"] = True
                else:
                    item["IsFilter"] = False
                    timecount = timecount + 1
                res_child = child_page(item["url"])
                item["title"] = res_child.xpath("//h1[@class='title']/text() | //div[@class='editor-content editor-content-nweview']/p/text() | //div[@class='editor-content editor-content-nweview']/p/font/text() | //div[@class='editor-content editor-content-nweview']/p")
                item["info"] = res_child.xpath("//p[@class='MsoNormal']/span/text() | //font/text()  | //div[@class='editor-content editor-content-nweview']//p/text()")
                item["info"] = "".join(item["info"])
                pub_time = res_child.xpath("//div[@class='content content-view']/p[1]/span[1]/text()")
                if pub_time is not None:
                    item["time"] = pub_time
                    item["time"] = "".join(item["time"])
                    item["time"] = str(item["time"])[0:10]
                title = item["title"][0]
                item["title"] = title
            except:
                item['IsFilter'] = False

            yield item
        if (len(nodelist)!=0) and (timecount<self.allowed_timesup):
            keyword = response.url.split('content=')[1].split('&')[0]
            page_num = response.url.split('currentpage=')[-1]
            print('\n第***********************************%s***********************************页\n'%page_num)
            page_num = int(page_num)+1
            NextPageUrl = "http://gd.shandong.gov.cn/gentleCMS/cmssearch/search.do?siteId=224c56cd-948a-4ac8-95bf-a44822be2f09&content=%s&currentpage=%s"%(keyword,str(page_num))
            print(NextPageUrl)
            if page_num < 6:
                yield scrapy.Request(NextPageUrl,callback = self.parse,dont_filter=True)
            else:
                self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
        else:
            self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
예제 #14
0
    def parse(self, response):
        nodelist = response.xpath("//div[@class='xwd']")  #得到一页中的所有帖子
        nodelist = [] if nodelist == None else nodelist
        item = BaiduspiderItem()
        item = inititem(item)
        # 是否符合爬取条件
        item['IsFilter'] = False
        timecount = 0  # 计数器
        for node in nodelist:  # 分析帖子信息
            try:
                item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                   time.localtime(time.time()))
                item["url"] = node.xpath("./div[1]/a/@href").extract_first()
                item["url"] = 'http://gdj.shaanxi.gov.cn/%s' % item["url"]
                item["urlId"] = item["url"].split('/')[-1].split('.')[0]
                item["urlId"] = '%s_%s' % (self.name, item["urlId"])
                item["time"] = node.xpath("./div[2]/text()").extract_first()
                item["time"] = "".join(item["time"])
                item["time"] = item["time"].split("时间:")[-1]
                # item["time"] = time.strftime("%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日"))
                # 判断这个帖子是否符合时间
                if TimeMarch.time_March(item["time"], self.default_scope_day):
                    item["IsFilter"] = True
                else:
                    item["IsFilter"] = False
                    timecount = timecount + 1
                res_child = child_page(item["url"])
                item["title"] = res_child.xpath(
                    "//div[@class='article-title']/h1/text()")
                item["title"] = "".join(item["title"])
                item["info"] = res_child.xpath(
                    "//div[@class='v_news_content']/p/font/text()")
                item["info"] = "".join(item["info"])
            except:
                item['IsFilter'] = False

            yield item
        if (len(nodelist) != 0) and (timecount < self.allowed_timesup):
            keyword = response.url.split('keyword=')[1]
            page_num = response.url.split('currentnum=')[1].split('&')[0]
            print(
                '\n第***********************************%s***********************************页\n'
                % page_num)
            page_num = int(page_num) + 1
            NextPageUrl = "http://gdj.shaanxi.gov.cn/chaxunjieguo.jsp?wbtreeid=1001&searchScope=0&currentnum=%s&keyword=%s" % (
                str(page_num), keyword)
            print(NextPageUrl)
            yield scrapy.Request(NextPageUrl,
                                 callback=self.parse,
                                 dont_filter=True)
        else:
            self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #15
0
 def parse(self, response):
     html = response.text
     html = str(html)
     docs = html.split("documentId")[1:]
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     timecount = 0  # 计数器
     for doc in docs:
         item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                            time.localtime(time.time()))
         id = doc.split(':', 1)[1].split(',', 1)[0]
         doc = doc.split("documentDate", 1)[1]
         creattime = doc.split(':\"', 1)[1].split('\"', 1)[0]
         doc = doc.split("documentTitle", 1)[1]
         title = doc.split(':\"', 1)[1].split('\",', 1)[0]
         doc = doc.split("documentUrl", 1)[1]
         url = doc.split(':\"', 1)[1].split('\"', 1)[0]
         url = url.split("xwcbj2016")[1]
         url = "http://gdj.sc.gov.cn%s" % url
         item["title"] = title
         item["url"] = url
         item["urlId"] = '%s_%s' % (self.name, id)
         item["time"] = creattime.replace(' ', '')
         if TimeMarch.time_March(item["time"], self.default_scope_day):
             item["IsFilter"] = True
         else:
             item["IsFilter"] = False
             timecount = timecount + 1
         res_child = child_page(item["url"])
         item["info"] = res_child.xpath(
             "//div[@class = 'Custom_UnionStyle']/p/text() | //div[@class = 'Custom_UnionStyle']/p/font/text() | //div[@class='content']//span/text() | //div[@class='content']//font/text() | //div[@class = 'Custom_UnionStyle']//span/text()"
         )
         item["info"] = "".join(item["info"])
         yield item
     if (len(docs) != 0) and (timecount < self.allowed_timesup):
         keyword = response.url.split('keyword=')[1].split('&')[0]
         page_num = response.url.split('pageIndex=')[1]
         page_num = int(page_num) + 1
         print(
             '\n第***********************************%s***********************************页\n'
             % str(page_num))
         NextPageUrl = "http://gdj.sc.gov.cn/scxwcbjss/search?keyword=%s&pageIndex=%s" % (
             keyword, str(page_num))
         print(NextPageUrl)
         yield scrapy.Request(NextPageUrl,
                              callback=self.parse,
                              dont_filter=True)
     else:
         self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #16
0
 def parse(self, response):
     nodelist = response.xpath("//li[@class='active']")  #得到一页中的所有帖子
     nodelist = [] if nodelist == None else nodelist
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     timecount = 0  # 计数器
     for node in nodelist:  #分析帖子信息
         try:
             item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime(time.time()))
             item["url"] = node.xpath(
                 "./div[@class='com-title']/a/@href").extract_first()
             item["urlId"] = item["url"].split('/')[-1].split('.')[0]
             item["urlId"] = '%s_%s' % (self.name, item["urlId"])
             item["time"] = node.xpath(
                 "./div[2]/div/span[2]/text()").extract_first()
             item["time"] = item["time"].split(':')[-1]
             res_child = child_page(item["url"])
             # item["info"] = res_child.xpath("//div[@id='j-show-body']/div/div/p/span/voice/text()")
             item["info"] = res_child.xpath("//span/text()")
             item["info"] = "".join(item["info"])
             item["title"] = res_child.xpath(
                 "//div[@class='main_content']/h2/text()")
             item["title"] = "".join(item["title"])
             # 判断这个帖子是否符合时间
             if TimeMarch.time_March(item["time"], self.default_scope_day):
                 item["IsFilter"] = True
             else:
                 item["IsFilter"] = False
                 timecount = timecount + 1
         except:
             item['IsFilter'] = False
         yield item
     if (len(nodelist) != 0) and (timecount < self.allowed_timesup):
         keyword = response.url.split('q=')[1].split('&')[0]
         page_num = response.url.split('p=')[1].split('&')[0]
         page_num = int(page_num)
         print(
             '\n第***********************************%s***********************************页\n'
             % (page_num + 1))
         page_num = page_num + 1
         NextPageUrl = "http://searchs.hunan.gov.cn/hunan/gbdsj/news?q=%s&searchfields=&sm=0&columnCN=&p=%s&timetype=timeqb" % (
             keyword, str(page_num))
         print(NextPageUrl)
         yield scrapy.Request(NextPageUrl, callback=self.parse)
     else:
         self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #17
0
 def parse(self, response):
     html = response.text
     html = str(html)
     docs = html.split("\"title\":\"")[1:-1]
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     timecount = 0  # 计数器
     for doc in docs:
         item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                            time.localtime(time.time()))
         title = doc.split("\",")[0]
         doc = doc.split("\",", 1)[1].split('\"url\":\"')[1]
         url = doc.split("\",")[0]
         doc = doc.split("\",", 1)[1].split('\"time\":\"')[1]
         creattime = doc.split(' ')[0]
         item["title"] = title
         item["url"] = url
         item["urlId"] = item["url"].split('_')[-1].split('.')[0]
         item["urlId"] = '%s_%s' % (self.name, item["urlId"])
         item["time"] = creattime
         if TimeMarch.time_March(item["time"], self.default_scope_day):
             item["IsFilter"] = True
         else:
             item["IsFilter"] = False
             timecount = timecount + 1
         res_child = child_page(item["url"])
         item["info"] = res_child.xpath(
             "//div[@class = 'Custom_UnionStyle']/p/text() | //div[@class = 'Custom_UnionStyle']/p/font/text() | //p/font/text() | //p/text()"
         )
         item["info"] = "".join(item["info"])
         yield item
     if (len(docs) != 0) and (timecount < self.allowed_timesup):
         keyword = response.url.split('classsql=')[1].split('&')[0]
         page_num = response.url.split('&page=')[1]
         print(
             '\n第***********************************%s***********************************页\n'
             % page_num)
         page_num = int(page_num) + 1
         NextPageUrl = "http://gdj.fujian.gov.cn/was5/web/search?channelid=229105&templet=advsch.jsp&sortfield=-docreltime&classsql=%s&prepage=20&page=%s" % (
             keyword, str(page_num))
         print(NextPageUrl)
         yield scrapy.Request(NextPageUrl,
                              callback=self.parse,
                              dont_filter=True)
     else:
         self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #18
0
 def parse(self, response):
     nodelist = response.xpath(
         "//span[@class = 'list plist rc']/a")  #得到一页中的所有帖子
     nodelist = [] if nodelist == None else nodelist
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     timecount = 0  # 计数器
     for node in nodelist:  #分析帖子信息
         try:
             item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime(time.time()))
             item["title"] = node.xpath("./text()").extract_first()
             item["url"] = node.xpath("./@href").extract_first()
             item["url"] = 'http://gdj.shanxi.gov.cn/%s' % item["url"]
             item["urlId"] = item["url"].split('id=')[-1].split('.')[0]
             item["urlId"] = '%s_%s' % (self.name, item["urlId"])
             item["time"] = node.xpath("./em/text()").extract_first()
             # 判断这个帖子是否符合时间
             if TimeMarch.time_March(item["time"], self.default_scope_day):
                 item["IsFilter"] = True
             else:
                 item["IsFilter"] = False
                 timecount = timecount + 1
             res_child = child_page(item["url"])
             item["info"] = res_child.xpath(
                 "//div[@id = 'Zoom']/p/text() | //div[@id='Zoom']/text()")
             item["info"] = "".join(item["info"])
             # item["info"] = bytearray.fromhex(''.join(item["info"].split("\\x"))).decode()
         except:
             item['IsFilter'] = False
         yield item
     if (len(nodelist) != 0) and (timecount < self.allowed_timesup):
         keyword = response.url.split('title=')[1]
         page_num = response.url.split('p=')[1].split('&')[0]
         print(
             '\n第***********************************%s***********************************页\n'
             % page_num)
         page_num = int(page_num) + 1
         NextPageUrl = "http://gdj.shanxi.gov.cn/soso.aspx?p=%s&title=%s&type=1" % (
             str(page_num), keyword)
         print(NextPageUrl)
         yield scrapy.Request(NextPageUrl,
                              callback=self.parse,
                              dont_filter=True)
     else:
         self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #19
0
 def parse(self, response):
     nodelist = response.xpath("//ul[@class='search_list']/li")#得到一页中的所有帖子
     nodelist = [] if nodelist==None else nodelist
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     timecount = 0  # 计数器
     for node in nodelist:#分析帖子信息
         try:
             item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
             item["title"] = node.xpath("./a/text()").extract_first()
             item["url"] = node.xpath("./a/@href").extract_first()
             item["urlId"] = item["url"].split('/')[-1].split('.')[0]
             item["urlId"] = '%s_%s' % (self.name, item["urlId"])
             res_child = child_page(item["url"])
             item["time"] = node.xpath("./span/text()").extract_first()
             item["time"] = item["time"].replace('[','')
             item["time"] = item["time"].replace(']', '')
             # item["time"] = item["time"].split('局')[-1].split(' ')[-1]
             # 判断这个帖子是否符合时间
             if "直播卫星" in item["title"] or "中星九号" in item["title"] or "扶贫工程" in item["title"] or "扶贫" in item["title"]:
                 if TimeMarch.time_March(item["time"],self.default_scope_day):
                     item["IsFilter"] = True
                 else:
                     item["IsFilter"] = False
                     timecount = timecount + 1
                 res_child = child_page(item["url"])
                 print(res_child.text)
                 item["info"] = res_child.xpath("//div[@class='content content_article']/text()  | //div[@class='content content_article']/div/text()")
                 item["info"] = "".join(item["info"])
                 yield item
             else:
                 yield None
         except:
             i = 0
             yield None
     if (len(nodelist)!=0) and (timecount<self.allowed_timesup):
         page_num = response.url.split('page=')[1]
         print('\n第***********************************%s***********************************页\n'%page_num)
         page_num = int(page_num)+1
         if page_num < 30:
             NextPageUrl = "http://gbdsj.gxzf.gov.cn/index.php?m=search&c=index&a=init&page=%s"%(str(page_num))
             print(NextPageUrl)
             yield scrapy.Request(NextPageUrl,callback = self.parse,dont_filter=True)
     else:
         self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
예제 #20
0
    def parse(self, response):
        nodelist = response.xpath('//tbody/tr/th')  #得到一页中的所有帖子
        item = BaiduspiderItem()
        isHasContent = False  # 判断此页中是否有合适的信息
        NextPageUrl = ''
        timecount = 0  # 计数器
        for node in nodelist:  #分析帖子信息\
            childUrl = node.xpath(
                "./a[2][@class='s xst']/@href").extract_first()
            item["title"] = node.xpath(
                "./a[2][@class='s xst']/text()").extract_first()
            item["UrlId"] = node.xpath(
                "./a[2][@class='s xst']/@href").extract_first()
            if (childUrl != None):
                item["info"] = ChildPage.ChildPage(childUrl, '2')
            item["time"] = node.xpath(
                './a[2]/../../td[@class="by"]/em/span/text()').extract_first()

            if item["time"] == None:
                item["time"] = node.xpath(
                    './a[2]/../../td[@class="by"]/em/span/span/text()'
                ).extract_first()
            #处理时间为空的情况
            if item["time"] == None:
                item["time"] = ''
            else:
                item["time"] = item["time"].strip()
                item["time"] = TimeCalculate.time_calculate(
                    item["time"], self.name)
            # 判断这个帖子是否符合时间
            if (TimeMarch.time_March(item["time"],
                                     self.default_scope_day) == True):
                item["IsLimitedTime"] = 'y'
            else:
                item["IsLimitedTime"] = 'n'
                timecount = timecount + 1

            if (NextPageUrl == ''):  #记录下一页的链接
                NextPageUrl = response.xpath(
                    '//div[@class="pg"]/a[@class="nxt"]/@href').extract_first(
                    )
            if item["UrlId"] != None:  # 非普通帖子的错误处理(置顶帖等异常的帖子)
                yield item  #返回数据到pipeline
        if (timecount > self.allowed_timesup):  #根据判断决定继续爬取还是结束
            self.crawler.engine.close_spider(self, 'Finished')  #关闭爬虫
        else:
            yield scrapy.Request(NextPageUrl, callback=self.parse)
예제 #21
0
 def parse(self, response):
     nodelist = response.xpath(
         "//div[@id='items']/div[@class='resultItem']")  #得到一页中的所有帖子
     nodelist = [] if nodelist == None else nodelist
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     timecount = 0  # 计数器
     for node in nodelist:  #分析帖子信息
         item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                            time.localtime(time.time()))
         item["title"] = node.xpath("./a/text()").extract()
         item["title"] = "".join(item["title"])
         item["url"] = node.xpath("./a/@href").extract_first()
         item["urlId"] = item["url"].split('/')[-1].split('.')[0]
         item["urlId"] = '%s_%s' % (self.name, item["urlId"])
         item["info"] = node.xpath("./div/text()").extract()
         item["info"] = "".join(item["info"])
         item["time"] = node.xpath("./font/text()").extract_first()
         item["time"] = item["time"].split(' ')[1]
         try:
             item["time"] = time.strftime(
                 "%Y-%m-%d",
                 time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日"))
             # 判断这个帖子是否符合时间
             if TimeMarch.time_March(item["time"], self.default_scope_day):
                 item["IsFilter"] = True
             else:
                 item["IsFilter"] = False
                 timecount = timecount + 1
         except:
             item['IsFilter'] = False
         yield item
     if (len(nodelist) != 0) and (timecount < self.allowed_timesup):
         keyword = response.url.split('q=')[1].split('&')[0]
         page_num = response.url.split('page=')[1]
         print(
             '\n第***********************************%s***********************************页\n'
             % page_num)
         page_num = int(page_num) + 1
         NextPageUrl = "http://searchgov1.eastday.com/searchwgj/search.ashx?q=%s&page=%s" % (
             keyword, str(page_num))
         print(NextPageUrl)
         yield scrapy.Request(NextPageUrl, callback=self.parse)
     else:
         self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #22
0
    def parse(self, response):
        nodelist = response.xpath(
            "//div[@class='articleList_listBox']/a")  #得到一页中的所有帖子
        nodelist = [] if nodelist == None else nodelist
        item = BaiduspiderItem()
        item = inititem(item)
        # 是否符合爬取条件
        item['IsFilter'] = False
        timecount = 0  # 计数器
        for node in nodelist:  #分析帖子信息
            try:
                item["title"] = node.xpath("./@title").extract_first()
                item["url"] = node.xpath("./@href").extract_first()
                item["url"] = 'http://www.hnppb.gov.cn%s' % item["url"]
                item["urlId"] = item["url"].split('/')[-1].split('.')[0]
                item["urlId"] = '%s_%s' % (self.name, item["urlId"])
                item["time"] = node.xpath("./span[2]/text()").extract_first()
                # 判断这个帖子是否符合时间
                if TimeMarch.time_March(item["time"], self.default_scope_day):
                    item["IsFilter"] = True
                else:
                    item["IsFilter"] = False
                    timecount = timecount + 1
                res_child = child_page(item["url"])
                item["info"] = res_child.xpath(
                    "//div[@class='article_body']//p/span/text()")
                item["info"] = "".join(item["info"])
            except:
                item['IsFilter'] = False

            yield item
        if (len(nodelist) != 0) and (timecount < self.allowed_timesup):
            keyword = response.url.split('title=')[1].split('&')[0]
            page_num = response.url.split('pageNo=')[1]
            print(
                '\n第***********************************%s***********************************页\n'
                % page_num)
            page_num = int(page_num) + 1
            NextPageUrl = "http://www.hnppb.gov.cn/cms/12/search.do?basic_title=%s&pageNo=%s" % (
                keyword, str(page_num))
            print(NextPageUrl)
            yield scrapy.Request(NextPageUrl,
                                 callback=self.parse,
                                 dont_filter=True)
        else:
            self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #23
0
 def parse(self, response):
     nodelist = response.xpath("//li[@class='wrap']")  #得到一页中的所有帖子
     nodelist = [] if nodelist == None else nodelist
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     timecount = 0  # 计数器
     for node in nodelist:  #分析帖子信息
         try:
             item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime(time.time()))
             item["title"] = node.xpath("./div/h5/a").extract()
             item["title"] = xml_filter("".join(item["title"]))
             item["url"] = node.xpath("./div/h5/a/@href").extract_first()
             item["urlId"] = item["url"].split('=')[-1]
             item["urlId"] = '%s_%s' % (self.name, item["urlId"])
             item['info'] = node.xpath(
                 "//li[@class='wrap']/div/p").extract()
             item["info"] = xml_filter("".join(item["info"]))
             item["time"] = node.xpath(
                 "./div[@class='adds']/text()").extract_first()
             item["time"] = item["time"].split(':')[1]
             # 判断这个帖子是否符合时间
             if TimeMarch.time_March(item["time"], self.default_scope_day):
                 item["IsFilter"] = True
             else:
                 item["IsFilter"] = False
                 timecount = timecount + 1
         except:
             item['IsFilter'] = False
         yield item
     if (len(nodelist) != 0) and (timecount < self.allowed_timesup):
         keyword = response.url.split('q=')[1].split('&')[0]
         page_num = response.url.split('page=')[1]
         print(
             '\n第***********************************%s***********************************页\n'
             % page_num)
         page_num = int(page_num) + 1
         NextPageUrl = "http://gbdsj.nmg.gov.cn/?m=search&c=index&a=init&typeid=1&siteid=1&q=%s&page=%s" % (
             keyword, str(page_num))
         print(NextPageUrl)
         yield scrapy.Request(NextPageUrl, callback=self.parse)
     else:
         self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #24
0
 def parse(self, response):
     nodelist = response.xpath("//div[@class='msg discuss']")  #得到一页中的所有帖子
     nodelist = [] if nodelist == None else nodelist
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     timecount = 0  # 计数器
     for node in nodelist:  #分析帖子信息
         item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                            time.localtime(time.time()))
         item["title"] = node.xpath(
             "./div[@class='titleP']/a/@title").extract_first()
         item["url"] = node.xpath(
             "./div[@class='titleP']/a/@href").extract_first()
         item["url"] = 'http://so.kaipuyun.cn/%s' % item["url"]
         item["urlId"] = '12'
         item["time"] = node.xpath(
             "./div[@class='content']/span/text()").extract_first()
         item["info"] = node.xpath(
             "./div[@class='content']/p/text()").extract()
         item["info"] = "".join(item["info"])
         try:
             # 判断这个帖子是否符合时间
             if TimeMarch.time_March(item["time"], self.default_scope_day):
                 item["IsFilter"] = True
             else:
                 timecount = timecount + 1
         except:
             item['IsFilter'] = False
         yield item
     if (len(nodelist) != 0) and (timecount < self.allowed_timesup):
         keyword = response.url.split('qt=')[1].split('&')[0]
         page_num = response.url.split('page=')[1]
         print(
             '\n第***********************************%s***********************************页\n'
             % page_num)
         page_num = int(page_num) + 1
         NextPageUrl = "http://so.kaipuyun.cn/s?q=1&qt=%s&siteCode=N000005664&page=%s" % (
             keyword, str(page_num))
         print(NextPageUrl)
         yield scrapy.Request(NextPageUrl, callback=self.parse)
     else:
         self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #25
0
 def parse(self, response):
     html = str(response.text)
     item = BaiduspiderItem()
     item = inititem(item)
     # 是否符合爬取条件
     item['IsFilter'] = False
     timecount = 0  # 计数器
     html = html.split('list":[{', 1)[1]
     html = html.split('}],"total"', 1)[0]
     html = str(html).encode('unicode_escape').decode("unicode_escape")
     docs = html.split("},{")
     for doc in docs:
         try:
             doc = '{' + doc + '}'
             doc_dict = ast.literal_eval(doc)
             item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                time.localtime(time.time()))
             item["title"] = str(doc_dict['title'])
             item["title"] = item["title"].replace('<em>', '')
             item["title"] = item["title"].replace('</em>', '')
             item["title"] = item["title"].replace('<\\\\/em>', '')
             item["title"] = item["title"].replace('[', '')
             item["title"] = item["title"].replace(']', '')
             item["url"] = str(doc_dict['url']).replace('\\', '')
             item["urlId"] = item["url"].split('post_')[-1].split('.')[0]
             item["urlId"] = '%s_%s' % (self.name, id)
             item["time"] = doc_dict['pub_time']
             info = str(doc_dict['content'])
             if TimeMarch.time_March(item["time"], self.default_scope_day):
                 item["IsFilter"] = True
             else:
                 item["IsFilter"] = False
                 timecount = timecount + 1
             res_child = child_page(item["url"])
             item["info"] = res_child.xpath(
                 "//div[@calss='article-content']/p/text() | //div[@calss='article-content']//p/text() | //div[@id='content']//span/text()"
             )
             item["info"] = "".join(item["info"])
             if len(item["info"]) < len(info):
                 item["info"] = info
             yield item
         except:
             print()
     self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫
예제 #26
0
    def parse(self, response):
        nodelist = response.xpath("//div[@class = 'listItem']/ul/a")#得到一页中的所有帖子
        nodelist = [] if nodelist==None else nodelist
        item = BaiduspiderItem()
        item = inititem(item)
        # 是否符合爬取条件
        item['IsFilter'] = False
        timecount = 0  # 计数器
        for node in nodelist:#分析帖子信息
            try:
                item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                item["url"] = node.xpath("./@href").extract_first()
                item["url"] = "http://gdj.gansu.gov.cn" + item["url"]
                item["urlId"] = item["url"].split('/')[-1].split('.')[0]
                item["urlId"] = '%s_%s' % (self.name, item["urlId"])
                item["time"] = node.xpath(".//div[@class = 'date']/text()").extract_first()
                item["time"] = item["time"].split(' ')[0]
                # item["time"] = time.strftime("%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日"))
                # 判断这个帖子是否符合时间
                if TimeMarch.time_March(item["time"],self.default_scope_day):
                    item["IsFilter"] = True
                else:
                    item["IsFilter"] = False
                    timecount = timecount + 1
                res_child = child_page(item["url"])
                item["info"] = res_child.xpath("//div[@class = 'notice_content']/p/span/text() | //div[@class = 'notice_content']/section//p/span/text()")
                item["info"] = "".join(item["info"])
                item["title"] = res_child.xpath("//div[@class = 'titles']/h6/text()")
                item["title"] = "".join(item["title"])
            except:
                item['IsFilter'] = False

            yield item
        if (len(nodelist)!=0) and (timecount<self.allowed_timesup):
            keyword = response.url.split('keyword=')[1]
            page_num = response.url.split('p=')[1].split('&')[0]
            print('\n第***********************************%s***********************************页\n'%page_num)
            page_num = int(page_num)+1
            NextPageUrl = "http://gdj.gansu.gov.cn/home/search/index.html?keyword=%s&p=%s"%(keyword,str(page_num))
            print(NextPageUrl)
            yield scrapy.Request(NextPageUrl,callback = self.parse,dont_filter=True)
        else:
            self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
예제 #27
0
    def parse(self, response):
        nodelist = response.xpath("//p")#得到一页中的所有帖子
        nodelist = [] if nodelist==None else nodelist
        item = BaiduspiderItem()
        item = inititem(item)
        # 是否符合爬取条件
        item['IsFilter'] = False
        timecount = 0  # 计数器
        for node in nodelist:#分析帖子信息
            try:
                item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                item["title"] = node.xpath("./a[@class='subject']/text()").extract_first()
                item["url"] = node.xpath("./a[@class='subject']/@href").extract_first()
                item["urlId"] = item["url"].split('/')[-1].split('.')[0]
                item["urlId"] = '%s_%s' % (self.name, item["urlId"])
                item["time"] = node.xpath("./a[@class='green']/text()").extract_first()
                item["time"] = "".join(item["time"])
                item["time"] = item["time"].split(' - ')[-1]
                # item["time"] = time.strftime("%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日"))
                # 判断这个帖子是否符合时间
                if TimeMarch.time_March(item["time"],self.default_scope_day):
                    item["IsFilter"] = True
                else:
                    item["IsFilter"] = False
                    timecount = timecount + 1
                res_child = child_page(item["url"])
                item["info"] = res_child.xpath("//div[@id = 'zoom']/p/text() | //div[@id = 'zoom']/p/span/text() | //font")
                item["info"] = "".join(item["info"])
            except:
                item['IsFilter'] = False

            yield item
        if (len(nodelist)!=0) and (timecount<self.allowed_timesup):
            keyword = response.url.split('q=')[1].split('&')[0]
            page_num = response.url.split('p=')[1].split('&')[0]
            print('\n第***********************************%s***********************************页\n'%page_num)
            page_num = int(page_num)+1
            NextPageUrl = "http://www.zjxwcb.gov.cn/jsearch/search?q=%s&area=1&pos=1&date=1&p=%s&pg=10&x=17&y=13"%(keyword,str(page_num))
            print(NextPageUrl)
            yield scrapy.Request(NextPageUrl,callback = self.parse,dont_filter=True)
        else:
            self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
예제 #28
0
    def parse(self, response):
        nodelist = response.xpath("//dl[@class = 'bbda cl']")#得到一页中的所有帖子
        nodelist = [] if nodelist==None else nodelist
        item = BaiduspiderItem()
        item = inititem(item)
        # 是否符合爬取条件
        item['IsFilter'] = False
        timecount = 0  # 计数器
        for node in nodelist:#分析帖子信息
            item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
            item["title"] = node.xpath("./dt/a/text()").extract_first()
            item["url"] = node.xpath("./dt/a/@href").extract_first()
            item["urlId"] = item["url"].split('/')[-1].split('.')[0]
            item["urlId"] = '%s_%s'%(self.name,item["urlId"])
            item["time"] = node.xpath("./dd[2]/span/text()").extract_first()
            item["time"] = item["time"].split(' ',1)[-1]

            try:
                # 判断这个帖子是否符合时间
                if TimeMarch.time_March(item["time"],self.default_scope_day):
                    item["IsFilter"] = True
                else:
                    item["IsFilter"] = False
                    timecount = timecount + 1
            except:
                item['IsFilter'] = False
            res_child = child_page(item["url"])
            item["info"] = res_child.xpath("//td[@id='article_content']/p/span/text() | //td[@id='article_content']/font/text() | //td[@id='article_content']/p/text() | //td[@id='article_content']/div/span/text() | //td[@id='article_content']/span/text()")
            item["info"] = "".join(item["info"])
            item["comment"] = res_child.xpath("//p[@class='xg1']/a[2]/em/text()")
            item["latestcomtime"] = res_child.xpath("//div[@class='bm_c']/dl[1]/dt/span[@class='xg1 xw0']/text()")
            item["latestcomtime"] = "".join(item["latestcomtime"])
            if item["latestcomtime"] == "":
                item["latestcomtime"] = None
            if item["comment"] != []:
                item["comment"] = item["comment"][0]
            else:
                item["comment"] = None
            item["read"] = res_child.xpath("//em[@id='_viewnum']/text()")[0]
            yield item
        self.crawler.engine.close_spider(self, 'Finished') # 关闭爬虫
예제 #29
0
    def parse(self, response):
        nodelist = response.xpath('//div[@class="col2_right j_threadlist_li_right "]')
        item = BaiduspiderItem()
        NextPageUrl = ''
        for node in nodelist:
            item["title"]= node.xpath("./div[1]/div/a[@title]/text()").extract_first()
            item["UrlId"] = node.xpath("./div[1]/div/a[@href]/@href").extract_first()
            item["info"] = node.xpath('./div[2]/div[@class="threadlist_text pull_left"]/div[1]/text()').extract_first()
            item["time"] = node.xpath('./div[1]/div[2]/span[@title="创建时间"]/text()').extract_first()

            childUrl = "https://tieba.baidu.com" + item["UrlId"]
            item["UrlId"] = childUrl
            if(NextPageUrl == ''):
                NextPageUrl = 'https:'+ response.xpath('//a[@class = "next pagination-item "]/@href').extract_first()
            #读取帖子详细信息的方法,但需求中不需要,实际只需使用使用'baidu2'即可,若用此方法需开启items中的childPage
            request = scrapy.Request(childUrl,callback =self.ChildPage)
            request.meta['item'] = item
            yield request

        yield scrapy.Request('https://tieba.baidu.com/f?kw=%E6%88%B7%E6%88%B7%E9%80%9A&ie=utf-8&pn=50',callback = self.parse)
        print("翻页了!!!!!!!!!!!!!!!!!")
예제 #30
0
    def parse(self, response):
        nodelist = response.xpath("//td[@class='ta']/table/tr")  #得到一页中的所有帖子
        nodelist = [] if nodelist == None else nodelist
        item = BaiduspiderItem()
        item = inititem(item)
        # 是否符合爬取条件
        item['IsFilter'] = False
        timecount = 0  # 计数器
        for node in nodelist:  #分析帖子信息
            try:
                item['spidertime'] = time.strftime('%Y-%m-%d %H:%M:%S',
                                                   time.localtime(time.time()))
                item["title"] = node.xpath("./td[2]/a/text()").extract_first()
                item["url"] = node.xpath("./td[2]/a/@href").extract_first()
                item["url"] = 'http://gdj.qinghai.gov.cn/%s' % item["url"]
                item["urlId"] = item["url"].split('/')[-1].split('.')[0]
                item["urlId"] = '%s_%s' % (self.name, item["urlId"])
                item["time"] = node.xpath("./td[3]/text()").extract_first()
                item["time"] = "".join(item["time"])
                if item["time"] is not None:
                    item["time"] = str(item["time"]).replace(' ', '')[0:10]
                # item["time"] = item["time"].split(' - ')[-1]
                # item["time"] = time.strftime("%Y-%m-%d", time.strptime(item["time"].split(' ')[0], "%Y年%m月%d日"))
                # 判断这个帖子是否符合时间
                if TimeMarch.time_March(item["time"], self.default_scope_day):
                    item["IsFilter"] = True
                else:
                    item["IsFilter"] = False
                    timecount = timecount + 1
                res_child = child_page(item["url"])
                item["info"] = res_child.xpath(
                    "//p/span/text() | //p/font/span/text()")
                item["info"] = "".join(item["info"])
            except:
                item['IsFilter'] = False

            yield item
        self.crawler.engine.close_spider(self, 'Finished')  # 关闭爬虫