Example #1
0
 def crawl(self):
     homepage = "http://www.jxzj.gov.cn/jxzj/index.html"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+(news).+\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #2
0
 def firstcrawler(self, homepage):
     html_stream = _get_url(homepage)
     list_url = []
     for item in HandleUrl.get_url(html_stream.text):
         # url_res = HandleUrl.judge_url(item,homepage)
         # if url_res == ''
         #     continue
         text = '.+\/(bj|tj|hb|sx|nmg|ln|jl|hlj|sh|js|zj|ah|fj|jx|sd|hn|hub|hun|gd|gx|hain|cq|sc|gz|yn|xz|shx|gs|qh|nx|xj)\/Index.html$'
         url_t = re.match(text, item)
         #  url_t = HandleUrl.judge_url(homepage,url_t)
         if url_t != None and (item not in list_url):
             self.getchildurl(HandleUrl.join_url_path(homepage, item))
             list_url.append(item)
Example #3
0
File: cqn.py Project: xxguo/crawler
 def firstcrawler(self, homepage):
     html_stream = _get_url(homepage)
     list_url = []
     for item in HandleUrl.get_url(html_stream.text):
         # url_res = HandleUrl.judge_url(item,homepage)
         # if url_res == ''
         #     continue
         text = '.+\/(bj|tj|hb|sx|nmg|ln|jl|hlj|sh|js|zj|ah|fj|jx|sd|hn|hub|hun|gd|gx|hain|cq|sc|gz|yn|xz|shx|gs|qh|nx|xj)\/Index.html$'
         url_t = re.match(text,item)
       #  url_t = HandleUrl.judge_url(homepage,url_t)
         if url_t != None and (item not in list_url):
             self.getchildurl(HandleUrl.join_url_path(homepage,item))
             list_url.append(item)
Example #4
0
 def crawl(self):
     homepage = "http://www.hzqts.gov.cn/zwpd/index.htm"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
           #  ContentCrawler(key=item).crawl()
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #5
0
 def crawl(self):
     homepage = "http://www.hzqts.gov.cn/zwpd/index.htm"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item, homepage)
         text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             #  ContentCrawler(key=item).crawl()
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #6
0
 def crawl(self):
     homepage = "http://www.hbzljd.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+\d\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             # print item.encode('utf-8')
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #7
0
 def crawl(self):
     homepage = "http://www.bjtsb.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
        # print '----',item
         item = HandleUrl.judge_url(item,homepage)
       #  print '====',item
         text = ur'(http).+(infoview).+\d{3,8}$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             # print item.encode('utf-8')
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #8
0
 def crawl(self):
     homepage = "http://www.fsjsjd.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         xp_putime = "//a[@href='%s']/parent::*/text()"%item
         pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+\d\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             data['pubtime'] = pubtime
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #9
0
File: cqn.py Project: xxguo/crawler
 def getchildurl(self, url,data={}):
     html_stream = _get_url(url)
 
     for item in HandleUrl.get_url(html_stream.text):
         text = '^(http|https).+(news)\/(zjpd|xfpd|zhuanti|zgzlb).+\d\.(htm|html|net)$'
         url_t = re.match(text, item)
         if url_t != None:
             # ContentCrawler(key=item).crawl()
             # print item
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #10
0
    def getchildurl(self, url, data={}):
        html_stream = _get_url(url)

        for item in HandleUrl.get_url(html_stream.text):
            text = '^(http|https).+(news)\/(zjpd|xfpd|zhuanti|zgzlb).+\d\.(htm|html|net)$'
            url_t = re.match(text, item)
            if url_t != None:
                # ContentCrawler(key=item).crawl()
                # print item
                Scheduler.schedule(ContentCrawler.type, key=item, data=data)
            else:
                pass
Example #11
0
 def crawl(self):
     world = self.key
     data = self.data
   #  world = str(self.key)
     data.update({
             'type': u'元搜索',
             'origin_source': u'微信搜索',
             'key': world
     })
     homepage = "http://weixin.sogou.com/weixinwap?ie=utf8&w=&\
                 type=2&t=1427703547684&s_t=&fr=sgsearch&\
                 query="+world+"&pg=webSearchList"
     homepage = clear_space(homepage)
     html_stream = _get_url(homepage)
     list_url = []
     for item in HandleUrl.get_url(html_stream.text):
         item  = HandleUrl.judge_url(item)
         if item == '':
             continue
         else:
             Scheduler.schedule(ContentCrawler.type, key=item,
                                  data=data)