Example #1
0
 def crawl(self):
     homepage = "http://www.jxzj.gov.cn/jxzj/index.html"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+(news).+\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #2
0
 def crawl(self):
     homepage = "http://www.gzq.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item, homepage)
         text = '^(http|https).+(public).+.+\d$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             # print item.encode('utf-8')
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #3
0
 def crawl(self):
     homepage = "http://www.hzqts.gov.cn/zwpd/index.htm"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
           #  ContentCrawler(key=item).crawl()
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #4
0
 def crawl(self):
     homepage = "http://www.hzqts.gov.cn/zwpd/index.htm"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item, homepage)
         text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             #  ContentCrawler(key=item).crawl()
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #5
0
 def crawl(self):
     homepage = "http://www.hbzljd.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+\d\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             # print item.encode('utf-8')
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #6
0
 def crawl(self):
     homepage = "http://www.fsjsjd.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         xp_putime = "//a[@href='%s']/parent::*/text()" % item
         pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
         item = HandleUrl.judge_url(item, homepage)
         text = '^(http|https).+\d\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             data['pubtime'] = pubtime
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #7
0
 def crawl(self):
     homepage = "http://www.bjtsb.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
        # print '----',item
         item = HandleUrl.judge_url(item,homepage)
       #  print '====',item
         text = ur'(http).+(infoview).+\d{3,8}$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             # print item.encode('utf-8')
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #8
0
 def crawl(self):
     homepage = "http://www.fsjsjd.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         xp_putime = "//a[@href='%s']/parent::*/text()"%item
         pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+\d\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             data['pubtime'] = pubtime
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass
Example #9
0
 def crawl(self):
     world = self.key
     data = self.data
     #  world = str(self.key)
     data.update({'type': u'元搜索', 'origin_source': u'微信搜索', 'key': world})
     homepage = "http://weixin.sogou.com/weixinwap?ie=utf8&w=&\
                 type=2&t=1427703547684&s_t=&fr=sgsearch&\
                 query=" + world + "&pg=webSearchList"
     homepage = clear_space(homepage)
     html_stream = _get_url(homepage)
     list_url = []
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item)
         if item == '':
             continue
         else:
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
Example #10
0
 def crawl(self):
     world = self.key
     data = self.data
   #  world = str(self.key)
     data.update({
             'type': u'元搜索',
             'origin_source': u'微信搜索',
             'key': world
     })
     homepage = "http://weixin.sogou.com/weixinwap?ie=utf8&w=&\
                 type=2&t=1427703547684&s_t=&fr=sgsearch&\
                 query="+world+"&pg=webSearchList"
     homepage = clear_space(homepage)
     html_stream = _get_url(homepage)
     list_url = []
     for item in HandleUrl.get_url(html_stream.text):
         item  = HandleUrl.judge_url(item)
         if item == '':
             continue
         else:
             Scheduler.schedule(ContentCrawler.type, key=item,
                                  data=data)