Python HandleUrl.judge_url Examples

Programming Language: Python

Namespace/Package Name: processdata

Class/Type: HandleUrl

Method/Function: judge_url

Examples at hotexamples.com: 10

Python HandleUrl.judge_url - 10 examples found. These are the top rated real world Python examples of processdata.HandleUrl.judge_url extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

get_url(8)

judge_url(6)

join_url_path(1)

Example #1

Show file

 def crawl(self):
     homepage = "http://www.jxzj.gov.cn/jxzj/index.html"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+(news).+\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass

Example #2

Show file

 def crawl(self):
     homepage = "http://www.gzq.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item, homepage)
         text = '^(http|https).+(public).+.+\d$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             # print item.encode('utf-8')
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass

Example #3

Show file

File: hzqts.py Project: xxguo/crawler

 def crawl(self):
     homepage = "http://www.hzqts.gov.cn/zwpd/index.htm"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
           #  ContentCrawler(key=item).crawl()
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass

Example #4

Show file

 def crawl(self):
     homepage = "http://www.hzqts.gov.cn/zwpd/index.htm"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item, homepage)
         text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             #  ContentCrawler(key=item).crawl()
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass

Example #5

Show file

File: hbzljd.py Project: xxguo/crawler

 def crawl(self):
     homepage = "http://www.hbzljd.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+\d\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             # print item.encode('utf-8')
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass

Example #6

Show file

 def crawl(self):
     homepage = "http://www.fsjsjd.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         xp_putime = "//a[@href='%s']/parent::*/text()" % item
         pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
         item = HandleUrl.judge_url(item, homepage)
         text = '^(http|https).+\d\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             data['pubtime'] = pubtime
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass

Example #7

Show file

 def crawl(self):
     homepage = "http://www.bjtsb.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
        # print '----',item
         item = HandleUrl.judge_url(item,homepage)
       #  print '====',item
         text = ur'(http).+(infoview).+\d{3,8}$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             # print item.encode('utf-8')
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass

Example #8

Show file

File: fsjsjd.py Project: xxguo/crawler

 def crawl(self):
     homepage = "http://www.fsjsjd.gov.cn/"
     html_stream = _get_url(homepage)
     for item in HandleUrl.get_url(html_stream.text):
         xp_putime = "//a[@href='%s']/parent::*/text()"%item
         pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime)
         item = HandleUrl.judge_url(item,homepage)
         text = '^(http|https).+\d\.(htm|html|net)$'
         url_t = re.match(text, item)
         data = {}
         if url_t != None:
             data['pubtime'] = pubtime
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)
         else:
             pass

Example #9

Show file

 def crawl(self):
     world = self.key
     data = self.data
     #  world = str(self.key)
     data.update({'type': u'元搜索', 'origin_source': u'微信搜索', 'key': world})
     homepage = "http://weixin.sogou.com/weixinwap?ie=utf8&w=&\
                 type=2&t=1427703547684&s_t=&fr=sgsearch&\
                 query=" + world + "&pg=webSearchList"
     homepage = clear_space(homepage)
     html_stream = _get_url(homepage)
     list_url = []
     for item in HandleUrl.get_url(html_stream.text):
         item = HandleUrl.judge_url(item)
         if item == '':
             continue
         else:
             Scheduler.schedule(ContentCrawler.type, key=item, data=data)

Example #10

Show file

File: sogou.py Project: xxguo/crawler

 def crawl(self):
     world = self.key
     data = self.data
   #  world = str(self.key)
     data.update({
             'type': u'元搜索',
             'origin_source': u'微信搜索',
             'key': world
     })
     homepage = "http://weixin.sogou.com/weixinwap?ie=utf8&w=&\
                 type=2&t=1427703547684&s_t=&fr=sgsearch&\
                 query="+world+"&pg=webSearchList"
     homepage = clear_space(homepage)
     html_stream = _get_url(homepage)
     list_url = []
     for item in HandleUrl.get_url(html_stream.text):
         item  = HandleUrl.judge_url(item)
         if item == '':
             continue
         else:
             Scheduler.schedule(ContentCrawler.type, key=item,
                                  data=data)