def getchildurl(self, url, data={}): html_stream = _get_url(url) for item in HandleUrl.get_url(html_stream.text): text = '^(http|https).+(news)\/(zjpd|xfpd|zhuanti|zgzlb).+\d\.(htm|html|net)$' url_t = re.match(text, item) if url_t != None: # ContentCrawler(key=item).crawl() # print item Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.jxzj.gov.cn/jxzj/index.html" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item,homepage) text = '^(http|https).+(news).+\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def getchildurl(self, url,data={}): html_stream = _get_url(url) for item in HandleUrl.get_url(html_stream.text): text = '^(http|https).+(news)\/(zjpd|xfpd|zhuanti|zgzlb).+\d\.(htm|html|net)$' url_t = re.match(text, item) if url_t != None: # ContentCrawler(key=item).crawl() # print item Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.gzq.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item, homepage) text = '^(http|https).+(public).+.+\d$' url_t = re.match(text, item) data = {} if url_t != None: # print item.encode('utf-8') Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def firstcrawler(self, homepage): html_stream = _get_url(homepage) list_url = [] for item in HandleUrl.get_url(html_stream.text): # url_res = HandleUrl.judge_url(item,homepage) # if url_res == '' # continue text = '.+\/(bj|tj|hb|sx|nmg|ln|jl|hlj|sh|js|zj|ah|fj|jx|sd|hn|hub|hun|gd|gx|hain|cq|sc|gz|yn|xz|shx|gs|qh|nx|xj)\/Index.html$' url_t = re.match(text,item) # url_t = HandleUrl.judge_url(homepage,url_t) if url_t != None and (item not in list_url): self.getchildurl(HandleUrl.join_url_path(homepage,item)) list_url.append(item)
def crawl(self): homepage = "http://www.hzqts.gov.cn/zwpd/index.htm" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item,homepage) text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: # ContentCrawler(key=item).crawl() Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def firstcrawler(self, homepage): html_stream = _get_url(homepage) list_url = [] for item in HandleUrl.get_url(html_stream.text): # url_res = HandleUrl.judge_url(item,homepage) # if url_res == '' # continue text = '.+\/(bj|tj|hb|sx|nmg|ln|jl|hlj|sh|js|zj|ah|fj|jx|sd|hn|hub|hun|gd|gx|hain|cq|sc|gz|yn|xz|shx|gs|qh|nx|xj)\/Index.html$' url_t = re.match(text, item) # url_t = HandleUrl.judge_url(homepage,url_t) if url_t != None and (item not in list_url): self.getchildurl(HandleUrl.join_url_path(homepage, item)) list_url.append(item)
def crawl(self): homepage = "http://www.hzqts.gov.cn/zwpd/index.htm" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item, homepage) text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: # ContentCrawler(key=item).crawl() Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.hbzljd.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item,homepage) text = '^(http|https).+\d\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: # print item.encode('utf-8') Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.fsjsjd.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): xp_putime = "//a[@href='%s']/parent::*/text()" % item pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) item = HandleUrl.judge_url(item, homepage) text = '^(http|https).+\d\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: data['pubtime'] = pubtime Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.bjtsb.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): # print '----',item item = HandleUrl.judge_url(item,homepage) # print '====',item text = ur'(http).+(infoview).+\d{3,8}$' url_t = re.match(text, item) data = {} if url_t != None: # print item.encode('utf-8') Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.fsjsjd.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): xp_putime = "//a[@href='%s']/parent::*/text()"%item pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) item = HandleUrl.judge_url(item,homepage) text = '^(http|https).+\d\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: data['pubtime'] = pubtime Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): world = self.key data = self.data # world = str(self.key) data.update({'type': u'元搜索', 'origin_source': u'微信搜索', 'key': world}) homepage = "http://weixin.sogou.com/weixinwap?ie=utf8&w=&\ type=2&t=1427703547684&s_t=&fr=sgsearch&\ query=" + world + "&pg=webSearchList" homepage = clear_space(homepage) html_stream = _get_url(homepage) list_url = [] for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item) if item == '': continue else: Scheduler.schedule(ContentCrawler.type, key=item, data=data)
def crawl(self): world = self.key data = self.data # world = str(self.key) data.update({ 'type': u'元搜索', 'origin_source': u'微信搜索', 'key': world }) homepage = "http://weixin.sogou.com/weixinwap?ie=utf8&w=&\ type=2&t=1427703547684&s_t=&fr=sgsearch&\ query="+world+"&pg=webSearchList" homepage = clear_space(homepage) html_stream = _get_url(homepage) list_url = [] for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item) if item == '': continue else: Scheduler.schedule(ContentCrawler.type, key=item, data=data)