def crawl(self): homepage = "http://www.jxzj.gov.cn/jxzj/index.html" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item,homepage) text = '^(http|https).+(news).+\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.gzq.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item, homepage) text = '^(http|https).+(public).+.+\d$' url_t = re.match(text, item) data = {} if url_t != None: # print item.encode('utf-8') Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.hzqts.gov.cn/zwpd/index.htm" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item,homepage) text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: # ContentCrawler(key=item).crawl() Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.hzqts.gov.cn/zwpd/index.htm" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item, homepage) text = '^(http|https).+(zwpd|qypd|smpd).+[^(Index)]\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: # ContentCrawler(key=item).crawl() Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.hbzljd.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item,homepage) text = '^(http|https).+\d\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: # print item.encode('utf-8') Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.fsjsjd.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): xp_putime = "//a[@href='%s']/parent::*/text()" % item pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) item = HandleUrl.judge_url(item, homepage) text = '^(http|https).+\d\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: data['pubtime'] = pubtime Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.bjtsb.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): # print '----',item item = HandleUrl.judge_url(item,homepage) # print '====',item text = ur'(http).+(infoview).+\d{3,8}$' url_t = re.match(text, item) data = {} if url_t != None: # print item.encode('utf-8') Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): homepage = "http://www.fsjsjd.gov.cn/" html_stream = _get_url(homepage) for item in HandleUrl.get_url(html_stream.text): xp_putime = "//a[@href='%s']/parent::*/text()"%item pubtime = HandleContent.get_pubtime(html_stream, xpath=xp_putime) item = HandleUrl.judge_url(item,homepage) text = '^(http|https).+\d\.(htm|html|net)$' url_t = re.match(text, item) data = {} if url_t != None: data['pubtime'] = pubtime Scheduler.schedule(ContentCrawler.type, key=item, data=data) else: pass
def crawl(self): world = self.key data = self.data # world = str(self.key) data.update({'type': u'元搜索', 'origin_source': u'微信搜索', 'key': world}) homepage = "http://weixin.sogou.com/weixinwap?ie=utf8&w=&\ type=2&t=1427703547684&s_t=&fr=sgsearch&\ query=" + world + "&pg=webSearchList" homepage = clear_space(homepage) html_stream = _get_url(homepage) list_url = [] for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item) if item == '': continue else: Scheduler.schedule(ContentCrawler.type, key=item, data=data)
def crawl(self): world = self.key data = self.data # world = str(self.key) data.update({ 'type': u'元搜索', 'origin_source': u'微信搜索', 'key': world }) homepage = "http://weixin.sogou.com/weixinwap?ie=utf8&w=&\ type=2&t=1427703547684&s_t=&fr=sgsearch&\ query="+world+"&pg=webSearchList" homepage = clear_space(homepage) html_stream = _get_url(homepage) list_url = [] for item in HandleUrl.get_url(html_stream.text): item = HandleUrl.judge_url(item) if item == '': continue else: Scheduler.schedule(ContentCrawler.type, key=item, data=data)