def work_loop(self): newhost = "" try: if VVQueue.CmsQueue.qsize() <= 0: # 判断队列是否为空 time.sleep(5) return newhost = VVQueue.CmsQueue.get(0.5) # get()方法从队头删除并返回一个项目 if not newhost: return print "[CMS][Thread:%d] [url:%s] run cms starting..." % (self.n, newhost) cmstmp = '' if self.open_url_200(newhost): # 测试url地址 CMS cmstmp = self.CS_cms(newhost) return else: print "[CMS][Thread:%d] [url:%s] NO DATA RETURN!" % (self.n, newhost) # 如果页面无法访问,而且不是子域名的话,就检查一下(www.)domain.cn if not is_subdomain(newhost): newhost = 'www.' + newhost print "[CMS][Thread:%d] [url:%s] www run cms starting..." % (self.n, newhost) if self.open_url_200(newhost): # 测试url地址 CMS cmstmp = self.CS_cms(newhost) else: print "[CMS][Thread:%d] [url:%s]--www NO DATA RETURN!" % (self.n, newhost) except Exception, e: print "[CMS][Thread:%d] [url:%s] Exception:[%s]" % (self.n, newhost, e)
def get_url_lis1(self, url): try: self.list = [] #self.list.append(data) #添加数据 req = urllib2.Request(url) req.add_header( 'User-Agent', "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.nml)" ) s = urllib2.urlopen( req, timeout=10 ) # 超时10秒 #s = urllib2.urlopen(r"http://www.163.com") ss = s.read() ################## #CMS另类识别 data_cms = self.URL_CMS(ss) if len(data_cms) >= 1: hm_url = "http://webxscan.com/url_cms.php?url=%s&cms=%s" % ( url, data_cms) self.url_post(hm_url) # 神龙后门 print "[Spider][Thread:%d]-openurl cms-[URL:%s]-[cms:%s]"%\ (self.n, url, data_cms) ################## # 构造及编译正则表达式 p = re.compile( r'<a[\s\S]*?href=["]([\s\S]*?)["][\s\S]*?>[\s\S]*?</a>') # 找出一条一条的<a></a>标签 sarr = p.findall(ss) for every in sarr: if not every: continue shref = every.replace("www.", "") if not self.startwithhttp(shref): # 判断是否是HTTP字符 shref = self.URL_TQURL(shref) # URL提取URL if is_legal_domain(shref): # 过滤违规域名 if self.bool_2com: # 是否采集二级域名 if is_subdomain(shref): #判断域名是否是二级域名 1是0否 self.list.append(shref) #添加到数组 a1 = trim_sdomain(shref) #解析主域名 self.list.append(a1) # 添加到数组 else: self.list.append(shref) #添加数据 else: self.list.append(shref) #添加数据 if len(self.list) >= 1: return set(s.strip() for s in self.list) return 0 except Exception, e: return 0
def work_loop(self): try: if VVQueue.StoreQueue.qsize() >= 5000: # 需要判断下 StoreQueue 消息队列需要存储的数据过多的时候跳过这个循环 time.sleep(5) return starturl = VVQueue.ReadQueue.get(0.5) if not starturl: return urlnum = self.URL_DZ('http://' + starturl) # 如果不是子域名的话,而且上面的没有抓取到页面,那么就可以加上www.再爬一次 if not urlnum and not is_subdomain(starturl): self.URL_DZ('http://www.' + starturl) except: pass
def URL_DZ(self, URL): # 遍历页里的地址 self.urlset.clear() try: req = urllib2.Request(URL) req.add_header('User-Agent', "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.nml)") s = urllib2.urlopen(req, timeout=10) # 超时10秒 #s = urllib2.urlopen(r"http://www.163.com") ss = s.read() # 构造及编译正则表达式 p = re.compile(r'<a[\s\S]*?href=["]([\s\S]*?)["][\s\S]*?>[\s\S]*?</a>' ) # 找出一条一条的<a></a>标签 sarr = p.findall(ss) for every in sarr: if self.urlset.isfull(): print "[Spider][Thread:%d]-openurl-[URL:%s]-[time Over:%d] URL address"%\ (self.n, URL, self.urlset.getitemcount()) break shref = every.replace("www.", "") if self.startwithhttp(shref): # 判断是否是HTTP字符 newdomain = self.URL_TQURL(shref) # URL提取URL if newdomain and is_legal_domain(newdomain): # 过滤违规域名 if self.bool_2com and is_subdomain(newdomain): #print newdomain self.urlset.add(newdomain) # 添加到数组 #continue for n in xrange(self.urlset.getitemcount()): tmpdomain = self.urlset.getitem(n) if not tmpdomain: continue #print tmpdomain if self.bool_com_cn: # 0否1是 设置是否限制采集范围 if self.bool_for_com_cn_lis(tmpdomain): # 存储一下,先不爬这个,优先爬下子域名 VVQueue.StoreQueue.put(tmpdomain, 0.1) #else: # 不存储,但是让爬虫爬下页面就行 #if VVQueue.ReadQueue.qsize() <= 14000: # VVQueue.ReadQueue.put(tmpdomain, 0.1) else: VVQueue.StoreQueue.put(tmpdomain, 0.1) print "[Spider][Thread:%d]--[count url:%d]--[url:%s]--[time:%s]" %\ (self.n, self.urlset.getitemcount(), URL, time.strftime('%Y.%m.%d-%H.%M.%S')) except Exception, e: print "[Spider][Thread:%d]--Exception--[url:%s]--[time:%s]\n[%s]" %\ (self.n, URL, time.strftime('%Y.%m.%d-%H.%M.%S'), e) return self.urlset.getitemcount()
def work_loop(self): try: if VVQueue.StoreQueue.qsize() >= 5000: # 需要判断下 StoreQueue 消息队列需要存储的数据过多的时候跳过这个循环 time.sleep(5) return 0 starturl = VVQueue.ReadQueue.get(0.5) if not starturl: return 0 if not self.open_url_200(starturl): #判断网站是否能打开 可以快点采集 print "[Spider][Thread:%d]--http_200--NO--[url:%s]--[time:%s]" %\ (self.n, starturl, time.strftime('%Y.%m.%d-%H.%M.%S')) time.sleep(0.1) return 0 urlnum = self.URL_DZ('http://' + starturl) # 如果不是子域名的话,而且上面的没有抓取到页面,那么就可以加上www.再爬一次 if not urlnum and not is_subdomain(starturl): self.URL_DZ('http://www.' + starturl) return 0 except: pass