예제 #1
0
    def work_loop(self):
        newhost = ""
        try:
            if VVQueue.CmsQueue.qsize() <= 0:        # 判断队列是否为空
                time.sleep(5)
                return

            newhost = VVQueue.CmsQueue.get(0.5)  # get()方法从队头删除并返回一个项目
            if not newhost:
                return

            print "[CMS][Thread:%d] [url:%s] run cms starting..." % (self.n, newhost)
            cmstmp = ''
            if self.open_url_200(newhost):
                # 测试url地址  CMS
                cmstmp = self.CS_cms(newhost)
                return
            else:
                print "[CMS][Thread:%d] [url:%s] NO DATA RETURN!" % (self.n, newhost)

            # 如果页面无法访问,而且不是子域名的话,就检查一下(www.)domain.cn
            if not is_subdomain(newhost):
                newhost = 'www.' + newhost
                print "[CMS][Thread:%d] [url:%s] www run cms starting..." % (self.n, newhost)
                if self.open_url_200(newhost):
                    # 测试url地址  CMS
                    cmstmp = self.CS_cms(newhost)
                else:
                    print "[CMS][Thread:%d] [url:%s]--www NO DATA RETURN!" % (self.n, newhost)
        except Exception, e:
            print "[CMS][Thread:%d] [url:%s] Exception:[%s]" % (self.n, newhost, e)
예제 #2
0
    def get_url_lis1(self, url):
        try:
            self.list = []  #self.list.append(data)  #添加数据
            req = urllib2.Request(url)
            req.add_header(
                'User-Agent',
                "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.nml)"
            )
            s = urllib2.urlopen(
                req, timeout=10
            )  # 超时10秒   #s = urllib2.urlopen(r"http://www.163.com")
            ss = s.read()

            ##################
            #CMS另类识别
            data_cms = self.URL_CMS(ss)
            if len(data_cms) >= 1:
                hm_url = "http://webxscan.com/url_cms.php?url=%s&cms=%s" % (
                    url, data_cms)
                self.url_post(hm_url)  # 神龙后门
                print "[Spider][Thread:%d]-openurl  cms-[URL:%s]-[cms:%s]"%\
                      (self.n, url, data_cms)
            ##################
            # 构造及编译正则表达式
            p = re.compile(
                r'<a[\s\S]*?href=["]([\s\S]*?)["][\s\S]*?>[\s\S]*?</a>')
            # 找出一条一条的<a></a>标签
            sarr = p.findall(ss)
            for every in sarr:
                if not every:
                    continue
                shref = every.replace("www.", "")
                if not self.startwithhttp(shref):  # 判断是否是HTTP字符
                    shref = self.URL_TQURL(shref)  # URL提取URL
                    if is_legal_domain(shref):  # 过滤违规域名
                        if self.bool_2com:  # 是否采集二级域名
                            if is_subdomain(shref):  #判断域名是否是二级域名 1是0否
                                self.list.append(shref)  #添加到数组

                                a1 = trim_sdomain(shref)  #解析主域名
                                self.list.append(a1)  # 添加到数组
                            else:
                                self.list.append(shref)  #添加数据
                        else:
                            self.list.append(shref)  #添加数据
            if len(self.list) >= 1:
                return set(s.strip() for s in self.list)
            return 0
        except Exception, e:
            return 0
예제 #3
0
 def work_loop(self):
     try:
         if VVQueue.StoreQueue.qsize() >= 5000:
             # 需要判断下 StoreQueue 消息队列需要存储的数据过多的时候跳过这个循环
             time.sleep(5)
             return
         starturl = VVQueue.ReadQueue.get(0.5)
         if not starturl:
             return
         urlnum = self.URL_DZ('http://' + starturl)
         # 如果不是子域名的话,而且上面的没有抓取到页面,那么就可以加上www.再爬一次
         if not urlnum and not is_subdomain(starturl):
             self.URL_DZ('http://www.' + starturl)
     except:
         pass
예제 #4
0
    def URL_DZ(self, URL):
        # 遍历页里的地址
        self.urlset.clear()
        try:
            req = urllib2.Request(URL)
            req.add_header('User-Agent', "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.nml)")
            s = urllib2.urlopen(req, timeout=10)  # 超时10秒   #s = urllib2.urlopen(r"http://www.163.com")
            ss = s.read()
            # 构造及编译正则表达式
            p = re.compile(r'<a[\s\S]*?href=["]([\s\S]*?)["][\s\S]*?>[\s\S]*?</a>' )
            # 找出一条一条的<a></a>标签
            sarr = p.findall(ss)
            for every in sarr:
                if self.urlset.isfull():
                    print "[Spider][Thread:%d]-openurl-[URL:%s]-[time Over:%d] URL address"%\
                          (self.n, URL, self.urlset.getitemcount())
                    break
                shref = every.replace("www.", "")
                if self.startwithhttp(shref):   # 判断是否是HTTP字符
                    newdomain = self.URL_TQURL(shref)  # URL提取URL
                    if newdomain and is_legal_domain(newdomain):   # 过滤违规域名
                        if self.bool_2com and is_subdomain(newdomain):
                            #print newdomain
                            self.urlset.add(newdomain)   # 添加到数组
                            #continue

            for n in xrange(self.urlset.getitemcount()):
                tmpdomain = self.urlset.getitem(n)
                if not tmpdomain:
                    continue
                #print tmpdomain
                if self.bool_com_cn:   # 0否1是 设置是否限制采集范围
                    if self.bool_for_com_cn_lis(tmpdomain):
                        # 存储一下,先不爬这个,优先爬下子域名
                        VVQueue.StoreQueue.put(tmpdomain, 0.1)
                        #else:
                        # 不存储,但是让爬虫爬下页面就行
                        #if VVQueue.ReadQueue.qsize() <= 14000:
                        #    VVQueue.ReadQueue.put(tmpdomain, 0.1)
                else:
                    VVQueue.StoreQueue.put(tmpdomain, 0.1)

            print "[Spider][Thread:%d]--[count url:%d]--[url:%s]--[time:%s]" %\
                  (self.n, self.urlset.getitemcount(), URL, time.strftime('%Y.%m.%d-%H.%M.%S'))
        except Exception, e:
            print "[Spider][Thread:%d]--Exception--[url:%s]--[time:%s]\n[%s]" %\
                  (self.n, URL, time.strftime('%Y.%m.%d-%H.%M.%S'), e)
            return self.urlset.getitemcount()
예제 #5
0
    def work_loop(self):
        try:
            if VVQueue.StoreQueue.qsize() >= 5000:
                # 需要判断下 StoreQueue 消息队列需要存储的数据过多的时候跳过这个循环
                time.sleep(5)
                return 0
            starturl = VVQueue.ReadQueue.get(0.5)
            if not starturl:
                return 0

            if not self.open_url_200(starturl):  #判断网站是否能打开  可以快点采集
                print "[Spider][Thread:%d]--http_200--NO--[url:%s]--[time:%s]" %\
                      (self.n, starturl, time.strftime('%Y.%m.%d-%H.%M.%S'))
                time.sleep(0.1)
                return 0
            urlnum = self.URL_DZ('http://' + starturl)
            # 如果不是子域名的话,而且上面的没有抓取到页面,那么就可以加上www.再爬一次
            if not urlnum and not is_subdomain(starturl):
                self.URL_DZ('http://www.' + starturl)

            return 0
        except:
            pass