Esempio n. 1
0
 def __init__(self, urltuples, urlqueue, keys, dbfile, depth):
     threading.Thread.__init__(self)
     self.urltuples = urltuples      # (0, url) 0:层数,url:链接
     self.urlqueue = urlqueue        # 存放url的队列
     self.keys = keys                # 关键字
     self.savedatabase = SaveDatabase(dbfile)        # 数据库文件
     self.depth = depth              # 深度
     self.urls = []
Esempio n. 2
0
class ClawThread(threading.Thread):
    '''
    多线程并发爬虫的类
    '''
    def __init__(self, urltuples, urlqueue, keys, dbfile, depth):
        threading.Thread.__init__(self)
        self.urltuples = urltuples      # (0, url) 0:层数,url:链接
        self.urlqueue = urlqueue        # 存放url的队列
        self.keys = keys                # 关键字
        self.savedatabase = SaveDatabase(dbfile)        # 数据库文件
        self.depth = depth              # 深度
        self.urls = []

    def run(self):
        while not self.urlqueue.empty():
            urltuples = self.urlqueue.get()     # 从队列中取出一个url
            currentdepth, url = urltuples
            self.add_urlqueue(urltuples)        # 解析出该url页面中所有的url,加入队列
            print 'clawing url: %s ----- %s' % (url, currentdepth)
            log.info('clawing url: %s ----- %s' % (url, currentdepth))
            self.claw_start(urltuples, url, currentdepth)       # 查找关键字,存入数据库
            print self.urlqueue.qsize()
            print '----------------clawing-----------------------'
        self.savedatabase.close()

    def claw_start(self, urltuples, url, currentdepth):
        ''' 调用find_key_savedb保存数据 '''
        htmlsource = GetPage(urltuples).get_html()
        if htmlsource:
            self.find_key_savedb(url, currentdepth, htmlsource)
        self.urlqueue.task_done()

    def find_key_savedb(self, url, currentdepth, htmlsource):
        ''' 查找关键字,将找到关键字的url及相关数据存到数据库 '''
        if self.keys:
            soup = BeautifulSoup(htmlsource)
            if soup.findAll(text=re.compile(self.keys)):        # 匹配关键字,将找到关键字的URL等数据存入数据库
                self.savedatabase.insert_db(url, currentdepth, htmlsource, self.keys)
                print 'save url: %s ---- %s' %(url, currentdepth)
                log.info('save url: %s ---- %s' %(url, currentdepth)
)
            else:
                print " Don't find keyword:  %s" % url
                log.info(" Don't find keyword:  %s" % url)
        else:
            self.savedatabase.insert_db(url, currentdepth, htmlsource, '')

    def add_urlqueue(self, urltuples):
        ''' 解析URL,向队列中添加解析好的url '''
        id, url = urltuples
        try:
            html = urllib2.urlopen(url).read()
        except Exception, e:
            print "error: %s'\n url: %s'" % (e, url)
            log.debug("error: %s'\n url: %s'" % (e, url))
        else: