Example #1
0
def init():
    db = tools.getConnectedDB()
    # 设唯一索引
    db.urls.ensure_index('url', unique=True)
    db.text_info.ensure_index('url', unique=True)
# encoding=utf8
import sys
sys.path.append("..")

import threading
import time
import utils.tools as tools
import base.constance as Constance
from html_parser.parsers import *
from base.collector import Collector
from utils.log import log

db = tools.getConnectedDB()


class PaserControl(threading.Thread):
    def __init__(self):
        super(PaserControl, self).__init__()
        self._collector = Collector()
        self._urlCount = int(tools.getConfValue("html_parser", "url_count"))
        self._interval = int(tools.getConfValue("html_parser", "sleep_time"))

    def run(self):
        while True:
            try:
                urls = self._collector.getUrls(self._urlCount)
                print("取到的url大小 %d" % len(urls))
                # 判断是否结束
                if self._collector.isFinished():
                    log.debug("-------------- 结束 --------------")
                    break
Example #3
0
class Collector(threading.Thread, Singleton):
    _db = tools.getConnectedDB()
    _threadStop = False
    _urls = []
    _interval = int(tools.getConfValue("collector", "sleep_time"))

    #初始时将正在做的任务至为未做
    _db.urls.update({'status': Constance.DOING},
                    {'$set': {
                        'status': Constance.TODO
                    }},
                    multi=True)

    if DEBUG:
        log.debug("is debug depth = %s" % DEPTH)

    def __init__(self):
        super(Collector, self).__init__()

    def run(self):
        while not Collector._threadStop:
            self.__inputData()
            time.sleep(Collector._interval)

    def stop(self):
        Collector._threadStop = False

    def __inputData(self):
        if len(Collector._urls) > int(
                tools.getConfValue("collector", "max_size")):
            return
        mylock.acquire()  #加锁

        website = tools.getConfValue("collector", "website")
        depth = int(tools.getConfValue("collector", "depth"))
        urlCount = int(tools.getConfValue("collector", "url_count"))
        if DEBUG:
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": DEPTH
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)
        elif website == 'all':
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)  #sort -1 降序 1 升序
        else:
            websiteId = tools.getWebsiteId(website)
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "website_id": websiteId,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)

        urlsList = list(urlsList)
        Collector._urls.extend(urlsList)
        #更新已取到的url状态为doing
        for url in urlsList:
            Collector._db.urls.update(url,
                                      {'$set': {
                                          'status': Constance.DOING
                                      }})

        mylock.release()

    def getUrls(self, count):
        mylock.acquire()  #加锁

        urls = Collector._urls[:count]
        del Collector._urls[:count]

        mylock.release()

        return urls
class Collector(threading.Thread, Singleton):
    _db = tools.getConnectedDB()
    _threadStop = False
    _urls = []
    _nullTimes = 0
    _readPos = -1
    _writePos = -1
    _maxSize = int(tools.getConfValue("collector", "max_size"))
    _interval = int(tools.getConfValue("collector", "sleep_time"))
    _allowedNullTimes = int(
        tools.getConfValue("collector", 'allowed_null_times'))
    _website = tools.getConfValue("collector", "website")
    _depth = int(tools.getConfValue("collector", "depth"))
    _urlCount = int(tools.getConfValue("collector", "url_count"))

    #初始时将正在做的任务至为未做
    beginTime = time.time()
    # _db.urls.update({'status':Constance.DOING}, {'$set':{'status':Constance.TODO}}, multi=True)
    endTime = time.time()
    log.debug('update url time' + str(endTime - beginTime))

    if DEBUG:
        log.debug("is debug depth = %s" % DEPTH)

    def __init__(self):
        super(Collector, self).__init__()

    def run(self):
        while not Collector._threadStop:
            self.__inputData()
            time.sleep(Collector._interval)

    def stop(self):
        Collector._threadStop = True

    @tools.log_function_time
    def __inputData(self):
        log.debug('buffer size %d' % self.getMaxReadSize())
        log.debug('buffer can write size = %d' % self.getMaxWriteSize())
        if self.getMaxWriteSize() == 0:
            log.debug("collector 已满 size = %d" % self.getMaxReadSize())
            return

        beginTime = time.time()

        urlCount = Collector._urlCount if Collector._urlCount <= self.getMaxWriteSize(
        ) else self.getMaxWriteSize()

        if DEBUG:
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": DEPTH
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)
        elif Collector._website == 'all':
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": {
                        "$lte": Collector._depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)  #sort -1 降序 1 升序
        else:
            websiteId = tools.getWebsiteId(Collector._website)
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "website_id": websiteId,
                    "depth": {
                        "$lte": Collector._depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)

        endTime = time.time()

        urlsList = list(urlsList)

        log.debug('get url time ' + str(endTime - beginTime) + " size " +
                  str(len(urlsList)))

        # 存url
        self.putUrls(urlsList)

        #更新已取到的url状态为doing
        beginTime = time.time()
        for url in urlsList:
            Collector._db.urls.update(url,
                                      {'$set': {
                                          'status': Constance.DOING
                                      }})
        endTime = time.time()
        log.debug('update url time ' + str(endTime - beginTime))

        if self.isAllHaveDone():
            self.stop()
            exportData.export()

    def isFinished(self):
        return Collector._threadStop

    def isAllHaveDone(self):
        if Collector._urls == []:
            Collector._nullTimes += 1
            if Collector._nullTimes >= Collector._allowedNullTimes:
                return True
        else:
            Collector._nullTimes = 0
            return False

    def getMaxWriteSize(self):
        size = 0
        if Collector._readPos == Collector._writePos:
            size = Collector._maxSize
        elif Collector._readPos < Collector._writePos:
            size = Collector._maxSize - (Collector._writePos -
                                         Collector._readPos)
        else:
            size = Collector._readPos - Collector._writePos

        return size

    def getMaxReadSize(self):
        return Collector._maxSize - self.getMaxWriteSize()

    def putUrls(self, urlsList):
        # 添加url 到 _urls
        urlCount = len((urlsList))
        endPos = urlCount + Collector._writePos + 1
        # 判断是否超出队列容量 超出的话超出的部分需要从头写
        # 超出部分
        overflowEndPos = endPos - Collector._maxSize
        # 没超出部分
        inPos = endPos if endPos <= Collector._maxSize else Collector._maxSize

        # 没超出部分的数量
        urlsListCutPos = inPos - Collector._writePos - 1

        beginTime = time.time()
        mylock.acquire()  #加锁

        Collector._urls[Collector._writePos +
                        1:inPos] = urlsList[:urlsListCutPos]
        if overflowEndPos > 0:
            Collector._urls[:overflowEndPos] = urlsList[urlsListCutPos:]

        mylock.release()
        log.debug('put url time ' + str(time.time() - beginTime) + " size " +
                  str(len(urlsList)))

        Collector._writePos += urlCount
        Collector._writePos %= Collector._maxSize

    @tools.log_function_time
    def getUrls(self, count):
        mylock.acquire()  #加锁
        urls = []

        count = count if count <= self.getMaxReadSize(
        ) else self.getMaxReadSize()
        endPos = Collector._readPos + count + 1
        if endPos > Collector._maxSize:
            urls.extend(Collector._urls[Collector._readPos + 1:])
            urls.extend(Collector._urls[:endPos % Collector._maxSize])
        else:
            urls.extend(Collector._urls[Collector._readPos + 1:endPos])

        Collector._readPos += len(urls)
        Collector._readPos %= Collector._maxSize

        mylock.release()

        return urls
class Collector(threading.Thread, Singleton):
    _db = tools.getConnectedDB()
    _threadStop = False
    _urls = []
    _nullTimes = 0
    _interval = int(tools.getConfValue("collector", "sleep_time"))

    #初始时将正在做的任务至为未做
    beginTime = time.time()
    _db.urls.update({'status': Constance.DOING},
                    {'$set': {
                        'status': Constance.TODO
                    }},
                    multi=True)
    endTime = time.time()
    log.debug('update url time' + str(endTime - beginTime))

    if DEBUG:
        log.debug("is debug depth = %s" % DEPTH)

    def __init__(self):
        super(Collector, self).__init__()

    def run(self):
        while not Collector._threadStop:
            self.__inputData()
            time.sleep(Collector._interval)

    def stop(self):
        Collector._threadStop = True

    @tools.log_function_time
    def __inputData(self):
        if len(Collector._urls) > int(
                tools.getConfValue("collector", "max_size")):
            log.debug("collector 已满 size = %d" % len(Collector._urls))
            return
        mylock.acquire()  #加锁

        website = tools.getConfValue("collector", "website")
        depth = int(tools.getConfValue("collector", "depth"))
        urlCount = int(tools.getConfValue("collector", "url_count"))

        beginTime = time.time()

        if DEBUG:
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": DEPTH
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)
        elif website == 'all':
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)  #sort -1 降序 1 升序
        else:
            websiteId = tools.getWebsiteId(website)
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "website_id": websiteId,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)

        endTime = time.time()

        urlsList = list(urlsList)

        log.debug('get url time ' + str(endTime - beginTime) + " size " +
                  str(len(urlsList)))

        beginTime = time.time()
        Collector._urls.extend(urlsList)
        log.debug('put get url time ' + str(time.time() - beginTime) +
                  " size " + str(len(urlsList)))

        #更新已取到的url状态为doing
        beginTime = time.time()
        for url in urlsList:
            Collector._db.urls.update(url,
                                      {'$set': {
                                          'status': Constance.DOING
                                      }})
        endTime = time.time()
        log.debug('update url time ' + str(endTime - beginTime))

        if self.isAllHaveDone():
            self.stop()
            exportData.export()

        mylock.release()

    def isFinished(self):
        return Collector._threadStop

    def isAllHaveDone(self):
        allowedNullTimes = int(
            tools.getConfValue("collector", 'allowed_null_times'))
        if Collector._urls == []:
            Collector._nullTimes += 1
            if Collector._nullTimes >= allowedNullTimes:
                return True
        else:
            Collector._nullTimes = 0
            return False

    @tools.log_function_time
    def getUrls(self, count):
        mylock.acquire()  #加锁

        urls = Collector._urls[:count]
        del Collector._urls[:count]

        mylock.release()

        return urls