コード例 #1
0
ファイル: root_url.py プロジェクト: sx1616039/crawl
    def run(self):
        website = tools.getConfValue("collector", "website")
        if website == 'all':
            self.addYoukuUrl()
            self.addTencentUrl()
            self.addWangYiUrl()
            self.addPPTVUrl()
            self.addCCTVUrl()
            self.addKanKanUrl()
            self.addTouDouUrl()
            self.addV1Url()
            self.addKu6Url()

        elif website == Constance.YOUKU:
            self.addYoukuUrl()
        elif website == Constance.TENCENT:
            self.addTencentUrl()
        elif website == Constance.WANG_YI:
            self.addWangYiUrl()
        elif website == Constance.PPTV:
            self.addPPTVUrl()
        elif website == Constance.CCTV:
            self.addCCTVUrl()
        elif website == Constance.KAN_KAN:
            self.addKanKanUrl()
        elif website == Constance.TUDOU:
            self.addTouDouUrl()
        elif website == Constance.V1:
            self.addV1Url()
        elif website == Constance.KU6:
            self.addKu6Url()
コード例 #2
0
    def __inputData(self):
        if len(Collector._urls) > int(
                tools.getConfValue("collector", "max_size")):
            return
        mylock.acquire()  #加锁

        depth = int(tools.getConfValue("collector", "depth"))
        urlCount = int(tools.getConfValue("collector", "url_count"))
        if DEBUG:
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": DEPTH
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1
                }).sort([("depth", 1)]).limit(urlCount)
        else:
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1
                }).sort([("depth", 1)]).limit(urlCount)  #sort -1 降序 1 升序

        urlsList = list(urlsList)
        Collector._urls.extend(urlsList)
        #更新已取到的url状态为doing
        for url in urlsList:
            Collector._db.urls.update(url,
                                      {'$set': {
                                          'status': Constance.DOING
                                      }})

        if self.isAllHaveDone():
            self.stop()

        mylock.release()
コード例 #3
0
 def isAllHaveDone(self):
     allowedNullTimes = int(
         tools.getConfValue("collector", 'allowed_null_times'))
     if Collector._urls == []:
         Collector._nullTimes += 1
         if Collector._nullTimes >= allowedNullTimes:
             return True
     else:
         Collector._nullTimes = 0
         return False
コード例 #4
0
    def run(self):
        website = tools.getConfValue("collector", "website")
        self.registUrlFunc()

        # 执行add url func
        for addUrlFunc in AddRootUrl._addUrlFuncs:
            addWebUrl = addUrlFunc[0]
            domain = addUrlFunc[1]

            if website == 'all':
                addWebUrl()
            elif website == domain:
                addWebUrl()
コード例 #5
0
sys.path.append("..")

import utils.tools as tools
from utils.log import log
from base.collector import Collector
from base.root_url import AddRootUrl
from html_parser.parser_control import PaserControl


def init():
    db = tools.getConnectedDB()
    # 设唯一索引
    db.urls.ensure_index('url', unique=True)
    db.text_info.ensure_index('url', unique=True)


if __name__ == '__main__':
    log.info("--------begin--------")
    init()

    addRootUrl = AddRootUrl()
    addRootUrl.start()

    coll = Collector()
    coll.start()

    paserCount = int(tools.getConfValue("html_parser", "parser_count"))
    while paserCount:
        paser = PaserControl()
        paser.start()
        paserCount = paserCount - 1
コード例 #6
0
 def __init__(self):
     super(PaserControl, self).__init__()
     self._collector = Collector()
     self._urlCount = int(tools.getConfValue("html_parser", "url_count"))
     self._interval = int(tools.getConfValue("html_parser", "sleep_time"))
コード例 #7
0
class Collector(threading.Thread, Singleton):
    _db = tools.getConnectedDB()
    _threadStop = False
    _urls = []
    _nullTimes = 0
    _readPos = -1
    _writePos = -1
    _maxSize = int(tools.getConfValue("collector", "max_size"))
    _interval = int(tools.getConfValue("collector", "sleep_time"))
    _allowedNullTimes = int(
        tools.getConfValue("collector", 'allowed_null_times'))
    _website = tools.getConfValue("collector", "website")
    _depth = int(tools.getConfValue("collector", "depth"))
    _urlCount = int(tools.getConfValue("collector", "url_count"))

    #初始时将正在做的任务至为未做
    beginTime = time.time()
    # _db.urls.update({'status':Constance.DOING}, {'$set':{'status':Constance.TODO}}, multi=True)
    endTime = time.time()
    log.debug('update url time' + str(endTime - beginTime))

    if DEBUG:
        log.debug("is debug depth = %s" % DEPTH)

    def __init__(self):
        super(Collector, self).__init__()

    def run(self):
        while not Collector._threadStop:
            self.__inputData()
            time.sleep(Collector._interval)

    def stop(self):
        Collector._threadStop = True

    @tools.log_function_time
    def __inputData(self):
        log.debug('buffer size %d' % self.getMaxReadSize())
        log.debug('buffer can write size = %d' % self.getMaxWriteSize())
        if self.getMaxWriteSize() == 0:
            log.debug("collector 已满 size = %d" % self.getMaxReadSize())
            return

        beginTime = time.time()

        urlCount = Collector._urlCount if Collector._urlCount <= self.getMaxWriteSize(
        ) else self.getMaxWriteSize()

        if DEBUG:
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": DEPTH
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)
        elif Collector._website == 'all':
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": {
                        "$lte": Collector._depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)  #sort -1 降序 1 升序
        else:
            websiteId = tools.getWebsiteId(Collector._website)
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "website_id": websiteId,
                    "depth": {
                        "$lte": Collector._depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)

        endTime = time.time()

        urlsList = list(urlsList)

        log.debug('get url time ' + str(endTime - beginTime) + " size " +
                  str(len(urlsList)))

        # 存url
        self.putUrls(urlsList)

        #更新已取到的url状态为doing
        beginTime = time.time()
        for url in urlsList:
            Collector._db.urls.update(url,
                                      {'$set': {
                                          'status': Constance.DOING
                                      }})
        endTime = time.time()
        log.debug('update url time ' + str(endTime - beginTime))

        if self.isAllHaveDone():
            self.stop()
            exportData.export()

    def isFinished(self):
        return Collector._threadStop

    def isAllHaveDone(self):
        if Collector._urls == []:
            Collector._nullTimes += 1
            if Collector._nullTimes >= Collector._allowedNullTimes:
                return True
        else:
            Collector._nullTimes = 0
            return False

    def getMaxWriteSize(self):
        size = 0
        if Collector._readPos == Collector._writePos:
            size = Collector._maxSize
        elif Collector._readPos < Collector._writePos:
            size = Collector._maxSize - (Collector._writePos -
                                         Collector._readPos)
        else:
            size = Collector._readPos - Collector._writePos

        return size

    def getMaxReadSize(self):
        return Collector._maxSize - self.getMaxWriteSize()

    def putUrls(self, urlsList):
        # 添加url 到 _urls
        urlCount = len((urlsList))
        endPos = urlCount + Collector._writePos + 1
        # 判断是否超出队列容量 超出的话超出的部分需要从头写
        # 超出部分
        overflowEndPos = endPos - Collector._maxSize
        # 没超出部分
        inPos = endPos if endPos <= Collector._maxSize else Collector._maxSize

        # 没超出部分的数量
        urlsListCutPos = inPos - Collector._writePos - 1

        beginTime = time.time()
        mylock.acquire()  #加锁

        Collector._urls[Collector._writePos +
                        1:inPos] = urlsList[:urlsListCutPos]
        if overflowEndPos > 0:
            Collector._urls[:overflowEndPos] = urlsList[urlsListCutPos:]

        mylock.release()
        log.debug('put url time ' + str(time.time() - beginTime) + " size " +
                  str(len(urlsList)))

        Collector._writePos += urlCount
        Collector._writePos %= Collector._maxSize

    @tools.log_function_time
    def getUrls(self, count):
        mylock.acquire()  #加锁
        urls = []

        count = count if count <= self.getMaxReadSize(
        ) else self.getMaxReadSize()
        endPos = Collector._readPos + count + 1
        if endPos > Collector._maxSize:
            urls.extend(Collector._urls[Collector._readPos + 1:])
            urls.extend(Collector._urls[:endPos % Collector._maxSize])
        else:
            urls.extend(Collector._urls[Collector._readPos + 1:endPos])

        Collector._readPos += len(urls)
        Collector._readPos %= Collector._maxSize

        mylock.release()

        return urls
コード例 #8
0
    def addHeadlinesListUrl(self):
        baseUrl = 'http://is.snssdk.com/api/news/feed/v46/'
        params = {
            'version_code': '5.8.6',
            'app_name': 'news_article',
            'vid': 'B0DB5DD0-FF94-4773-85B1-EFC11132C2A4',
            'device_id': '34633749953',
            'channel': 'App Store',
            'resolution': '1242*2208',
            'aid': 13,
            'ab_version':
            '91796,89593,83095,89184,87331,93903,94158,94056,93418,93085,92848,93981,31210,94178,93319,94042,92438,93526,93357,94163,94003,92487,87496,93887,87988',
            'ab_feature': 'z1',
            'build_version': '5.9.0.5',
            'openudid': '7064ff7d773ef8efeb5d6a25f62cd3d85035674f',
            'live_sdk_version': '1.3.0',
            'idfv': 'B0DB5DD0-FF94-4773-85B1-EFC11132C2A4',
            'ac': 'WIFI',
            'os_version': '10.1.1',
            'ssmix': 'a',
            'device_platform': 'iphone',
            'iid': 6542551421,
            'ab_client': 'a1,f2,f7,e1',
            'device_type': 'iPhone9,2',
            'idfa': 'D2E02B97-0F35-486F-9CD4-A2EC13BBC8FB',
            'LBS_status': 'deny',
            'category': 'news_local',
            'city': '',
            'concern_id': '',
            'count': 20,
            'cp': '548e4d7f7b1BCq1',
            'detail': 1,
            'image': 1,
            'language': 'zh-Hans-CN',
            'last_refresh_sub_entrance_interval': 1482077184,
            'loc_mode': 0,
            'max_behot_time': 1481063762,
            'refer': 1,
            'strict': 0,
            'tt_from': 'load_more',
            'user_city': '泸州'
        }

        contentReleasedTime = tools.getConfValue('content',
                                                 'content_released_time')
        timeInterval = int(tools.getConfValue('content', 'time_interval'))
        contentReleasedTime = tools.dateToTimestamp(contentReleasedTime)
        currentTimestamp = tools.getCurrentTimestamp()

        maxBehotTime = currentTimestamp
        while maxBehotTime >= contentReleasedTime:
            maxBehotTime -= timeInterval

            currentTimestamp = currentTimestamp + random.randint(60, 300)

            # 泸州的文章
            params['category'] = 'news_local'
            params[
                'last_refresh_sub_entrance_interval'] = currentTimestamp  # + random.randint(60, 300)
            params['max_behot_time'] = maxBehotTime

            url = tools.jointUrl(baseUrl, params)
            self.addUrl(url, Constance.NEWS_LOCAL)

            # 视频
            params['category'] = 'video'
            params[
                'last_refresh_sub_entrance_interval'] = currentTimestamp  # + random.randint(60, 300)
            params['max_behot_time'] = maxBehotTime

            url = tools.jointUrl(baseUrl, params)
            self.addUrl(url, Constance.VIDEO)
コード例 #9
0
class Collector(threading.Thread, Singleton):
    _db = tools.getConnectedDB()
    _threadStop = False
    _urls = []
    _interval = int(tools.getConfValue("collector", "sleep_time"))

    #初始时将正在做的任务至为未做
    _db.urls.update({'status': Constance.DOING},
                    {'$set': {
                        'status': Constance.TODO
                    }},
                    multi=True)

    if DEBUG:
        log.debug("is debug depth = %s" % DEPTH)

    def __init__(self):
        super(Collector, self).__init__()

    def run(self):
        while not Collector._threadStop:
            self.__inputData()
            time.sleep(Collector._interval)

    def stop(self):
        Collector._threadStop = False

    def __inputData(self):
        if len(Collector._urls) > int(
                tools.getConfValue("collector", "max_size")):
            return
        mylock.acquire()  #加锁

        website = tools.getConfValue("collector", "website")
        depth = int(tools.getConfValue("collector", "depth"))
        urlCount = int(tools.getConfValue("collector", "url_count"))
        if DEBUG:
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": DEPTH
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)
        elif website == 'all':
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)  #sort -1 降序 1 升序
        else:
            websiteId = tools.getWebsiteId(website)
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "website_id": websiteId,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)

        urlsList = list(urlsList)
        Collector._urls.extend(urlsList)
        #更新已取到的url状态为doing
        for url in urlsList:
            Collector._db.urls.update(url,
                                      {'$set': {
                                          'status': Constance.DOING
                                      }})

        mylock.release()

    def getUrls(self, count):
        mylock.acquire()  #加锁

        urls = Collector._urls[:count]
        del Collector._urls[:count]

        mylock.release()

        return urls
コード例 #10
0
    def __inputData(self):
        if len(Collector._urls) > int(
                tools.getConfValue("collector", "max_size")):
            log.debug("collector 已满 size = %d" % len(Collector._urls))
            return
        mylock.acquire()  #加锁

        website = tools.getConfValue("collector", "website")
        depth = int(tools.getConfValue("collector", "depth"))
        urlCount = int(tools.getConfValue("collector", "url_count"))

        beginTime = time.time()

        if DEBUG:
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": DEPTH
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)
        elif website == 'all':
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)  #sort -1 降序 1 升序
        else:
            websiteId = tools.getWebsiteId(website)
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "website_id": websiteId,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)

        endTime = time.time()

        urlsList = list(urlsList)

        log.debug('get url time ' + str(endTime - beginTime) + " size " +
                  str(len(urlsList)))

        beginTime = time.time()
        Collector._urls.extend(urlsList)
        log.debug('put get url time ' + str(time.time() - beginTime) +
                  " size " + str(len(urlsList)))

        #更新已取到的url状态为doing
        beginTime = time.time()
        for url in urlsList:
            Collector._db.urls.update(url,
                                      {'$set': {
                                          'status': Constance.DOING
                                      }})
        endTime = time.time()
        log.debug('update url time ' + str(endTime - beginTime))

        if self.isAllHaveDone():
            self.stop()
            exportData.export()

        mylock.release()
コード例 #11
0
class Collector(threading.Thread, Singleton):
    _db = tools.getConnectedDB()
    _threadStop = False
    _urls = []
    _nullTimes = 0
    _interval = int(tools.getConfValue("collector", "sleep_time"))

    #初始时将正在做的任务至为未做
    beginTime = time.time()
    _db.urls.update({'status': Constance.DOING},
                    {'$set': {
                        'status': Constance.TODO
                    }},
                    multi=True)
    endTime = time.time()
    log.debug('update url time' + str(endTime - beginTime))

    if DEBUG:
        log.debug("is debug depth = %s" % DEPTH)

    def __init__(self):
        super(Collector, self).__init__()

    def run(self):
        while not Collector._threadStop:
            self.__inputData()
            time.sleep(Collector._interval)

    def stop(self):
        Collector._threadStop = True

    @tools.log_function_time
    def __inputData(self):
        if len(Collector._urls) > int(
                tools.getConfValue("collector", "max_size")):
            log.debug("collector 已满 size = %d" % len(Collector._urls))
            return
        mylock.acquire()  #加锁

        website = tools.getConfValue("collector", "website")
        depth = int(tools.getConfValue("collector", "depth"))
        urlCount = int(tools.getConfValue("collector", "url_count"))

        beginTime = time.time()

        if DEBUG:
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": DEPTH
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)
        elif website == 'all':
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)  #sort -1 降序 1 升序
        else:
            websiteId = tools.getWebsiteId(website)
            urlsList = Collector._db.urls.find(
                {
                    "status": Constance.TODO,
                    "website_id": websiteId,
                    "depth": {
                        "$lte": depth
                    }
                }, {
                    "url": 1,
                    "_id": 0,
                    "depth": 1,
                    "description": 1,
                    "website_id": 1
                }).sort([("depth", 1)]).limit(urlCount)

        endTime = time.time()

        urlsList = list(urlsList)

        log.debug('get url time ' + str(endTime - beginTime) + " size " +
                  str(len(urlsList)))

        beginTime = time.time()
        Collector._urls.extend(urlsList)
        log.debug('put get url time ' + str(time.time() - beginTime) +
                  " size " + str(len(urlsList)))

        #更新已取到的url状态为doing
        beginTime = time.time()
        for url in urlsList:
            Collector._db.urls.update(url,
                                      {'$set': {
                                          'status': Constance.DOING
                                      }})
        endTime = time.time()
        log.debug('update url time ' + str(endTime - beginTime))

        if self.isAllHaveDone():
            self.stop()
            exportData.export()

        mylock.release()

    def isFinished(self):
        return Collector._threadStop

    def isAllHaveDone(self):
        allowedNullTimes = int(
            tools.getConfValue("collector", 'allowed_null_times'))
        if Collector._urls == []:
            Collector._nullTimes += 1
            if Collector._nullTimes >= allowedNullTimes:
                return True
        else:
            Collector._nullTimes = 0
            return False

    @tools.log_function_time
    def getUrls(self, count):
        mylock.acquire()  #加锁

        urls = Collector._urls[:count]
        del Collector._urls[:count]

        mylock.release()

        return urls