Example #1
0
    def __init__(self, tab_urls, depth, process_num = None):
        '''
        @summary:
        ---------
        @param tab_urls:
        @param depth:
        @param process_num: 进程编号
        ---------
        @result:
        '''

        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times'))
        self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False

        self._tab_worker_status = 'news:worker_status'
        self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '')
Example #2
0
    def __init__(self):

        self._mysqldb = MysqlDB(**config.get('mysqldb'))
        self._redis = RedisDB(**config.get('redisdb'))

        self._task_root_key = config.get('spider').get(
            'redis_task_cache_root_key')

        self._account_task_key = self._task_root_key + ':z_account_task'
        self._article_task_key = self._task_root_key + ':z_article_task'
        self._last_article_publish_time = self._task_root_key + ':h_last_article_publish_time'
        self._new_last_article_publish_time = self._task_root_key + ':h_new_last_article_publish_time'

        self._ignore_haved_crawl_today_article_account = config.get(
            'spider').get('ignore_haved_crawl_today_article_account')
        self._monitor_interval = config.get('spider').get('monitor_interval')
        self._zombie_account_not_publish_article_days = config.get(
            'spider').get('zombie_account_not_publish_article_days')
        self._spider_interval_min = config.get('spider').get(
            'spider_interval').get('min_sleep_time')
        self._spider_interval_max = config.get('spider').get(
            'spider_interval').get('max_sleep_time')
        self._spider_interval_max = config.get('spider').get(
            'spider_interval').get('max_sleep_time')
        self._crawl_time_range = (config.get("spider").get("crawl_time_range")
                                  or "~").split('~')
    def __init__(self, table_article='articles'):
        if not hasattr(self, '_table_article'):
            super(ArticleManager, self).__init__()

            self._thread_stop = False

            self._articles_deque = collections.deque()
            self._db = RedisDB()
            self._table_article = table_article
Example #4
0
def monitor_proxies():
    redis_0 = RedisDB()
    config = os.path.join(os.path.dirname(__file__) + '/../config.conf')
    redis_key = tools.get_conf_value(config, 'redis', 'redis_key')
    redis_key2 = tools.get_conf_value(config, 'redis', 'redis_key2')
    sum = redis_0.count(redis_key)
    sum2 = redis_0.count(redis_key2)

    log.debug("douban当前redis库中剩余ip总数:%d" % sum)
    log.debug("weibo当前redis库中剩余ip总数:%d" % sum2)
Example #5
0
    def __init__(self, table_url = 'urls'):
        if not hasattr(self,'_table_url'):
            super(UrlManager, self).__init__()

            self._thread_stop = False

            self._urls_deque = collections.deque()
            self._db = RedisDB()
            self._table_url = table_url
            self._table_url_dupefilter = self._table_url + '_dupefilter'
            self._table_url_end_depth_dupefilter = self._table_url + '_end_depth_dupefilter'
Example #6
0
    def __init__(self, table_article = 'articles'):
        if not hasattr(self,'_table_article'):
            super(ArticleManager, self).__init__()

            self._thread_stop = False

            self._articles_deque = collections.deque()
            self._db = RedisDB()
            self._table_article = table_article
            self._table_article_bloomfilter = table_article + '_bloomfilter'

            self._bloomfilter = BloomFilter(redis_obj = self._db, key = self._table_article_bloomfilter)
Example #7
0
def random_proxy():
    try:
        redis = RedisDB()
        ip_pools = redis.sget(table=redis_key, count=1)
        proxy = random.choice(ip_pools)
        proxies = {
            "http": proxy,
            "https": proxy,
        }
    except Exception as e:
        print(e)
        proxies = {}
    return proxies
Example #8
0
class Detection(object):
    def __init__(self):
        self.redis = RedisDB()

    async def detection_proxy(self, proxy, semaphore):
        async with semaphore:
            con = aiohttp.TCPConnector(verify_ssl=False,
                                       family=socket.AF_INET,
                                       limit=60)
            async with aiohttp.ClientSession(connector=con) as session:
                try:
                    test_proxy = "http://" + proxy
                    log.debug("正在测试代理:" + test_proxy)
                    async with session.get(TEST_URL,
                                           proxy=test_proxy,
                                           timeout=7) as response:
                        html = await response.text()
                        if response.status == 200 and '检测到有异常请求' not in html:
                            log.debug("\n" + proxy + " 代理可用")
                        else:
                            self.redis.delete_value(redis_key, proxy)
                            log.debug("已清除失效的代理:" + proxy)
                except Exception as e:
                    self.redis.delete_value(redis_key, proxy)
                    log.debug("\n" + proxy + ' 代理请求失败')
                    log.debug("已清除失效的代理:" + proxy)

    def run(self):
        try:
            proxies = self.redis.get_all(redis_key)
            for i in range(0, len(proxies), BATCH_SIZE):
                test_proxies = proxies[i:i + BATCH_SIZE]
                self.main(test_proxies)
        except Exception as e:
            log.debug("测试发生错误", e.args)

    def main(self, test_proxies):
        semaphore = asyncio.Semaphore(5)
        loop = asyncio.get_event_loop()
        task = [
            self.detection_proxy(proxy, semaphore) for proxy in test_proxies
        ]
        loop.run_until_complete(asyncio.wait(task))
    def __init__(self, tab_urls, depth):
        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth  # or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(
            tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(
            tools.get_conf_value('config.conf', "collector",
                                 'allowed_null_times'))
        self._url_count = int(
            tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False
class Detection(object):
    def __init__(self):
        self.redis = RedisDB()
        self.test_url = 'https://m.weibo.cn/'

    @tools.debug
    async def get_html(self, root_url, proxy, semaphore):
        try:
            test_proxy = "http://" + proxy
            log.debug("正在测试代理:" + test_proxy)
            async with semaphore:
                response = await requests.get(root_url,
                                              proxy=test_proxy,
                                              timeout=5)
                html = await response.text()
                return response, html
        except asyncio.TimeoutError as err:
            #log.debug(err)
            return [], []

    @tools.debug
    async def run(self, content_info):

        semaphore = asyncio.Semaphore(10)
        try:
            response, html = await self.get_html(self.test_url, content_info,
                                                 semaphore)
            if html and response:
                if response.status == 200 and '检测到有异常请求' not in html:
                    log.debug("\n" + content_info + " 代理可用")
                else:
                    self.redis.delete_value(redis_key, content_info)
                    log.debug("已清除失效的代理:" + content_info)
            else:
                self.redis.delete_value(redis_key, content_info)
                log.debug("已清除失效的代理:" + content_info)
        except Exception as e:
            print(e)
            self.redis.delete_value(redis_key, content_info)
            log.debug("\n" + content_info + ' 代理请求失败')
            log.debug("已清除失效的代理:" + content_info)

    def doing_main(self):
        task_list = self.redis.get_all(redis_key)
        log.debug('数据库中IP总数{}'.format(len(task_list)))
        tasks = [asyncio.ensure_future(self.run(data)) for data in task_list]
        loop = asyncio.get_event_loop()
        loop.run_until_complete(asyncio.wait(tasks, timeout=6))
class ArticleManager(threading.Thread, Singleton):
    def __init__(self, table_article='articles'):
        if not hasattr(self, '_table_article'):
            super(ArticleManager, self).__init__()

            self._thread_stop = False

            self._articles_deque = collections.deque()
            self._db = RedisDB()
            self._table_article = table_article

    def run(self):
        while not self._thread_stop:
            try:
                self.__add_article_to_db()
            except Exception as e:
                log.error(e)

            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True

    def put_articles(self, article):
        self._articles_deque.append(article)

    def clear_article(self):
        '''
        @summary: 删除redis里的数据
        ---------
        ---------
        @result:
        '''

        self._db.clear(self._table_article)

    def __add_article_to_db(self):
        article_list = []
        while self._articles_deque:
            article = self._articles_deque.popleft()
            article_list.append(article)
            if len(article_list) > 100:
                log.debug('添加article到数据库')
                self._db.zadd(self._table_article, article_list)
                article_list = []

        if article_list:
            log.debug('添加article到数据库')
            self._db.zadd(self._table_article, article_list)
Example #12
0
class SyncArtice(threading.Thread):
    def __init__(self):
        super(SyncArtice, self).__init__()

        self._es = ES()
        self._redis = RedisDB()
        self._sync_count = 0

    def run(self):
        is_show_tip = False
        while True:
            try:
                datas = self.get_data_from_redis(SYNC_STEP)
                if not datas:
                    if not is_show_tip:
                        print('\n{time} 无数据 休眠...    '.format(
                            time=tools.get_current_date()))
                        is_show_tip = True
                elif self.add_data_to_es(datas):
                    is_show_tip = False
                    self._sync_count += len(datas)
                    tools.print_one_line('已同步 %d 条数据' % self._sync_count)
                tools.delay_time(1)
            except Exception as e:
                log.error(e)

    def get_data_from_redis(self, count):
        datas = self._redis.sget('news:news_article', count=count)
        return_datas = []
        for data in datas:
            data = eval(data)
            release_time = data.get('release_time')
            if release_time and len(release_time) == 19:
                return_datas.append(data)

        return return_datas

    def add_data_to_es(self, datas):
        return self._es.add_batch(datas,
                                  primary_key='uuid',
                                  table='news_article')
Example #13
0
class UrlManager(threading.Thread, Singleton):
    def __init__(self, table_url = 'urls'):
        if not hasattr(self,'_table_url'):
            super(UrlManager, self).__init__()

            self._thread_stop = False

            self._urls_deque = collections.deque()
            self._db = RedisDB()
            self._table_url = table_url
            self._table_url_dupefilter = self._table_url + '_dupefilter'
            self._table_url_end_depth_dupefilter = self._table_url + '_end_depth_dupefilter'

    def run(self):
        while not self._thread_stop:
            try:
                self.__add_url_to_db()
            except Exception as e:
                log.error(e)

            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True

    def put_urls(self, urls):
        urls = urls if isinstance(urls, list) else [urls]
        for url in urls:
            self._urls_deque.append(url)

    def get_urls_count(self):
        return len(self._urls_deque)

    def clear_url(self):
        '''
        @summary: 删除redis里的数据
        ---------
        ---------
        @result:
        '''

        self._db.clear(self._table_url)
        self._db.clear(self._table_url_dupefilter)

    def __add_url_to_db(self):
        url_list = []
        prioritys = []

        while self._urls_deque:
            url = self._urls_deque.popleft()
            url_id = tools.get_sha1(url.get('url'))
            depth = url.get('depth', 0)

            max_depth = url.get('remark',{}).get('spider_depth', 0)
            # 为了获取每层数量,指纹url暂时采用zset,且先校验指纹url,后校验最后一层url,不需要获取每层url时建议采用set存储,且先校验最后一层url
            if depth == max_depth - 1: #最后一层 url单独放,之后不需要清空
                if self._db.zadd(self._table_url_dupefilter, url_id, depth) and self._db.sadd(self._table_url_end_depth_dupefilter, url_id):
                    url_list.append(url)
                    prioritys.append(depth)

            elif self._db.zadd(self._table_url_dupefilter, url_id, depth):
                url_list.append(url)
                prioritys.append(depth)

            if len(url_list) > 100:
                log.debug('添加url到数据库')
                self._db.zadd(self._table_url, url_list, prioritys)
                url_list = []
                prioritys = []

        if url_list:
            log.debug('添加url到数据库')
            self._db.zadd(self._table_url, url_list, prioritys)
 def __init__(self):
     self.redis = RedisDB()
     self.test_url = 'https://m.weibo.cn/'
from db.data_generator import DataGenerator
from db.redisdb import RedisDB

data_generator = DataGenerator()

# Generating data
data_generator.generate_data()

# Retrieving data
data = data_generator.get_data()

# Create Redis instance
redis = RedisDB()

redis.set(data)

print(redis.get("planes:0"))
print(redis.get("planes:1"))

# It is not necessary to indicate the "planes" prefix as all of them are "planes"

# In order to lock a nested resource use ":"
Example #16
0
class WechatService():
    _db = OracleDB()
    _es = ES()
    _redisdb = RedisDB()
    _wechat_sogou = WechatSogou()
    _wechat_public_platform = WechatPublicPlatform()

    _todo_accounts = collections.deque()
    _rownum = 1

    _is_done = False  # 做完一轮
    _is_all_done = False  # 所有账号当日发布的消息均已爬取

    # wechat_sogou 最后没被封的时间
    _wechat_sogou_enable = True
    _wechat_sogou_last_unenable_time = tools.get_current_timestamp()

    # wechat_public_platform 最后没被封的时间
    _wechat_public_platform_enable = True
    _wechat_public_platform_last_unenable_time = tools.get_current_timestamp()

    def __init__(self):
        pass

    def __load_todo_account(self):
        accounts = WechatService._redisdb.sget('wechat:account', count=1)

        for account in accounts:
            account = eval(account)
            WechatService._todo_accounts.append(account)

    def is_have_new_article(self, account_id, account_name, __biz):
        '''
        @summary: 检查是否有新发布的文章
        ---------
        @param account_id:
        @param __biz:
        ---------
        @result:
        '''

        result = ''
        if WechatService._wechat_sogou_enable:  # 搜狗微信可用
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 有新发布的文章 抓取
                pass

            elif result == constance.NOT_UPDATE:
                # 无新发布的文章 pass
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                # 被封了 请求失败 记录下失败时间
                WechatService._wechat_sogou_enable = False
                WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
                )

        # 搜狗微信停用时间超过24小时了 可重新尝试
        elif tools.get_current_timestamp(
        ) - WechatService._wechat_sogou_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
            result = WechatService._wechat_sogou.is_have_new_article(
                account_id=account_id, account=account_name)
            if result == constance.UPDATE:
                # 搜狗微信可用
                WechatService._wechat_sogou_enable = True

            elif result == constance.NOT_UPDATE:
                pass

            elif result == constance.ERROR:
                pass

            elif result == constance.VERIFICATION_CODE:
                pass

            # 更新下可用时间
            WechatService._wechat_sogou_last_unenable_time = tools.get_current_timestamp(
            )

        # 如果搜狗微信不可用 则使用微信公众平台检查是否有新发布的文章
        if not result or result == constance.VERIFICATION_CODE:
            if WechatService._wechat_public_platform_enable:  # 微信公众平台可用
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    pass

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    WechatService._wechat_public_platform_enable = False
                    WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                    )

            elif tools.get_current_timestamp(
            ) - WechatService._wechat_public_platform_last_unenable_time > TIME_INTERVAL:  # 搜狗微信不可用 但是已经间歇一天 还可以一试
                result = WechatService._wechat_public_platform.is_have_new_article(
                    __biz)
                if result == constance.UPDATE:
                    # 有新发布的文章 抓取
                    WechatService._wechat_public_platform_enable = True

                elif result == constance.NOT_UPDATE:
                    # 无新发布的文章 pass
                    pass

                elif result == constance.ERROR:
                    # 被封了 请求失败 记录下失败时间
                    pass

                # 更新下可用时间
                WechatService._wechat_public_platform_last_unenable_time = tools.get_current_timestamp(
                )

        return result

    def get_next_account(self):
        '''
        @summary:
        ---------
        ---------
        @result: 返回biz, 是否已做完一圈 (biz, True)
        '''

        if not WechatService._todo_accounts:
            self.__load_todo_account()

        if not WechatService._todo_accounts:
            return None

        oralce_id, account_id, account_name, last_article_release_time, biz = WechatService._todo_accounts.popleft(
        )
        next_account_id = account_id
        next_account_biz = biz
        next_account_name = account_name

        next_account = next_account_id, next_account_biz

        sql = "update TAB_IOPM_SITE t set t.spider_status=602 where t.biz = '%s'" % (
            next_account_biz)
        WechatService._db.update(sql)

        return next_account

    def update_account_article_num(self, __biz):
        # 查询es 统计数量
        # 今日
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "filter": {
                        "range": {
                            "record_time": {
                                "gte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 00:00:00',
                                "lte":
                                tools.get_current_date('%Y-%m-%d') +
                                ' 23:59:59'
                            }
                        }
                    },
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        today_msg = result.get('hits', {}).get('total', 0)

        # 历史总信息量
        body = {
            "size": 0,
            "query": {
                "filtered": {
                    "query": {
                        'match': {
                            "__biz": __biz
                        }
                    }
                }
            }
        }
        result = WechatService._es.search('wechat_article', body)
        total_msg = result.get('hits', {}).get('total', 0)

        if total_msg:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.total_msg = %d, t.spider_status=603 where t.biz = '%s'" % (
                today_msg, total_msg, __biz)
        else:
            sql = "update TAB_IOPM_SITE t set t.today_msg = %d, t.spider_status=603 where t.biz = '%s'" % (
                today_msg, __biz)
        print(sql)
        WechatService._db.update(sql)

    def is_exist(self, table, data_id):
        if WechatService._es.get(table, data_id=data_id, doc_type=table):
            return True
        else:
            return False

    def add_article_info(self, article_info):
        '''
        @summary:
        ---------
        @param article_info:
        ---------
        @result:
        '''

        log.debug('''
            -----文章信息-----
            标题     %s
            发布时间 %s
            作者     %s
            公众号   %s
            url      %s
            ''' % (article_info['title'], article_info['release_time'],
                   article_info['author'], article_info['account'],
                   article_info['url']))

        WechatService._es.add('wechat_article', article_info,
                              article_info.get('article_id'))

    def add_account_info(self, account_info):
        log.debug('''
            -----公众号信息-----
            %s''' % tools.dumps_json(account_info))

        WechatService._es.add('wechat_account', account_info,
                              account_info.get('__biz'))
Example #17
0
        print(f"{bcolors.OKGREEN}Client 2:{bcolors.ENDC} "
              f"{bcolors.OKCYAN}Updating resource '" + RESOURCE_NAME +
              f"' {bcolors.ENDC}")

        # Update the resource
        redis.update(name=RESOURCE_NAME, key="client", value="client_2")
        redis.update(name=RESOURCE_NAME, key="random", value=random.random())
    else:
        print(
            f"{bcolors.FAIL}Client 2: Error acquiring the lock on resource '" +
            RESOURCE_NAME + f"' {bcolors.ENDC}")


# Create Redis instance
redis = RedisDB()

# Create Redlock instance
dlm = Redlock([
    {
        "host": "localhost",
        "port": 6379,
        "db": 0
    },
])

print(f"{bcolors.OKBLUE}## EXECUTING TEST 6 ##{bcolors.ENDC}")
print(
    f"{bcolors.OKBLUE} Several clients, Several locks, one resource, client blocked {bcolors.ENDC}"
)
Example #18
0
class Collector(threading.Thread):
    def __init__(self, tab_urls, depth, process_num = None):
        '''
        @summary:
        ---------
        @param tab_urls:
        @param depth:
        @param process_num: 进程编号
        ---------
        @result:
        '''

        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times'))
        self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False

        self._tab_worker_status = 'news:worker_status'
        self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '')

    def run(self):
        while not self._thread_stop:
            try:
                self.__input_data()
            except Exception as e:
                log.error(e)

            time.sleep(self._interval)

    def stop(self):
        self._thread_stop = True
        if self._finished_callback:
            self._finished_callback()

    # @tools.log_function_time
    def __input_data(self):
        if self._urls:
            log.debug('url 未处理完,不取url, url数量 = %s'%len(self._urls))
            return

        # 汇报节点信息
        self._db.zadd(self._tab_worker_status, self._worker_mark, 0) # 未做

        url_count = self._url_count # 先赋值
        # 根据等待节点数量,动态分配url
        worker_wait_count = self._db.zget_count(self._tab_worker_status, priority_min = 0, priority_max = 0)
        if worker_wait_count:
            # 任务数量
            task_count = self._db.zget_count(self._tab_urls)
            # 动态分配的数量 = 任务数量 / 休息的节点数量
            url_count = task_count // worker_wait_count

        url_count = url_count if url_count <= self._url_count else self._url_count

        urls_list = self._db.zget(self._tab_urls, count = url_count)

        if not urls_list:
            if not self._is_show_wait:
                log.info('等待任务...')
                self._is_show_wait = True
        else:
            # # 记录url数量 测试用
            # url_count_record = tools.read_file('url_count.txt')
            # url_count_record =  url_count_record and int(url_count_record) or 0
            # url_count_record += len(urls_list)
            # tools.write_file('url_count.txt', str(url_count_record))

            # 汇报节点信息
            self._db.zadd(self._tab_worker_status, self._worker_mark, 1) # 正在做

            # 存url
            self.put_urls(urls_list)
            self._is_show_wait = False

        # if self.is_all_have_done():
        #     log.debug('is_all_have_done end')
        #     self.stop()

    def is_finished(self):
        return self._thread_stop

    def add_finished_callback(self, callback):
        self._finished_callback = callback

    # 没有可做的url
    def is_all_have_done(self):
        # log.debug('判断是否有未做的url collector url size = %s | url_manager size = %s'%(len(self._urls), self._url_manager.get_urls_count()))
        if len(self._urls) == 0:
            self._null_times += 1
            if self._null_times >= self._allowed_null_times and self._url_manager.get_urls_count() == 0:
                return True
            else:
                return False
        else:
            self._null_times = 0
            return False


    # @tools.log_function_time
    def put_urls(self, urls_list):
        for url_info in urls_list:
            try:
                url_info = eval(url_info)
            except Exception as e:
                url_info = None

            if url_info:
                self._urls.append(url_info)

    # @tools.log_function_time
    def get_urls(self, count):
        urls = []
        count = count if count <= len(self._urls) else len(self._urls)
        while count:
            urls.append(self._urls.popleft())
            count -= 1

        return urls
Example #19
0
class TaskService():
    _task_ring_buff = RingBuff(TASK_BUFFER_SIZE)
    _offset = 1
    _lock = threading.RLock()
    _spider_start_timestamp = 0
    _spider_end_timestamp = 0
    _total_task_size = 0
    _db = OracleDB()
    _redisdb = RedisDB()

    def __init__(self ):
        pass

    def load_task(self):
        if TaskService._offset == 1:
            log.info('开始新的一轮抓取')
            TaskService._spider_start_timestamp = tools.get_current_timestamp()
            TaskService._total_task_size = 0

            # 清空url表
            TaskService._redisdb.clear('news:news_urls')
            TaskService._redisdb.clear('news:news_urls_dupefilter')


        task_sql = '''
            select *
              from (select t.id, t.name, t.position, t.url, t.depth, rownum r
                      from TAB_IOPM_SITE t
                     where classify = 1
                       and t.mointor_status = 701
                       and t.position != 35
                       and rownum < {page_size})
             where r >= {offset}
        '''.format(page_size = TaskService._offset + TASK_BUFFER_SIZE, offset = TaskService._offset)
        TaskService._offset += TASK_BUFFER_SIZE

        print(task_sql)
        tasks = TaskService._db.find(task_sql)
        TaskService._total_task_size += len(tasks)

        if not tasks:
            TaskService._spider_end_timestamp = tools.get_current_timestamp()
            log.info('已做完一轮,共处理网站%s个 耗时%s'%(TaskService._total_task_size, tools.seconds_to_h_m_s(TaskService._spider_end_timestamp - TaskService._spider_start_timestamp)))
            TaskService._offset = 1
            self.load_task()

        TaskService._task_ring_buff.put_data(tasks)

    def get_task(self, count = TASK_COUNT):
        TaskService._lock.acquire() #加锁
        tasks = TaskService._task_ring_buff.get_data(count)
        if not tasks:
            self.load_task()
            tasks = TaskService._task_ring_buff.get_data(count)

        TaskService._lock.release()
        return {'tasks':tasks, 'thread_count':THREAD_COUNT}

    def update_task_status(self, tasks, status):
        TaskService._lock.acquire() #加锁
        for task in tasks:
          website_id = task[0]

          sql = "update tab_iopm_site t set t.spider_time = to_date('%s', 'yyyy-mm-dd :hh24:mi:ss'), t.spider_status = %s where id = %s"%(tools.get_current_date(), status, website_id)

          TaskService._db.update(sql)
        TaskService._lock.release()
class Collector(threading.Thread):
    def __init__(self, tab_urls, depth):
        super(Collector, self).__init__()
        self._db = RedisDB()
        self._thread_stop = False
        self._urls = collections.deque()
        self._null_times = 0
        self._tab_urls = tab_urls
        self._depth = depth  # or int(tools.get_conf_value('config.conf', "collector", "depth"))
        self._interval = int(
            tools.get_conf_value('config.conf', "collector", "sleep_time"))
        self._allowed_null_times = int(
            tools.get_conf_value('config.conf', "collector",
                                 'allowed_null_times'))
        self._url_count = int(
            tools.get_conf_value('config.conf', "collector", "url_count"))

        self._url_manager = UrlManager(tab_urls)

        self._finished_callback = None

        self._is_show_wait = False

    def run(self):
        while not self._thread_stop:
            self.__input_data()
            time.sleep(self._interval)

    def stop(self):
        self._thread_stop = True
        if self._finished_callback:
            self._finished_callback()

    # @tools.log_function_time
    def __input_data(self):
        if self._urls:
            log.debug('url 未处理完,不取url, url数量 = %s' % len(self._urls))
            return

        urls_list = self._db.zget(self._tab_urls, count=self._url_count)
        if not urls_list:
            if not self._is_show_wait:
                log.info('等待任务...')
                self._is_show_wait = True
        else:
            # 存url
            self.put_urls(urls_list)
            self._is_show_wait = False

        # if self.is_all_have_done():
        #     log.debug('is_all_have_done end')
        #     self.stop()

    def is_finished(self):
        return self._thread_stop

    def add_finished_callback(self, callback):
        self._finished_callback = callback

    # 没有可做的url
    def is_all_have_done(self):
        # log.debug('判断是否有未做的url collector url size = %s | url_manager size = %s'%(len(self._urls), self._url_manager.get_urls_count()))
        if len(self._urls) == 0:
            self._null_times += 1
            if self._null_times >= self._allowed_null_times and self._url_manager.get_urls_count(
            ) == 0:
                return True
            else:
                return False
        else:
            self._null_times = 0
            return False

    # @tools.log_function_time
    def put_urls(self, urls_list):
        for url_info in urls_list:
            try:
                url_info = eval(url_info)
            except Exception as e:
                url_info = None

            if url_info:
                self._urls.append(url_info)

    # @tools.log_function_time
    def get_urls(self, count):
        urls = []
        count = count if count <= len(self._urls) else len(self._urls)
        while count:
            urls.append(self._urls.popleft())
            count -= 1

        return urls
 def __init__(self):
     self._oracledb = OracleDB()
     self._redisdb = RedisDB()
     self._news_url_table = 'news:news_urls'
     self._news_urls_dupefilter = 'news:news_urls_dupefilter'
class TaskManager():
    def __init__(self):
        self._oracledb = OracleDB()
        self._redisdb = RedisDB()
        self._news_url_table = 'news:news_urls'
        self._news_urls_dupefilter = 'news:news_urls_dupefilter'

    def get_task_count(self):
        '''
        @summary: redis 中是否有待做的url
        ---------
        ---------
        @result:
        '''

        return self._redisdb.zget_count(self._news_url_table)

    def get_ever_depth_count(self, total_depth=5):
        '''
        @summary:
        ---------
        @param total_depth: 不包含。 以客户角度的层数
        ---------
        @result:
        '''

        depth_count_info = {}
        total_count = 0
        for depth in range(total_depth):
            key = '第%s层url数' % (depth + 1)
            depth_count_info[key] = self._redisdb.sget_count(
                self._news_urls_dupefilter + str(depth))
            total_count += depth_count_info[key]

        depth_count_info['总url数'] = total_count
        return depth_count_info

    def get_task_from_oracle(self):
        tasks = []

        offset = 0
        while True:
            # 取任务
            task_sql = '''
                select *
                  from (select t.id, t.name, t.position, t.url, t.depth, rownum r
                          from TAB_IOPM_SITE t
                         where classify = 1
                           and t.mointor_status = 701
                           and (t.position != 35 or t.position is null)
                           and rownum < {page_size})
                 where r >= {offset}
            '''.format(page_size=offset + ONE_PAGE_SIZE, offset=offset)

            results = self._oracledb.find(task_sql)
            offset += ONE_PAGE_SIZE

            if not results: break

            # 拼装成json格式的url
            for task in results:
                website_id = task[0]
                website_name = task[1]
                website_position = task[2]
                website_url = task[3]
                website_domain = tools.get_domain(website_url)
                spider_depth = task[4]

                remark = {
                    'website_name': website_name,
                    'website_position': website_position,
                    'website_url': website_url,
                    'website_domain': website_domain,
                    'spider_depth': spider_depth
                }
                url_dict = {
                    'site_id': 1,
                    'url': website_url,
                    'depth': 0,
                    'remark': remark,
                    'retry_times': 0
                }

                tasks.append(url_dict)

        return tasks

    def add_task_to_redis(self, tasks):
        for task in tasks:
            url = task.get('url')
            if url:
                url_id = tools.get_sha1(url)
                if self._redisdb.sadd(self._news_urls_dupefilter, url_id):
                    self._redisdb.zadd(self._news_url_table, task, prioritys=0)
                    # 下面是统计每层url数量用的表
                    self._redisdb.sadd('news:news_urls_dupefilter0', url_id)

    def clear_task(self):
        # 清空url指纹表
        self._redisdb.sdelete('news:news_urls_dupefilter')
        # 下面是统计每层url数量用的表
        self._redisdb.sdelete('news:news_urls_dupefilter0')
        self._redisdb.sdelete('news:news_urls_dupefilter1')
        self._redisdb.sdelete('news:news_urls_dupefilter2')
        self._redisdb.sdelete('news:news_urls_dupefilter3')
        self._redisdb.sdelete('news:news_urls_dupefilter4')
Example #23
0
import socket
import os, signal
import time
from storage_module.dection_ping_proxy import check_ip
from retrying import retry
# def write(content_info):
#     f = open('D:\start_get_ip\pid.txt','a',encoding="utf-8")
#     f.write(str(content_info)+"\n")
#     f.close()
# pid=os.getpid()
# print(pid)
# write(pid)
# while True:
#     print(1)
#     time.sleep(34)
redis_0 = RedisDB()
MAX_POOL = 400
config = os.path.join('D:\proxy\\' + 'config.conf')

redis_key = tools.get_conf_value(config, 'redis', 'redis_key')
redis_key2 = tools.get_conf_value(config, 'redis', 'redis_key2')
bj_ip = socket.gethostbyname(socket.gethostname())


def retry(attempt):
    def decorator(func):
        def wrapper(*args, **kw):
            att = 0
            while att < attempt:
                try:
                    return func(*args, **kw)
Example #24
0
class ArticleManager(threading.Thread, Singleton):
    def __init__(self, table_article = 'articles'):
        if not hasattr(self,'_table_article'):
            super(ArticleManager, self).__init__()

            self._thread_stop = False

            self._articles_deque = collections.deque()
            self._db = RedisDB()
            self._table_article = table_article
            self._table_article_bloomfilter = table_article + '_bloomfilter'

            self._bloomfilter = BloomFilter(redis_obj = self._db, key = self._table_article_bloomfilter)

    def run(self):
        while not self._thread_stop:
            try:
                self.__add_article_to_db()
            except Exception as e:
                log.error(e)

            log.debug('缓存中文章数量 %s'%len(self._articles_deque))
            tools.delay_time(1)

    def stop(self):
        self._thread_stop = True

    def put_articles(self, article):
        self._articles_deque.append(article)

        if self.get_articles_count() > MAX_ARTICLE_COUNT: # 超过最大缓存,总动调用
            self.__add_article_to_db()

    def get_articles_count(self):
        return len(self._table_article)

    def clear_article(self):
        '''
        @summary: 删除redis里的数据
        ---------
        ---------
        @result:
        '''

        self._db.clear(self._table_article)

    def __add_article_to_db(self):
        article_list = []
        while self._articles_deque:
            article = self._articles_deque.popleft()
            # 查看article是否存在
            if self._bloomfilter.is_contains(article.get('uuid')):
                article_list.append(article)
            else:
                self._bloomfilter.insert(article.get('uuid'))

            if len(article_list) > 100:
                log.debug('添加article到数据库')
                self._db.sadd(self._table_article, article_list)
                article_list = []

        if article_list:
            log.debug('添加article到数据库')
            self._db.sadd(self._table_article, article_list)
Example #25
0
class TaskManager():
    IS_IN_TIME_RANGE = 1  # 在时间范围
    NOT_REACH_TIME_RANGE = 2  # 没到达时间范围
    OVER_MIN_TIME_RANGE = 3  # 超过时间范围

    def __init__(self):

        self._mysqldb = MysqlDB(**config.get('mysqldb'))
        self._redis = RedisDB(**config.get('redisdb'))

        self._task_root_key = config.get('spider').get('redis_task_cache_root_key')

        self._account_task_key = self._task_root_key + ':z_account_task'
        self._article_task_key = self._task_root_key + ':z_article_task'
        self._last_article_publish_time = self._task_root_key + ':h_last_article_publish_time'
        self._new_last_article_publish_time = self._task_root_key + ':h_new_last_article_publish_time'

        self._ignore_haved_crawl_today_article_account = config.get('spider').get('ignore_haved_crawl_today_article_account')
        self._monitor_interval = config.get('spider').get('monitor_interval')
        self._zombie_account_not_publish_article_days = config.get('spider').get('zombie_account_not_publish_article_days')
        self._spider_interval_min = config.get('spider').get('spider_interval').get('min_sleep_time')
        self._spider_interval_max = config.get('spider').get('spider_interval').get('max_sleep_time')
        self._spider_interval_max = config.get('spider').get('spider_interval').get('max_sleep_time')
        self._crawl_time_range = (config.get("spider").get("crawl_time_range") or "~").split('~')

    def __get_task_from_redis(self, key):
        task = self._redis.zget(key, is_pop=True)
        if task:
            task = eval(task[0])
            return task

    def __random_int(self, min, max):
        pass

    def get_account_task(self):
        """
        获取公众号任务
        :return:
            {'__biz': 'Mjc1NjM3MjY2MA==', 'last_publish_time': None}
            或
            None
        """
        task = self.__get_task_from_redis(self._account_task_key)
        if not task:
            publish_time_condition = "AND last_publish_time < '{today}'".format(today=tools.get_current_date(date_format='%Y-%m-%d' + ' 00:00:00')) if self._ignore_haved_crawl_today_article_account else ''
            sql = '''
                SELECT
                    __biz,
                    last_publish_time
                FROM
                    wechat_account_task
                WHERE
                    `is_zombie` != 1
                AND (
                    (
                        (
                            UNIX_TIMESTAMP(CURRENT_TIMESTAMP) - UNIX_TIMESTAMP(last_spider_time)
                        ) > {monitor_interval}
                        {publish_time_condition}
                    )
                    OR (last_spider_time IS NULL)
                )
                '''.format(monitor_interval=self._monitor_interval, publish_time_condition=publish_time_condition)

            tasks = self._mysqldb.find(sql, to_json=True)
            if tasks:
                self._redis.zadd(self._account_task_key, tasks)
                task = self.__get_task_from_redis(self._account_task_key)

        return task

    def get_article_task(self):
        """
        获取文章任务
        :return:
            {'article_url': 'http://mp.weixin.qq.com/s?__biz=MzIxNzg1ODQ0MQ==&mid=2247485501&idx=1&sn=92721338ddbf7d907eaf03a70a0715bd&chksm=97f220dba085a9cd2b9a922fb174c767603203d6dbd2a7d3a6dc41b3400a0c477a8d62b96396&scene=27#wechat_redirect'}
            或
            None
        """
        task = self.__get_task_from_redis(self._article_task_key)
        if not task:
            sql = 'select id, article_url from wechat_article_task where state = 0 limit 5000'
            tasks = self._mysqldb.find(sql)
            if tasks:
                # 更新任务状态
                task_ids = str(tuple([task[0] for task in tasks])).replace(',)', ')')
                sql = 'update wechat_article_task set state = 2 where id in %s' % (task_ids)
                self._mysqldb.update(sql)

            else:
                sql = 'select id, article_url from wechat_article_task where state = 2 limit 5000'
                tasks = self._mysqldb.find(sql)

            if tasks:
                task_json = [
                    {
                        'article_url': article_url
                    }
                    for id, article_url in tasks
                ]
                self._redis.zadd(self._article_task_key, task_json)
                task = self.__get_task_from_redis(self._article_task_key)

        return task

    def update_article_task_state(self, sn, state=1):
        sql = 'update wechat_article_task set state = %s where sn = "%s"' % (state, sn)
        self._mysqldb.update(sql)

    def record_last_article_publish_time(self, __biz, last_publish_time):
        self._redis.hset(self._last_article_publish_time, __biz, last_publish_time or '')

    def is_reach_last_article_publish_time(self, __biz, publish_time):
        last_publish_time = self._redis.hget(self._last_article_publish_time, __biz)
        if not last_publish_time:
            # 查询mysql里是否有该任务
            sql = "select last_publish_time from wechat_account_task where __biz = '%s'" % __biz
            data = self._mysqldb.find(sql)
            if data:  # [(None,)] / []
                last_publish_time = str(data[0][0] or '')
                self.record_last_article_publish_time(__biz, last_publish_time)

        if last_publish_time is None:
            return

        if publish_time < last_publish_time:
            return True

        return False

    def is_in_crawl_time_range(self, publish_time):
        """
        是否在时间范围
        :param publish_time:
        :return: 是否达时间范围
        """
        if not publish_time or (not self._crawl_time_range[0] and not self._crawl_time_range[1]):
            return TaskManager.IS_IN_TIME_RANGE

        if self._crawl_time_range[0]:  # 时间范围 上限
            if publish_time > self._crawl_time_range[0]:
                return TaskManager.NOT_REACH_TIME_RANGE

            if publish_time <= self._crawl_time_range[0] and publish_time >= self._crawl_time_range[1]:
                return TaskManager.IS_IN_TIME_RANGE

        if publish_time < self._crawl_time_range[1]:  # 下限
            return TaskManager.OVER_MIN_TIME_RANGE

        return TaskManager.IS_IN_TIME_RANGE

    def record_new_last_article_publish_time(self, __biz, new_last_publish_time):
        self._redis.hset(self._new_last_article_publish_time, __biz, new_last_publish_time)

    def get_new_last_article_publish_time(self, __biz):
        return self._redis.hget(self._new_last_article_publish_time, __biz)

    def update_account_last_publish_time(self, __biz, last_publish_time):
        sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}" where __biz="{}"'.format(
            last_publish_time, tools.get_current_date(), __biz
        )
        self._mysqldb.update(sql)

    def is_zombie_account(self, last_publish_timestamp):
        if tools.get_current_timestamp() - last_publish_timestamp > self._zombie_account_not_publish_article_days * 86400:
            return True
        return False

    def sign_account_is_zombie(self, __biz, last_publish_time=None):
        if last_publish_time:
            sql = 'update wechat_account_task set last_publish_time = "{}", last_spider_time="{}", is_zombie=1 where __biz="{}"'.format(
                last_publish_time, tools.get_current_date(), __biz
            )
        else:
            sql = 'update wechat_account_task set last_spider_time="{}", is_zombie=1 where __biz="{}"'.format(
                tools.get_current_date(), __biz
            )

        self._mysqldb.update(sql)

    def get_task(self, url=None, tip=''):
        """
        获取任务
        :param url: 指定url时,返回该url包装后的任务。否则先取公众号任务,无则取文章任务。若均无任务,则休眠一段时间之后再取
        :return:
        """

        sleep_time = random.randint(self._spider_interval_min, self._spider_interval_max)

        if not url:
            account_task = self.get_account_task()
            if account_task:
                __biz = account_task.get('__biz')
                last_publish_time = account_task.get('last_publish_time')
                self.record_last_article_publish_time(__biz, last_publish_time)
                tip = '正在抓取列表'
                url = 'https://mp.weixin.qq.com/mp/profile_ext?action=home&__biz={}&scene=124#wechat_redirect'.format(__biz)
            else:
                article_task = self.get_article_task()
                if article_task:
                    tip = '正在抓取详情'
                    url = article_task.get('article_url')
                else:
                    sleep_time = config.get('spider').get('no_task_sleep_time')
                    log.info('暂无任务 休眠 {}s'.format(sleep_time))
                    tip = '暂无任务 '

        if url:
            next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.href='{url}';}},{sleep_time_msec});</script>".format(
                tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time), url=url, sleep_time_msec=sleep_time * 1000
            )
        else:
            next_page = "{tip} 休眠 {sleep_time}s 下次刷新时间 {begin_spider_time} <script>setTimeout(function(){{window.location.reload();}},{sleep_time_msec});</script>".format(
                tip=tip and tip + ' ', sleep_time=sleep_time, begin_spider_time=tools.timestamp_to_date(tools.get_current_timestamp() + sleep_time), sleep_time_msec=sleep_time * 1000
            )

        return next_page

    def reset_task(self):
        # 清除redis缓存
        keys = self._task_root_key + "*"
        keys = self._redis.getkeys(keys)
        if keys:
            for key in keys:
                self._redis.clear(key)

            # 重设任务
            sql = "update wechat_article_task set state = 0 where state = 2"
            self._mysqldb.update(sql)
Example #26
0
 def __init__(self):
     self._oracledb = OracleDB()
     self._redisdb = RedisDB()
     self._wechat_sogo = WechatSogou()
Example #27
0
 def __init__(self):
     self.redis = RedisDB()
     self.test_url = "https://movie.douban.com/"
 def __init__(self):
     self.redis = RedisDB()
Example #29
0
    def __init__(self):
        super(SyncArtice, self).__init__()

        self._es = ES()
        self._redis = RedisDB()
        self._sync_count = 0
Example #30
0
class CheckNewArticle():
    def __init__(self):
        self._oracledb = OracleDB()
        self._redisdb = RedisDB()
        self._wechat_sogo = WechatSogou()

    def get_wait_check_account(self):
        '''
        @summary:
        ---------
        @param :
        ---------
        @result:
        '''
        # 取抓取完的公众号,且最近发布时间已过去两小时,则再次监测是否又发布新文章
        before_tow_hours = tools.timestamp_to_date(
            tools.get_current_timestamp() - 60 * 60 * 2)
        sql = '''
            select t.id,
                   t.domain,
                   t.name,
                   to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'),
                   t.biz
              from TAB_IOPM_SITE t
             where t.biz is not null
               and mointor_status = 701
               and t.spider_status = 603
               and (t.last_article_release_time is null or
                   t.last_article_release_time <=
                   to_date('{}', 'yyyy-mm-dd hh24:mi:ss'))
        '''.format(before_tow_hours)

        accounts = self._oracledb.find(sql)

        # 若无抓取完的公众号,且redis中无抓取任务,则数据库中非603任务可能为丢失任务,需要重新下发
        if not accounts and not self._redisdb.sget_count('wechat:account'):
            sql = '''
                select t.id,
                       t.domain,
                       t.name,
                       to_char(t.last_article_release_time, 'yyyy-mm-dd hh24:mi:ss'),
                       t.biz
                  from TAB_IOPM_SITE t
                 where t.biz is not null
                   and mointor_status = 701
                   and t.spider_status != 603
            '''

            accounts = self._oracledb.find(sql)

        return accounts

    def check_new_article(self, account):
        oralce_id, account_id, account_name, last_article_release_time, biz = account

        article_release_time = self._wechat_sogo.get_article_release_time(
            account_id=account_id, account=account_name)
        print(article_release_time)
        if article_release_time:
            last_article_release_time = last_article_release_time or ''
            if article_release_time >= tools.get_current_date(
                    '%Y-%m-%d'
            ) and article_release_time > last_article_release_time:
                print('{} 有新文章发布,等待抓取。 发布时间:{}'.format(account_name,
                                                       article_release_time))

                sql = '''
                    update TAB_IOPM_SITE t set t.spider_status = 601,
                     t.last_article_release_time =
                           to_date('{}', 'yyyy-mm-dd hh24:mi:ss')
                     where id = {}
                '''.format(article_release_time, oralce_id)

                # 多线程, 数据库需每个线程持有一个
                oracledb = OracleDB()
                oracledb.update(sql)
                oracledb.close()

                # 入redis, 作为微信爬虫的任务池
                data = (oralce_id, account_id, account_name,
                        last_article_release_time, biz)
                self._redisdb.sadd('wechat:account', data)