Beispiel #1
0
 def _update_hash(self):
     """Updates information hash and if update
     of whole tree is required adds the pk to
     the update hash queue"""
     new = hash_md5(str(self))
     if new != self._info_hash:
         self._info_hash = new
Beispiel #2
0
 def test_information_node_creation(self):
     info_node = InformationNode(randomPK, **temp_info)
     self.assertIsInstance(info_node, InformationNode)
     self.assertEqual(info_node._pk, randomPK)
     self.assertEqual(info_node._data_holder, temp_info)
     self.assertEqual(info_node._info_hash, hash_md5(str(info_node)))
     self.assertTrue(check_valid_hash(info_node._info_hash))
Beispiel #3
0
def into_url_save_dir(mysql_handle_base, job_body):
    '''
    将任务结果插入 url_save_dir 数据表
    '''
    for once_task in job_body['task_list']:
        url = once_task['url']
        if 'path' in once_task:
            timestamp_dir_path = once_task['path']
            file_list = os.listdir(once_task['path'])
            insert_gray_dir_tree(mysql_handle_base, url, timestamp_dir_path,
                                 file_list)
            continue
        if 'web_save_resource_num' in once_task:
            web_save_resource_num = once_task['web_save_resource_num']
            timestamp_abs_dir_clist = timestamp_dir_path.split('/')
            update_fields = {
                'html': [int(web_save_resource_num['html_num']), 'd'],
                'css': [int(web_save_resource_num['css_num']), 'd'],
                'js': [int(web_save_resource_num['js_num']), 'd'],
                'pic': [int(web_save_resource_num['img_num']), 'd']
            }
            wheres = {
                'url_hash': [hash_md5(once_task['url']), 's'],
                'timestamp': [timestamp_abs_dir_clist[-1], 's']
            }
            result = mysql_handle_base.update('url_save_dir', update_fields,
                                              wheres)
Beispiel #4
0
 def _update_children_hash(self):
     """ Updates children hash.
     """
     if not self._children:
         self._set_base_attribute('_children_hash', DEFAULT_HASH_VALUE)
         return
     temp = ''.join((x.get_hash() for x in self._children))
     self._children_hash = hash_md5(temp)
Beispiel #5
0
def hash_exist(mysql_handle_base, url, timestamp_dir_path):
    '''
    判断一个记录是否在 url_save_dir 表中
    '''
    wheres = {
        'url_hash': [hash_md5(url), 's'],
        'timestamp': [timestamp_dir_path.split('/')[-1], 's']
    }
    return wheres
Beispiel #6
0
 def __init__(self, config_name='test_engine_conf.yaml'):
     super(Engine_Model, self).__init__()
     self.config_name = config_name
     self.CURRENT_PATH = sys.path[0]
     self.read_config_public()
     self.engine_id = hash_md5(getLocalIp() + self.engine_type +
                               self.CURRENT_PATH)
     self.mq = multiprocessing.Queue()
     self.lock = multiprocessing.Lock()
     self.start_server_heart_beat()
Beispiel #7
0
    def _update_hash(self):
        """ Updates hash of self, as well as of the tree.
        If hash has changed. Insert self pk into _update_hash_queue
        so as to inform that I have a new updated hash, and the
        corresponding parents should be updated too.
        """

        old = self.get_hash()
        self._info._update_hash()
        self._update_children_hash() # assumes that all children have clean hash
        self._hash = hash_md5(self.get_children_hash() + self.get_info_hash())
        new = self.get_hash()

        self._touch()
        if new != old:
            # propogate hash upwards
            self._update_hash_queue.add(self._pk)
Beispiel #8
0
def into_url_list(mysql_handle_base, job_body):
    '''
    将任务结果在url_list数据表中插入一条记录存储
    '''
    engine_describe_all = engine_describe.get_engine_describe(
        mysql_handle_base)
    # 向数据表中填写task_list中相关数据
    table_field_descripe = TableFieldDescripe
    insert_urls(mysql_handle_base, job_body['url_list'], job_body['add_way'])
    update_fields = add_table_feild(job_body, table_field_descripe)
    # 防止下面添加add_way时重复
    if 'add_way' in update_fields:
        del update_fields['add_way']
    for once_task in job_body['task_list']:
        update_fields = add_table_feild(
            once_task, table_field_descripe, update_fields)
        # 防止下面添加url时重复
        if 'url' in update_fields:
            del update_fields['url']
        run_win_engine_list = []
        run_error_engine_list = []
        # 分别把操作成功的引擎和操作失败的引擎添加到相应列表中(此处失败分为对网页分析失败和引擎启动失败两部分)
        # 对网页分析失败部分
        for field, value in once_task.iteritems():
            if field.find('_status') != -1:
                engine_code_name = field[0:field.find('_status')]
                engine_type = engine_describe_all[engine_code_name]
                if value == True:
                    run_win_engine_list.append(engine_type)
                elif value == False:
                    run_error_engine_list.append(engine_type)
        # 引擎启动失败部分
        run_error_engine_list.extend(job_body['run_error_engine'])
        # 引擎执行结果转换给字符串格式
        update_fields['run_win_engine'] = ['-'.join(run_win_engine_list), 's']
        update_fields['run_error_engine'] = ['-'.join(run_error_engine_list), 's']
        update_fields['waiting_engine'] = ['-'.join(update_fields['waiting_engine'][0]), 's']
        update_fields['running_engine'] = ['-'.join(update_fields['running_engine'][0]), 's']
        # 获取指令url的ip
        ip_location = get_ip_location(once_task['url'])
        update_fields = add_table_feild(
            ip_location, table_field_descripe, update_fields)
        wheres = {'url_hash': [hash_md5(once_task['url']), 's']}
        result = mysql_handle_base.update(
            'url_list', update_fields, wheres)
Beispiel #9
0
        def __deco(mysql_handle_base, urls, add_way='test'):
            @arg_exist(table_name)
            def hash_exist(mysql_handle_base, url):
                wheres = {'url_hash': [hash_md5(url), 's']}
                return wheres

            fields = [('url_hash', 's'), ('url', 's'), ('add_way', 's'),
                      ('add_time', 's')]
            param = []
            for url in urls:
                if hash_exist(mysql_handle_base,
                              url) is not False:  # 说明该URL已存在
                    continue
                param.append((hash_md5(url), url, add_way, get_format_time()))
            if param == []:
                return False
            param = tuple(param)
            return mysql_handle_base.batch_insert(table_name, fields, param)
Beispiel #10
0
    def run(self):
        count = 0
        while True:
            # ADD_COLLECTION 补采账号  get_account 日常采集; 使用account_list 兼容单个账号和账号列表
            account_list = ADD_COLLECTION if ADD_COLLECTION else self.get_account(
            )
            # length = len(threading.enumerate())  # 枚举返回个列表
            log.info('当前运行的线程数为:{}'.format(threading.active_count()))
            log.info('当前运行的进程:{}'.format(
                multiprocessing.current_process().name))
            count += 1
            log.info('第{}次'.format(count))
            if account_list is None:
                log.info('调度队列为空,休眠5秒')
                time.sleep(5)
                continue
            for account_name in account_list:
                try:
                    self.search_name = account_name
                    html_account = self.account_homepage()
                    if html_account:
                        html = html_account
                    else:
                        log.info('{}|找到不到微信号'.format(account_name))
                        continue
                    urls_article = self.urls_article(html)
                    # 确定account信息
                    account = Account()
                    account.name = self.name
                    account.account = account_name
                    account.tags = self.get_tags()
                    account.get_account_id()
                    # 判重 查底层
                    # ids = self.dedup(account_name) if JUDEG else ''
                    # 判重 redis
                    sentenced_keys = account.account + ' ' + str(
                        account.account_id)
                    keys = hash_md5(sentenced_keys)
                    log.info('keys: {}'.format(keys))
                    dedup_result = self.dedup_redis(keys)
                    post_dedup_urls = []

                    entity = None
                    backpack_list = []
                    ftp_list = []
                    ftp_info = None
                    for page_count, url in enumerate(urls_article):
                        try:
                            # if page_count > 5:
                            #     break
                            article = Article()
                            article.create(url, account, self.proxies)
                            log.info('第{}条 文章标题: {}'.format(
                                page_count, article.title))
                            log.info("当前文章url: {}".format(url))
                            entity = JsonEntity(article, account)
                            log.info('当前文章ID: {}'.format(entity.id))
                            article_date = datetime.datetime.fromtimestamp(
                                int(str(article.time)[:-3]))
                            day_diff = datetime.date.today(
                            ) - article_date.date()
                            if day_diff.days > 15:
                                log.info(
                                    '超过采集interval最大15天 的文章不采集,已采集{}条文章'.format(
                                        page_count))
                                self.count_articles(page_count)
                                break
                            if dedup_result:
                                # title_time_str = entity.title + str(entity.time)
                                # title_time_md5 = hash_md5(title_time_str)
                                if entity.id in dedup_result:
                                    log.info('当前文章已存在,跳过')
                                    continue
                                else:
                                    post_dedup_urls.append(entity.id)
                            else:
                                # title_time_str = entity.title + str(entity.time)
                                # title_time_md5 = hash_md5(title_time_str)
                                post_dedup_urls.append(entity.id)

                            # dedup_result = self.dedup_redis(entity)
                            # if dedup_result:
                            #     log.info('当前文章已存在,跳过')
                            # ids = ids.append({'key': entity.id, 'urls': entity.url})
                            # if entity.id in ids and JUDEG is True:
                            #     log.info('当前文章已存在,跳过')
                            #     continue
                            backpack = Backpack()
                            backpack.create(entity)
                            backpack_list.append(backpack.create_backpack())
                            # self.save_to_mysql(entity)
                            # self.save_to_mongo(entity.to_dict())
                            # ftp包
                            ftp_info = Ftp(entity)
                            name_xml = ftp_info.hash_md5(ftp_info.url)
                            log.info('当前文章xml: {}'.format(name_xml))
                            self.create_xml(ftp_info.ftp_dict(), name_xml)
                            ftp_list.append(name_xml)
                        except Exception as run_error:
                            log.info('微信解析文章错误 {}'.format(run_error))
                            continue

                    log.info("开始发包")
                    if entity and backpack_list:
                        # 直接发底层
                        # entity.uploads(backpack_list)
                        entity.uploads_datacenter_relay(backpack_list)
                        entity.uploads_datacenter_unity(backpack_list)
                        log.info("数据中心,三合一,发包完成")
                    else:
                        log.info('包列表为空,不发送数据')
                        continue
                    # todo 发包超时,修改MTU
                    if ftp_info is not None:
                        entity.uploads_ftp(ftp_info, ftp_list)
                        log.info("ftp发包完成")
                    if post_dedup_urls:
                        log.info('上传判重中心key:{} urls:{}'.format(
                            keys, post_dedup_urls))
                        url = 'http://47.100.53.87:8008/Schedule/CacheWx'
                        data = [{
                            "key": keys,
                            "sourceNodes": "1",
                            "sourceType": "2",
                            "urls": post_dedup_urls
                        }]
                        r = requests.post(url,
                                          data=json.dumps(data),
                                          timeout=self.timeout)
                        log.info('上传判重中心结果{}'.format(r.status_code))
                except Exception as e:
                    log.exception("解析公众号错误 {}".format(e))
                    time.sleep(30)
                    if ('chrome not reachable'
                            in str(e)) or ('Message: timeout' in str(e)):
                        raise RuntimeError('chrome not reachable')
            if ADD_COLLECTION:
                break
Beispiel #11
0
def hash_exist(mysql_handle_base, url):
    '''
    判断一个url是否在url_list表中
    '''
    wheres = {'url_hash': [hash_md5(url), 's']}
    return wheres
Beispiel #12
0
 def hash_exist(mysql_handle_base, url):
     wheres = {'url_hash': [hash_md5(url), 's']}
     return wheres
Beispiel #13
0
 def __deco(mysql_handle_base,
            url,
            timestamp_dir_path,
            file_list=[],
            update_sign=True):
     timestamp_abs_dir_clist = timestamp_dir_path.split('/')
     fields = {
         'url_hash': [hash_md5(url), 's'],
         'timestamp': [timestamp_abs_dir_clist[-1], 's'],
         'url': [url, 's'],
         'update_time': [get_format_time(), 's'],
         'save_path': [
             '/'.join(timestamp_abs_dir_clist[timestamp_abs_dir_clist.
                                              index('web_info') + 1:]),
             's'
         ],
         'url_file': [0, 'd'],
         'main_html': [0, 'd'],
         'normal_html_html': [0, 'd'],
         'html': [0, 'd'],
         'css': [0, 'd'],
         'js': [0, 'd'],
         'pic': [0, 'd'],
         'text_json': [0, 'd'],
         'block_json': [0, 'd'],
         'border_json': [0, 'd'],
         'block_html': [0, 'd'],
         'cut_img': [0, 'd'],
         'vips_imgs_txt': [0, 'd'],
         'view_json': [0, 'd'],
         'webpage_jpeg': [0, 'd'],
         'blockpage_jpeg': [0, 'd'],
         'other': ['', 's']
     }
     for f in file_list:
         if f == 'images':  # 第二版网页保存引擎中images目录下存储资源
             resource_nums = count_resource_num(timestamp_dir_path)
             fields['js'][0] = resource_nums[0]
             fields['css'][0] = resource_nums[1]
             fields['pic'][0] = resource_nums[2]
             fields['html'][0] = resource_nums[3]
         elif f in fields:
             #对文件名和fields中一致的文件或目录进行添加, 第一版网页保存引擎中各资源存储在fields中对应名称的目录下
             if os.path.isdir(pjoin(timestamp_dir_path, f)):
                 f_num = len(os.listdir(pjoin(timestamp_dir_path, f)))
                 fields[f][0] = f_num
             else:
                 fields[f][0] = 1
         elif f.replace('.', '_') in fields:  # 对有后缀的文件进行添加
             fields[f.replace('.', '_')][0] = 1
         else:
             fields['other'][0] += (f + '/')
     wheres = {
         'url_hash': fields['url_hash'],
         'timestamp': fields['timestamp']
     }
     if mysql_handle_base.select(table_name, ['*'], wheres):
         if update_sign:
             mysql_handle_base.update(table_name, fields, wheres)
     else:
         mysql_handle_base.insert(table_name, fields)
     return True
Beispiel #14
0
    def run(self):
        # self.set_name()
        # while True:
        account_list = ['大数据发布', '上海港湾集团', '绿盟365', '酌梦录', '瞄了个喵', '豪德通讯', '魔都娱乐1', '大侠的小宇宙', '澳洲梦', '盛世路跑', '佛系金融女',
                        '中卫今日热点', '金华社区居委会', '昕说法', '华农海洋研会', '尘埃一生', '革镇堡街道普法', '速度车行', '七分钟高清视频', '摘星少女酱',
                        '青海省格尔木市健桥医院', '乐用好车', '最强省钱喵喵君', '石柱港航', '荣盛物业长沙花语馨苑客服中心', '汕头超声集团', '中奥吴郡半岛', '隽永人生',
                        '飞鸿影视传媒', 'RGSE义乌雨具遮阳及防护用品展']

        articles = []
        ID = hash_md5(self.name)

        for name in account_list:
            if len(name) == 0:
                continue
            self.name = name

            html_account = self.account_homepage()
            if html_account:
                html, account_of_homepage = html_account
            else:
                continue
            log('start 公众号: ', self.name)
            urls_article = self.urls_article(html)

            account = Account()
            account.name = self.name
            account.account = account_of_homepage
            account.get_account_id()

            backpack_list = []
            for page_count, url in enumerate(urls_article):
                # if page_count < 35:
                #     continue
                article = Article()
                article.create(url, self.name)
                log('文章标题:', article.title)
                log("第{}条".format(page_count))

                entity = JsonEntity(article, account)
                backpack = Backpack()
                backpack.create(entity)
                backpack_list.append(backpack.create_backpack())

                # 所有文章
                article_info = backpack.to_dict()
                articles.append({ID: article_info})
                # 上传数据库
                import pymongo
                conn = pymongo.MongoClient('120.78.237.213', 27017)
                sql = '''
                        INSERT INTO
                            account_http(article_url, addon, account, account_id, author, id, title)
                        VALUES
                            (%s, %s, %s, %s, %s, %s, %s)
                '''
                _tuple = (
                    article.url, datetime.datetime.now(), entity.account, entity.account_id, entity.author, entity.id,
                    entity.title
                )
                uploads_mysql(config_mysql, sql, _tuple)
                # if page_count == 5:
                #     break

        log("发包")
        if entity:
            entity.uploads(backpack_list)