예제 #1
0
    def cluster_week_hot(self,
                         day_hot,
                         hot_value=None,
                         article_count=None,
                         vip_count=None,
                         negative_emotion_count=None,
                         weight=None):
        '''
        @summary: 聚类
        ---------
        @param hot:每日热点信息
        @param hot_value: 一条舆情的热度 (不为空时表示该条每日热点为更新热点,那么7日热点已经聚过此热点, 热度应该只加该条舆情的热度)
        @param article_count:
        @param vip_count:
        @param negative_emotion_count:
        @param weight:
        ---------
        @result:
        '''

        article_text = day_hot.get("TITLE")  # + hot.get("CONTENT")
        release_time = day_hot.get("RELEASE_TIME")

        article_text = tools.del_html_tag(article_text)

        hots = self._get_week_hots(article_text, release_time)

        # 找最相似的热点
        similar_hot = None
        max_similarity = 0
        for hot_info in hots:
            hot = hot_info.get('_source')
            hot_text = hot.get('TITLE')  # + hot.get('CONTENT')
            hot_text = tools.del_html_tag(hot_text)

            temp_similarity = compare_text(article_text, hot_text)
            if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity:
                similar_hot = hot
                max_similarity = temp_similarity

            break  #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较

        if similar_hot:  # 找到相似的热点
            if similar_hot["ID"] != day_hot["ID"]:  # 防止同一个舆情 比较多次
                data = {}

                # 更新热点的热度与文章数
                data['HOT'] = similar_hot['HOT'] + (hot_value
                                                    or day_hot.get('HOT'))
                data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + (
                    article_count or day_hot.get('ARTICLE_COUNT'))

                # 更新主流媒体数量及负面舆情数量
                data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + (
                    vip_count or day_hot.get('VIP_COUNT'))
                data["NEGATIVE_EMOTION_COUNT"] = similar_hot[
                    'NEGATIVE_EMOTION_COUNT'] + (
                        negative_emotion_count
                        or hot.get('NEGATIVE_EMOTION_COUNT'))

                # 更新相关度
                # data['WEIGHT'] = similar_hot['WEIGHT'] + (weight or day_hot['WEIGHT'])

                # 更新 hot_day_ids
                if not hot_value:
                    data["HOT_DAY_IDS"] = similar_hot[
                        'HOT_DAY_IDS'] + ',' + day_hot['ID']

                # 更新热点
                self._es.update_by_id("tab_iopm_hot_week_info",
                                      data_id=similar_hot.get("ID"),
                                      data=data)

            # 返回热点id
            return similar_hot.get("ID")
        else:
            # 将该舆情添加为热点
            hot_info = deepcopy(day_hot)

            # 处理事件类型
            del_tag_content = tools.del_html_tag(hot_info['CONTENT'])
            text = hot_info['TITLE'] + del_tag_content
            contain_event_ids = self._event_filter.find_contain_event(text)
            hot_info['EVENT_IDS'] = ','.join(contain_event_ids)

            hot_info['HOT_DAY_IDS'] = day_hot.get("ID")

            self._es.add('tab_iopm_hot_week_info',
                         hot_info,
                         data_id=hot_info['ID'])

            # 返回热点id
            return hot_info['ID']
예제 #2
0
def main():
    rownum = 0

    sql = 'select count(*) from tab_iopm_article_info'
    result = db.find(sql)
    articles_count = result[0][0]
    deal_count = 0

    while articles_count:

        # 查文章
        sql = '''
            select *
              from (select rownum r, id, title
                      from tab_iopm_article_info
                     where rownum <= %d and info_type != 3 and release_time >= to_date('2017-10-10 00:00:00', 'yyyy-mm-dd hh24:mi:ss') and release_time <= to_date('2017-10-25 23:59:59', 'yyyy-mm-dd hh24:mi:ss'))
             where r > %d
        ''' % (rownum + PAGE_SIZE, rownum)

        sql = '''
            select *
              from (select rownum r, id, title
                      from tab_iopm_article_info
                     where rownum <= %d and info_type != 3)
             where r > %d
        ''' % (rownum + PAGE_SIZE, rownum)

        articles = db.find(sql)
        if not articles:
            deal_cluster_buffer()
            break

        rownum += PAGE_SIZE

        # 查热点
        sql = 'select id, title, hot from tab_iopm_hot_info'
        hots = db.find(sql)
        # 将元组的结果转换为列表
        for i, hot in enumerate(hots):
            hots[i] = list(hot)

        # 查询类别最大id
        sql = 'select max(id) from tab_iopm_hot_info'
        result = db.find(sql)
        max_hot_id = result[0][0] if result[0][0] else 0

        for article in articles:
            max_similar = {
                'similarity': 0,
                'hot_id': -1,
                'article_id': -1,
                'hot_title': '',
                'article_count': 0,
                'hot_pos': -1
            }  # 最相似的文章 similarity表示相似度(0~1)
            article_id = article[1]
            article_text = article[2]

            for i, hot in enumerate(hots):
                hot_id = hot[0]
                hot_text = hot[1]
                # article_count = hot[2]

                similarity = compare_text(hot_text, article_text)
                # print('''
                #     article_text %s
                #     hot_text     %s
                #     similarity   %s
                #     '''%(article_text, hot_text, similarity))

                # 将相似的文章和热点的信息记录下来
                if similarity > max_similar['similarity']:
                    max_similar['similarity'] = similarity
                    max_similar['hot_id'] = hot_id
                    max_similar['article_id'] = article_id
                    max_similar['hot_title'] = article_text if len(
                        hot_text) > len(article_text) else hot_text
                    max_similar['hot_pos'] = i  # 相似热点的下标 后续根据下标来更新热点的标题和文章数

            # 该舆情找到了所属类别
            if max_similar['similarity'] >= SIMILARITY:
                # 将热点及舆情信息缓存起来
                if max_similar['hot_id'] not in cluster_buffer.keys():
                    cluster_buffer[max_similar['hot_id']] = {
                        'title': '',
                        'article_ids': [],
                        'article_count': 0
                    }

                cluster_buffer[
                    max_similar['hot_id']]['title'] = max_similar['hot_title']
                cluster_buffer[max_similar['hot_id']][
                    'article_count'] = max_similar['article_count']
                cluster_buffer[max_similar['hot_id']]['article_ids'].append(
                    max_similar['article_id'])

                hots[max_similar['hot_pos']][1] = max_similar[
                    'hot_title']  # 热点标题
                hots[max_similar['hot_pos']][2] += 1  # 热点文章信息量
            else:
                # 在原有的类别集合中添加新的类别
                max_hot_id += 1
                hots.append([max_hot_id, article_text, 1])  # 1 为文章数

                # 文章自己是一类, 自己和自己肯定相似,所以在聚类的缓存中把自己及类别对应关系缓存起来
                cluster_buffer[max_hot_id] = {
                    'title': article_text,
                    'article_ids': [article_id],
                    'article_count': 1
                }

            deal_count += 1
            tools.print_loading('正在聚类分析 已完成 %d/%d' %
                                (deal_count, articles_count))

            # 如果大于最大缓存,则添加到数据库中
            if len(cluster_buffer) > CLUSTER_BUFFER_ZISE:
                deal_cluster_buffer()

                # 查热点
                sql = 'select id, title, hot from tab_iopm_hot_info'
                hots = db.find(sql)
                # 将元组的结果转换为列表
                for i, hot in enumerate(hots):
                    hots[i] = list(hot)
예제 #3
0
def main():
    db = OracleDB()

    # 查文章
    sql = '''
        select *
          from (select rownum r, id, title
                  from tab_iopm_article_info
                 where rownum >= 1)
         where r <= 100000
    '''
    articles = db.find(sql)

    # 查热点
    sql = 'select id, title from tab_iopm_hot_info'
    hots = db.find(sql)

    for article in articles:
        max_similar = {
            'similarity': 0,
            'hot_id': -1,
            'article_id': -1,
            'hot_title': ''
        }  # 最相似的文章 similarity表示相似度(0~1)
        article_id = article[1]
        article_text = article[2]

        for hot in hots:
            hot_id = hot[0]
            hot_text = hot[1]

            similarity = compare_text(hot_text, article_text)
            # print('''
            #     article_text %s
            #     hot_text     %s
            #     similarity   %s
            #     '''%(article_text, hot_text, similarity))
            if similarity > max_similar['similarity']:
                max_similar['similarity'] = similarity
                max_similar['hot_id'] = hot_id
                max_similar['article_id'] = article_id
                max_similar['hot_title'] = article_text if len(hot_text) > len(
                    article_text) else hot_text

        if max_similar['similarity'] > SIMILARITY:
            sql = 'update tab_iopm_article_info set hot_id = %s where id = %s' % (
                max_similar['hot_id'], max_similar['article_id'])
            db.update(sql)
            sql = "update tab_iopm_hot_info set hot = hot + 1, title = '%s' where id = %s" % (
                max_similar['hot_title'], max_similar['hot_id'])
            db.update(sql)

        else:
            sql = 'select sequence.nextval from dual'
            hot_id = db.find(sql)[0][0]
            sql = "insert into tab_iopm_hot_info (id, title, hot) values (%s, '%s', 1)" % (
                hot_id, article_text)
            db.add(sql)
            sql = 'update tab_iopm_article_info set hot_id = %s where id = %s' % (
                hot_id, article_id)
            db.update(sql)

        sql = 'select id, title from tab_iopm_hot_info'
        hots = db.find(sql)
예제 #4
0
    def get_hot_id(self, article_info, positions, weight_factor):
        '''
        @summary: 聚类
        ---------
        @param article_info:
        ---------
        @result:
        '''
        # weight_factor = 1

        article_text = article_info.get("TITLE")# + article_info.get("CONTENT")
        release_time = article_info.get("RELEASE_TIME")

        article_text = tools.del_html_tag(article_text)

        hots = self._get_today_hots(article_text, release_time)

        # 找最相似的热点
        similar_hot = None
        max_similarity = 0
        for hot_info in hots:
            hot = hot_info.get('_source')
            hot_text = hot.get('TITLE')# + hot.get('CONTENT')
            hot_text = tools.del_html_tag(hot_text)

            temp_similarity = compare_text(article_text, hot_text)
            if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity:
                similar_hot = hot
                max_similarity = temp_similarity

            break #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较

        if similar_hot:# 找到相似的热点
            if similar_hot["ID"] != article_info["ID"]: # 防止同一个舆情 比较多次
                data = {}

                # 更新热点的热度与文章数
                data['HOT'] = similar_hot['HOT'] + INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) *  weight_factor
                data["ARTICLE_COUNT"] = similar_hot["ARTICLE_COUNT"] + 1

                # 更新主流媒体数量及负面舆情数量
                data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + article_info["IS_VIP"]
                data["NEGATIVE_EMOTION_COUNT"] = similar_hot['NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION'] == 2 else 0)

                weight_temp = 0 # 记录更新前后的差值
                # 更新相关度
                if similar_hot['CLUES_IDS']:
                    url = IOPM_SERVICE_ADDRESS + 'related_sort'
                    data_args = {
                        'hot_id': similar_hot['ID'], # 文章id
                        'hot_value' :data['HOT'], # 热度值
                        'clues_ids': similar_hot['CLUES_IDS'],  #相关舆情匹配到的线索id
                        'article_count' : data['ARTICLE_COUNT'], # 文章总数
                        'vip_count': data["VIP_COUNT"],   # 主流媒体数
                        'negative_emotion_count': data["NEGATIVE_EMOTION_COUNT"],  # 负面情感数
                        'zero_ids':article_info['ZERO_ID']
                    }

                    result = tools.get_json_by_requests(url, data = data_args)
                    weight_temp = similar_hot['WEIGHT'] - result.get('weight', 0)
                    data['WEIGHT'] = result.get('weight', 0) * weight_factor

                # 更新热点
                self._es.update_by_id("tab_iopm_hot_info", data_id = similar_hot.get("ID"), data = data)
                # 同步7日热点
                self._hot_week_sync.cluster_week_hot(similar_hot, hot_value = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0), article_count = 1, vip_count = article_info["IS_VIP"], negative_emotion_count = 1 if article_info['EMOTION'] == 2 else 0, weight = weight_temp)


            # 返回热点id
            return similar_hot.get("ID")
        else:
            # 将该舆情添加为热点
            hot_info = deepcopy(article_info)
            hot_info.pop('HOT_ID') # 热点表中无hot_id

            # 默认用户行为数量为零
            hot_info['ACCEPT_COUNT'] = 0
            hot_info['UNACCEPT_COUNT'] = 0
            hot_info['WATCH_COUNT'] = 0

            # 其他值
            hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0
            hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info['EMOTION'] == 2 else 0

            hot_info['HOT'] = INFO_WEIGHT.get(article_info["INFO_TYPE"], 0) * weight_factor
            hot_info['ID'] = article_info.get("ID")
            hot_info['ARTICLE_COUNT'] = 1
            hot_info['HOT_KEYWORDS'] = ','.join(self._cut_text.cut_for_keyword(article_info["TITLE"]))  # 关键词 可优化速度  在比较相似度时已经分词了 TODO
            hot_info['POSITIONS'] = positions
            hot_info['EVENT_IDS'] = ''  # 事件类型(每日热点不需要 TODO | 每周热点已加)

            self._es.add('tab_iopm_hot_info', hot_info, data_id = hot_info['ID'])
            # 同步7日热点
            self._hot_week_sync.cluster_week_hot(hot_info)

            # 返回热点id
            return hot_info['ID']
예제 #5
0
def main():
    deal_count = 0

    record_time = tools.get_current_date() # 2017-11-07 08:09:11

    while True:

        # 查文章
        sql = '''
            select id, title, record_time
              from tab_iopm_article_info
            where record_time >= to_date('%s', 'yyyy-mm-dd hh24:mi:ss')
        '''%(record_time)

        articles = db.find(sql)
        if not articles:
            deal_cluster_buffer()
            print('''
                sql 未查到数据
                %s
                等待新数据...
                '''%sql)
            time.sleep(10)
            continue

        # 查热点
        sql = 'select id, title, hot from tab_iopm_hot_info_test where record_time >= sysdate-1'
        hots = db.find(sql)

        # 查询类别最大id
        sql = 'select max(id) from tab_iopm_hot_info_test'
        result = db.find(sql)
        max_hot_id = result[0][0] if result[0][0] else 0

        for article in articles:
            max_similar = {'similarity':0, 'hot_id':-1, 'article_id':-1, 'hot_title':'', 'article_count':0, 'hot_pos':-1}  # 最相似的文章 similarity表示相似度(0~1)
            article_id = article[0]
            article_title = article[1][:article[1].find('-')] if article[1] else ''
            # article_content = article[2]
            temp_record_time = article[2]

            article_text = article_title# + article_content
            if not article_text:
                continue

            # 更新record_time 为库里最大的值
            if temp_record_time > record_time:
                record_time = temp_record_time

            for i, hot in enumerate(hots):
                hot_id = hot[0]
                hot_text = hot[1]
                # article_count = hot[2]

                similarity = compare_text(hot_text, article_text)
                # print('''
                #     article_text %s
                #     hot_text     %s
                #     similarity   %s
                #     '''%(article_text, hot_text, similarity))

                # 将相似的文章和热点的信息记录下来
                if similarity > max_similar['similarity']:
                    max_similar['similarity'] = similarity
                    max_similar['hot_id'] = hot_id
                    max_similar['article_id'] = article_id
                    max_similar['hot_title'] = article_title if len(hot_text) > len(article_title) else hot_text
                    max_similar['hot_pos'] = i # 相似热点的下标 后续根据下标来更新热点的标题和文章数


            # 该舆情找到了所属类别
            if max_similar['similarity'] >= SIMILARITY:
                # 将热点及舆情信息缓存起来
                if max_similar['hot_id'] not in cluster_buffer.keys():
                    cluster_buffer[max_similar['hot_id']] = {
                        'title':'', 'article_ids':[], 'article_count':0
                    }

                hots[max_similar['hot_pos']][1] = max_similar['hot_title'] # 热点标题
                hots[max_similar['hot_pos']][2] += 1  # 热点文章信息量

                cluster_buffer[max_similar['hot_id']]['title'] = max_similar['hot_title']
                cluster_buffer[max_similar['hot_id']]['article_count'] = hots[max_similar['hot_pos']][2]
                cluster_buffer[max_similar['hot_id']]['article_ids'].append(max_similar['article_id'])

            else:
                # 在原有的类别集合中添加新的类别
                max_hot_id += 1
                hots.append([max_hot_id, article_title, 1]) # 1 为文章数

                # 文章自己是一类, 自己和自己肯定相似,所以在聚类的缓存中把自己及类别对应关系缓存起来
                cluster_buffer[max_hot_id] = {
                    'title':article_title,
                    'article_ids':[article_id],
                    'article_count':1
                }

            deal_count += 1
            tools.print_loading('正在聚类分析 已完成 %d'%(deal_count))

        deal_cluster_buffer()
예제 #6
0
    def deal_news(self):
        '''
        @summary: 取tab_news_csr_result信息
        ---------
        ---------
        @result:
        '''
        while True:
            body = {
                "query": {
                    "filtered": {
                        "filter": {
                            "range": {
                                "csr_res_id": {  # 查询大于该csr_res_id 的信息
                                    "gt": self._current_csr_res_id
                                }
                            }
                        }
                    }
                },
                "_source": ["csr_res_id", "csr_content", "start_time"],
                "sort": [{
                    "csr_res_id": "asc"
                }]
            }

            news_json = self._es.search('tab_news_csr_result', body)
            news_list = news_json.get('hits', {}).get('hits', [])

            if not news_list:
                log.debug(
                    'tab_news_csr_result 表中无大于%s的csr_res_id\nsleep %s...' %
                    (self._current_csr_res_id, SLEEP_TIME))
                tools.delay_time(SLEEP_TIME)
                continue

            for news_info in news_list:
                news = news_info.get('_source')
                csr_res_id = news.get('csr_res_id')
                csr_content = news.get('csr_content')
                start_time = news.get('start_time')

                log.debug('''
                    处理 tab_news_csr_result
                    csr_res_id  %s
                    start_time  %s
                    csr_content %s
                    ''' % (csr_res_id, start_time, csr_content))

                # 找相似文章
                similar_hot = None
                hots = self._get_same_day_hots(csr_content, start_time)

                # 遍历相似的文章,比较相似度
                for hot_info in hots:
                    hot = hot_info.get('_source')
                    hot_text = hot.get('csr_content')

                    temp_similarity = compare_text(csr_content, hot_text)
                    if temp_similarity > MIN_SIMILARITY:
                        similar_hot = hot

                    break  #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较

                # 如果找到相似的文章,追加csr_res_id和hot值, 否则将该条信息最为新的热点
                if similar_hot:  # 找到相似的热点
                    log.debug('找到所属热点:%s' % similar_hot.get('csr_content'))

                    data = {}

                    # 更新热点的热度及追加文章的id
                    data["hot"] = similar_hot["hot"] + 1
                    data["csr_res_ids"] = similar_hot[
                        "csr_res_ids"] + ',' + csr_res_id

                    # 更新热点
                    self._es.update_by_id("tab_news_csr_hot",
                                          data_id=similar_hot.get("hot_id"),
                                          data=data)

                else:  # 没有找到相似的热点, 将当前文章作为热点
                    log.debug('无所属热点')

                    hot_info = {
                        'hot_id': csr_res_id,
                        'hot': 1,
                        'start_time': start_time,
                        'csr_res_ids': csr_res_id,
                        'csr_content': csr_content
                    }
                    self._es.add('tab_news_csr_hot',
                                 hot_info,
                                 data_id=csr_res_id)

                # 保存当前的id
                self._current_csr_res_id = csr_res_id
                self._save_current_id()
예제 #7
0
    def get_hot_id(self, article_info):
        article_text = article_info.get(
            "TITLE")  # + article_info.get("CONTENT")
        release_time = article_info.get("RELEASE_TIME")

        article_text = tools.del_html_tag(article_text)

        hots = self._get_today_hots(article_text, release_time)

        # 找最相似的热点
        similar_hot = None
        max_similarity = 0
        for hot_info in hots:
            hot = hot_info.get('_source')
            hot_text = hot.get('TITLE')  # + hot.get('CONTENT')
            hot_text = tools.del_html_tag(hot_text)

            temp_similarity = compare_text(article_text, hot_text)
            if temp_similarity > MIN_SIMILARITY and temp_similarity > max_similarity:
                similar_hot = hot
                max_similarity = temp_similarity

            break  #hots 按照匹配值排序后,第一个肯定是最相似的,无需向后比较

        if similar_hot:  # 找到相似的热点
            if similar_hot["ID"] != article_info["ID"]:  # 防止同一个舆情 比较多次
                data = {}

                # 更新热点的热度
                data["HOT"] = similar_hot["HOT"] + 1

                # 更新主流媒体数量及负面舆情数量
                data["VIP_COUNT"] = similar_hot['VIP_COUNT'] + (
                    1 if article_info["IS_VIP"] else 0)
                data["NEGATIVE_EMOTION_COUNT"] = similar_hot[
                    'NEGATIVE_EMOTION_COUNT'] + (1 if article_info['EMOTION']
                                                 == 2 else 0)

                # 更新相关度
                if similar_hot['CLUES_IDS']:
                    url = IOPM_SERVICE_ADDRESS + 'related_sort'
                    data_args = {
                        'hot_id': similar_hot['ID'],  # 文章id
                        'hot_value': data['HOT'],  # 热度值
                        'clues_id': similar_hot['CLUES_IDS'],  #相关舆情匹配到的线索id
                        'article_count': data['HOT'],  # 文章总数
                        'vip_count': data["VIP_COUNT"],  # 主流媒体数
                        'negative_emotion_count':
                        data["NEGATIVE_EMOTION_COUNT"],  # 负面情感数
                        'zero_ids': article_info['ZERO_ID']
                    }

                    result = tools.get_json_by_requests(url, data=data_args)
                    if result:
                        data['WEIGHT'] = result.get('weight', 0)

                # 更新热点
                self._es.update_by_id("tab_iopm_hot_info",
                                      data_id=similar_hot.get("ID"),
                                      data=data)

            # 返回热点id
            return similar_hot.get("ID")
        else:
            # 将该舆情添加为热点
            hot_info = deepcopy(article_info)
            hot_info.pop('HOT_ID')  # 热点表中无hot_id

            # 默认用户行为数量为零
            hot_info['ACCEPT_COUNT'] = 0
            hot_info['UNACCEPT_COUNT'] = 0
            hot_info['WATCH_COUNT'] = 0

            # 其他值
            hot_info['VIP_COUNT'] = 1 if article_info["IS_VIP"] else 0
            hot_info['NEGATIVE_EMOTION_COUNT'] = 1 if article_info[
                'EMOTION'] == 2 else 0

            hot_info['HOT'] = 1
            hot_info['ID'] = article_info.get("ID")

            self._es.add('tab_iopm_hot_info', hot_info, data_id=hot_info['ID'])

            # 返回热点id
            return hot_info['ID']