Example #1
0
    def parse_tweet(self, response):
        # logging.info('Processing --> ' + response.url)
        username = response.xpath(
            '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="username u-dir u-textTruncate"]/b/text()'
        ).get(default='')
        full_name = response.xpath(
            '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="FullNameGroup"]/strong/text()'
        ).get(default='')

        try:
            tweet_text = response.xpath('//title/text()').get(
                default='').split(':')[1].strip()

        except:
            tweet_text = ' '.join(
                response.xpath(
                    '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-text-container"]/p//text()'
                ).getall()).strip()
        image_list = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="AdaptiveMediaOuterContainer"]//img/@src'
        ).getall()
        date_time = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()'
        ).get(default='')

        date_time = parser.parse(date_time.replace(
            '-', '')).strftime('%Y-%m-%d %H:%M:%S')
        retweets = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()'
        ).get(default='')

        likes = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()'
        ).get(default='')
        replies = response.xpath(
            '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[contains(@id,"profile-tweet-action-reply-count")]/parent::span/@data-tweet-stat-count'
        ).get(default='')

        mentions = get_mentions(tweet_text)
        hashtags = get_hashtags(tweet_text)
        cta = get_links(tweet_text)

        result = {
            'username': username.lower(),
            'full_name': full_name,
            'twitter_url': response.url,
            'tweet_text': tweet_text,
            'tweet_time': str(date_time),
            'number_of_likes': str(likes),
            'no_of_retweets': str(retweets),
            'no_of_replies': str(replies),
            'mentions': ' | '.join(mentions),
            'no_of_mentions': str(len(mentions)),
            'hashtags': ' | '.join(hashtags),
            'no_of_hashtags': str(len(hashtags)),
            'call_to_action': ' | '.join(cta),
            'image_url': ' | '.join(image_list),
            'tag': self.tag
        }
        yield result
Example #2
0
def get_stats(tweets, scr_name, count=2):
    """ returns stats about hashtags and user mentions 
        tweets: a dict of tweet dicts from rest api
    """
    hashtags = get_hashtags(tweets)
    mentions = get_mentions(tweets)
    
    freq_hashtags =  sort_dct(histogram(hashtags))
    freq_mentions =  sort_dct(histogram(mentions))
    freq_mentions = remove_self(freq_mentions, scr_name)
    return freq_hashtags[:count], freq_mentions[:count]
Example #3
0
def entity_linker_with_use(title, title_tags, article):
    """
    :param title: 标题
    :param article: 正文,以段落的形式呈现(para, para_tags)
    :param title_tags: 标题NER标注序列
    :return: text中的mention以及其对应的链接结果
    """
    from Config import embed

    def _predict(_m, _k, _backup):
        """
        :param _m: 待预测的mention
        :param _k: 待预测的债券类型
        :param _backup: 使用映射表将简称转换为全称后的mention
        :return: 候选, 链接结果,链接得分(距离)
        """
        def _find_neighbor(_mention):
            """
            :param _mention: 用于寻找近邻的mention
            :return: 近邻的距离,近邻的索引(债券名库中)
            """
            def _helper(_m):
                if config.use_PCA:
                    _embed = pca.transform(embed(_m).numpy())
                else:
                    _embed = embed(_m).numpy()
                if config.use_LSH:
                    _distance, _idx = neighbor_finder.kneighbors(_embed, n_neighbors=config.knn)
                else:
                    _distance, _idx = neighbor_finder.query(_embed, k=config.knn)
                _candi_set = []
                for i in _idx[0]:
                    if _kind_idx == -1:
                        _candi_set.append(config.full_embeddings[i])
                    else:
                        _candi_set.append(config.bond_clusters[_kind_idx][i])
                _sim_matrix = cosine_similarity(embed(_mention).numpy(), _candi_set)
                _cur_ans = -2
                _pos = 0
                for i, s in enumerate(_sim_matrix[0]):
                    if s > _cur_ans:
                        _cur_ans = s
                        _pos = i
                return _cur_ans, _idx[0][_pos]
            nonlocal _flag
            nonlocal neighbor_finder
            nonlocal pca
            nonlocal _kind_idx
            sim, pos = _helper(_mention)
            if _flag:
                new_sim, new_pos = _helper(_mention + '资产支持证券')
                if new_sim > sim:
                    pos = new_pos
                    sim = new_sim
            return sim, pos
        # 繁体转简体
        _m = Converter('zh-hans').convert(_m)
        _k = Converter('zh-hans').convert(_k)
        _flag = '资产支持证券' in _m
        _candidates = []
        pca = None
        _k = '转债' if _k == '可转债' else _k
        if _k in config.bond_kind:
            _kind_idx = config.bond_kind.index(_k)
        else:
            _kind_idx = -1
            for char in config.short_character:
                if char in _m:
                    _kind_idx = config.bond_kind.index(char)
                    break
        if _kind_idx == -1:
            neighbor_finder = config.total_neighbor
            if config.use_PCA:
                pca = config.pca
        else:
            if len(config.bond_clusters[_kind_idx]) == 0:
                neighbor_finder = config.total_neighbor
                if config.use_PCA:
                    pca = config.pca
            else:
                neighbor_finder = config.neighbor_in_cluster[_kind_idx]
                if config.use_PCA:
                    pca = config.pca_in_cluster[_kind_idx]
        similarity, idx = _find_neighbor(_m)
        if _backup is not None:
            for ins in _backup:
                backup_similarity, backup_idx = _find_neighbor(ins)
                if backup_similarity > similarity:
                    similarity = backup_similarity
                    idx = backup_idx
        if _kind_idx == -1:
            result = config.names[config.full_to_id[idx]][:-1]
        else:
            result = config.names[config.cluster_to_id[_kind_idx][idx]][:-1]
        if similarity < config.thresh_hold:
            result = 'entity not find in knowledge base!'
        return [], result, similarity

    def _get_backup(_block):
        """
        :param _block: 债券要素块
        :return: 将简称映射为全称后的债券mention
        """
        _backup = None
        if '发债方' in _block['tags']:
            idx = _block['tags'].index('发债方')
            if _block['elements'][idx] in config.map_table:
                _backup = []
                for full_name in config.map_table[_block['elements'][idx]]:
                    temp = ''
                    for i, e in enumerate(_block['elements']):
                        if i == idx:
                            temp += full_name
                        else:
                            temp += e
                    _backup.append(temp)
        return _backup
    # 目前按照名称的相似度选择链接对象

    title_entity_set = []
    title_candidate_set = []
    title_scores = []
    article_entity_set = []
    article_candidate_set = []
    article_scores = []
    title_blocks = merge_elements(title, title_tags)
    title_mentions, title_kinds, title_missing = get_mentions(title_blocks)
    assert (len(title_mentions) == len(title_kinds))
    assert (len(title_kinds) == len(title_missing))
    article_blocks = []
    article_elements = dict()
    article_elements['年份'] = set()
    article_elements['发债方'] = set()
    article_elements['修饰语'] = set()
    article_elements['期数'] = set()
    article_elements['债券类型'] = set()
    for para, para_tags in article:
        _blocks, article_elements = process_paragraph(para, para_tags, article_elements)
        article_blocks += _blocks
    article_mentions, article_kinds, _ = get_mentions(article_blocks)
    if len(article_mentions) != len(article_kinds):
        print(article_mentions, len(article_mentions))
        print(article_kinds, len(article_kinds))
        raise Exception('error!')
    for article_mention, article_kind, article_block in zip(article_mentions, article_kinds, article_blocks):
        _candi, predict, score = _predict(article_mention, article_kind, _get_backup(article_block))
        article_candidate_set.append(_candi)
        article_entity_set.append(predict)
        article_scores.append(score)
    bonds_in_article = list(set(article_entity_set))
    # 这一步是为了保证去重后顺序不变
    bonds_in_article.sort(key=article_entity_set.index)
    for title_mention, title_kind, is_miss, title_block in \
            zip(title_mentions, title_kinds, title_missing, title_blocks):
        linking_result = []
        candidates = []
        scores = []
        if is_miss:
            # 使用正文提及的债券,并保证发债方、年份、期数、债券类型一致
            if len(bonds_in_article) > 0 and ('年份' not in title_block['tags'] or '期数' not in title_block['tags']) \
                    and '资产支持' not in title_kind and '资产证券化' not in title_kind and '专项计划' not in title_kind:
                for bond in bonds_in_article:
                    flag = True
                    if flag and '发债方' in title_block['tags'] and \
                            title_block['elements'][title_block['tags'].index('发债方')] not in bond:
                        flag = False
                    if flag and '年份' in title_block['tags'] and \
                            title_block['elements'][title_block['tags'].index('年份')] not in bond:
                        flag = False
                    if flag and '期数' in title_block['tags'] and \
                            title_block['elements'][title_block['tags'].index('期数')] not in bond:
                        flag = False
                    if flag and '债券类型' in title_block['tags'] and \
                            title_block['elements'][title_block['tags'].index('债券类型')] not in bond:
                        flag = False
                    if flag:
                        linking_result.append(bond)
                if len(linking_result) == 0:
                    linking_result.append('entity not find in knowledge base!')
                    candidates.append([])
                    scores.append(0)
            # 补全要素
            else:
                pad_results = pad_element(title_block, article_elements, title_mention)
                for block in pad_results:
                    if '发债方' not in block['tags']:
                        linking_result.append('entity not find in knowledge base!')
                        candidates.append([])
                        scores.append(0)
                        continue
                    pad_mention = ''
                    for ele in block['elements']:
                        pad_mention += ele
                    # print('pad mention:', pad_mention)
                    _candi, predict, score = _predict(pad_mention, title_kind, _get_backup(block))
                    linking_result.append(predict)
                    candidates.append(_candi)
                    scores.append(score)
        else:
            _candi, predict, score = _predict(title_mention, title_kind, _get_backup(title_block))
            linking_result.append(predict)
            candidates.append(_candi)
            scores.append(score)
        title_entity_set.append(list(set(linking_result)))
        title_candidate_set.append(candidates)
        title_scores.append(scores)
    return title_mentions, title_candidate_set, title_entity_set, title_scores, \
        article_mentions, article_candidate_set, article_entity_set, article_scores
Example #4
0
for p in primarys:
    if p['id'] in [
            '5992855888', '5698108446', '5620822338', '6323177781',
            '6545274324'
    ]:
        region_rank = calc_region_rank(p['id'], followers)
        last_region_rank = calc_region_rank(p['id'], last_followers)
        calc_region_rank_offset(region_rank, last_region_rank)

        ws.append(['', p['brand_name'], '地域排名变化'])
        for r in region_rank:
            ws.append([r['province'], str(r['percentage']) + '%', r['offset']])
        ws.append([])

# 获取本月及上月的提及数量,删除无关,计算增长数量和增长率
mentions = utils.get_mentions(mention_filename)
last_metions = utils.get_mentions(last_mention_filename)
cleanup_mentions = cleanup_mention(mentions)

# 创建工作簿,保存清洗后的提及数据,人工进行复核
m_wb = Workbook()
m_ws = m_wb.active
m_ws.title = '微博提及'
m_ws.append(['提及关键词', '作者', '内容', '链接'])
for m in cleanup_mentions:
    m_ws.append([m['keyword'], m['author'], m['content'], m['url']])
m_wb.save(os.path.join(dist_path, '微博提及表_{}.xlsx'.format(label)))

print('微博提及数据已清洗,保存为 dist 目录下“微博提及表_{}.xlsx”,请人工复核'.format(label))
cleanup_flag = input('是否清洗干净(y/n):')
if not cleanup_flag == 'y':