def test_simplify_headline(self):

        s = '<NQN>◇<東証>三菱UFJが続伸 株高受けた買い戻し優勢'
        expected = '三菱UFJが続伸 株高受けた買い戻し優勢'
        self.assertEqual(simplify_headline(s), expected)

        s = IDEOGRAPHIC_SPACE.join(
            ['日経平均、一時8600円下回る', 'TOPIXは年初来安値下回る', '円高とアジア株安で'])
        expected = IDEOGRAPHIC_SPACE.join(
            ['日経平均、一時8600円下回る', 'TOPIXは年初来安値下回る', '円高とアジア株安で'])
        self.assertEqual(simplify_headline(s), expected)

        s = '【要チェック画面】日銀追加緩和見送り'
        expected = '日銀追加緩和見送り'
        self.assertEqual(simplify_headline(s), expected)
Beispiel #2
0
def test_simplify_headline():

    s = '<NQN>◇<東証>三菱UFJが続伸 株高受けた買い戻し優勢'
    expected = '三菱UFJが続伸 株高受けた買い戻し優勢'
    result = simplify_headline(s)
    assert result == expected

    s = IDEOGRAPHIC_SPACE.join(
        ['日経平均、一時8600円下回る', 'TOPIXは年初来安値下回る', '円高とアジア株安で'])
    expected = IDEOGRAPHIC_SPACE.join(
        ['日経平均、一時8600円下回る', 'TOPIXは年初来安値下回る', '円高とアジア株安で'])
    result = simplify_headline(s)
    assert result == expected

    s = '【要チェック画面】日銀追加緩和見送り'
    expected = '日銀追加緩和見送り'
    result = simplify_headline(s)
    assert result == expected
Beispiel #3
0
def update_headlines(session: Session, user_dict: Path,
                     logger: Logger) -> None:

    query_result = session \
        .query(Headline) \
        .filter(Headline.is_used.is_(None)) \
        .all()
    headlines = list(query_result)

    if len(headlines) == 0:
        return

    tokenizer = Tokenizer(str(user_dict))
    mappings = []

    logger.info('start updating headlines')
    for headline in tqdm(headlines):

        h = simplify_headline(headline.headline)

        is_about_di = headline.categories is not None and \
            DOMESTIC_INDEX in headline.categories

        # We stopped using `is_template` because the size of the dataset decreased and the result got worse.
        # if is_template(h) or not is_interesting(h) or not is_about_di:
        if not is_interesting(h) or not is_about_di:
            mappings.append({
                'article_id': headline.article_id,
                'is_used': False
            })
            continue

        tokens = kansuuzi2number(
            [token.surface for token in tokenizer.tokenize(h)])
        tag_tokens = replace_prices_with_tags(tokens)

        mappings.append({
            'article_id': headline.article_id,
            'simple_headline': h,
            'tokens': tokens,
            'tag_tokens': tag_tokens,
            'is_used': True,
        })
    session.bulk_update_mappings(Headline, mappings)
    session.commit()
    logger.info('end updating headlines')