def test_replace_prices_with_tags(self):

        headline = '日経平均222円安、1500円割れ 円安一服で'
        tokens = self.tokenizer.tokenize(headline)
        tag_tokens = replace_prices_with_tags([t.surface for t in tokens])
        expected = [
            '日経平均', '<yen val="222"/>', '安', '、', '<yen val="1500"/>', '割れ',
            IDEOGRAPHIC_SPACE, '円安', '一服', 'で'
        ]
        self.assertEqual(tag_tokens, expected)
Exemple #2
0
def test_replace_prices_with_tags():

    tokenizer = Tokenizer('resources/user-dict.csv')

    headline = '日経平均222円安、1500円割れ 円安一服で'
    tokens = tokenizer.tokenize(headline)
    result = replace_prices_with_tags([t.surface for t in tokens])
    expected = [
        '日経平均', '<yen val="222"/>', '安', '、', '<yen val="1500"/>', '割れ',
        IDEOGRAPHIC_SPACE, '円安', '一服', 'で'
    ]
    assert result == expected
Exemple #3
0
def update_headlines(session: Session, user_dict: Path,
                     logger: Logger) -> None:

    query_result = session \
        .query(Headline) \
        .filter(Headline.is_used.is_(None)) \
        .all()
    headlines = list(query_result)

    if len(headlines) == 0:
        return

    tokenizer = Tokenizer(str(user_dict))
    mappings = []

    logger.info('start updating headlines')
    for headline in tqdm(headlines):

        h = simplify_headline(headline.headline)

        is_about_di = headline.categories is not None and \
            DOMESTIC_INDEX in headline.categories

        # We stopped using `is_template` because the size of the dataset decreased and the result got worse.
        # if is_template(h) or not is_interesting(h) or not is_about_di:
        if not is_interesting(h) or not is_about_di:
            mappings.append({
                'article_id': headline.article_id,
                'is_used': False
            })
            continue

        tokens = kansuuzi2number(
            [token.surface for token in tokenizer.tokenize(h)])
        tag_tokens = replace_prices_with_tags(tokens)

        mappings.append({
            'article_id': headline.article_id,
            'simple_headline': h,
            'tokens': tokens,
            'tag_tokens': tag_tokens,
            'is_used': True,
        })
    session.bulk_update_mappings(Headline, mappings)
    session.commit()
    logger.info('end updating headlines')