def test_kansuuzi2number(self):

        tokens = ['1', '万', '円', '台']
        expected = ['10000', '円', '台']
        self.assertEqual(kansuuzi2number(tokens), expected)

        tokens = ['賞金', 'は' '500', '万']
        expected = ['賞金', 'は' '500', '万']
        self.assertEqual(kansuuzi2number(tokens), expected)
Beispiel #2
0
def test_kansuuzi2number():

    tokens = ['1', '万', '円', '台']
    expected = ['10000', '円', '台']
    result = kansuuzi2number(tokens)
    assert result == expected

    tokens = ['賞金', 'は' '500', '万']
    expected = ['賞金', 'は' '500', '万']
    result = kansuuzi2number(tokens)
    assert result == expected
Beispiel #3
0
def update_headlines(session: Session, user_dict: Path,
                     logger: Logger) -> None:

    query_result = session \
        .query(Headline) \
        .filter(Headline.is_used.is_(None)) \
        .all()
    headlines = list(query_result)

    if len(headlines) == 0:
        return

    tokenizer = Tokenizer(str(user_dict))
    mappings = []

    logger.info('start updating headlines')
    for headline in tqdm(headlines):

        h = simplify_headline(headline.headline)

        is_about_di = headline.categories is not None and \
            DOMESTIC_INDEX in headline.categories

        # We stopped using `is_template` because the size of the dataset decreased and the result got worse.
        # if is_template(h) or not is_interesting(h) or not is_about_di:
        if not is_interesting(h) or not is_about_di:
            mappings.append({
                'article_id': headline.article_id,
                'is_used': False
            })
            continue

        tokens = kansuuzi2number(
            [token.surface for token in tokenizer.tokenize(h)])
        tag_tokens = replace_prices_with_tags(tokens)

        mappings.append({
            'article_id': headline.article_id,
            'simple_headline': h,
            'tokens': tokens,
            'tag_tokens': tag_tokens,
            'is_used': True,
        })
    session.bulk_update_mappings(Headline, mappings)
    session.commit()
    logger.info('end updating headlines')