def test_kansuuzi2number(self): tokens = ['1', '万', '円', '台'] expected = ['10000', '円', '台'] self.assertEqual(kansuuzi2number(tokens), expected) tokens = ['賞金', 'は' '500', '万'] expected = ['賞金', 'は' '500', '万'] self.assertEqual(kansuuzi2number(tokens), expected)
def test_kansuuzi2number(): tokens = ['1', '万', '円', '台'] expected = ['10000', '円', '台'] result = kansuuzi2number(tokens) assert result == expected tokens = ['賞金', 'は' '500', '万'] expected = ['賞金', 'は' '500', '万'] result = kansuuzi2number(tokens) assert result == expected
def update_headlines(session: Session, user_dict: Path, logger: Logger) -> None: query_result = session \ .query(Headline) \ .filter(Headline.is_used.is_(None)) \ .all() headlines = list(query_result) if len(headlines) == 0: return tokenizer = Tokenizer(str(user_dict)) mappings = [] logger.info('start updating headlines') for headline in tqdm(headlines): h = simplify_headline(headline.headline) is_about_di = headline.categories is not None and \ DOMESTIC_INDEX in headline.categories # We stopped using `is_template` because the size of the dataset decreased and the result got worse. # if is_template(h) or not is_interesting(h) or not is_about_di: if not is_interesting(h) or not is_about_di: mappings.append({ 'article_id': headline.article_id, 'is_used': False }) continue tokens = kansuuzi2number( [token.surface for token in tokenizer.tokenize(h)]) tag_tokens = replace_prices_with_tags(tokens) mappings.append({ 'article_id': headline.article_id, 'simple_headline': h, 'tokens': tokens, 'tag_tokens': tag_tokens, 'is_used': True, }) session.bulk_update_mappings(Headline, mappings) session.commit() logger.info('end updating headlines')