Example #1
0
def normalize_topic(topic):
    """
    Get a canonical representation of a Wikipedia topic, which may include
    a disambiguation string in parentheses.

    Returns (name, disambig), where "name" is the normalized topic name,
    and "disambig" is a string corresponding to the disambiguation text or
    None.
    """
    # find titles of the form Foo (bar)
    topic = topic.replace('_', ' ')
    match = re.match(r'([^(]+) \(([^)]+)\)', topic)
    if not match:
        return normalize(topic), None
    else:
        return normalize(match.group(1)), 'n/' + match.group(2).strip(' _')
Example #2
0
def normalize_topic(topic):
    """
    Get a canonical representation of a Wikipedia topic, which may include
    a disambiguation string in parentheses.

    Returns (name, disambig), where "name" is the normalized topic name,
    and "disambig" is a string corresponding to the disambiguation text or
    None.
    """
    # find titles of the form Foo (bar)
    topic = topic.replace('_', ' ')
    match = re.match(r'([^(]+) \(([^)]+)\)', topic)
    if not match:
        return normalize(topic), None
    else:
        return normalize(match.group(1)), 'n/' + match.group(2).strip(' _')
Example #3
0
def test_japanese():
    eq_(normalize('これはテストです'), 'テスト')
    this_is_a_test = [('これ', '~名詞', 'これ'),
                      ('は', '~助詞', 'は'),
                      ('テスト', '名詞', 'テスト'),
                      ('です', '~助動詞', 'です'),
                      ('。', '.', '。')]
    eq_(tag_and_stem('これはテストです。'), this_is_a_test)
Example #4
0
def test_japanese():
    eq_(normalize('これはテストです'), 'テスト')
    this_is_a_test = [('これ', '~名詞', 'これ'), ('は', '~助詞', 'は'),
                      ('テスト', '名詞', 'テスト'), ('です', '~助動詞', 'です'),
                      ('。', '.', '。')]
    eq_(tag_and_stem('これはテストです。'), this_is_a_test)