def _parse(raw_string):
    """
    Private function to parse strings using a trained model.
    Should not be called directly, but rather using parse and other functions.

    :param raw_string: input string to parse
    :type raw_string: str

    :return: a tuple of tokens and labels
    :rtype: tuple
    """
    tokens = tok.tokenize(raw_string)

    if not tokens:
        return []

    features = tok.tokens2features(tokens)

    tags = TAGGER.tag(features)

    return tokens, tags
Beispiel #2
0
 def test_spaces(self):
     assert tokens.tokenize('foo bar') == ['foo', 'bar']
     assert tokens.tokenize('foo  bar') == ['foo', 'bar']
     assert tokens.tokenize('foo bar ') == ['foo', 'bar']
     assert tokens.tokenize(' foo bar') == ['foo', 'bar']
Beispiel #3
0
 def test_real_addresses(self):
     assert tokens.tokenize('CHERRY TREE HOUSING ASSOCIATION 5 TAVISTOCK AVENUE ST ALBANS AL1 2NQ') \
     == ['CHERRY', 'TREE', 'HOUSING', 'ASSOCIATION', '5', 'TAVISTOCK', 'AVENUE', 'ST', 'ALBANS', 'AL1', '2NQ']
     assert tokens.tokenize('339 PERSHORE ROAD EDGBASTON BIRMINGHAM B5 7RY') == ['339', 'PERSHORE', 'ROAD',
                                                                                 'EDGBASTON', 'BIRMINGHAM', 'B5',
                                                                                 '7RY']
Beispiel #4
0
 def test_split_on_punc(self):
     assert tokens.tokenize('foo,bar') == ['foo,', 'bar']
def test(raw_string='ONS LIMITED FLAT 1 12 OXFORD STREET STREET ST1 2FW',
         verbose=False):
    """
    A simple test to check that the calling mechanism from Python gives the same
    results as if CRFsuite were called directly from the command line. Requires
    a compiled version of the CRFsuite.

    :param raw_string: input string to test
    :type raw_string: str
    :param verbose: additional debugging output
    :type verbose: bool

    :return: None
    """
    print('Input string:', raw_string)
    print('Python Results:', tag(raw_string))

    tokens = tok.tokenize(raw_string)
    features = tok.tokens2features(tokens)

    if verbose:
        print('features:', features)

    tags = TAGGER.tag(features)
    print('Inferred tags:', tags)

    print('Probability of the sequence:', round(TAGGER.probability(tags), 6))
    assert round(TAGGER.probability(tags),
                 6) == 0.992256, 'Sequence probability not correct'

    results = [
        0.999999, 0.999999, 0.999846, 0.993642, 0.999728, 1., 1., 0.998874, 1.,
        1.
    ]
    for i, tg in enumerate(tags):
        prob = round(TAGGER.marginal(tg, i), 6)
        print('Marginal probability of', tg, 'in position', i, 'is', prob)
        assert prob == results[
            i], 'Marginal Probability of a Label not correct'

    if verbose:
        print(TAGGER.info().transitions)
        print(TAGGER.info().state_features)
        print(TAGGER.info().attributes)

    # store the ItemSequence temporarily
    tmp = pycrfsuite.ItemSequence(features)

    # write to a text file
    fh = open('training/test.txt', 'w')
    for i, tg in enumerate(tags):
        fh.write(tg + '\t')
        items = tmp.items()[i]
        for item in sorted(items):
            itemtext = str(item)
            fh.write(
                itemtext.replace(':', '\:') + ':' + str(items[item]) + '\t')
        fh.write('\n')
    fh.close()

    # command line call to the C code to test the output
    print('\nCRFsuite call results:')
    os.system(
        'crfsuite tag -pit -m training/addressCRF.crfsuite training/test.txt')