def _parse(raw_string): """ Private function to parse strings using a trained model. Should not be called directly, but rather using parse and other functions. :param raw_string: input string to parse :type raw_string: str :return: a tuple of tokens and labels :rtype: tuple """ tokens = tok.tokenize(raw_string) if not tokens: return [] features = tok.tokens2features(tokens) tags = TAGGER.tag(features) return tokens, tags
def test_spaces(self): assert tokens.tokenize('foo bar') == ['foo', 'bar'] assert tokens.tokenize('foo bar') == ['foo', 'bar'] assert tokens.tokenize('foo bar ') == ['foo', 'bar'] assert tokens.tokenize(' foo bar') == ['foo', 'bar']
def test_real_addresses(self): assert tokens.tokenize('CHERRY TREE HOUSING ASSOCIATION 5 TAVISTOCK AVENUE ST ALBANS AL1 2NQ') \ == ['CHERRY', 'TREE', 'HOUSING', 'ASSOCIATION', '5', 'TAVISTOCK', 'AVENUE', 'ST', 'ALBANS', 'AL1', '2NQ'] assert tokens.tokenize('339 PERSHORE ROAD EDGBASTON BIRMINGHAM B5 7RY') == ['339', 'PERSHORE', 'ROAD', 'EDGBASTON', 'BIRMINGHAM', 'B5', '7RY']
def test_split_on_punc(self): assert tokens.tokenize('foo,bar') == ['foo,', 'bar']
def test(raw_string='ONS LIMITED FLAT 1 12 OXFORD STREET STREET ST1 2FW', verbose=False): """ A simple test to check that the calling mechanism from Python gives the same results as if CRFsuite were called directly from the command line. Requires a compiled version of the CRFsuite. :param raw_string: input string to test :type raw_string: str :param verbose: additional debugging output :type verbose: bool :return: None """ print('Input string:', raw_string) print('Python Results:', tag(raw_string)) tokens = tok.tokenize(raw_string) features = tok.tokens2features(tokens) if verbose: print('features:', features) tags = TAGGER.tag(features) print('Inferred tags:', tags) print('Probability of the sequence:', round(TAGGER.probability(tags), 6)) assert round(TAGGER.probability(tags), 6) == 0.992256, 'Sequence probability not correct' results = [ 0.999999, 0.999999, 0.999846, 0.993642, 0.999728, 1., 1., 0.998874, 1., 1. ] for i, tg in enumerate(tags): prob = round(TAGGER.marginal(tg, i), 6) print('Marginal probability of', tg, 'in position', i, 'is', prob) assert prob == results[ i], 'Marginal Probability of a Label not correct' if verbose: print(TAGGER.info().transitions) print(TAGGER.info().state_features) print(TAGGER.info().attributes) # store the ItemSequence temporarily tmp = pycrfsuite.ItemSequence(features) # write to a text file fh = open('training/test.txt', 'w') for i, tg in enumerate(tags): fh.write(tg + '\t') items = tmp.items()[i] for item in sorted(items): itemtext = str(item) fh.write( itemtext.replace(':', '\:') + ':' + str(items[item]) + '\t') fh.write('\n') fh.close() # command line call to the C code to test the output print('\nCRFsuite call results:') os.system( 'crfsuite tag -pit -m training/addressCRF.crfsuite training/test.txt')