コード例 #1
0
def main():
    nlp = English(parser=False, tagger=False, entity=False)

    gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones']
    example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.'
    pattern_ids = PreshMap()
    max_length = 0
    for pattern_str in gazetteer:
        pattern = nlp.tokenizer(pattern_str)
        bilou_tags = get_bilou(len(pattern))
        for word, tag in zip(pattern, bilou_tags):
            lexeme = nlp.vocab[word.orth]
            lexeme.set_flag(tag, True)
        pattern_ids[hash_string(pattern.text)] = True
        max_length = max(max_length, len(pattern))

    matcher = make_matcher(nlp.vocab, max_length)

    doc = nlp(example_text)
    matches = get_matches(matcher, pattern_ids, doc)
    merge_matches(doc, matches)
    for token in doc:
        print(token.text, token.ent_type_)
コード例 #2
0
ファイル: seq2seq.py プロジェクト: moonlightlane/GAN-general
# add special tokens to the embeddings
embeddings_index['SOS'] = SOS_token
embeddings_index['EOS'] = EOS_token
embeddings_index['UNK'] = UNK_token

######### read corpus
triplets = readSQuAD(path_to_data)

######### corpus preprocessing
# TODO: need some work here: deal with inprecise tokenizer,
# words that do not appear in embeddings, etc

## find all unique tokens in the data (should be a subset of the number of embeddings)
data_tokens = ['SOS', 'EOS', 'UNK']
for triple in triplets:
    c = [token.string.strip() for token in spacynlp.tokenizer(triple[0])]
    q = [token.string.strip() for token in spacynlp.tokenizer(triple[1])]
    a = [token.string.strip() for token in spacynlp.tokenizer(triple[2])]
    data_tokens += c + q + a
data_tokens = list(set(data_tokens))  # find unique

# experimental usage only
data_tokens = data_tokens[0:10000]

num_tokens = len(data_tokens)
# generate some index
# token_indices = random.sample(range(0, len(data_tokens)), 20)
# # debugging purpose
# token_subset = [data_tokens[i] for i in token_indices]
# print('original tokens: ' + str(token_subset))
# # extra preprocessing step to replace all tokens in data_tokens
コード例 #3
0
ファイル: test_infix.py プロジェクト: alvations/spaCy
def test_period():
    EN = English()
    tokens = EN.tokenizer('best.Known')
    assert len(tokens) == 3
    tokens = EN('zombo.com')
    assert len(tokens) == 1
コード例 #4
0
from spacy.en import English

print("Loading English Model...")
nlp = English(entity=False, parser=False)
print("Done!")

print("Vocab. Size: ", len(nlp.vocab.strings))
print("hello" in nlp.vocab.strings)

#loads the entire nlp pipeline with parser, named-entity recognition, pos tagger
tokens = nlp(u'Mr Anderson, welcome back, we missed you.')
tokens = [token.lemma_ for token in tokens]
print(tokens)

tokens = nlp.tokenizer('Mr Anderson, welcome back, we missed you.')
print(type(tokens))
tokens = [token for token in tokens]
print(tokens)

print(type(tokens[0]))
print(tokens[0].orth_)