def main(): nlp = English(parser=False, tagger=False, entity=False) gazetteer = [u'M.I.A.', 'Shiny Happy People', 'James E. Jones'] example_text = u'The artist M.I.A. did a cover of Shiny Happy People. People is not an entity.' pattern_ids = PreshMap() max_length = 0 for pattern_str in gazetteer: pattern = nlp.tokenizer(pattern_str) bilou_tags = get_bilou(len(pattern)) for word, tag in zip(pattern, bilou_tags): lexeme = nlp.vocab[word.orth] lexeme.set_flag(tag, True) pattern_ids[hash_string(pattern.text)] = True max_length = max(max_length, len(pattern)) matcher = make_matcher(nlp.vocab, max_length) doc = nlp(example_text) matches = get_matches(matcher, pattern_ids, doc) merge_matches(doc, matches) for token in doc: print(token.text, token.ent_type_)
# add special tokens to the embeddings embeddings_index['SOS'] = SOS_token embeddings_index['EOS'] = EOS_token embeddings_index['UNK'] = UNK_token ######### read corpus triplets = readSQuAD(path_to_data) ######### corpus preprocessing # TODO: need some work here: deal with inprecise tokenizer, # words that do not appear in embeddings, etc ## find all unique tokens in the data (should be a subset of the number of embeddings) data_tokens = ['SOS', 'EOS', 'UNK'] for triple in triplets: c = [token.string.strip() for token in spacynlp.tokenizer(triple[0])] q = [token.string.strip() for token in spacynlp.tokenizer(triple[1])] a = [token.string.strip() for token in spacynlp.tokenizer(triple[2])] data_tokens += c + q + a data_tokens = list(set(data_tokens)) # find unique # experimental usage only data_tokens = data_tokens[0:10000] num_tokens = len(data_tokens) # generate some index # token_indices = random.sample(range(0, len(data_tokens)), 20) # # debugging purpose # token_subset = [data_tokens[i] for i in token_indices] # print('original tokens: ' + str(token_subset)) # # extra preprocessing step to replace all tokens in data_tokens
def test_period(): EN = English() tokens = EN.tokenizer('best.Known') assert len(tokens) == 3 tokens = EN('zombo.com') assert len(tokens) == 1
from spacy.en import English print("Loading English Model...") nlp = English(entity=False, parser=False) print("Done!") print("Vocab. Size: ", len(nlp.vocab.strings)) print("hello" in nlp.vocab.strings) #loads the entire nlp pipeline with parser, named-entity recognition, pos tagger tokens = nlp(u'Mr Anderson, welcome back, we missed you.') tokens = [token.lemma_ for token in tokens] print(tokens) tokens = nlp.tokenizer('Mr Anderson, welcome back, we missed you.') print(type(tokens)) tokens = [token for token in tokens] print(tokens) print(type(tokens[0])) print(tokens[0].orth_)