def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers): tagger1 = taggers[0] tagger1_b = tagger1.to_bytes() tagger1 = tagger1.from_bytes(tagger1_b) assert tagger1.to_bytes() == tagger1_b new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b) assert new_tagger1.to_bytes() == tagger1_b
def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): tagger1, tagger2 = taggers with make_tempdir() as d: file_path1 = d / "tagger1" file_path2 = d / "tagger2" tagger1.to_disk(file_path1) tagger2.to_disk(file_path2) tagger1_d = Tagger(en_vocab).from_disk(file_path1) tagger2_d = Tagger(en_vocab).from_disk(file_path2) assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers): tagger1 = taggers[0] tagger1_b = tagger1.to_bytes() tagger1 = tagger1.from_bytes(tagger1_b) assert tagger1.to_bytes() == tagger1_b cfg = {"model": DEFAULT_TAGGER_MODEL} model = registry.resolve(cfg, validate=True)["model"] new_tagger1 = Tagger(en_vocab, model).from_bytes(tagger1_b) new_tagger1_b = new_tagger1.to_bytes() assert len(new_tagger1_b) == len(tagger1_b) assert new_tagger1_b == tagger1_b
def test_issue1727(): """Test that models with no pretrained vectors can be deserialized correctly after vectors are added.""" data = numpy.ones((3, 300), dtype="f") vectors = Vectors(data=data, keys=["I", "am", "Matt"]) tagger = Tagger(Vocab()) tagger.add_label("PRP") tagger.begin_training() assert tagger.cfg.get("pretrained_dims", 0) == 0 tagger.vocab.vectors = vectors with make_tempdir() as path: tagger.to_disk(path) tagger = Tagger(Vocab()).from_disk(path) assert tagger.cfg.get("pretrained_dims", 0) == 0
def part_of_speech(texts: Iterable[str], nlp: Optional[Language] = None, model: str = 'en_core_web_sm') -> Counter: """Part-Of-Speech Tags from an Iterable of String Sequences.""" added_tagger: Union[str, bool] = False if nlp is None: nlp = spacy.load(model) elif isinstance(nlp, Language) and 'tagger' not in nlp.pipe_names: nlp.add_pipe(Tagger(nlp.vocab)) added_tagger = 'tagger' pos_tags = {} for text in texts: if not text: continue for tok in nlp(text): if tok.is_space or not tok.is_alpha or not tok.pos_: continue if tok.pos_ not in pos_tags: pos_tags[tok.pos_] = 1 else: pos_tags[tok.pos_] += 1 if added_tagger: nlp.remove_pipe(added_tagger) return Counter(pos_tags)
def train_spacy(data, iterations, model=None): TRAIN_DATA = pickle.load(open(data, 'rb')) nlp = spacy.blank('en') # create blank Language class # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get('entities'): ner.add_label(ent[2]) if model is None: optimizer = nlp.begin_training() else: print("Existing entities in the model are:", move_names) optimizer = nlp.entity.create_optimizer() # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(iterations): print("Starting iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: text, annotations = zip(*batch) nlp.update( text, # batch of texts annotations, # batch of annotations drop=0.2, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) print(losses) #custom_ner_model = spacy.load(nlp) nlp_core_model = spacy.load("en_core_web_lg") tagger = Tagger(nlp_core_model.vocab) nlp.add_pipe(tagger, before="ner") parser = DependencyParser(nlp_core_model.vocab) nlp.add_pipe(parser, before="ner") nlp.begin_training() return TRAIN_DATA, nlp
def test_serialize_tagger_strings(en_vocab, de_vocab, taggers): label = "SomeWeirdLabel" assert label not in en_vocab.strings assert label not in de_vocab.strings tagger = taggers[0] assert label not in tagger.vocab.strings with make_tempdir() as d: # check that custom labels are serialized as part of the component's strings.jsonl tagger.add_label(label) assert label in tagger.vocab.strings file_path = d / "tagger1" tagger.to_disk(file_path) # ensure that the custom strings are loaded back in when using the tagger in another pipeline cfg = {"model": DEFAULT_TAGGER_MODEL} model = registry.resolve(cfg, validate=True)["model"] tagger2 = Tagger(de_vocab, model).from_disk(file_path) assert label in tagger2.vocab.strings
def test_serialize_tagger_roundtrip_disk(en_vocab, taggers): tagger1, tagger2 = taggers with make_tempdir() as d: file_path1 = d / "tagger1" file_path2 = d / "tagger2" tagger1.to_disk(file_path1) tagger2.to_disk(file_path2) cfg = {"model": DEFAULT_TAGGER_MODEL} model = registry.resolve(cfg, validate=True)["model"] tagger1_d = Tagger(en_vocab, model).from_disk(file_path1) tagger2_d = Tagger(en_vocab, model).from_disk(file_path2) assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
def main(data_dir, model_dir=None, exclude_normalize_tags=None, keys={}): ''' data_dir -> path to brat annotation data. searches recursively model_dir -> path to save spacy training model exclude_normalize_tags -> list of tags to exclude from normalization. If NONE, no normalization is performed. keys -> dict translating brat tags to training tags. keys not in dict will be preserved ''' r = RepoModel(data_dir, recursive=True, cached=False) nlp = spacy.load('en_default') # v1.1.2 onwards if nlp.tagger is None: print('---- WARNING ----') print('Data directory not found') print( 'please run: `python -m spacy.en.download --force all` for better performance' ) print('Using feature templates for tagging') print('-----------------') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) normalized_train_data = [] excludes = exclude_normalize_tags #we have manually tagged all instances of these for key, data in r.documents.items(): if exclude_normalize_tags: normalized_train_data.extend( normalize_tags(nlp, get_annotated_sents(data, keys), excludes)) else: normalized_train_data.extend(get_annotated_sents(data, keys)) # print(normalized_train_data) nlp = train_ner(nlp, normalized_train_data, keys.values()) doc = nlp( u"Hi Adam,\nSounds great to me. I'll send through the QA department. In the invite you through Skype, and we can discuss if Applause is right for you.\nI look forward to it!\nRegards,\nAndrew" ) for word in doc: print(word.text, word.tag_, word.ent_type_) if model_dir is not None: save_model(nlp, model_dir)
def top_60_feature_extraction(inputs): nlp = spacy.load('en') desired_labels = [ 'PERSON', 'NORP', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LAW', 'LANGUAGE' ] tagger = Tagger(nlp.vocab, model=True) features = [] for inp in tqdm(inputs): postStr = ' '.join(inp['postText']) parsed_post = nlp(postStr) parsed_title = nlp(inp['targetTitle']) keywords = [ kw.strip().lower() for kw in inp['targetKeywords'].split(',') ] if len(parsed_post) == 0: parsed_post = nlp(inp['targetDescription']) # tokenize (also by punctuation) tokens_by_punc = word_tokenize(postStr) # get parts of speech TAG = [token.tag_ for token in parsed_post] # get word lengths in post lens = [len(token.text) for token in parsed_post] features.append([ # 1 number of proper nouns match_tags(parsed_post, ['NNP']), # 2 readability of target paragraphs # 3 number of tokens len(tokens_by_punc), # 4 word length of post text len(parsed_post), # 5 POS 2-gram NNP NNP match_tags(parsed_post, ['NNP', 'NNP']), # 6 Whether the post starts with number 1 if is_first_word_number(parsed_post) else 0, # 7 Average length of words in post np.mean(lens), # 8 Number of Prepositions / Subordinating Conjunction match_tags(parsed_post, ['IN']), # 9 POS 2-gram NNP 3rd person singular present Verb match_tags(parsed_post, ['NNP', 'VBZ']), # 10 POS 2-gram IN NNP match_tags(parsed_post, ['IN', 'NNP']), # 11 length of the longest word in post text max(lens), # 12 number of wh-adverb match_tags(parsed_post, ['WRB']), # 13 count POS pattern WRB # 14 number of single/mass nouns match_tags(parsed_post, ['NN']), # 15 count POS pattern NN # 16 whether the post text starts with 5W1H 1 if parsed_post[0].tag_ in ['WDT', 'WP', 'WP$', 'WRB'] else 0, # 17 Whether exist Question Mark 1 if '?' in postStr else 0, # 18 similarity between post and target title parsed_post.similarity(parsed_title), # 19 Count POS pattern this/these NN match_this_NN(parsed_post), # 20 Count POS pattern PRP # 21 Number of PRP match_tags(parsed_post, ['PRP']), # 22 Number of VBZ match_tags(parsed_post, ['VBZ']), # 23 POS 3-gram NNP NNP VBZ match_tags(parsed_post, ['NNP', 'NNP', 'VBZ']), # 24 POS 2-gram NN IN match_tags(parsed_post, ['NN', 'IN']), # 25 POS 3-gram NN IN NNP match_tags(parsed_post, ['NN', 'IN', 'NNP']), # 26 ratio of stop words in posttext len(filter(lambda x: x.is_stop, parsed_post)) / len(parsed_post), # 27 POS 2-gram NNP match_NNP_period(parsed_post), # 28 POS 2-gram PRP VBP match_tags(parsed_post, ['PRP', 'VBP']), # 29 Count POS pattern WP # 30 Number of WP match_tags(parsed_post, ['WP']), # 31 Count POS pattern DT # 32 Number of DT match_tags(parsed_post, ['DT']), # 33 POS 2-gram NNP IN match_tags(parsed_post, ['NNP', 'IN']), # 34 POS 3-gram IN NNP NNP match_tags(parsed_post, ['IN', 'NNP', 'NNP']), # 35 Number of POS match_tags(parsed_post, ['POS']), # 36 POS 2-gram IN IN match_tags(parsed_post, ['IN', 'IN']), # 37 Match between keywords and post len(filter(lambda x: x in postStr.lower(), keywords)), # 38 Number of ',' len(filter(lambda x: x == ',', postStr)), # 39 POS 2-gram NNP NNS match_tags(parsed_post, ['NNP', 'NNS']), # 40 POS 2-gram IN JJ match_tags(parsed_post, ['IN', 'JJ']), # 41 POS 2-gram NNP POS match_tags(parsed_post, ['NNP', 'POS']), # 42 WDT match_tags(parsed_post, ['WDT']), # 43 Count POS pattern WDT # 44 POS 2-gram NN NN match_tags(parsed_post, ['NN', 'NN']), # 45 POS 2-gram NN NNP match_tags(parsed_post, ['NN', 'NNP']), # 46 POS 2-gram NNP VBD match_tags(parsed_post, ['NN', 'VBD']), # 47 Similarity between post and target paragraphs # 48 POS pattern RB match_tags(parsed_post, ['RB']), # 49 Number of RB # 50 POS 3-gram NNP NNP NNP match_tags(parsed_post, ['NNP', 'NNP', 'NNP']), # 51 POS 3-gram NNP NNP NN match_tags(parsed_post, ['NNP', 'NNP', 'NN']), # 52 Readability of target paragraphs # 53 Number of RBS match_tags(parsed_post, ['RBS']), # 54 Number of VBN match_tags(parsed_post, ['VBN']), # 55 POS 2-gram VBN IN match_tags(parsed_post, ['VBN', 'IN']), # 56 whether exist NUMBER NP VB match_NUM_NP_VB(parsed_post), # 57 POS 2-gram JJ NNP match_tags(parsed_post, ['JJ', 'NNP']), # 58 POS 3-gram NNP NN NN match_tags(parsed_post, ['NNP', 'NN', 'NN']), # 59 POS 2-gram DT NN match_tags(parsed_post, ['DT', 'NN']), # 60 whether exist EX 1 if match_tags(parsed_post, ['EX']) > 1 else 0 ]) return features
]) counter = end parts.extend(sentences[i][counter:]) sentences_marked.append(''.join(parts)) else: # the target is a single token start = target_indices[i][0] end = target_indices[i][1] sentences_marked.append(''.join([ sentences[i][0:start], '[TARGET_START] ', sentences[i][start:end], ' [TARGET_END]', sentences[i][end:] ])) # SpaCy NLP tools nlp = en_core_web_md.load() tagger = Tagger(nlp.vocab) # Add special case rule special_case1 = [{ORTH: "[TARGET_START]"}] special_case2 = [{ORTH: "[TARGET_END]"}] nlp.tokenizer.add_special_case("[TARGET_START]", special_case1) nlp.tokenizer.add_special_case("[TARGET_END]", special_case2) # A list of universal POS tags used by SpaCy (except for 'SPECIAL' and 'PAD' tags). # Tag 'SPECIAL' is used only for markers '[CLS]' and '[SEP]', and tag 'PAD' for padded tokens. pos_tags = [ 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPACE', 'SPECIAL', 'PAD' ]
def taggers(en_vocab): cfg = {"model": DEFAULT_TAGGER_MODEL} model = registry.resolve(cfg, validate=True)["model"] tagger1 = Tagger(en_vocab, model) tagger2 = Tagger(en_vocab, model) return tagger1, tagger2
def __init__(self, model="fr_core_news_md"): self.nlp = spacy.load(model) self.tagger = Tagger(self.nlp.vocab) self.parsed_text = OrderedDict()
def taggers(en_vocab): tagger1 = Tagger(en_vocab) tagger2 = Tagger(en_vocab) tagger1.model = tagger1.Model(8) tagger2.model = tagger1.model return (tagger1, tagger2)