def main(model=None, new_model_name='animal', output_dir=None, n_iter=20): """Set up the pipeline and entity recognizer, and train the new entity.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe('ner') ner.add_label(LABEL) # add new entity label to entity recognizer if model is None: optimizer = nlp.begin_training() else: # Note that 'begin_training' initializes the models, so it'll zero out # existing entity types. optimizer = nlp.entity.create_optimizer() # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update([text], [annotations], sgd=optimizer, drop=0.35, losses=losses) print(losses) # test the trained model test_text = 'Do you like horses?' doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta['name'] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)
def test_issue2626(): '''Check that this sentence doesn't cause an infinite loop in the tokenizer.''' nlp = spacy.blank('en') text = """ ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume """ doc = nlp.make_doc(text)
def main(n_iter=10): nlp = spacy.blank('en') ner = nlp.create_pipe('ner') ner.add_multitask_objective(get_position_label) nlp.add_pipe(ner) print("Create data", len(TRAIN_DATA)) optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA) for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annot_brackets in TRAIN_DATA: annotations, _ = annot_brackets doc = nlp.make_doc(text) gold = GoldParse.from_annot_tuples(doc, annotations[0]) nlp.update( [doc], # batch of texts [gold], # batch of annotations drop=0.2, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) print(losses.get('nn_labeller', 0.0), losses['ner']) # test the trained model for text, _ in TRAIN_DATA: doc = nlp(text) print('Entities', [(ent.text, ent.label_) for ent in doc.ents]) print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def main(patterns_loc, text_loc, n=10000, lang="en"): nlp = spacy.blank(lang) nlp.vocab.lex_attr_getters = {} phrases = read_gazetteer(nlp.tokenizer, patterns_loc) count = 0 t1 = time.time() for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)): count += 1 t2 = time.time() print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
def main(model=None, output_dir=None, n_iter=15): """Load the model, set up the pipeline and train the parser.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # add the parser to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if "parser" not in nlp.pipe_names: parser = nlp.create_pipe("parser") nlp.add_pipe(parser, first=True) # otherwise, get it, so we can add labels to it else: parser = nlp.get_pipe("parser") # add labels to the parser for _, annotations in TRAIN_DATA: for dep in annotations.get("deps", []): parser.add_label(dep) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"] with nlp.disable_pipes(*other_pipes): # only train parser optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model test_text = "I like securities." doc = nlp(test_text) print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc]) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc = nlp2(test_text) print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
def load_nlp(corpus, config, vectors=None): lang = corpus.split("_")[0] nlp = spacy.blank(lang) if config.vectors: if not vectors: raise ValueError( "config asks for vectors, but no vectors " "directory set on command line (use -v)" ) if (Path(vectors) / corpus).exists(): nlp.vocab.from_disk(Path(vectors) / corpus / "vocab") nlp.meta["treebank"] = corpus return nlp
def FeatureExtracter(lang, attrs=[LOWER, SHAPE, PREFIX, SUFFIX], tokenized=True): nlp = spacy.blank(lang) nlp.vocab.lex_attr_getters[PREFIX] = lambda string: string[:3] nlp.vocab.lex_attr_getters[SUFFIX] = lambda string: string[-3:] def forward(texts, drop=0.): if tokenized: docs = [Doc(nlp.vocab, words) for words in texts] else: docs = [nlp(text) for text in texts] features = [doc.to_array(attrs) for doc in docs] def backward(d_features, sgd=None): return d_features return features, backward return layerize(forward)
def main( lang, in_dir, out_loc, negative=5, n_workers=4, window=5, size=128, min_count=10, nr_iter=2, ): logging.basicConfig( format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO ) model = Word2Vec( size=size, window=window, min_count=min_count, workers=n_workers, sample=1e-5, negative=negative, ) nlp = spacy.blank(lang) corpus = Corpus(in_dir) total_words = 0 total_sents = 0 for text_no, text_loc in enumerate(iter_dir(corpus.directory)): with text_loc.open("r", encoding="utf-8") as file_: text = file_.read() total_sents += text.count("\n") doc = nlp(text) total_words += corpus.count_doc(doc) logger.info( "PROGRESS: at batch #%i, processed %i words, keeping %i word types", text_no, total_words, len(corpus.strings), ) model.corpus_count = total_sents model.raw_vocab = defaultdict(int) for orth, freq in corpus.counts: if freq >= min_count: model.raw_vocab[nlp.vocab.strings[orth]] = freq model.scale_vocab() model.finalize_vocab() model.iter = nr_iter model.train(corpus) model.save(out_loc)
def main(model=None, output_dir=None, n_iter=15): """Load the model, set up the pipeline and train the parser.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # We'll use the built-in dependency parser class, but we want to create a # fresh instance – just in case. if "parser" in nlp.pipe_names: nlp.remove_pipe("parser") parser = nlp.create_pipe("parser") nlp.add_pipe(parser, first=True) for text, annotations in TRAIN_DATA: for dep in annotations.get("deps", []): parser.add_label(dep) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"] with nlp.disable_pipes(*other_pipes): # only train parser optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model test_model(nlp) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) test_model(nlp2)
def main(model=None, output_dir=None, n_iter=5): """Load the model, set up the pipeline and train the parser.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # We'll use the built-in dependency parser class, but we want to create a # fresh instance – just in case. if 'parser' in nlp.pipe_names: nlp.remove_pipe('parser') parser = nlp.create_pipe('parser') nlp.add_pipe(parser, first=True) for text, annotations in TRAIN_DATA: for dep in annotations.get('deps', []): parser.add_label(dep) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser'] with nlp.disable_pipes(*other_pipes): # only train parser optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update([text], [annotations], sgd=optimizer, losses=losses) print(losses) # test the trained model test_model(nlp) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) test_model(nlp2)
def main(lang='en', output_dir=None, n_iter=25): """Create a new model, set up the pipeline and train the tagger. In order to train the tagger with a custom tag map, we're creating a new Language instance with a custom vocab. """ nlp = spacy.blank(lang) # add the tagger to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy tagger = nlp.create_pipe('tagger') # Add the tags. This needs to be done before you start training. for tag, values in TAG_MAP.items(): tagger.add_label(tag, values) nlp.add_pipe(tagger) optimizer = nlp.begin_training() for i in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update([text], [annotations], sgd=optimizer, losses=losses) print(losses) # test the trained model test_text = "I like blue eggs" doc = nlp(test_text) print('Tags', [(t.text, t.tag_, t.pos_) for t in doc]) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the save model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc = nlp2(test_text) print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
def main(vectors_loc, lang=None): if lang is None: nlp = Language() else: # create empty language class – this is required if you're planning to # save the model to disk and load it back later (models always need a # "lang" setting). Use 'xx' for blank multi-language class. nlp = spacy.blank(lang) with open(vectors_loc, 'rb') as file_: header = file_.readline() nr_row, nr_dim = header.split() nlp.vocab.reset_vectors(width=int(nr_dim)) for line in file_: line = line.rstrip().decode('utf8') pieces = line.rsplit(' ', int(nr_dim)) word = pieces[0] vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f') nlp.vocab.set_vector(word, vector) # add the vectors to the vocab # test the vectors and similarity text = 'class colspan' doc = nlp(text) print(text, doc[0].similarity(doc[1]))
def make_vocab(cmd, lang, output_dir, lexemes_loc, vectors_loc=None, prune_vectors=-1): """Compile a vocabulary from a lexicon jsonl file and word vectors.""" if not lexemes_loc.exists(): prints(lexemes_loc, title="Can't find lexical data", exits=1) vectors_loc = ensure_path(vectors_loc) nlp = spacy.blank(lang) for word in nlp.vocab: word.rank = 0 lex_added = 0 with lexemes_loc.open() as file_: for line in file_: if line.strip(): attrs = json.loads(line) if 'settings' in attrs: nlp.vocab.cfg.update(attrs['settings']) else: lex = nlp.vocab[attrs['orth']] lex.set_attrs(**attrs) assert lex.rank == attrs['id'] lex_added += 1 if vectors_loc is not None: vector_data = numpy.load(vectors_loc.open('rb')) nlp.vocab.vectors = Vectors(data=vector_data) for word in nlp.vocab: if word.rank: nlp.vocab.vectors.add(word.orth, row=word.rank) if prune_vectors >= 1: remap = nlp.vocab.prune_vectors(prune_vectors) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) vec_added = len(nlp.vocab.vectors) prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir, title="Sucessfully compiled vocab and vectors, and saved model") return nlp
def __iter__(self): nlp = spacy.blank(self.lang) return self.iter(nlp)
import json import spacy from spacy.matcher import Matcher from spacy.tokens import Span, DocBin with open("exercises/es/adidas.json", encoding="utf8") as f: TEXTS = json.loads(f.read()) nlp = spacy.blank("es") matcher = Matcher(nlp.vocab) # Agrega patrones al matcher pattern1 = [{"LOWER": "adidas"}, {"LOWER": "zx"}] pattern2 = [{"LOWER": "adidas"}, {"IS_DIGIT": True}] matcher.add("ROPA", [pattern1, pattern2]) docs = [] for doc in nlp.pipe(TEXTS): matches = matcher(doc) spans = [ Span(doc, start, end, label=match_id) for match_id, start, end in matches ] doc.ents = spans docs.append(doc) doc_bin = ____(____=____) doc_bin.____(____)
def __init__(self, lang:str): self.tok = spacy.blank(lang, disable=["parser","tagger","ner"])
def load_nlp(corpus, config): lang = corpus.split("_")[0] nlp = spacy.blank(lang) if config.vectors: nlp.vocab.from_disk(config.vectors / "vocab") return nlp
def __init__(self, lang='en', special_toks=None, buf_sz=5000): special_toks = ifnone(special_toks, defaults.text_spec_tok) nlp = spacy.blank(lang, disable=["parser", "tagger", "ner"]) for w in special_toks: nlp.tokenizer.add_special_case(w, [{ORTH: w}]) self.pipe,self.buf_sz = nlp.pipe,buf_sz
import scattertext as st import spacy nlp = spacy.blank('en_core_web_sm') nlp.tokenizer.rules = {key: value for key, value in nlp.tokenizer.rules.items() if "'" not in key and "’" not in key and "‘" not in key} nlp.add_pipe(nlp.create_pipe('sentencizer')) df = st.SampleCorpora.ConventionData2012.get_data().assign( parse=lambda df: df.text.apply(nlp) ) corpus = st.CorpusFromParsedDocuments( df, category_col='party', parsed_col='parse' ).build().compact(st.ClassPercentageCompactor(term_count=10)) html = st.produce_scattertext_explorer( corpus, category='democrat', category_name='Democratic', not_category_name='Republican', minimum_term_frequency=0, pmi_threshold_coefficient=0, width_in_pixels=1000, metadata=corpus.get_df()['speaker'], transform=st.Scalers.dense_rank, show_diagonal=False, max_overlapping=3 ) open('./demo_with_apostrophes.html', 'w').write(html) print('open ./demo_with_apostrophes.html in Chrome')
def train_new_ner(model=None, output_dir=model_dir, n_iter=100): """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model if model is None: nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print("Losses", losses) # test the trained model for text, _ in TRAIN_DATA: doc = nlp(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) for text, _ in TRAIN_DATA: doc = nlp2(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def load_nlp(corpus, config): lang = corpus.split('_')[0] nlp = spacy.blank(lang) if config.vectors: nlp.vocab.from_disk(config.vectors / 'vocab') return nlp
def train(train_data,iterations,modelPath): # Test if the model exists modelExists=os.path.isdir(modelPath) # 1.) load model or create blank Language class """Load the model, set up the pipeline and train the entity recognizer.""" if modelExists: nlp = spacy.load(modelPath) # load existing spaCy model print("Loaded model '%s'" % modelPath) else: # 1.) create blank Language class nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # Initialize a textcat pipe in a spacy pipeline object (nlp), and add the label variable in it. if 'textcat' not in nlp.pipe_names: print("create pipe 'textcat'...") # create a new pipe with exclusive_classes=true #textcat = nlp.create_pipe("textcat") textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True}) nlp.add_pipe(textcat, last=True) #nlp.add_pipe(textcat) else: textcat = nlp.get_pipe("textcat") for i in LABEL: print("...adding new label '" + i + "'...") textcat.add_label(i) # Add new labels to the categorizer # Iterate the training examples to optimize the model other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] # Only train the textcat pipe with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() #optimizer = nlp.begin_training(component_cfg={'textcat': {"exclusive_classes": False} }) print("Training model...") for i in range(iterations): random.shuffle(train_data) losses = {} batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) print("Losses", losses) print("Training completed") return nlp
def test_blank(lang): spacy.blank(lang)
def main(model=None, new_model_name="entity", output_dir="entityModel", n_iter=30): """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") ner.add_label(LABEL1) # add new entity label to entity recognizer # Adding extraneous labels shouldn't mess anything up ner.add_label(LABEL2) ner.add_label(LABEL3) if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.resume_training() move_names = list(ner.move_names) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] with nlp.disable_pipes(*other_pipes): # only train NER sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): random.shuffle(TRAIN_DATA) batches = minibatch(TRAIN_DATA, size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print("Losses", losses) # test the trained model test_text = "Should I dump my lights?" doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) # Check the classes have loaded back consistently assert nlp2.get_pipe("ner").move_names == move_names doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)
def main(model='/home/sangeetha/Desktop/spa/combine', new_model_name='fund', output_dir='', n_iter=0): """Set up the pipeline and entity recognizer, and train the new entity.""" if model is not None: nlp = spacy.load( '/home/sangeetha/Desktop/spa/combine') # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) f = 1 # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe('ner') ner.add_label(LABEL1) # add new entity label to entity recognizer ner.add_label(LABEL2) ner.add_label(LABEL3) ner.add_label(LABEL4) ner.add_label(LABEL5) ner.add_label(LABEL6) if model is None: optimizer = nlp.begin_training() else: # Note that 'begin_training' initializes the models, so it'll zero out # existing entity types. optimizer = nlp.entity.create_optimizer() # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update([text], [annotations], sgd=optimizer, drop=0.35, losses=losses) print(losses) # with open('test_data.txt', 'r') as myfile: # data=myfile.read().replace('\n', '') # test the trained model data = 'hey im from fil how can i help you let me place a buy deal of ten thousand pounds eleven cents in fund demon okay let me reconfirm your deal okay i confirm to place a buy deal amounting ten thousand pounds eleven cents in fund demon with the associated account number one two three nine okay okay thank you' doc = nlp(data) n = 10 print("entities in '%s'" % data) for ent in (doc.ents): print(ent.text, ent.label_) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta['name'] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(data) for ent in doc2.ents: print(ent.label_, ent.text)
def train_CNN(model=None, output_dir=None, n_iter=20, n_texts=2000): if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if 'textcat' not in nlp.pipe_names: textcat = nlp.create_pipe('textcat') nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe('textcat') # add label to text classifier textcat.add_label('POSITIVE') # load the IMDB dataset print("Loading IMDB data...") (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) print("Using {} examples ({} training, {} evaluation)".format( n_texts, len(train_texts), len(dev_texts))) train_data = list(zip(train_texts, [{ 'cats': cats } for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() print("Training the model...") print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F')) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch # batches = minibatch(train_data, size=compounding(4., 32., 1.001)) batches = get_batches(train_data, 'textcat') for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=next(dropout), losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table .format(losses['textcat'], scores['textcat_p'], scores['textcat_r'], scores['textcat_f'])) # test the trained model test_text = "This movie sucked" doc = nlp(test_text) print(test_text, doc.cats) if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) print(test_text, doc2.cats)
def DIDA_NER(model=None, output_dir="/home/wangdi498/SpaCy/models", number_iterations=50): if model is not None: nlp = spacy.load(model) print( "In NER the goddamn language model {} already exists so it will be loaded." .format(model)) else: nlp = spacy.blank('zh') print( "In NER the goddamn language model does not exist so 'zh_core_web_sm' will be loaded." ) if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') # create_pipe()只对SpaCy承认的component有效。 nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe('ner') for _, annotations in TRAIN_DATA: for ent in annotations.get('entities'): # get可以返回None。 ner.add_label(ent[2]) # 其余component必须在NER训练时被终止! other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): optimizer = nlp.begin_training() for iteration in range(number_iterations): random.shuffle(TRAIN_DATA) # 每轮都会shuffle训练数据,保证模型不会根据训练顺序来做generalizations。也可以设置dropout rate让模型以一定几率放弃一些features和representations来避免模型过牢地记住训练数据。 losses = {} for text, annotations in TRAIN_DATA: nlp.update([text], [annotations], drop=0.5, sgd=optimizer, losses=losses) # 用得到的数据更新模型。 print("In round {} of {} NER the loss is {}.".format( iteration, number_iterations, losses)) # 姑且用训练数据直接测试。 for text, _ in TRAIN_DATA: doc = nlp(text) print("\n{}".format([(ent.text, ent.label_) for ent in doc.ents])) # print ("\nHow can I trust the entities without test?!\n{}".format([(ent.text, ent.label_) for ent in doc.ents])) # print ("\nHow can I trust the tokens without test?!\n{}".format([(token.text, token.ent_type_, token.ent_iob_) for token in doc])) # 存储训练好的模型。 if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("\nThe NER model is saved to {}. This is the end of training.". format(output_dir)) # 读取训练好的模型。 print( "\nThe NER model is loaded from {}. This is the beginning of testing." .format(output_dir)) nlu = spacy.load(output_dir) for text, _ in TEST_DATA: doc = nlu(text) print("\n{}".format([(ent.text, ent.label_) for ent in doc.ents]))
def main(model=None, output_dir=None, n_iter=100): if model is not None: nlp = spacy.load(model) print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") print("Created blank 'en' model") if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe("ner") for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): if model is None: nlp.begin_training() for itn in range(n_iter): ts = time.time() st= datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print (st) print("Starting iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} batches = minibatch(TRAIN_DATA, size=compounding(1000., 8000., 1.25)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, annotations, drop=0.5, losses=losses, ) print("Losses", losses) ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print (st) print ("CONCLUIDO") for text, _ in TRAIN_DATA: doc = nlp(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) print("Loading from", output_dir) nlp2 = spacy.load(output_dir) for text, _ in TRAIN_DATA: doc = nlp2(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
parser = argparse.ArgumentParser() parser.add_argument( "data_dir", type=parse_path, help='Directory to the TDSA dataset e.g. ~/.Bella/Datasets') parser.add_argument("dataset_name", type=str, choices=['Laptop', 'Restaurant', 'Election']) parser.add_argument("--force_space", action='store_true', help=force_space_help) args = parser.parse_args() data_dir = args.data_dir dataset_name = args.dataset_name tokeniser = spacy.blank('en') split_names = ['Train', 'Val', 'Test'] for split_name in split_names: dataset_fp = Path(data_dir, f'{dataset_name} {split_name}') dataset = TargetCollection.load_from_json(dataset_fp) retrieve_target_from_tokens = [] cannot_retrieve_target_from_tokens = [] for target in dataset.data_dict(): text = target['text'] target_word = target['target'].strip() target_start_offset = target['spans'][0][0] target_end_offset = target['spans'][0][1] if args.force_space:
def main(kb_path, vocab_path=None, output_dir=None, n_iter=50): """Create a blank model with the specified vocab, set up the pipeline and train the entity linker. The `vocab` should be the one used during creation of the KB.""" vocab = Vocab().from_disk(vocab_path) # create blank Language class with correct vocab nlp = spacy.blank("en", vocab=vocab) nlp.vocab.vectors.name = "spacy_pretrained_vectors" print("Created blank 'en' model with vocab from '%s'" % vocab_path) # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy. nlp.add_pipe(nlp.create_pipe('sentencizer')) # Add a custom component to recognize "Russ Cochran" as an entity for the example training data. # Note that in a realistic application, an actual NER algorithm should be used instead. ruler = EntityRuler(nlp) patterns = [{ "label": "PERSON", "pattern": [{ "LOWER": "russ" }, { "LOWER": "cochran" }] }] ruler.add_patterns(patterns) nlp.add_pipe(ruler) # Create the Entity Linker component and add it to the pipeline. if "entity_linker" not in nlp.pipe_names: # use only the predicted EL score and not the prior probability (for demo purposes) cfg = {"incl_prior": False} entity_linker = nlp.create_pipe("entity_linker", cfg) kb = KnowledgeBase(vocab=nlp.vocab) kb.load_bulk(kb_path) print("Loaded Knowledge Base from '%s'" % kb_path) entity_linker.set_kb(kb) nlp.add_pipe(entity_linker, last=True) # Convert the texts to docs to make sure we have doc.ents set for the training examples. # Also ensure that the annotated examples correspond to known identifiers in the knowlege base. kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings() TRAIN_DOCS = [] for text, annotation in TRAIN_DATA: with nlp.disable_pipes("entity_linker"): doc = nlp(text) annotation_clean = annotation for offset, kb_id_dict in annotation["links"].items(): new_dict = {} for kb_id, value in kb_id_dict.items(): if kb_id in kb_ids: new_dict[kb_id] = value else: print("Removed", kb_id, "from training because it is not in the KB.") annotation_clean["links"][offset] = new_dict TRAIN_DOCS.append((doc, annotation_clean)) # get names of other pipes to disable them during training pipe_exceptions = ["entity_linker", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] with nlp.disable_pipes(*other_pipes): # only train entity linker # reset and initialize the weights randomly optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DOCS) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.2, # dropout - make it harder to memorise data losses=losses, sgd=optimizer, ) print(itn, "Losses", losses) # test the trained model _apply_model(nlp) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print() print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) _apply_model(nlp2)
def main( model=None, new_model_name='named_entity_recognizer', output_dir='./model', n_iter=30, ): """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe('ner') # add labels for (_, annotations) in TRAIN_DATA: for ent in annotations.get('entities'): if ent[2] == "Links": continue ner.add_label(ent[2]) if model is None or reset_weights: optimizer = nlp.begin_training() else: optimizer = nlp.resume_training() move_names = list(ner.move_names) # inititate pipeline other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes), warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module='spacy') # batch up the examples using spaCy's minibatch for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update([text], [annotations], sgd=optimizer, drop=0.35, losses=losses) print('Losses', losses) # test the trained model test_text = 'Amity University' doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta['name'] = new_model_name # rename model nlp.to_disk(output_dir) print('Saved model to', output_dir) # test the saved model print('Loading from', output_dir) nlp2 = spacy.load(output_dir) # Check the classes have loaded back consistently assert nlp2.get_pipe('ner').move_names == move_names doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)
def main(model=None, new_model_name="Music Data", output_dir=os.path.dirname(os.path.realpath(__file__)) + '\\ner', n_iter=10): LABEL_1 = 'Artist' stat = DatasetAnalysis() data = stat.format_traindata() TEST_DATA = data[0:1000] TRAIN_DATA = data[1000:] """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") ner.add_label(LABEL_1) # add new entity label to entity recognizer if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.resume_training() move_names = list(ner.move_names) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): random.shuffle(TRAIN_DATA) batches = minibatch(TRAIN_DATA, size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print("Losses", losses) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # Test the trained model # Calculate Precision-Recall scores results = stat.evaluate(TEST_DATA) print(f'Precision metric score is {results[0]} ') print(f'Recall accuracy metric score is {results[1]} ') print(f'F1 accuracy metric score is {results[2]} ')
import spacy from spacy.tokens import Span nlp = spacy.blank("zh") # 定义这个方法 def to_html(span, tag): # 将span文本包在HTML标签中并返回 return f"<{tag}>{span.text}</{tag}>" # 注册这个Span方法扩展名"to_html"及其方法to_html Span.set_extension("to_html", method=to_html) # 处理文本,在span上调用to_html方法及其标签名"strong" doc = nlp("大家好,这是一个句子。") span = doc[0:3] print(span._.to_html("strong"))
def batch_train(model=None, blank_model='de', output_dir=None, dropout=0.2, n_iter=12, eval_split=0.3, train_data=None, n_samples=None, new_label=None): """Load the model, set up the pipeline and train the entity recognizer.""" abs_start_time = datetime.datetime.now() if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '{}'".format(model)) elif blank_model is not None: nlp = spacy.blank(blank_model) # create blank Language class print("Created blank model from '{}'".format(blank_model)) else: print("You have to add either a model or a blank model") print("No training possible") return "nothing" # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") if new_label is not None: if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.entity.create_optimizer() ner.add_label(new_label) TRAIN_DATA = train_data if n_samples is not None: TRAIN_DATA = TRAIN_DATA[:n_samples] train, test = train_test_split(TRAIN_DATA, test_size=eval_split) print("{} train vs {} test samples".format(len(train), len(test))) for _, annotations in train: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] prev_acc = 0.0 with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model if model is None: nlp.begin_training() for itn in range(n_iter): print("Iteration Number: {}".format(itn)) start_time = datetime.datetime.now() random.shuffle(train) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train, size=compounding(4.0, 16.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) if new_label: try: nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) except Exception as e: print(annotations) else: try: nlp.update( texts, # batch of texts annotations, # batch of annotations drop= 0.5, # dropout - make it harder to memorise data losses=losses, ) except Exception as e: print(annotations) print("Losses", losses) end_time = datetime.datetime.now() print("Duration: {}".format(end_time - start_time)) results = evaluate(nlp, test) print("p: {}; f: {}; r: {}".format(results['ents_p'], results['ents_f'], results['ents_r'])) acc = float(results['ents_f']) if acc > prev_acc: if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to: {}".format(output_dir)) prev_acc = acc print("######################") abs_end_time = datetime.datetime.now() print("######################") print("######################") print("######################") overall_duration = str(abs_end_time - abs_start_time) print("Overal duration: {}".format(overall_duration)) if output_dir is not None: print("model with f1 score: {} saved to location: {}".format( prev_acc, output_dir)) return nlp
save(args.char_emb_file, char_emb_mat, message="char embedding") save(args.train_eval_file, train_eval, message="train eval") save(args.dev_eval_file, dev_eval, message="dev eval") save(args.word2idx_file, word2idx_dict, message="word dictionary") save(args.char2idx_file, char2idx_dict, message="char dictionary") save(args.dev_meta_file, dev_meta, message="dev meta") #save(args.knowledge_edges_file,edges_dict,message="knowledge graph edges") if __name__ == '__main__': # Get command-line args args_ = get_setup_args() # Download resources download(args_) # Import spacy language model nlp = spacy.blank("en") # Preprocess dataset args_.train_file = url_to_data_path(args_.train_url) args_.dev_file = url_to_data_path(args_.dev_url) if args_.include_test_examples: args_.test_file = url_to_data_path(args_.test_url) glove_dir = url_to_data_path(args_.glove_url.replace('.zip', '')) glove_ext = f'.txt' if glove_dir.endswith( 'd') else f'.{args_.glove_dim}d.txt' args_.glove_file = os.path.join(glove_dir, os.path.basename(glove_dir) + glove_ext) pre_process(args_)
def train(model, train_data, dev_data, test_data, output_dir, n_iter, meta_overrides): """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") if meta_overrides is not None: metadata = json.load(open(meta_overrides)) nlp.meta.update(metadata) original_tokenizer = nlp.tokenizer nlp.tokenizer = WhitespaceTokenizer(nlp.vocab) # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names and "parser" in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, after="parser") elif 'ner' not in nlp.pipe_names and "tagger" in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, after="tagger") elif 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe('ner') # add labels for _, annotations in train_data: for ent in annotations.get('entities'): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2), util.env_opt('dropout_to', 0.2), util.env_opt('dropout_decay', 0.005)) batch_sizes = util.compounding(util.env_opt('batch_from', 1), util.env_opt('batch_to', 32), util.env_opt('batch_compound', 1.001)) optimizer = nlp.begin_training() best_epoch = 0 best_f1 = 0 for i in range(n_iter): random.shuffle(train_data) count = 0 losses = {} total = len(train_data) with nlp.disable_pipes(*other_pipes): # only train NER with tqdm.tqdm(total=total, leave=True) as pbar: for batch in minibatch(train_data, size=batch_sizes): docs, golds = zip(*batch) nlp.update(docs, golds, sgd=optimizer, losses=losses, drop=next(dropout_rates)) pbar.update(len(batch)) if count % 100 == 0 and count > 0: print('sum loss: %s' % losses['ner']) count += 1 # save model to output directory output_dir_path = Path(output_dir + "/" + str(i)) if not output_dir_path.exists(): output_dir_path.mkdir() with nlp.use_params(optimizer.averages): nlp.tokenizer = original_tokenizer nlp.to_disk(output_dir_path) print("Saved model to", output_dir_path) # test the saved model print("Loading from", output_dir_path) nlp2 = util.load_model_from_path(output_dir_path) nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab) metrics = evaluate_ner(nlp2, dev_data) if metrics["f1-measure-overall"] > best_f1: best_f1 = metrics["f1-measure-overall"] best_epoch = i # save model to output directory best_model_path = Path(output_dir + "/" + "best") print(f"Best Epoch: {best_epoch} of {n_iter}") if os.path.exists(best_model_path): shutil.rmtree(best_model_path) shutil.copytree(os.path.join(output_dir, str(best_epoch)), best_model_path) # test the saved model print("Loading from", best_model_path) nlp2 = util.load_model_from_path(best_model_path) nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab) evaluate_ner(nlp2, dev_data, dump_path=os.path.join(output_dir, "dev_metrics.json")) evaluate_ner(nlp2, test_data, dump_path=os.path.join(output_dir, "test_metrics.json"))
def _retrain_model(self): """ Update an existing spaCy model with labelled training data. """ # Load the model, set up the pipeline and train the entity recognizer: # Load existing spaCy model if not self.blank: # If this is a custom model, set the path to the directory if self.custom: self.base_model = self.path + self.base_model + "/" nlp = spacy.load(self.base_model) # If the parameter blank=true is passed we start with a blank Language class, e.g. en else: nlp = spacy.blank(self.base_model) # Debug information is printed to the terminal and logs if the paramater debug = true if self.debug: self._print_log(7) # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in self.train: for ent in annotations.get("entities"): ner.add_label(ent[2]) # Retrain the model: # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # Setup lists to store the loss for each epoch self.losses_train = [] self.losses_test = [] # reset and initialize the weights randomly – but only if we're # training a new model if self.blank: nlp.begin_training() for epoch in range(self.epochs): random.shuffle(self.train) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(self.train, size=self.batch_size) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=self.drop, # dropout - make it harder to memorise data losses=losses, ) # Store loss for the epoch to a list self.losses_train.append(('Epoch {}'.format(epoch+1), losses['ner'])) # Debug information is printed to the terminal and logs if the paramater debug = true if self.debug: self._print_log(8) # If a test dataset is available, calculate losses for it as well if self.validation is not None: losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(self.validation, size=self.batch_size) for batch in batches: texts, annotations = zip(*batch) # Get losses for the test data without updating the model nlp.update( texts, # batch of texts annotations, # batch of annotations sgd = None, # do not update model weights losses=losses, ) # Store loss for the epoch to a list self.losses_test.append(('Epoch {}'.format(epoch+1), losses['ner'])) # Debug information is printed to the terminal and logs if the paramater debug = true if self.debug: self._print_log(9) # Save model to output directory: output_dir = pathlib.Path(self.path + self.model + '/') if not output_dir.exists(): output_dir.mkdir(parents=True, exist_ok=False) nlp.to_disk(output_dir) # Debug information is printed to the terminal and logs if the paramater debug = true if self.debug: self._print_log(10) # Evaluate the model: # Prepare spaCy docs and golds for getting evaluation metrics docs_golds = [] for sample in self.train: doc = nlp.make_doc(sample[0]) gold = GoldParse(doc, entities=sample[1]["entities"]) docs_golds.append((doc, gold)) # Get scores for training data scorer_train = nlp.evaluate(docs_golds) # Add the training scores to evaluation metrics self.metrics = self._prep_scores(scorer_train) # Get scores for testing data and add to the evaluation metrics if self.validation is not None: docs_golds = [] for sample in self.validation: doc = nlp.make_doc(sample[0]) gold = GoldParse(doc, entities=sample[1]["entities"]) docs_golds.append((doc, gold)) scorer_test = nlp.evaluate(docs_golds) self.metrics = pd.concat([self.metrics, self._prep_scores(scorer_test, subset='test')], ignore_index=True) # Add loss metrics self.metrics = pd.concat([self.metrics, self._prep_losses(self.losses_train)], ignore_index=True) if self.validation is not None: self.metrics = pd.concat([self.metrics, self._prep_losses(self.losses_test, subset='test')], ignore_index=True)
def main(json_data, model=None, output_dir=None, n_iter=10, n_texts=2000, init_tok2vec=None): if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if "textcat" not in nlp.pipe_names: textcat = nlp.create_pipe("textcat", config={ "exclusive_classes": True, "architecture": "simple_cnn" }) nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe("textcat") # load the dataset print("Loading data...") Intents, (train_texts, train_cats), (dev_texts, dev_cats) = load_data(json_data) train_texts = train_texts[:n_texts] train_cats = train_cats[:n_texts] print("Using {} examples ({} training, {} evaluation)".format( n_texts, len(train_texts), len(dev_texts))) train_data = list(zip(train_texts, [{ "cats": cats } for cats in train_cats])) # add label to text classifier for intent in Intents: textcat.add_label(intent) # get names of other pipes to disable them during training pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() if init_tok2vec is not None: with init_tok2vec.open("rb") as file_: textcat.model.tok2vec.from_bytes(file_.read()) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) batch_sizes = compounding(4.0, 32.0, 1.001) for _ in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch random.shuffle(train_data) batches = minibatch(train_data, size=batch_sizes) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(Intents, nlp.tokenizer, textcat, dev_texts, dev_cats) print("{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}". format( # print a simple table losses["textcat"], scores["textcat_p"], scores["textcat_r"], scores["textcat_f"], )) # test the trained model test_text = "add an item to todo list" doc = nlp(test_text) print(test_text, doc.cats) if output_dir is not None: with nlp.use_params(optimizer.averages): nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model # print("Loading from", output_dir) # nlp2 = spacy.load(output_dir) # doc2 = nlp2(test_text) # print(test_text, doc2.cats) return nlp
def nlp(): return spacy.blank("en")
def cargar_modelo(dim_modelo, lenguaje, maxima_longitud=None): """ Carga y retorna un modelo de lenguaje de spaCy del tamaño y lenguaje \ fijado por el usuario. Para mayor información sobre estos modelos se \ puede consultar la página de spaCy (https://spacy.io/models/). :param dim_modelo: (str). Tamaño del modelo de spaCy que se desea \ utilizar. Puede ser una de las siguientes opciones, sin hacer \ distinción entre mayúsculas y minúsculas: |ul| |li| pequeño: {'pequeño', 'pequeno', 'small', 's', 'sm'} |/li| |li| mediano: {'mediano', 'medio', 'md', 'medium', 'm'} |/li| |li| grande: {'grande', 'large', 'lg', 'gr'} |/li| |/ul| Entre más grande sea el modelo es posible que tenga un soporte de \ sus características para un vocabulario más amplio. También \ aumentará el tamaño del archivo de cada modelo. Si un \ modelo de determinado lenguaje y tamaño no se encuentra en el \ computador del usuario, la función lo descargará. Una vez descargado \ el modelo correspondiente, el usuario debe correr la función de nuevo. :param lenguaje: (str). Lenguaje para el que se desea cargar el modelo \ de spaCy. spaCy tiene modelos disponibles para varios lenguajes. \ Para mayor información, visitar https://spacy.io/models/ :param maxima_longitud: (int), valor por defecto: None. Parámetro \ opcional que permite establecer la máxima longitud (número de \ caracteres) que acepta el vectorizador en un texto de entrada. \ Si este valor se deja en None, se utilizará la máxima longitud que \ trae Spacy por defecto (1 millón de caracteres). :return: Modelo de spaCy, del tamaño y lenguaje especificados. Si el \ modelo requerido no está disponible en el computador del usuario, \ la función descargará el modelo correspondiente, lo cual puede tardar \ algunos minutos, dependiendo del tamaño de los modelos y la \ velocidad de conexión a internet del usuario. Si este es el caso, \ la función retornará un modelo en blanco, del lenguaje especificado \ por el usuario. A partir de la siguiente vez que se corra la \ función, esta retornará el modelo ya descargado. """ dim_modelo = dim_modelo.lower() # Estandarizar el tamaño del modelo if dim_modelo in ["grande", "large", "lg", "gr"]: dim_modelo = "lg" elif dim_modelo in ["mediano", "medio", "md", "medium", "m"]: dim_modelo = "md" elif dim_modelo in ["pequeño", "pequeno", "small", "s", "sm"]: dim_modelo = "sm" # Si no se puede distinguir el tamaño del modelo, se retorna un modelo # en blanco else: return spacy.blank(lenguaje) # Modelo a cargar de acuerdo a su tamaño y lenguaje if lenguaje == "en": language_model = f"{lenguaje}_core_web_{dim_modelo}" else: language_model = f"{lenguaje}_core_news_{dim_modelo}" # Se intenta cargar el modelo try: modelo = spacy.load(language_model) # Si no funciona, se trata de descargar el modelo, o se usa uno vacío except BaseException: try: print(("[INFO] Descargando modelo. Este proceso puede " "tardar varios minutos.\n")) os.system(f"python -m spacy download {language_model}") print("\n[INFO] El modelo ha sido descargado.") print(("[INFO] Por favor correr de nuevo el script, " "o iniciar una nueva sesión de Python para cargarlo.")) print("[INFO] Hasta entonces, se utilizará un modelo en blanco.\n") except BaseException: print(("\n[INFO] El modelo no pudo ser descargado, " "se cargará un modelo vacío.\n")) modelo = spacy.blank(lenguaje) # Se ajusta la máxima longitud, si se especificó if maxima_longitud is not None: modelo.max_length = maxima_longitud # Devolver el modelo return modelo
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None): if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if "textcat" not in nlp.pipe_names: textcat = nlp.create_pipe( "textcat", config={ "exclusive_classes": True, "architecture": "simple_cnn", } ) nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe("textcat") # add label to text classifier textcat.add_label("POSITIVE") textcat.add_label("NEGATIVE") # load the IMDB dataset print("Loading IMDB data...") (train_texts, train_cats), (dev_texts, dev_cats) = load_data() train_texts = train_texts[:n_texts] train_cats = train_cats[:n_texts] print( "Using {} examples ({} training, {} evaluation)".format( n_texts, len(train_texts), len(dev_texts) ) ) train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() if init_tok2vec is not None: with init_tok2vec.open("rb") as file_: textcat.model.tok2vec.from_bytes(file_.read()) print("Training the model...") print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F")) batch_sizes = compounding(4.0, 32.0, 1.001) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch random.shuffle(train_data) batches = minibatch(train_data, size=batch_sizes) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print( "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format( # print a simple table losses["textcat"], scores["textcat_p"], scores["textcat_r"], scores["textcat_f"], ) ) # test the trained model test_text = "This movie sucked" doc = nlp(test_text) print(test_text, doc.cats) if output_dir is not None: with nlp.use_params(optimizer.averages): nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) print(test_text, doc2.cats)
def main(model=None, new_model_name='animal', output_dir=None, n_iter=10): """Set up the pipeline and entity recognizer, and train the new entity.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe('ner') ner.add_label(LABEL) # add new entity label to entity recognizer if model is None: optimizer = nlp.begin_training() else: # Note that 'begin_training' initializes the models, so it'll zero out # existing entity types. optimizer = nlp.entity.create_optimizer() # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print('Losses', losses) # test the trained model test_text = 'Do you like horses?' doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta['name'] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)
def main(model=None, output_dir=None, n_iter=20, n_texts=2000): if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # add the text classifier to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if 'textcat' not in nlp.pipe_names: textcat = nlp.create_pipe('textcat') nlp.add_pipe(textcat, last=True) # otherwise, get it, so we can add labels to it else: textcat = nlp.get_pipe('textcat') # add label to text classifier textcat.add_label('POSITIVE') # load the IMDB dataset print("Loading IMDB data...") (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts) print("Using {} examples ({} training, {} evaluation)" .format(n_texts, len(train_texts), len(dev_texts))) train_data = list(zip(train_texts, [{'cats': cats} for cats in train_cats])) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat'] with nlp.disable_pipes(*other_pipes): # only train textcat optimizer = nlp.begin_training() print("Training the model...") print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F')) for i in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4., 32., 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses) with textcat.model.use_params(optimizer.averages): # evaluate on the dev data split off in load_data() scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats) print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}' # print a simple table .format(losses['textcat'], scores['textcat_p'], scores['textcat_r'], scores['textcat_f'])) # test the trained model test_text = "This movie sucked" doc = nlp(test_text) print(test_text, doc.cats) if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) print(test_text, doc2.cats)
import blingfire import nltk import pysbd import spacy import stanza import syntok from syntok.tokenizer import Tokenizer import syntok.segmenter as syntok_segmenter from english_golden_rules import GOLDEN_EN_RULES pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False) nlp = spacy.blank('en') nlp.add_pipe(nlp.create_pipe("sentencizer")) nlp_dep = spacy.load('en_core_web_sm', disable=["ner"]) #stanza.download('en') stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize') syntok_tokenizer = Tokenizer() def blingfire_tokenize(text): return blingfire.text_to_sentences(text).split('\n') def nltk_tokenize(text): return nltk.sent_tokenize(text) def pysbd_tokenize(text): return pysbd_segmenter.segment(text)
from ordered_set import OrderedSet from spacy.tokenizer import Tokenizer from spacy.lang.en import English import torch import ujson as json import bisect from typing import Dict import random import numpy as np import linecache # import nltk.data import time nlp = spacy.blank("en") # the tokenizer # nlp = nltk.data.load('tokenizers/punkt/english.pickle') # nlp = English() # tokenizer = Tokenizer(nlp.vocab) # tokenizer = English().Defaults.create_tokenizer(nlp) # justatest = [] wordlist = OrderedSet( ) # an Ordered Set which will contain all words of the data once def token(string): words = nlp(string) # tokenizing of the sentence index = [ ] # an array for the index numbers, which will replace that sentence for i in words: # for all words:
import spacy nlp = spacy.blank("ja") # Docクラスをインポート from ____ import ____ # 作りたいテキスト:「spaCyは素晴らしい!」 words = ["spaCy", "は", "素晴らしい", "!"] spaces = [False, False, False, False] # wordsとspacesからDocを作成 doc = ____(____, words=words, spaces=spaces) print(doc.text)
import tensorflow as tf import random from tqdm import tqdm import spacy import ujson as json from collections import Counter import numpy as np nlp = spacy.blank("en") def word_tokenize(sent): doc = nlp(sent) return [token.text for token in doc] def convert_idx(text, tokens): current = 0 spans = [] for token in tokens: current = text.find(token, current) if current < 0: print("Token {} cannot be found".format(token)) raise Exception() spans.append((current, current + len(token))) current += len(token) return spans def process_file(filename, data_type, word_counter, char_counter): print("Generating {} examples...".format(data_type))
import spacy import random import json with open("exercises/de/gadgets.json") as f: TRAINING_DATA = json.loads(f.read()) nlp = spacy.blank("de") ner = nlp.create_pipe("ner") nlp.add_pipe(ner) ner.add_label("GADGET") # Beginne das Training nlp.begin_training() # Führe die Schleife 10 Mal aus for itn in range(10): # Mische die Trainingsdaten random.shuffle(TRAINING_DATA) losses = {} # Teile die Beispiele in Batches auf und iteriere über die Batches for batch in spacy.util.minibatch(TRAINING_DATA, size=2): texts = [text for text, entities in batch] annotations = [entities for text, entities in batch] # Aktualisiere das Modell nlp.update(texts, annotations, losses=losses) print(losses)
import spacy spacy.require_gpu() from pathlib import Path import random from spacy.util import minibatch, compounding #import xx_ent_wiki_sm from spacy.lang.pt import Portuguese from ast import literal_eval import datetime import time output_dir = "./sky_ner" modelDir = Path(output_dir) nlp = spacy.blank('pt') ts = time.time() st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') print (st) if modelDir.exists() is True: # training data TRAIN_DATA = open('dataset_new.txt', 'r').read() print('Dados carregados') try: TRAIN_DATA = literal_eval(TRAIN_DATA) print('literal eval aplicado') except: print('Falha ao aplicar eval') if 'ner' not in nlp.pipe_names:
# Importe spaCy import spacy # Crée l'objet nlp français nlp = spacy.blank("fr") # Traite un texte doc = nlp("Ceci est une phrase.") # Affiche le texte du document print(doc.text)
def main(model=None, output_dir=None, n_iter=100): """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) # otherwise, get it so we can add labels else: ner = nlp.get_pipe("ner") # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER # reset and initialize the weights randomly – but only if we're # training a new model if model is None: nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) print("Losses", losses) # test the trained model for text, _ in TRAIN_DATA: doc = nlp(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc]) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) for text, _ in TRAIN_DATA: doc = nlp2(text) print("Entities", [(ent.text, ent.label_) for ent in doc.ents]) print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
def main(model=None, new_model_name="animal", output_dir=None, n_iter=30): """Set up the pipeline and entity recognizer, and train the new entity.""" random.seed(0) if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe("ner") ner.add_label(LABEL) # add new entity label to entity recognizer # Adding extraneous labels shouldn't mess anything up ner.add_label("VEGETABLE") if model is None: optimizer = nlp.begin_training() else: optimizer = nlp.resume_training() move_names = list(ner.move_names) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"] with nlp.disable_pipes(*other_pipes): # only train NER sizes = compounding(1.0, 4.0, 1.001) # batch up the examples using spaCy's minibatch for itn in range(n_iter): random.shuffle(TRAIN_DATA) batches = minibatch(TRAIN_DATA, size=sizes) losses = {} for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses) print("Losses", losses) # test the trained model test_text = "Do you like horses?" doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta["name"] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) # Check the classes have loaded back consistently assert nlp2.get_pipe("ner").move_names == move_names doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)