def update_model(name, train_data, n_iter): nlp = load_model(name) isNew = False if "ner" not in nlp.pipe_names: ner = nlp.create_pipe("ner") nlp.add_pipe(ner, last=True) isNew = True else: ner = nlp.get_pipe("ner") # add labels for _, annotations in train_data: for ent in annotations.get("entities"): ner.add_label(ent[2]) # get names of other pipes to disable them during training pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"] other_pipes = [ pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions ] stats = [] # only train NER with nlp.disable_pipes(*other_pipes), warnings.catch_warnings(): # show warnings for misaligned entity spans once warnings.filterwarnings("once", category=UserWarning, module='spacy') if isNew is True: nlp.begin_training() for itn in range(n_iter): losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update( texts, # batch of texts annotations, # batch of annotations drop=0.5, # dropout - make it harder to memorise data losses=losses, ) #print("Losses", losses) now = dt.datetime.now() stat = make_stat(losses, str(now)) stats.append(stat) save_model(nlp, name) return stats
def plot_all_without_sum(): DIR = "/run/media/andreas/INTENSO/fastas/artdata/" # nr_of_reads_list = [100] # average_length_list = [300] nr_of_reads_list = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] average_length_list = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000] for nr_of_reads in nr_of_reads_list: for average_length in average_length_list: curr_DIR = DIR + "shuffled_{}_{}/".format(nr_of_reads, average_length) os.chdir(curr_DIR) filename = "shuffled_{}_{}".format(nr_of_reads, average_length) (nr_best_edge_used, scores) = stats.make_stat(filename) plt.plot(nr_best_edge_used, 'k', nr_best_edge_used, "bo") plt.title(filename) plt.grid(True) plt.xticks(xrange(len(nr_best_edge_used))) plt.yticks(xrange(0, max(nr_best_edge_used) + 10, 5)) plt.ylabel("How often is the n_th best edge used") plt.xlabel("n-th best edge") plt.savefig(filename + "_plot.pdf")