Example #1
0
def update_model(name, train_data, n_iter):
    nlp = load_model(name)

    isNew = False

    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
        isNew = True
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
    ]

    stats = []

    # only train NER
    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        if isNew is True:
            nlp.begin_training()

        for itn in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )

            #print("Losses", losses)
            now = dt.datetime.now()
            stat = make_stat(losses, str(now))
            stats.append(stat)

    save_model(nlp, name)

    return stats
Example #2
0
def plot_all_without_sum():
    DIR = "/run/media/andreas/INTENSO/fastas/artdata/"
    # nr_of_reads_list = [100]
    # average_length_list = [300]
    nr_of_reads_list = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
    average_length_list = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]
    for nr_of_reads in nr_of_reads_list:
        for average_length in average_length_list:
            curr_DIR = DIR + "shuffled_{}_{}/".format(nr_of_reads, average_length)
            os.chdir(curr_DIR)
            filename = "shuffled_{}_{}".format(nr_of_reads, average_length)
            (nr_best_edge_used, scores) = stats.make_stat(filename)
            plt.plot(nr_best_edge_used, 'k', nr_best_edge_used, "bo")
            plt.title(filename)
            plt.grid(True)
            plt.xticks(xrange(len(nr_best_edge_used)))
            plt.yticks(xrange(0, max(nr_best_edge_used) + 10, 5))
            plt.ylabel("How often is the n_th best edge used")
            plt.xlabel("n-th best edge")
            plt.savefig(filename + "_plot.pdf")