def test_issue3611(): """ Test whether adding n-grams in the textcat works even when n > token length of some docs """ unique_classes = ["offensive", "inoffensive"] x_train = [ "This is an offensive text", "This is the second offensive text", "inoff", ] y_train = ["offensive", "offensive", "inoffensive"] nlp = spacy.blank("en") # preparing the data train_data = [] for text, train_instance in zip(x_train, y_train): cat_dict = {label: label == train_instance for label in unique_classes} train_data.append(Example.from_dict(nlp.make_doc(text), {"cats": cat_dict})) # add a text categorizer component model = { "@architectures": "spacy.TextCatBOW.v1", "exclusive_classes": True, "ngram_size": 2, "no_output_layer": False, } textcat = nlp.add_pipe("textcat", config={"model": model}, last=True) for label in unique_classes: textcat.add_label(label) # training the network with nlp.select_pipes(enable="textcat"): optimizer = nlp.initialize() for i in range(3): losses = {} batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update(examples=batch, sgd=optimizer, drop=0.1, losses=losses)
def test_compounding_rate(): rates = compounding(1, 16, 1.01) rate0 = next(rates) assert rate0 == 1.0 rate1 = next(rates) rate2 = next(rates) rate3 = next(rates) assert rate3 > rate2 > rate1 > rate0 assert (rate3 - rate2) > (rate2 - rate1) > (rate1 - rate0)
def test_issue4348(): """Test that training the tagger with empty data, doesn't throw errors""" nlp = English() example = Example.from_dict(nlp.make_doc(""), {"tags": []}) TRAIN_DATA = [example, example] tagger = nlp.add_pipe("tagger") tagger.add_label("A") optimizer = nlp.initialize() for i in range(5): losses = {} batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update(batch, sgd=optimizer, losses=losses)
def _train_tuples(train_data): nlp = English() ner = nlp.add_pipe("ner") ner.add_label("ORG") ner.add_label("LOC") train_examples = [] for t in train_data: train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1])) optimizer = nlp.initialize() for i in range(5): losses = {} batches = minibatch(train_examples, size=compounding(4.0, 32.0, 1.001)) for batch in batches: nlp.update(batch, sgd=optimizer, losses=losses)