Example #1
0
def main(model=None, output_dir=None, n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the parser to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "parser" not in nlp.pipe_names:
        parser = nlp.create_pipe("parser")
        nlp.add_pipe(parser, first=True)
    # otherwise, get it, so we can add labels to it
    else:
        parser = nlp.get_pipe("parser")

    # add labels to the parser
    for _, annotations in TRAIN_DATA:
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "I like securities."
    doc = nlp(test_text)
    print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
Example #2
0
def train_tensorizer(nlp, texts, dropout, n_iter):
    tensorizer = nlp.create_pipe("tensorizer")
    nlp.add_pipe(tensorizer)
    optimizer = nlp.begin_training()
    for i in range(n_iter):
        losses = {}
        for i, batch in enumerate(minibatch(tqdm.tqdm(texts))):
            docs = [nlp.make_doc(text) for text in batch]
            tensorizer.update(docs, None, losses=losses, sgd=optimizer, drop=dropout)
        print(losses)
    return optimizer
Example #3
0
def main(model_name, unlabelled_loc):
    n_iter = 10
    dropout = 0.2
    batch_size = 4
    nlp = spacy.load(model_name)
    nlp.get_pipe("ner").add_label(LABEL)
    raw_docs = list(read_raw_data(nlp, unlabelled_loc))
    optimizer = nlp.resume_training()
    # Avoid use of Adam when resuming training. I don't understand this well
    # yet, but I'm getting weird results from Adam. Try commenting out the
    # nlp.update(), and using Adam -- you'll find the models drift apart.
    # I guess Adam is losing precision, introducing gradient noise?
    optimizer.alpha = 0.1
    optimizer.b1 = 0.0
    optimizer.b2 = 0.0

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    sizes = compounding(1.0, 4.0, 1.001)
    with nlp.disable_pipes(*other_pipes):
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            random.shuffle(raw_docs)
            losses = {}
            r_losses = {}
            # batch up the examples using spaCy's minibatch
            raw_batches = minibatch(raw_docs, size=4)
            for batch in minibatch(TRAIN_DATA, size=sizes):
                docs, golds = zip(*batch)
                nlp.update(docs, golds, sgd=optimizer, drop=dropout, losses=losses)
                raw_batch = list(next(raw_batches))
                nlp.rehearse(raw_batch, sgd=optimizer, losses=r_losses)
            print("Losses", losses)
            print("R. Losses", r_losses)
    print(nlp.get_pipe('ner').model.unseen_classes)
    test_text = "Do you like horses?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)
Example #4
0
def main(output_dir, model="en_core_web_sm", n_jobs=4, batch_size=1000, limit=10000):
    nlp = spacy.load(model)  # load spaCy model
    print("Loaded model '%s'" % model)
    if not output_dir.exists():
        output_dir.mkdir()
    # load and pre-process the IMBD dataset
    print("Loading IMDB data...")
    data, _ = thinc.extra.datasets.imdb()
    texts, _ = zip(*data[-limit:])
    print("Processing texts...")
    partitions = minibatch(texts, size=batch_size)
    executor = Parallel(n_jobs=n_jobs, backend="multiprocessing", prefer="processes")
    do = delayed(partial(transform_texts, nlp))
    tasks = (do(i, batch, output_dir) for i, batch in enumerate(partitions))
    executor(tasks)
Example #5
0
def main(model=None, output_dir=None, n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # We'll use the built-in dependency parser class, but we want to create a
    # fresh instance – just in case.
    if "parser" in nlp.pipe_names:
        nlp.remove_pipe("parser")
    parser = nlp.create_pipe("parser")
    nlp.add_pipe(parser, first=True)

    for text, annotations in TRAIN_DATA:
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        test_model(nlp2)
Example #6
0
def train_batch_model(nlp, train_data):

    random.seed(1)
    spacy.util.fix_random_seed(1)
    optimizer = nlp.begin_training()

    losses = {}
    for epoch in range(10):
        random.shuffle(train_data)
        # Create the batch generator with batch size = 8
        batches = minibatch(train_data, size=8)
        # Iterate through minibatches
        for batch in batches:
            # Each batch is a list of (text, label) but we need to
            # send separate lists for texts and labels to update().
            # This is a quick way to split a list of tuples into lists
            texts, labels = zip(*batch)
            nlp.update(texts, labels, sgd=optimizer, losses=losses)
    return nlp
 def predict(self,data):
     data = list(data)
     #batches = minibatch(data,size=compounding(10,128,1.3))
     batches = minibatch(data,size=32)
     for batch in batches:
         print("batch size: ",len(batch))
         batch = list(batch)
         score = self.model_score(batch)
         for i in range(len(batch)):
             if('label' not in batch[i].keys()):
                 batch[i]['label'] = self.labels[0]
             if(self.coll):
                 if('score' in list(self.coll.find({'text': batch[i]['text']}))[0].keys()):
                     current_score =  list(self.coll.find({'text': batch[i]['text']}))[0]['score']
                 else:
                     current_score = []
                 current_score.append(float(score[i]))
                 self.coll.update_one({"text": batch[i]['text']}, {"$set": {"score": current_score}})
             yield (float(score[i]),batch[i])
    def learn_embeddings(self,
                         train_texts,
                         train_cats,
                         val_texts=None,
                         val_cats=None,
                         force_restart=False,
                         epochs=10,
                         batch_size=compounding(1., 64., 1.001)):
        logger.info("Learning embeddings")
        train_texts = [utils.decode(text) for text in train_texts]
        train_cats = [{'cats': {'POS': bool(cat)}} for cat in train_cats]
        train_data = list(zip(train_texts, train_cats))
        validation = True

        if val_texts is None or val_cats is None or len(val_texts) == 0 or len(
                val_cats) == 0:
            logger.warn(
                "Validation data is either not given or incomplete so evaluation of validation data set will not be done"
            )
            validation = False

        with self.nlp.disable_pipes(*self.other_pipes):
            logger.warn('Disabled following pipes for training: ' +
                        str(self.other_pipes))
            if self.optimizer is None or force_restart:
                self.optimizer = self.nlp.begin_training()
                logger.info("New optimizer created")
            for i in range(epochs):
                losses = {}
                batches = minibatch(train_data, size=batch_size)
                for batch in batches:
                    texts, cats = zip(*batch)
                    self.nlp.update(texts,
                                    cats,
                                    sgd=self.optimizer,
                                    drop=0.2,
                                    losses=losses)
                if validation:
                    self.validate(val_texts, val_cats)
                logger.info("Epoch " + str(i) + " finished, losses: " +
                            str(losses))

        return self.vectorize(train_texts)
Example #9
0
def train_ner(nlp, train_data, entity_types, n_iter=1000):
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model

        # if model is None:
        nlp.begin_training()

        for itn in range(n_iter):
            random.shuffle(train_data)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)

                print(batch)
                print(texts)
                print(annotations)

                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
                print("Losses", losses)

    return nlp
Example #10
0
def main(lang="en", output_dir=None, n_iter=25):
    """Create a new model, set up the pipeline and train the tagger. In order to
    train the tagger with a custom tag map, we're creating a new Language
    instance with a custom vocab.
    """
    nlp = spacy.blank(lang)
    # add the tagger to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    tagger = nlp.create_pipe("tagger")
    # Add the tags. This needs to be done before you start training.
    for tag, values in TAG_MAP.items():
        tagger.add_label(tag, values)
    nlp.add_pipe(tagger)

    optimizer = nlp.begin_training()
    for i in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, losses=losses)
        print("Losses", losses)

    # test the trained model
    test_text = "I like blue eggs"
    doc = nlp(test_text)
    print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the save model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print("Tags", [(t.text, t.tag_, t.pos_) for t in doc])
Example #11
0
def train_cnn(X_train,
              cats_train,
              X_val,
              cats_val,
              X_test,
              cats_test,
              n_iter=5):
    """Train CNN on training data and updates model using validation data. Finally,
    print scores on test data. It also prints training progres."""
    nlp = spacy.load('en_core_web_sm')
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe('textcat')
    textcat.add_label('POSITIVE')

    # only train textcat, disable other pipes in pipeline
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']

    train_data = list(zip(X_train, [{'cats': cats} for cats in cats_train]))

    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for i in range(n_iter):
            losses = {}
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                scores = evaluate(nlp.tokenizer, textcat, X_val, cats_val)
                print(scores)

    # save model to disk
    nlp.to_disk('../models/cnn_classifier')

    print('Scores on test data...')
    print(evaluate(nlp.tokenizer, textcat, X_test, cats_test))
Example #12
0
def test_train_empty():
    """Test that training an empty text does not throw errors."""
    train_data = [
        ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
        ("", {"entities": []}),
    ]

    nlp = English()
    train_examples = []
    for t in train_data:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    ner = nlp.add_pipe("ner", last=True)
    ner.add_label("PERSON")
    nlp.initialize()
    for itn in range(2):
        losses = {}
        batches = util.minibatch(train_examples, size=8)
        for batch in batches:
            nlp.update(batch, losses=losses)
Example #13
0
def test_train_negative_deprecated():
    """Test that the deprecated negative entity format raises a custom error."""
    train_data = [
        ("Who is Shaka Khan?", {"entities": [(7, 17, "!PERSON")]}),
    ]

    nlp = English()
    train_examples = []
    for t in train_data:
        train_examples.append(Example.from_dict(nlp.make_doc(t[0]), t[1]))
    ner = nlp.add_pipe("ner", last=True)
    ner.add_label("PERSON")
    nlp.initialize()
    for itn in range(2):
        losses = {}
        batches = util.minibatch(train_examples, size=8)
        for batch in batches:
            with pytest.raises(ValueError):
                nlp.update(batch, losses=losses)
Example #14
0
    def train(self, n_iter=400):
        """Load the model, set up the pipeline and train the parser.
  		Arnaud Brown"""

        # We'll use the built-in dependency parser class, but we want to create a
        # fresh instance – just in case.
        if "parser" in self.nlp_model.pipe_names:
            self.nlp_model.remove_pipe("parser")
        parser = self.nlp_model.create_pipe("parser")
        self.nlp_model.add_pipe(parser, first=True)

        for _, annotations in TRAIN_DATA:
            for dep in annotations.get("deps", []):
                parser.add_label(dep)

        pipe_exceptions = ["parser", "trf_wordpiecer", "trf_tok2vec"]
        other_pipes = [
            pipe for pipe in self.nlp_model.pipe_names
            if pipe not in pipe_exceptions
        ]
        with self.nlp_model.disable_pipes(*other_pipes):  # only train parser
            optimizer = self.nlp_model.begin_training()
            for _ in range(n_iter):
                random.shuffle(TRAIN_DATA)
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = minibatch(TRAIN_DATA,
                                    size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    texts, annotations = zip(*batch)
                    self.nlp_model.update(texts,
                                          annotations,
                                          sgd=optimizer,
                                          losses=losses)
                print("Losses", losses)

        # save model to output directory
        if self.output_dir is not None:
            self.output_dir_path = Path(self.output_dir)
            if not self.output_dir_path.exists():
                self.output_dir_path.mkdir()
            self.nlp_model.to_disk(self.output_dir_path)
            print("Saved model to", self.output_dir_path)
Example #15
0
def test_issue4030():
    """Test whether textcat works fine with empty doc"""
    unique_classes = ["offensive", "inoffensive"]
    x_train = [
        "This is an offensive text",
        "This is the second offensive text",
        "inoff",
    ]
    y_train = ["offensive", "offensive", "inoffensive"]
    nlp = spacy.blank("en")
    # preparing the data
    train_data = []
    for text, train_instance in zip(x_train, y_train):
        cat_dict = {label: label == train_instance for label in unique_classes}
        train_data.append(
            Example.from_dict(nlp.make_doc(text), {"cats": cat_dict}))
    # add a text categorizer component
    model = {
        "@architectures": "spacy.TextCatBOW.v1",
        "exclusive_classes": True,
        "ngram_size": 2,
        "no_output_layer": False,
    }
    textcat = nlp.add_pipe("textcat", config={"model": model}, last=True)
    for label in unique_classes:
        textcat.add_label(label)
    # training the network
    with nlp.select_pipes(enable="textcat"):
        optimizer = nlp.initialize()
        for i in range(3):
            losses = {}
            batches = util.minibatch(train_data,
                                     size=compounding(4.0, 32.0, 1.001))

            for batch in batches:
                nlp.update(examples=batch,
                           sgd=optimizer,
                           drop=0.1,
                           losses=losses)
    # processing of an empty doc should result in 0.0 for all categories
    doc = nlp("")
    assert doc.cats["offensive"] == 0.0
    assert doc.cats["inoffensive"] == 0.0
Example #16
0
def train_textcat(nlp, n_texts, n_iter=10):
    textcat = nlp.get_pipe("textcat")
    tok2vec_weights = textcat.model.tok2vec.to_bytes()
    (train_texts, train_cats), (dev_texts,
                                dev_cats) = load_textcat_data(limit=n_texts)
    print("Using {} examples ({} training, {} evaluation)".format(
        n_texts, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts, [{
        "cats": cats
    } for cats in train_cats]))

    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
    ]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        textcat.model.tok2vec.from_bytes(tok2vec_weights)
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        for i in range(n_iter):
            losses = {"textcat": 0.0}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(tqdm.tqdm(train_data), size=2)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts,
                                          dev_cats)
            print("{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".
                  format(  # print a simple table
                      losses["textcat"],
                      scores["textcat_p"],
                      scores["textcat_r"],
                      scores["textcat_f"],
                  ))
Example #17
0
def train(subject, n_sent, n_iter):
    # Creates or updates the NER model for a subject

    train_data = []
    for doc in traindb['sentence'].find({
            'subject': subject,
            'train': {
                '$exists': True
            }
    }).limit(n_sent):
        train_data.append((doc['text'], {'entities': doc['train']['tags']}))

    path = base.joinpath(subject)
    if path.exists():
        nlp = spacy.load(path)
        ner = nlp.get_pipe('ner')
    else:
        nlp = Portuguese()
        ner = nlp.create_pipe('ner')
        ner.add_label('CARDINAL')
        ner.add_label('QUANTITY')
        ner.add_label('PRODUCT')

        nlp.add_pipe(ner, last=True)
        nlp.begin_training()

    for itn in range(n_iter):
        random.shuffle(train_data)

        losses = {}
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))

        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(
                texts,
                annotations,
                drop=0.5,
                losses=losses,
            )
            print("Losses", losses)

    nlp.to_disk(path)
Example #18
0
def train_spacy(data, iterations):
    TRAIN_DATA = data
    nlp = spacy.load('en_core_web_sm')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
    ]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        nlp.begin_training()
        for itn in range(iterations):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)
    return nlp
def train(train_data, output_dir, n_iter, model=None):
    """
    Train n_iter NLP models.
    """

    # Load or create NLP model
    if model is not None:
        nlp = spacy.load(output_dir)
    else:
        nlp = spacy.blank("en")

    # Pipeline components
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")

    # Label
    for _, annotations in train_data:
        for entity in annotations.get("entities"):
            ner.add_label(entity[2])

    # Disable other piplines (only train NER)
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):
        if model is None:
            nlp.begin_training()
        else:
            nlp.resume_training()

        # Train NER
        for itn in tqdm.tqdm(range(n_iter)):
            random.shuffle(train_data)
            batches = minibatch(train_data,
                                size=compounding(4.0, 500.0, 1.001))
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, drop=0.5, losses=losses)

    save_model(output_dir, nlp, "st_ner")
Example #20
0
    async def train(self, sources: Sources):
        train_examples = await self._preprocess_data(sources)
        for _, entities in train_examples:
            for ent in entities.get("entities"):
                self.ner.add_label(ent[2])

        # get names of other pipes to disable them during training
        pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
        other_pipes = [
            pipe for pipe in self.nlp.pipe_names if pipe not in pipe_exceptions
        ]
        # only train NER
        with self.nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
            # show warnings for misaligned entity spans once
            warnings.filterwarnings("once",
                                    category=UserWarning,
                                    module="spacy")
            if self.parent.config.model_name_or_path is None:
                self.nlp.begin_training()
            for itn in range(self.parent.config.n_iter):
                random.shuffle(train_examples)
                losses = {}
                batches = minibatch(train_examples,
                                    size=compounding(4.0, 32.0, 1.001))
                for batch in batches:
                    examples = []
                    for doc, gold_dict in batch:
                        doc = self.nlp.make_doc(doc)
                        examples.append(Example.from_dict(doc, gold_dict))
                    self.nlp.update(
                        examples,
                        drop=self.parent.config.dropout,
                        losses=losses,
                    )
                self.logger.debug(f"Losses: {losses}")

        if self.parent.config.directory is not None:
            if not self.parent.config.directory.exists():
                self.parent.config.directory.mkdir(parents=True)
            self.nlp.to_disk(self.parent.config.directory)
            self.logger.debug(
                f"Saved model to {self.parent.config.directory.name}")
Example #21
0
    def _multiprocessing_pipe(self, texts, pipes, n_process, batch_size):
        # raw_texts is used later to stop iteration.
        texts, raw_texts = itertools.tee(texts)
        # for sending texts to worker
        texts_q = [mp.Queue() for _ in range(n_process)]
        # for receiving byte-encoded docs from worker
        bytedocs_recv_ch, bytedocs_send_ch = zip(
            *[mp.Pipe(False) for _ in range(n_process)])

        batch_texts = minibatch(texts, batch_size)
        # Sender sends texts to the workers.
        # This is necessary to properly handle infinite length of texts.
        # (In this case, all data cannot be sent to the workers at once)
        sender = _Sender(batch_texts, texts_q, chunk_size=n_process)
        # send twice to make process busy
        sender.send()
        sender.send()

        procs = [
            mp.Process(
                target=_apply_pipes,
                args=(self.make_doc, pipes, rch, sch, Underscore.get_state(),
                      load_nlp.VECTORS),
            ) for rch, sch in zip(texts_q, bytedocs_send_ch)
        ]
        for proc in procs:
            proc.start()

        # Cycle channels not to break the order of docs.
        # The received object is a batch of byte-encoded docs, so flatten them with chain.from_iterable.
        byte_docs = chain.from_iterable(recv.recv()
                                        for recv in cycle(bytedocs_recv_ch))
        docs = (Doc(self.vocab).from_bytes(byte_doc) for byte_doc in byte_docs)
        try:
            for i, (_, doc) in enumerate(zip(raw_texts, docs), 1):
                yield doc
                if i % batch_size == 0:
                    # tell `sender` that one batch was consumed.
                    sender.step()
        finally:
            for proc in procs:
                proc.terminate()
def train_spacy_ner(model, data, iterations):
    if model is not None:
        nlp = spacy.load(model)
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('vi')  # create blank Language class
        print("Created blank 'vi' model")
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe('ner')

    for _, annotations in data:
        for entity in annotations.get('entities'):
            ner.add_label(entity[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only extract_xpath NER
        if model is None:
            optimizer = nlp.begin_training()
        else:
            # optimizer = nlp.resume_training()
            optimizer = nlp.entity.create_optimizer()
        for itn in range(iterations):
            print("Statring iteration " + str(itn))
            random.shuffle(data)
            split = int(len(data) * 5/6)
            training_data = data[:split]
            test_data = data[split + 1:]

            losses = {}
            batches = minibatch(training_data, size=compounding(2000, 5000., 1.5))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35,
                           losses=losses)
            print('Losses:', losses)
            print('Score:', evaluate(nlp, test_data))

    return nlp
def ner_model(TRAIN_DATA, n_iter=500):
    print('Training started...')
    """Load the model, set up the pipeline and train the entity recognizer."""

    nlp = spacy.blank("en")  # create blank Language class
    print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        nlp.begin_training()
        for itn in tqdm_notebook(range(n_iter)):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
    print('Training completed.')
    return nlp
def train_model(data: list, folder_to_save_model: str, n_iter: int,
                batch_size: int, dropout_rate: float):
    """
    Train a NER model using Spacy
    :param data: list of tuples [(text, offset)]
    :param folder_to_save_model: Where to save the learned model. None to skip. Will be overiden with new model
    :param n_iter: number iterations of the CNN
    :param batch_size: more = less precise / less time to learn
    :param dropout_rate: more : learn less / better generalization
    """
    nlp = get_empty_model(load_labels_for_training=True)
    nlp.vocab.vectors.name = 'spacy_pretrained_vectors'
    optimizer = nlp.begin_training()
    with tqdm(total=n_iter * ceil(len(data) / batch_size),
              unit=" paragraphs",
              desc="Learn NER model") as pbar:
        for itn in range(n_iter):
            pbar.set_description(f"Learn NER model - iteration {itn + 1}")
            losses = {}
            random.shuffle(data)
            batches = util.minibatch(data, batch_size)

            for current_batch_item in batches:
                case_id, texts, annotations = zip(*current_batch_item)
                docs = [nlp.make_doc(text) for text in texts]
                gold_with_unknown_bilou = convert_unknown_bilou_bulk(
                    docs=docs, offsets=annotations)
                nlp.update(
                    docs,  # batch of texts
                    gold_with_unknown_bilou,  # batch of annotations
                    drop=
                    dropout_rate,  # dropout - make it harder to memorise rules
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
                pbar.postfix = "loss: " + str(losses['ner'])
                pbar.update()

    # save model to output directory
    if folder_to_save_model is not None:
        folder_to_save_model = Path(folder_to_save_model)
        nlp.to_disk(folder_to_save_model)
Example #25
0
def train_model(training_data: list,
                test_data: list,
                iterations: int = 20) -> None:
    nlp = spacy.load("en_core_web_sm")
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe("textcat",
                                  config={"architecture": "simple_cnn"})
        nlp.add_pipe(textcat, last=True)
    else:
        textcat = nlp.get_pipe("textcat")

    textcat.add_label("pos")
    textcat.add_label("neg")

    training_excluded_pipes = [
        pipe for pipe in nlp.pipe_names if pipe != "textcat"
    ]

    with nlp.disable_pipes(training_excluded_pipes):
        optimizer = nlp.begin_training()
        print("Training...")
        batch_sizes = compounding(4.0, 32.0, 1.001)

        for i in trange(iterations):
            loss = {}
            random.shuffle(training_data)
            batches = minibatch(training_data, size=batch_sizes)
            for batch in batches:
                text, labels = zip(*batch)
                nlp.update(text, labels, drop=0.2, sgd=optimizer, losses=loss)

            with textcat.model.use_params(optimizer.averages):
                evaluation_results = evaluate_model(tokenizer=nlp.tokenizer,
                                                    textcat=textcat,
                                                    test_data=test_data)
                print(f"{loss['textcat']}\t{evaluation_results['precision']}"
                      f"\t{evaluation_results['recall']}"
                      f"\t{evaluation_results['f-score']}")

    with nlp.use_params(optimizer.averages):
        nlp.to_disk("model_artifacts")
Example #26
0
def train_spacy_ner(path):
    import spacy
    TRAIN_DATA = label_studio_to_spacy(path)
    nlp = spacy.load('en_core_web_sm')
    ner = nlp.get_pipe("ner")
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    unaffected_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
    ]

    import random
    from spacy.util import minibatch, compounding
    from pathlib import Path

    # TRAINING THE MODEL
    with nlp.disable_pipes(*unaffected_pipes):

        # Training for 30 iterations
        for iteration in range(50):

            # shuufling examples  before every iteration
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
                print("Losses", losses)
    output_dir = Path(r'E:\Nitin\RVCE\Projects\PDF-OCR\code\models')
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)
Example #27
0
def ner_trainig():
    with open(r'C:\Users\Akash\Desktop\be_proj\Data\NER_training_data.pickle',
              'rb') as fp:
        TRAIN_DATA = pickle.load(fp)

    LABEL = ['BRANCH']
    nlp = spacy.blank('en')  #created a blank model
    ner = nlp.create_pipe('ner')  #created a pipeline  ner
    nlp.add_pipe(ner)  #added NER to the spcy pipeline
    for l in LABEL:
        ner.add_label(l)  #added labels for the component
    optimizer = nlp.begin_training()

    n_iter = 10
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
    ]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)  #shuffled the dataset
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(1.0, 4.0, 1.001))
            for batch in batches:

                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.35,
                           losses=losses)
            print('Losses', losses)

    #test_text = "how many electronics students are there in IT branch ?"
    #doc = nlp(test_text)
    #print("Entities in '%s'" % test_text)
    #for ent in doc.ents:
    #    print(ent.label_, ent.text)

    nlp.meta["name"] = "NER_mod"  # rename model
    nlp.to_disk(r"D:\projects\col_chatbot - Copy\college_bot\NERdata")
Example #28
0
def train_model(n_iter: int = 100) -> None:

    # You can load other languages if specificied in Dockerfile
    nlp = spacy.load("en_core_web_sm")

    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
    ]

    # only train NER
    with nlp.disable_pipes(*other_pipes):
        nlp.begin_training()
        for step_n in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(1.0, 4.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop=0.5,
                    losses=losses,
                )
            print("Losses: ", losses)
            tracking.log_metrics(step=step_n, **losses)

    nlp.to_disk("custom_spacy_model")
def start_training(model=None, output=None, epoch=30):
    train_data = TRAIN_DATA2 + TRAIN_DATA

    # Loading or create an empty model.
    if model is not None:
        nlp = spacy.load(model)
        print("Loaded model '%s'." % model)
    else:
        nlp = spacy.blank('en')
        print("Created blank model to train.")

    # Create a fresh instance of parser.
    if 'parser' in nlp.pipe_names:
        nlp.remove_pipe('parser')
    parser = nlp.create_pipe('parser')
    nlp.add_pipe(parser, first=True)

    for text, tags in train_data:
        for dep in tags.get('deps', []):
            parser.add_label(dep)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for _ in range(epoch):
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=4)
            for batch in batches:
                texts, labels = zip(*batch)
                nlp.update(texts, labels, sgd=optimizer, losses=losses)
            print('Losses', losses)

    if output is not None:
        output = Path(output)
        if not output.exists():
            output.mkdir()
        nlp.to_disk(output)
        print("Saved model to directory %s." % output)

    return nlp
Example #30
0
    def _train_categories(self, train_categories_data):

        # get names of other pipes to disable them during training
        other_pipes = [
            pipe for pipe in self._nlp.pipe_names if pipe != 'textcat'
        ]
        with self._nlp.disable_pipes(*other_pipes):  # only train textcat
            optimizer = self._nlp.begin_training()
            # print("Training the model...")
            # print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))

            for itn in range(self._iterations):
                losses = {}
                batches = minibatch(train_categories_data,
                                    size=compounding(4., 32., 1.001))
                for batch in batches:
                    texts, annotations = zip(*batch)
                    self._nlp.update(texts,
                                     annotations,
                                     sgd=optimizer,
                                     losses=losses)
Example #31
0
    def _predict(self, texts: List[str], batch_size: Optional[int] = 1000) -> Iterable[str]:

        translated_texts = []
        with tqdm(total=len(texts)) as pbar:
            for batch in minibatch(texts, batch_size):
                json_body = [
                    {
                        "q": text,
                        "source": self.source_lang,
                        "target": self.target_lang,
                        "format": "text",
                    }
                    for text in batch
                ]
                res = httpx.post(self._translate_url, params=self._default_params, json=json_body)
                data = res.json()
                for doc in data["translations"]:
                    translated_texts.append(doc["text"])
                    pbar.update(1)

        return translated_texts
Example #32
0
    def pipe(self,
             stream: Iterable[Doc],
             *,
             batch_size: int = 128) -> Iterator[Doc]:
        """Apply the pipe to a stream of documents. This usually happens under
        the hood when the nlp object is called on a text and all components are
        applied to the Doc.

        stream (Iterable[Doc]): A stream of documents.
        batch_size (int): The number of documents to buffer.
        YIELDS (Doc): Processed documents in order.

        DOCS: https://nightly.spacy.io/api/transformer#pipe
        """
        for outer_batch in minibatch(stream, batch_size):
            outer_batch = list(outer_batch)
            for indices in batch_by_length(outer_batch,
                                           self.cfg["max_batch_items"]):
                subbatch = [outer_batch[i] for i in indices]
                self.set_annotations(subbatch, self.predict(subbatch))
            yield from outer_batch
Example #33
0
def main(args):

    merge_terms = load_merge_terms(args.merge_terms) \
        if args.merge_terms else {}

    nlp = get_parser(disable=args.disable,
                     merge_terms=merge_terms,
                     max_sent_len=args.max_sent_len)

    identity_preprocess = lambda x: x
    corpus = dataloader(args.inputdir, preprocess=identity_preprocess)

    partitions = minibatch(corpus, size=args.batch_size)
    executor = Parallel(n_jobs=args.n_procs,
                        backend="multiprocessing",
                        prefer="processes")
    do = delayed(partial(transform_texts, nlp))
    tasks = (do(i, batch, args.outputdir, args.disable,
                args.prefix, args.keep_whitespace) \
             for i, batch in enumerate(partitions))
    executor(tasks)
Example #34
0
    def _train_ner(self, train_entities_data):

        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in self._nlp.pipe_names if pipe != 'ner']
        with self._nlp.disable_pipes(*other_pipes):  # only train NER
            optimizer = self._nlp.begin_training()
            losses = {}
            for itn in range(self._iterations):
                random.shuffle(train_entities_data)

                # batch up the examples using spaCy's minibatch
                batches = minibatch(train_entities_data,
                                    size=compounding(4., 32., 1.001))
                for batch in batches:
                    texts, annotations = zip(*batch)
                    self._nlp.update(
                        texts,  # batch of texts
                        annotations,  # batch of annotations
                        sgd=optimizer,  # callable to update weights
                        losses=losses)
            self._logger.debug('Losses ', losses)
Example #35
0
    def _predict(self,
                 texts: List[str],
                 batch_size: Optional[int] = 1000) -> Iterable[str]:

        translated_texts = []
        with tqdm(total=len(texts)) as pbar:
            for batch in minibatch(texts, batch_size):
                json_body = [{"text": text} for text in batch]
                res = httpx.post(
                    self._translate_url,
                    params=self._default_params,
                    headers=self._default_headers,
                    json=json_body,
                )
                data = res.json()
                for doc in data:
                    translation = doc["translations"][0]
                    translated_texts.append(translation["text"])
                    pbar.update(1)

        return translated_texts
Example #36
0
def train_textcat(nlp, n_texts, n_iter=10):
    textcat = nlp.get_pipe("textcat")
    tok2vec_weights = textcat.model.tok2vec.to_bytes()
    (train_texts, train_cats), (dev_texts, dev_cats) = load_textcat_data(limit=n_texts)
    print(
        "Using {} examples ({} training, {} evaluation)".format(
            n_texts, len(train_texts), len(dev_texts)
        )
    )
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        textcat.model.tok2vec.from_bytes(tok2vec_weights)
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        for i in range(n_iter):
            losses = {"textcat": 0.0}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(tqdm.tqdm(train_data), size=2)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate_textcat(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )
Example #37
0
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
Example #38
0
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    n_iter=100

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.55,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
                print('Losses', losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        #print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
            ner.add_label(ent[2])

    n_iter=10

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            ts = time.time()
            st=  datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
            print (st)
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            #batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            batches = minibatch(TRAIN_DATA, size=compounding(1000., 8000., 1.25))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,
                    annotations,
                    drop=0.3,
                    sgd=optimizer,
                    losses=losses)
                print('Losses', losses)
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
    print (st)
    print ("CONCLUIDO") 

    # test the trained model
Example #40
0
def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    ner.add_label(LABEL)  # add new entity label to entity recognizer
    # Adding extraneous labels shouldn't mess anything up
    ner.add_label("VEGETABLE")
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "Do you like horses?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
Example #41
0
def main(
    ud_dir,
    parses_dir,
    corpus,
    config=None,
    limit=0,
    gpu_device=-1,
    vectors_dir=None,
    use_oracle_segments=False,
):
    spacy.util.fix_random_seed()
    lang.zh.Chinese.Defaults.use_jieba = False
    lang.ja.Japanese.Defaults.use_janome = False

    if config is not None:
        config = Config.load(config, vectors_dir=vectors_dir)
    else:
        config = Config(vectors_dir=vectors_dir)
    paths = TreebankPaths(ud_dir, corpus)
    if not (parses_dir / corpus).exists():
        (parses_dir / corpus).mkdir()
    print("Train and evaluate", corpus, "using lang", paths.lang)
    nlp = load_nlp(paths.lang, config, vectors=vectors_dir)

    docs, golds = read_data(
        nlp,
        paths.train.conllu.open(),
        paths.train.text.open(),
        max_doc_length=config.max_doc_length,
        limit=limit,
    )

    optimizer = initialize_pipeline(nlp, docs, golds, config, gpu_device)

    batch_sizes = compounding(config.min_batch_size, config.max_batch_size, 1.001)
    beam_prob = compounding(0.2, 0.8, 1.001)
    for i in range(config.nr_epoch):
        docs, golds = read_data(
            nlp,
            paths.train.conllu.open(),
            paths.train.text.open(),
            max_doc_length=config.max_doc_length,
            limit=limit,
            oracle_segments=use_oracle_segments,
            raw_text=not use_oracle_segments,
        )
        Xs = list(zip(docs, golds))
        random.shuffle(Xs)
        if config.batch_by_words:
            batches = minibatch_by_words(Xs, size=batch_sizes)
        else:
            batches = minibatch(Xs, size=batch_sizes)
        losses = {}
        n_train_words = sum(len(doc) for doc in docs)
        with tqdm.tqdm(total=n_train_words, leave=False) as pbar:
            for batch in batches:
                batch_docs, batch_gold = zip(*batch)
                pbar.update(sum(len(doc) for doc in batch_docs))
                nlp.parser.cfg["beam_update_prob"] = next(beam_prob)
                nlp.update(
                    batch_docs,
                    batch_gold,
                    sgd=optimizer,
                    drop=config.dropout,
                    losses=losses,
                )

        out_path = parses_dir / corpus / "epoch-{i}.conllu".format(i=i)
        with nlp.use_params(optimizer.averages):
            if use_oracle_segments:
                parsed_docs, scores = evaluate(
                    nlp, paths.dev.conllu, paths.dev.conllu, out_path
                )
            else:
                parsed_docs, scores = evaluate(
                    nlp, paths.dev.text, paths.dev.conllu, out_path
                )
            print_progress(i, losses, scores)
Example #42
0
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat",
            config={
                "exclusive_classes": True,
                "architecture": "simple_cnn",
            }
        )
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")

    # load the IMDB dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
    train_texts = train_texts[:n_texts]
    train_cats = train_cats[:n_texts]
    print(
        "Using {} examples ({} training, {} evaluation)".format(
            n_texts, len(train_texts), len(dev_texts)
        )
    )
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )

    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
Example #43
0
def main(model=None, output_dir=None, n_iter=100):

    if model is not None:
        nlp = spacy.load(model) 
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")
        print("Created blank 'en' model")

    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")

    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes): 
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            ts = time.time()
            st=  datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
            print (st)
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}

            batches = minibatch(TRAIN_DATA, size=compounding(1000., 8000., 1.25))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts, 
                    annotations, 
                    drop=0.5, 
                    losses=losses,
                )
            print("Losses", losses)
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
    print (st)
    print ("CONCLUIDO") 

    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
Example #44
0
def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add label to text classifier
    textcat.add_label('POSITIVE')

    # load the IMDB dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
    print("Using {} examples ({} training, {} evaluation)"
          .format(n_texts, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts,
                          [{'cats': cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)