Example #1
0
def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()



    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # test the trained model
    test_text = 'Do you like horses?'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
def test_issue2626():
    '''Check that this sentence doesn't cause an infinite loop in the tokenizer.'''
    nlp = spacy.blank('en')
    text = """
    ABLEItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume TABLE ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume ItemColumn IAcceptance Limits of ErrorIn-Service Limits of ErrorColumn IIColumn IIIColumn IVColumn VComputed VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeUnder Registration of\xa0VolumeOver Registration of\xa0VolumeCubic FeetCubic FeetCubic FeetCubic FeetCubic Feet1Up to 10.0100.0050.0100.005220.0200.0100.0200.010350.0360.0180.0360.0184100.0500.0250.0500.0255Over 100.5% of computed volume0.25% of computed volume0.5% of computed volume0.25% of computed volume
    """
    doc = nlp.make_doc(text)
def main(n_iter=10):
    nlp = spacy.blank('en')
    ner = nlp.create_pipe('ner')
    ner.add_multitask_objective(get_position_label)
    nlp.add_pipe(ner)

    print("Create data", len(TRAIN_DATA))
    optimizer = nlp.begin_training(get_gold_tuples=lambda: TRAIN_DATA)
    for itn in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annot_brackets in TRAIN_DATA:
            annotations, _ = annot_brackets
            doc = nlp.make_doc(text)
            gold = GoldParse.from_annot_tuples(doc, annotations[0])
            nlp.update(
                [doc],  # batch of texts
                [gold],  # batch of annotations
                drop=0.2,  # dropout - make it harder to memorise data
                sgd=optimizer,  # callable to update weights
                losses=losses)
        print(losses.get('nn_labeller', 0.0), losses['ner'])

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print('Entities', [(ent.text, ent.label_) for ent in doc.ents])
        print('Tokens', [(t.text, t.ent_type_, t.ent_iob) for t in doc])
Example #4
0
def main(patterns_loc, text_loc, n=10000, lang="en"):
    nlp = spacy.blank(lang)
    nlp.vocab.lex_attr_getters = {}
    phrases = read_gazetteer(nlp.tokenizer, patterns_loc)
    count = 0
    t1 = time.time()
    for ent_id, text in get_matches(nlp.tokenizer, phrases, read_text(text_loc, n=n)):
        count += 1
    t2 = time.time()
    print("%d docs in %.3f s. %d matches" % (n, (t2 - t1), count))
Example #5
0
def main(model=None, output_dir=None, n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the parser to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "parser" not in nlp.pipe_names:
        parser = nlp.create_pipe("parser")
        nlp.add_pipe(parser, first=True)
    # otherwise, get it, so we can add labels to it
    else:
        parser = nlp.get_pipe("parser")

    # add labels to the parser
    for _, annotations in TRAIN_DATA:
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "I like securities."
    doc = nlp(test_text)
    print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
Example #6
0
def load_nlp(corpus, config, vectors=None):
    lang = corpus.split("_")[0]
    nlp = spacy.blank(lang)
    if config.vectors:
        if not vectors:
            raise ValueError(
                "config asks for vectors, but no vectors "
                "directory set on command line (use -v)"
            )
        if (Path(vectors) / corpus).exists():
            nlp.vocab.from_disk(Path(vectors) / corpus / "vocab")
    nlp.meta["treebank"] = corpus
    return nlp
Example #7
0
def FeatureExtracter(lang, attrs=[LOWER, SHAPE, PREFIX, SUFFIX], tokenized=True):
    nlp = spacy.blank(lang)
    nlp.vocab.lex_attr_getters[PREFIX] = lambda string: string[:3]
    nlp.vocab.lex_attr_getters[SUFFIX] = lambda string: string[-3:]
    def forward(texts, drop=0.):
        if tokenized:
            docs = [Doc(nlp.vocab, words) for words in texts]
        else:
            docs = [nlp(text) for text in texts]
        features = [doc.to_array(attrs) for doc in docs]
        def backward(d_features, sgd=None):
            return d_features
        return features, backward
    return layerize(forward)
Example #8
0
def main(
    lang,
    in_dir,
    out_loc,
    negative=5,
    n_workers=4,
    window=5,
    size=128,
    min_count=10,
    nr_iter=2,
):
    logging.basicConfig(
        format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO
    )
    model = Word2Vec(
        size=size,
        window=window,
        min_count=min_count,
        workers=n_workers,
        sample=1e-5,
        negative=negative,
    )
    nlp = spacy.blank(lang)
    corpus = Corpus(in_dir)
    total_words = 0
    total_sents = 0
    for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
        with text_loc.open("r", encoding="utf-8") as file_:
            text = file_.read()
        total_sents += text.count("\n")
        doc = nlp(text)
        total_words += corpus.count_doc(doc)
        logger.info(
            "PROGRESS: at batch #%i, processed %i words, keeping %i word types",
            text_no,
            total_words,
            len(corpus.strings),
        )
    model.corpus_count = total_sents
    model.raw_vocab = defaultdict(int)
    for orth, freq in corpus.counts:
        if freq >= min_count:
            model.raw_vocab[nlp.vocab.strings[orth]] = freq
    model.scale_vocab()
    model.finalize_vocab()
    model.iter = nr_iter
    model.train(corpus)
    model.save(out_loc)
Example #9
0
def main(model=None, output_dir=None, n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # We'll use the built-in dependency parser class, but we want to create a
    # fresh instance – just in case.
    if "parser" in nlp.pipe_names:
        nlp.remove_pipe("parser")
    parser = nlp.create_pipe("parser")
    nlp.add_pipe(parser, first=True)

    for text, annotations in TRAIN_DATA:
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        test_model(nlp2)
Example #10
0
def main(model=None, output_dir=None, n_iter=5):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # We'll use the built-in dependency parser class, but we want to create a
    # fresh instance – just in case.
    if 'parser' in nlp.pipe_names:
        nlp.remove_pipe('parser')
    parser = nlp.create_pipe('parser')
    nlp.add_pipe(parser, first=True)

    for text, annotations in TRAIN_DATA:
        for dep in annotations.get('deps', []):
            parser.add_label(dep)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'parser']
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations], sgd=optimizer, losses=losses)
            print(losses)

    # test the trained model
    test_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        test_model(nlp2)
Example #11
0
def main(lang='en', output_dir=None, n_iter=25):
    """Create a new model, set up the pipeline and train the tagger. In order to
    train the tagger with a custom tag map, we're creating a new Language
    instance with a custom vocab.
    """
    nlp = spacy.blank(lang)
    # add the tagger to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    tagger = nlp.create_pipe('tagger')
    # Add the tags. This needs to be done before you start training.
    for tag, values in TAG_MAP.items():
        tagger.add_label(tag, values)
    nlp.add_pipe(tagger)

    optimizer = nlp.begin_training()
    for i in range(n_iter):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            nlp.update([text], [annotations], sgd=optimizer, losses=losses)
        print(losses)

    # test the trained model
    test_text = "I like blue eggs"
    doc = nlp(test_text)
    print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the save model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print('Tags', [(t.text, t.tag_, t.pos_) for t in doc])
Example #12
0
def main(vectors_loc, lang=None):
    if lang is None:
        nlp = Language()
    else:
        # create empty language class – this is required if you're planning to
        # save the model to disk and load it back later (models always need a
        # "lang" setting). Use 'xx' for blank multi-language class.
        nlp = spacy.blank(lang)
    with open(vectors_loc, 'rb') as file_:
        header = file_.readline()
        nr_row, nr_dim = header.split()
        nlp.vocab.reset_vectors(width=int(nr_dim))
        for line in file_:
            line = line.rstrip().decode('utf8')
            pieces = line.rsplit(' ', int(nr_dim))
            word = pieces[0]
            vector = numpy.asarray([float(v) for v in pieces[1:]], dtype='f')
            nlp.vocab.set_vector(word, vector)  # add the vectors to the vocab
    # test the vectors and similarity
    text = 'class colspan'
    doc = nlp(text)
    print(text, doc[0].similarity(doc[1]))
Example #13
0
def make_vocab(cmd, lang, output_dir, lexemes_loc,
               vectors_loc=None, prune_vectors=-1):
    """Compile a vocabulary from a lexicon jsonl file and word vectors."""
    if not lexemes_loc.exists():
        prints(lexemes_loc, title="Can't find lexical data", exits=1)
    vectors_loc = ensure_path(vectors_loc)
    nlp = spacy.blank(lang)
    for word in nlp.vocab:
        word.rank = 0
    lex_added = 0
    with lexemes_loc.open() as file_:
        for line in file_:
            if line.strip():
                attrs = json.loads(line)
                if 'settings' in attrs:
                    nlp.vocab.cfg.update(attrs['settings'])
                else:
                    lex = nlp.vocab[attrs['orth']]
                    lex.set_attrs(**attrs)
                    assert lex.rank == attrs['id']
                lex_added += 1
    if vectors_loc is not None:
        vector_data = numpy.load(vectors_loc.open('rb'))
        nlp.vocab.vectors = Vectors(data=vector_data)
        for word in nlp.vocab:
            if word.rank:
                nlp.vocab.vectors.add(word.orth, row=word.rank)

    if prune_vectors >= 1:
        remap = nlp.vocab.prune_vectors(prune_vectors)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    vec_added = len(nlp.vocab.vectors)
    prints("{} entries, {} vectors".format(lex_added, vec_added), output_dir,
           title="Sucessfully compiled vocab and vectors, and saved model")
    return nlp
Example #14
0
 def __iter__(self):
     nlp = spacy.blank(self.lang)
     return self.iter(nlp)
Example #15
0
import json
import spacy
from spacy.matcher import Matcher
from spacy.tokens import Span, DocBin

with open("exercises/es/adidas.json", encoding="utf8") as f:
    TEXTS = json.loads(f.read())

nlp = spacy.blank("es")
matcher = Matcher(nlp.vocab)
# Agrega patrones al matcher
pattern1 = [{"LOWER": "adidas"}, {"LOWER": "zx"}]
pattern2 = [{"LOWER": "adidas"}, {"IS_DIGIT": True}]
matcher.add("ROPA", [pattern1, pattern2])
docs = []
for doc in nlp.pipe(TEXTS):
    matches = matcher(doc)
    spans = [
        Span(doc, start, end, label=match_id)
        for match_id, start, end in matches
    ]
    doc.ents = spans
    docs.append(doc)

doc_bin = ____(____=____)
doc_bin.____(____)
Example #16
0
 def __init__(self, lang:str):
     self.tok = spacy.blank(lang, disable=["parser","tagger","ner"])
Example #17
0
def load_nlp(corpus, config):
    lang = corpus.split("_")[0]
    nlp = spacy.blank(lang)
    if config.vectors:
        nlp.vocab.from_disk(config.vectors / "vocab")
    return nlp
Example #18
0
 def __init__(self, lang='en', special_toks=None, buf_sz=5000):
     special_toks = ifnone(special_toks, defaults.text_spec_tok)
     nlp = spacy.blank(lang, disable=["parser", "tagger", "ner"])
     for w in special_toks: nlp.tokenizer.add_special_case(w, [{ORTH: w}])
     self.pipe,self.buf_sz = nlp.pipe,buf_sz
Example #19
0
import scattertext as st
import spacy

nlp = spacy.blank('en_core_web_sm')
nlp.tokenizer.rules = {key: value for key, value in nlp.tokenizer.rules.items()
                       if "'" not in key and "’" not in key and "‘" not in key}
nlp.add_pipe(nlp.create_pipe('sentencizer'))

df = st.SampleCorpora.ConventionData2012.get_data().assign(
    parse=lambda df: df.text.apply(nlp)
)

corpus = st.CorpusFromParsedDocuments(
    df, category_col='party', parsed_col='parse'
).build().compact(st.ClassPercentageCompactor(term_count=10))

html = st.produce_scattertext_explorer(
    corpus,
    category='democrat',
    category_name='Democratic',
    not_category_name='Republican',
    minimum_term_frequency=0, pmi_threshold_coefficient=0,
    width_in_pixels=1000, metadata=corpus.get_df()['speaker'],
    transform=st.Scalers.dense_rank,
    show_diagonal=False,
    max_overlapping=3
)
open('./demo_with_apostrophes.html', 'w').write(html)
print('open ./demo_with_apostrophes.html in Chrome')
Example #20
0
def train_new_ner(model=None, output_dir=model_dir, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")
    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
Example #21
0
def load_nlp(corpus, config):
    lang = corpus.split('_')[0]
    nlp = spacy.blank(lang)
    if config.vectors:
        nlp.vocab.from_disk(config.vectors / 'vocab')
    return nlp
Example #22
0
def train(train_data,iterations,modelPath):
    
    
     
    # Test if the model exists
    modelExists=os.path.isdir(modelPath)
     
     # 1.) load model or create blank Language class
    """Load the model, set up the pipeline and train the entity recognizer."""
    if modelExists:
        nlp = spacy.load(modelPath)  # load existing spaCy model
        print("Loaded model '%s'" % modelPath)
    else:
        # 1.) create blank Language class
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    
    
    
    # Initialize a textcat pipe in a spacy pipeline object (nlp), and add the label variable in it.


    if 'textcat' not in nlp.pipe_names:
      print("create pipe 'textcat'...")
      
      # create a new pipe with exclusive_classes=true
      
      #textcat = nlp.create_pipe("textcat")
      textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True})
      nlp.add_pipe(textcat, last=True)
      #nlp.add_pipe(textcat) 
    else:
      textcat = nlp.get_pipe("textcat")






    for i in LABEL:
        print("...adding new label '" + i + "'...")
        textcat.add_label(i)   # Add new  labels to the categorizer

    


    # Iterate the training examples to optimize the model
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']

    # Only train the textcat pipe
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        
        #optimizer = nlp.begin_training(component_cfg={'textcat': {"exclusive_classes": False} })

        
        print("Training model...")
        for i in range(iterations):
            random.shuffle(train_data)
            losses = {}
            batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, 
                           annotations, 
                           sgd=optimizer,
                           drop=0.2, 
                           losses=losses)

            print("Losses", losses)
    print("Training completed")
    return nlp
Example #23
0
def test_blank(lang):
    spacy.blank(lang)
Example #24
0
def main(model=None,
         new_model_name="entity",
         output_dir="entityModel",
         n_iter=30):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    ner.add_label(LABEL1)  # add new entity label to entity recognizer
    # Adding extraneous labels shouldn't mess anything up
    ner.add_label(LABEL2)
    ner.add_label(LABEL3)
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
    ]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.35,
                           losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "Should I dump my lights?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
Example #25
0
def main(model='/home/sangeetha/Desktop/spa/combine',
         new_model_name='fund',
         output_dir='',
         n_iter=0):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(
            '/home/sangeetha/Desktop/spa/combine')  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
        f = 1
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL1)  # add new entity label to entity recognizer
    ner.add_label(LABEL2)
    ner.add_label(LABEL3)
    ner.add_label(LABEL4)
    ner.add_label(LABEL5)
    ner.add_label(LABEL6)
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations],
                           sgd=optimizer,
                           drop=0.35,
                           losses=losses)
            print(losses)


#    with open('test_data.txt', 'r') as myfile:
#    	data=myfile.read().replace('\n', '')
# test the trained model
    data = 'hey im from fil how can i help you let me place a buy deal of ten thousand pounds eleven cents in fund demon okay let me reconfirm your deal okay i confirm to place a buy deal amounting ten thousand pounds eleven cents in fund demon with the associated account number one two three nine okay okay thank you'

    doc = nlp(data)
    n = 10
    print("entities in '%s'" % data)
    for ent in (doc.ents):
        print(ent.text, ent.label_)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(data)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
Example #26
0
def train_CNN(model=None, output_dir=None, n_iter=20, n_texts=2000):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add label to text classifier
    textcat.add_label('POSITIVE')

    # load the IMDB dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
    print("Using {} examples ({} training, {} evaluation)".format(
        n_texts, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts, [{
        'cats': cats
    } for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            #            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            batches = get_batches(train_data, 'textcat')
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=next(dropout),
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
Example #27
0
def DIDA_NER(model=None,
             output_dir="/home/wangdi498/SpaCy/models",
             number_iterations=50):
    if model is not None:
        nlp = spacy.load(model)
        print(
            "In NER the goddamn language model {} already exists so it will be loaded."
            .format(model))
    else:
        nlp = spacy.blank('zh')
        print(
            "In NER the goddamn language model does not exist so 'zh_core_web_sm' will be loaded."
        )
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')  # create_pipe()只对SpaCy承认的component有效。
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe('ner')

    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):  # get可以返回None。
            ner.add_label(ent[2])

    # 其余component必须在NER训练时被终止!
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):
        optimizer = nlp.begin_training()
        for iteration in range(number_iterations):
            random.shuffle(TRAIN_DATA)
            # 每轮都会shuffle训练数据,保证模型不会根据训练顺序来做generalizations。也可以设置dropout rate让模型以一定几率放弃一些features和representations来避免模型过牢地记住训练数据。
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations],
                           drop=0.5,
                           sgd=optimizer,
                           losses=losses)  # 用得到的数据更新模型。
            print("In round {} of {} NER the loss is {}.".format(
                iteration, number_iterations, losses))

    # 姑且用训练数据直接测试。
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("\n{}".format([(ent.text, ent.label_) for ent in doc.ents]))
        # print ("\nHow can I trust the entities without test?!\n{}".format([(ent.text, ent.label_) for ent in doc.ents]))
        # print ("\nHow can I trust the tokens without test?!\n{}".format([(token.text, token.ent_type_, token.ent_iob_) for token in doc]))

        # 存储训练好的模型。
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("\nThe NER model is saved to {}. This is the end of training.".
              format(output_dir))

        # 读取训练好的模型。
        print(
            "\nThe NER model is loaded from {}. This is the beginning of testing."
            .format(output_dir))
        nlu = spacy.load(output_dir)
        for text, _ in TEST_DATA:
            doc = nlu(text)
            print("\n{}".format([(ent.text, ent.label_) for ent in doc.ents]))
Example #28
0
def main(model=None, output_dir=None, n_iter=100):

    if model is not None:
        nlp = spacy.load(model) 
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")
        print("Created blank 'en' model")

    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")

    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes): 
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            ts = time.time()
            st=  datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
            print (st)
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}

            batches = minibatch(TRAIN_DATA, size=compounding(1000., 8000., 1.25))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts, 
                    annotations, 
                    drop=0.5, 
                    losses=losses,
                )
            print("Losses", losses)
    ts = time.time()
    st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
    print (st)
    print ("CONCLUIDO") 

    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "data_dir",
        type=parse_path,
        help='Directory to the TDSA dataset e.g. ~/.Bella/Datasets')
    parser.add_argument("dataset_name",
                        type=str,
                        choices=['Laptop', 'Restaurant', 'Election'])
    parser.add_argument("--force_space",
                        action='store_true',
                        help=force_space_help)
    args = parser.parse_args()

    data_dir = args.data_dir
    dataset_name = args.dataset_name
    tokeniser = spacy.blank('en')
    split_names = ['Train', 'Val', 'Test']

    for split_name in split_names:
        dataset_fp = Path(data_dir, f'{dataset_name} {split_name}')
        dataset = TargetCollection.load_from_json(dataset_fp)

        retrieve_target_from_tokens = []
        cannot_retrieve_target_from_tokens = []

        for target in dataset.data_dict():
            text = target['text']
            target_word = target['target'].strip()
            target_start_offset = target['spans'][0][0]
            target_end_offset = target['spans'][0][1]
            if args.force_space:
Example #30
0
def main(kb_path, vocab_path=None, output_dir=None, n_iter=50):
    """Create a blank model with the specified vocab, set up the pipeline and train the entity linker.
    The `vocab` should be the one used during creation of the KB."""
    vocab = Vocab().from_disk(vocab_path)
    # create blank Language class with correct vocab
    nlp = spacy.blank("en", vocab=vocab)
    nlp.vocab.vectors.name = "spacy_pretrained_vectors"
    print("Created blank 'en' model with vocab from '%s'" % vocab_path)

    # Add a sentencizer component. Alternatively, add a dependency parser for higher accuracy.
    nlp.add_pipe(nlp.create_pipe('sentencizer'))

    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data.
    # Note that in a realistic application, an actual NER algorithm should be used instead.
    ruler = EntityRuler(nlp)
    patterns = [{
        "label": "PERSON",
        "pattern": [{
            "LOWER": "russ"
        }, {
            "LOWER": "cochran"
        }]
    }]
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)

    # Create the Entity Linker component and add it to the pipeline.
    if "entity_linker" not in nlp.pipe_names:
        # use only the predicted EL score and not the prior probability (for demo purposes)
        cfg = {"incl_prior": False}
        entity_linker = nlp.create_pipe("entity_linker", cfg)
        kb = KnowledgeBase(vocab=nlp.vocab)
        kb.load_bulk(kb_path)
        print("Loaded Knowledge Base from '%s'" % kb_path)
        entity_linker.set_kb(kb)
        nlp.add_pipe(entity_linker, last=True)

    # Convert the texts to docs to make sure we have doc.ents set for the training examples.
    # Also ensure that the annotated examples correspond to known identifiers in the knowlege base.
    kb_ids = nlp.get_pipe("entity_linker").kb.get_entity_strings()
    TRAIN_DOCS = []
    for text, annotation in TRAIN_DATA:
        with nlp.disable_pipes("entity_linker"):
            doc = nlp(text)
        annotation_clean = annotation
        for offset, kb_id_dict in annotation["links"].items():
            new_dict = {}
            for kb_id, value in kb_id_dict.items():
                if kb_id in kb_ids:
                    new_dict[kb_id] = value
                else:
                    print("Removed", kb_id,
                          "from training because it is not in the KB.")
            annotation_clean["links"][offset] = new_dict
        TRAIN_DOCS.append((doc, annotation_clean))

    # get names of other pipes to disable them during training
    pipe_exceptions = ["entity_linker", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
    ]
    with nlp.disable_pipes(*other_pipes):  # only train entity linker
        # reset and initialize the weights randomly
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DOCS)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DOCS, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    losses=losses,
                    sgd=optimizer,
                )
            print(itn, "Losses", losses)

    # test the trained model
    _apply_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print()
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        _apply_model(nlp2)
def main(
    model=None,
    new_model_name='named_entity_recognizer',
    output_dir='./model',
    n_iter=30,
):
    """Set up the pipeline and entity recognizer, and train the new entity."""

    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy

    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)

    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for (_, annotations) in TRAIN_DATA:
        for ent in annotations.get('entities'):
            if ent[2] == "Links":
                continue
            ner.add_label(ent[2])

    if model is None or reset_weights:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()

    move_names = list(ner.move_names)

    # inititate pipeline
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    with nlp.disable_pipes(*other_pipes), warnings.catch_warnings():
        # show warnings for misaligned entity spans once
        warnings.filterwarnings("once", category=UserWarning, module='spacy')

        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations],
                           sgd=optimizer,
                           drop=0.35,
                           losses=losses)

            print('Losses', losses)

    # test the trained model

    test_text = 'Amity University'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print('Saved model to', output_dir)

        # test the saved model
        print('Loading from', output_dir)
        nlp2 = spacy.load(output_dir)

        # Check the classes have loaded back consistently
        assert nlp2.get_pipe('ner').move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
def main(model=None,
         new_model_name="Music Data",
         output_dir=os.path.dirname(os.path.realpath(__file__)) + '\\ner',
         n_iter=10):
    LABEL_1 = 'Artist'
    stat = DatasetAnalysis()
    data = stat.format_traindata()
    TEST_DATA = data[0:1000]
    TRAIN_DATA = data[1000:]
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    ner.add_label(LABEL_1)  # add new entity label to entity recognizer

    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]

    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.35,
                           losses=losses)
            print("Losses", losses)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

    # Test the trained model
    # Calculate Precision-Recall scores
    results = stat.evaluate(TEST_DATA)
    print(f'Precision metric score is {results[0]} ')
    print(f'Recall accuracy metric score is {results[1]} ')
    print(f'F1 accuracy metric score is {results[2]} ')
import spacy
from spacy.tokens import Span

nlp = spacy.blank("zh")


# 定义这个方法
def to_html(span, tag):
    # 将span文本包在HTML标签中并返回
    return f"<{tag}>{span.text}</{tag}>"


# 注册这个Span方法扩展名"to_html"及其方法to_html
Span.set_extension("to_html", method=to_html)

# 处理文本,在span上调用to_html方法及其标签名"strong"
doc = nlp("大家好,这是一个句子。")
span = doc[0:3]
print(span._.to_html("strong"))
Example #34
0
def batch_train(model=None,
                blank_model='de',
                output_dir=None,
                dropout=0.2,
                n_iter=12,
                eval_split=0.3,
                train_data=None,
                n_samples=None,
                new_label=None):
    """Load the model, set up the pipeline and train the entity recognizer."""

    abs_start_time = datetime.datetime.now()

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '{}'".format(model))
    elif blank_model is not None:
        nlp = spacy.blank(blank_model)  # create blank Language class
        print("Created blank model from '{}'".format(blank_model))
    else:
        print("You have to add either a model or a blank model")
        print("No training possible")
        return "nothing"

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    if new_label is not None:
        if model is None:
            optimizer = nlp.begin_training()
        else:
            optimizer = nlp.entity.create_optimizer()
        ner.add_label(new_label)

    TRAIN_DATA = train_data
    if n_samples is not None:
        TRAIN_DATA = TRAIN_DATA[:n_samples]
    train, test = train_test_split(TRAIN_DATA, test_size=eval_split)
    print("{} train vs {} test samples".format(len(train), len(test)))
    for _, annotations in train:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    prev_acc = 0.0
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            print("Iteration Number: {}".format(itn))
            start_time = datetime.datetime.now()
            random.shuffle(train)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train, size=compounding(4.0, 16.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                if new_label:
                    try:
                        nlp.update(texts,
                                   annotations,
                                   sgd=optimizer,
                                   drop=0.35,
                                   losses=losses)
                    except Exception as e:
                        print(annotations)
                else:
                    try:
                        nlp.update(
                            texts,  # batch of texts
                            annotations,  # batch of annotations
                            drop=
                            0.5,  # dropout - make it harder to memorise data
                            losses=losses,
                        )
                    except Exception as e:
                        print(annotations)
            print("Losses", losses)
            end_time = datetime.datetime.now()
            print("Duration: {}".format(end_time - start_time))
            results = evaluate(nlp, test)
            print("p: {}; f: {}; r: {}".format(results['ents_p'],
                                               results['ents_f'],
                                               results['ents_r']))
            acc = float(results['ents_f'])
            if acc > prev_acc:
                if output_dir is not None:
                    output_dir = Path(output_dir)
                    if not output_dir.exists():
                        output_dir.mkdir()
                    nlp.to_disk(output_dir)
                    print("Saved model to: {}".format(output_dir))
                prev_acc = acc
            print("######################")

    abs_end_time = datetime.datetime.now()
    print("######################")
    print("######################")
    print("######################")
    overall_duration = str(abs_end_time - abs_start_time)
    print("Overal duration: {}".format(overall_duration))
    if output_dir is not None:
        print("model with f1 score: {} saved to location: {}".format(
            prev_acc, output_dir))

    return nlp
Example #35
0
    save(args.char_emb_file, char_emb_mat, message="char embedding")
    save(args.train_eval_file, train_eval, message="train eval")
    save(args.dev_eval_file, dev_eval, message="dev eval")
    save(args.word2idx_file, word2idx_dict, message="word dictionary")
    save(args.char2idx_file, char2idx_dict, message="char dictionary")
    save(args.dev_meta_file, dev_meta, message="dev meta")
    #save(args.knowledge_edges_file,edges_dict,message="knowledge graph edges")


if __name__ == '__main__':
    # Get command-line args
    args_ = get_setup_args()

    # Download resources
    download(args_)

    # Import spacy language model
    nlp = spacy.blank("en")

    # Preprocess dataset
    args_.train_file = url_to_data_path(args_.train_url)
    args_.dev_file = url_to_data_path(args_.dev_url)
    if args_.include_test_examples:
        args_.test_file = url_to_data_path(args_.test_url)
    glove_dir = url_to_data_path(args_.glove_url.replace('.zip', ''))
    glove_ext = f'.txt' if glove_dir.endswith(
        'd') else f'.{args_.glove_dim}d.txt'
    args_.glove_file = os.path.join(glove_dir,
                                    os.path.basename(glove_dir) + glove_ext)
    pre_process(args_)
Example #36
0
def train(model, train_data, dev_data, test_data, output_dir, n_iter,
          meta_overrides):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    if meta_overrides is not None:
        metadata = json.load(open(meta_overrides))
        nlp.meta.update(metadata)

    original_tokenizer = nlp.tokenizer

    nlp.tokenizer = WhitespaceTokenizer(nlp.vocab)

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names and "parser" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="parser")
    elif 'ner' not in nlp.pipe_names and "tagger" in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, after="tagger")
    elif 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in train_data:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    dropout_rates = util.decaying(util.env_opt('dropout_from', 0.2),
                                  util.env_opt('dropout_to', 0.2),
                                  util.env_opt('dropout_decay', 0.005))
    batch_sizes = util.compounding(util.env_opt('batch_from', 1),
                                   util.env_opt('batch_to', 32),
                                   util.env_opt('batch_compound', 1.001))

    optimizer = nlp.begin_training()
    best_epoch = 0
    best_f1 = 0
    for i in range(n_iter):

        random.shuffle(train_data)
        count = 0
        losses = {}
        total = len(train_data)

        with nlp.disable_pipes(*other_pipes):  # only train NER
            with tqdm.tqdm(total=total, leave=True) as pbar:
                for batch in minibatch(train_data, size=batch_sizes):
                    docs, golds = zip(*batch)
                    nlp.update(docs,
                               golds,
                               sgd=optimizer,
                               losses=losses,
                               drop=next(dropout_rates))
                    pbar.update(len(batch))
                    if count % 100 == 0 and count > 0:
                        print('sum loss: %s' % losses['ner'])
                    count += 1

        # save model to output directory
        output_dir_path = Path(output_dir + "/" + str(i))
        if not output_dir_path.exists():
            output_dir_path.mkdir()

        with nlp.use_params(optimizer.averages):
            nlp.tokenizer = original_tokenizer
            nlp.to_disk(output_dir_path)
            print("Saved model to", output_dir_path)

        # test the saved model
        print("Loading from", output_dir_path)
        nlp2 = util.load_model_from_path(output_dir_path)
        nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab)

        metrics = evaluate_ner(nlp2, dev_data)
        if metrics["f1-measure-overall"] > best_f1:
            best_f1 = metrics["f1-measure-overall"]
            best_epoch = i
    # save model to output directory
    best_model_path = Path(output_dir + "/" + "best")
    print(f"Best Epoch: {best_epoch} of {n_iter}")
    if os.path.exists(best_model_path):
        shutil.rmtree(best_model_path)
    shutil.copytree(os.path.join(output_dir, str(best_epoch)), best_model_path)

    # test the saved model
    print("Loading from", best_model_path)
    nlp2 = util.load_model_from_path(best_model_path)
    nlp2.tokenizer = WhitespaceTokenizer(nlp.vocab)

    evaluate_ner(nlp2,
                 dev_data,
                 dump_path=os.path.join(output_dir, "dev_metrics.json"))
    evaluate_ner(nlp2,
                 test_data,
                 dump_path=os.path.join(output_dir, "test_metrics.json"))
Example #37
0
    def _retrain_model(self):
        """
        Update an existing spaCy model with labelled training data.
        """

        # Load the model, set up the pipeline and train the entity recognizer:
        
        # Load existing spaCy model
        if not self.blank:
            # If this is a custom model, set the path to the directory
            if self.custom:
                self.base_model = self.path + self.base_model + "/"
            
            nlp = spacy.load(self.base_model)  
        # If the parameter blank=true is passed we start with a blank Language class, e.g. en
        else:
            nlp = spacy.blank(self.base_model)  

        # Debug information is printed to the terminal and logs if the paramater debug = true
        if self.debug:
            self._print_log(7)

        # create the built-in pipeline components and add them to the pipeline

        # nlp.create_pipe works for built-ins that are registered with spaCy
        if "ner" not in nlp.pipe_names:
            ner = nlp.create_pipe("ner")
            nlp.add_pipe(ner, last=True)
        # otherwise, get it so we can add labels
        else:
            ner = nlp.get_pipe("ner")

        # add labels
        for _, annotations in self.train:
            for ent in annotations.get("entities"):
                ner.add_label(ent[2])

        # Retrain the model:
        
        # get names of other pipes to disable them during training
        other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
        with nlp.disable_pipes(*other_pipes):  # only train NER
           
            # Setup lists to store the loss for each epoch
            self.losses_train = []
            self.losses_test = []
            
            # reset and initialize the weights randomly – but only if we're
            # training a new model
            if self.blank:
                nlp.begin_training()
            for epoch in range(self.epochs): 
                random.shuffle(self.train)
                losses = {}
                # batch up the examples using spaCy's minibatch
                batches = minibatch(self.train, size=self.batch_size)
                for batch in batches:
                    texts, annotations = zip(*batch)
                    nlp.update(
                        texts,  # batch of texts
                        annotations,  # batch of annotations
                        drop=self.drop,  # dropout - make it harder to memorise data
                        losses=losses,
                    )
                # Store loss for the epoch to a list
                self.losses_train.append(('Epoch {}'.format(epoch+1), losses['ner']))

                # Debug information is printed to the terminal and logs if the paramater debug = true
                if self.debug:
                    self._print_log(8)
                
                # If a test dataset is available, calculate losses for it as well
                if self.validation is not None:
                    losses = {}

                    # batch up the examples using spaCy's minibatch
                    batches = minibatch(self.validation, size=self.batch_size)
                    for batch in batches:
                        texts, annotations = zip(*batch)
                        # Get losses for the test data without updating the model 
                        nlp.update(
                            texts,  # batch of texts
                            annotations,  # batch of annotations
                            sgd = None,  # do not update model weights
                            losses=losses,
                        )
                    # Store loss for the epoch to a list
                    self.losses_test.append(('Epoch {}'.format(epoch+1), losses['ner']))

                    # Debug information is printed to the terminal and logs if the paramater debug = true
                    if self.debug:
                        self._print_log(9)

        # Save model to output directory:
        
        output_dir = pathlib.Path(self.path + self.model + '/')
        if not output_dir.exists():
            output_dir.mkdir(parents=True, exist_ok=False)
        nlp.to_disk(output_dir)

        # Debug information is printed to the terminal and logs if the paramater debug = true
        if self.debug:
            self._print_log(10)

        # Evaluate the model:

        # Prepare spaCy docs and golds for getting evaluation metrics
        docs_golds = []
        for sample in self.train:
            doc = nlp.make_doc(sample[0])
            gold = GoldParse(doc, entities=sample[1]["entities"])
            docs_golds.append((doc, gold))
        
        # Get scores for training data
        scorer_train = nlp.evaluate(docs_golds)
        # Add the training scores to evaluation metrics
        self.metrics = self._prep_scores(scorer_train)

        # Get scores for testing data and add to the evaluation metrics
        if self.validation is not None:
            docs_golds = []
            for sample in self.validation:
                doc = nlp.make_doc(sample[0])
                gold = GoldParse(doc, entities=sample[1]["entities"])
                docs_golds.append((doc, gold))
            
            scorer_test = nlp.evaluate(docs_golds)
            self.metrics = pd.concat([self.metrics, self._prep_scores(scorer_test, subset='test')], ignore_index=True)

        # Add loss metrics
        self.metrics = pd.concat([self.metrics, self._prep_losses(self.losses_train)], ignore_index=True)
        if self.validation is not None:
            self.metrics = pd.concat([self.metrics, self._prep_losses(self.losses_test, subset='test')], ignore_index=True)
def main(json_data,
         model=None,
         output_dir=None,
         n_iter=10,
         n_texts=2000,
         init_tok2vec=None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe("textcat",
                                  config={
                                      "exclusive_classes": True,
                                      "architecture": "simple_cnn"
                                  })
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # load the dataset
    print("Loading data...")
    Intents, (train_texts, train_cats), (dev_texts,
                                         dev_cats) = load_data(json_data)
    train_texts = train_texts[:n_texts]
    train_cats = train_cats[:n_texts]
    print("Using {} examples ({} training, {} evaluation)".format(
        n_texts, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts, [{
        "cats": cats
    } for cats in train_cats]))

    # add label to text classifier
    for intent in Intents:
        textcat.add_label(intent)

    # get names of other pipes to disable them during training
    pipe_exceptions = ["textcat", "trf_wordpiecer", "trf_tok2vec"]
    other_pipes = [
        pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions
    ]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for _ in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(Intents, nlp.tokenizer, textcat, dev_texts,
                                  dev_cats)
            print("{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".
                  format(  # print a simple table
                      losses["textcat"],
                      scores["textcat_p"],
                      scores["textcat_r"],
                      scores["textcat_f"],
                  ))

    # test the trained model
    test_text = "add an item to todo list"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        # print("Loading from", output_dir)
        # nlp2 = spacy.load(output_dir)
        # doc2 = nlp2(test_text)
        # print(test_text, doc2.cats)
    return nlp
Example #39
0
def nlp():
    return spacy.blank("en")
Example #40
0
def cargar_modelo(dim_modelo, lenguaje, maxima_longitud=None):
    """
    Carga y retorna un modelo de lenguaje de spaCy del tamaño y lenguaje \
    fijado por el usuario. Para mayor información sobre estos modelos se \
    puede consultar la página de spaCy (https://spacy.io/models/).

    :param dim_modelo: (str). Tamaño del modelo de spaCy que se desea \
        utilizar. Puede ser una de las siguientes opciones, sin hacer \
        distinción entre mayúsculas y minúsculas:
        |ul|  |li| pequeño: {'pequeño', 'pequeno', 'small', 's', 'sm'} |/li|
        |li| mediano: {'mediano', 'medio', 'md', 'medium', 'm'} |/li|
        |li| grande: {'grande', 'large', 'lg', 'gr'} |/li|  |/ul|

        Entre más grande sea el modelo es posible que tenga un soporte de \
        sus características para un vocabulario más amplio. También \
        aumentará el tamaño del archivo de cada modelo. Si un \
        modelo de determinado lenguaje y tamaño no se encuentra en el \
        computador del usuario, la función lo descargará. Una vez descargado \
        el modelo correspondiente, el usuario debe correr la función de nuevo.

    :param lenguaje: (str). Lenguaje para el que se desea cargar el modelo \
        de spaCy. spaCy tiene modelos disponibles para varios lenguajes. \
        Para mayor información, visitar https://spacy.io/models/
    :param maxima_longitud: (int), valor por defecto: None. Parámetro \
        opcional que permite establecer la máxima longitud (número de \
        caracteres) que acepta el vectorizador en un texto de entrada. \
        Si este valor se deja en None, se utilizará la máxima longitud que \
        trae Spacy por defecto (1 millón de caracteres).
    :return: Modelo de spaCy, del tamaño y lenguaje especificados. Si el \
        modelo requerido no está disponible en el computador del usuario, \
        la función descargará el modelo correspondiente, lo cual puede tardar \
        algunos minutos, dependiendo del tamaño de los modelos y la \
        velocidad de conexión a internet del usuario. Si este es el caso, \
        la función retornará un modelo en blanco, del lenguaje especificado \
        por el usuario. A partir de la siguiente vez que se corra la \
        función, esta retornará el modelo ya descargado.
    """
    dim_modelo = dim_modelo.lower()
    # Estandarizar el tamaño del modelo
    if dim_modelo in ["grande", "large", "lg", "gr"]:
        dim_modelo = "lg"
    elif dim_modelo in ["mediano", "medio", "md", "medium", "m"]:
        dim_modelo = "md"
    elif dim_modelo in ["pequeño", "pequeno", "small", "s", "sm"]:
        dim_modelo = "sm"
    # Si no se puede distinguir el tamaño del modelo, se retorna un modelo
    # en blanco
    else:
        return spacy.blank(lenguaje)
    # Modelo a cargar de acuerdo a su tamaño y lenguaje
    if lenguaje == "en":
        language_model = f"{lenguaje}_core_web_{dim_modelo}"
    else:
        language_model = f"{lenguaje}_core_news_{dim_modelo}"
    # Se intenta cargar el modelo
    try:
        modelo = spacy.load(language_model)
    # Si no funciona, se trata de descargar el modelo, o se usa uno vacío
    except BaseException:
        try:
            print(("[INFO] Descargando modelo. Este proceso puede "
                   "tardar varios minutos.\n"))
            os.system(f"python -m spacy download {language_model}")
            print("\n[INFO] El modelo ha sido descargado.")
            print(("[INFO] Por favor correr de nuevo el script, "
                   "o iniciar una nueva sesión de Python para cargarlo."))
            print("[INFO] Hasta entonces, se utilizará un modelo en blanco.\n")
        except BaseException:
            print(("\n[INFO] El modelo no pudo ser descargado, "
                   "se cargará un modelo vacío.\n"))
        modelo = spacy.blank(lenguaje)
    # Se ajusta la máxima longitud, si se especificó
    if maxima_longitud is not None:
        modelo.max_length = maxima_longitud
    # Devolver el modelo
    return modelo
Example #41
0
def main(model=None, output_dir=None, n_iter=20, n_texts=2000, init_tok2vec=None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()

    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "textcat" not in nlp.pipe_names:
        textcat = nlp.create_pipe(
            "textcat",
            config={
                "exclusive_classes": True,
                "architecture": "simple_cnn",
            }
        )
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe("textcat")

    # add label to text classifier
    textcat.add_label("POSITIVE")
    textcat.add_label("NEGATIVE")

    # load the IMDB dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data()
    train_texts = train_texts[:n_texts]
    train_cats = train_cats[:n_texts]
    print(
        "Using {} examples ({} training, {} evaluation)".format(
            n_texts, len(train_texts), len(dev_texts)
        )
    )
    train_data = list(zip(train_texts, [{"cats": cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        if init_tok2vec is not None:
            with init_tok2vec.open("rb") as file_:
                textcat.model.tok2vec.from_bytes(file_.read())
        print("Training the model...")
        print("{:^5}\t{:^5}\t{:^5}\t{:^5}".format("LOSS", "P", "R", "F"))
        batch_sizes = compounding(4.0, 32.0, 1.001)
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            random.shuffle(train_data)
            batches = minibatch(train_data, size=batch_sizes)
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print(
                "{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}".format(  # print a simple table
                    losses["textcat"],
                    scores["textcat_p"],
                    scores["textcat_r"],
                    scores["textcat_f"],
                )
            )

    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        with nlp.use_params(optimizer.averages):
            nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
Example #42
0
def main(model=None, new_model_name='animal', output_dir=None, n_iter=10):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)  # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts,
                           annotations,
                           sgd=optimizer,
                           drop=0.35,
                           losses=losses)
            print('Losses', losses)

    # test the trained model
    test_text = 'Do you like horses?'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
Example #43
0
def main(model=None, output_dir=None, n_iter=20, n_texts=2000):
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")

    # add the text classifier to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'textcat' not in nlp.pipe_names:
        textcat = nlp.create_pipe('textcat')
        nlp.add_pipe(textcat, last=True)
    # otherwise, get it, so we can add labels to it
    else:
        textcat = nlp.get_pipe('textcat')

    # add label to text classifier
    textcat.add_label('POSITIVE')

    # load the IMDB dataset
    print("Loading IMDB data...")
    (train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=n_texts)
    print("Using {} examples ({} training, {} evaluation)"
          .format(n_texts, len(train_texts), len(dev_texts)))
    train_data = list(zip(train_texts,
                          [{'cats': cats} for cats in train_cats]))

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
    with nlp.disable_pipes(*other_pipes):  # only train textcat
        optimizer = nlp.begin_training()
        print("Training the model...")
        print('{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F'))
        for i in range(n_iter):
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(train_data, size=compounding(4., 32., 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.2,
                           losses=losses)
            with textcat.model.use_params(optimizer.averages):
                # evaluate on the dev data split off in load_data()
                scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
            print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}'  # print a simple table
                  .format(losses['textcat'], scores['textcat_p'],
                          scores['textcat_r'], scores['textcat_f']))

    # test the trained model
    test_text = "This movie sucked"
    doc = nlp(test_text)
    print(test_text, doc.cats)

    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        print(test_text, doc2.cats)
Example #44
0
import blingfire
import nltk
import pysbd
import spacy
import stanza

import syntok
from syntok.tokenizer import Tokenizer
import syntok.segmenter as syntok_segmenter

from english_golden_rules import GOLDEN_EN_RULES

pysbd_segmenter = pysbd.Segmenter(language="en", clean=False, char_span=False)

nlp = spacy.blank('en')
nlp.add_pipe(nlp.create_pipe("sentencizer"))
nlp_dep = spacy.load('en_core_web_sm', disable=["ner"])
#stanza.download('en')
stanza_nlp = stanza.Pipeline(lang='en', processors='tokenize')

syntok_tokenizer = Tokenizer()

def blingfire_tokenize(text):
    return blingfire.text_to_sentences(text).split('\n')

def nltk_tokenize(text):
    return nltk.sent_tokenize(text)

def pysbd_tokenize(text):
    return pysbd_segmenter.segment(text)
Example #45
0
from ordered_set import OrderedSet
from spacy.tokenizer import Tokenizer
from spacy.lang.en import English
import torch
import ujson as json
import bisect
from typing import Dict
import random
import numpy as np
import linecache

# import nltk.data

import time

nlp = spacy.blank("en")  # the tokenizer
# nlp = nltk.data.load('tokenizers/punkt/english.pickle')
# nlp = English()
# tokenizer = Tokenizer(nlp.vocab)
# tokenizer = English().Defaults.create_tokenizer(nlp)
# justatest = []
wordlist = OrderedSet(
)  # an Ordered Set which will contain all words of the data once


def token(string):
    words = nlp(string)  # tokenizing of the sentence
    index = [
    ]  # an array for the index numbers, which will replace that sentence

    for i in words:  # for all words:
Example #46
0
import spacy

nlp = spacy.blank("ja")

# Docクラスをインポート
from ____ import ____

# 作りたいテキスト:「spaCyは素晴らしい!」
words = ["spaCy", "は", "素晴らしい", "!"]
spaces = [False, False, False, False]

# wordsとspacesからDocを作成
doc = ____(____, words=words, spaces=spaces)
print(doc.text)
Example #47
0
import tensorflow as tf
import random
from tqdm import tqdm
import spacy
import ujson as json
from collections import Counter
import numpy as np

nlp = spacy.blank("en")


def word_tokenize(sent):
    doc = nlp(sent)
    return [token.text for token in doc]


def convert_idx(text, tokens):
    current = 0
    spans = []
    for token in tokens:
        current = text.find(token, current)
        if current < 0:
            print("Token {} cannot be found".format(token))
            raise Exception()
        spans.append((current, current + len(token)))
        current += len(token)
    return spans


def process_file(filename, data_type, word_counter, char_counter):
    print("Generating {} examples...".format(data_type))
Example #48
0
import spacy
import random
import json

with open("exercises/de/gadgets.json") as f:
    TRAINING_DATA = json.loads(f.read())

nlp = spacy.blank("de")
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner)
ner.add_label("GADGET")

# Beginne das Training
nlp.begin_training()

# Führe die Schleife 10 Mal aus
for itn in range(10):
    # Mische die Trainingsdaten
    random.shuffle(TRAINING_DATA)
    losses = {}

    # Teile die Beispiele in Batches auf und iteriere über die Batches
    for batch in spacy.util.minibatch(TRAINING_DATA, size=2):
        texts = [text for text, entities in batch]
        annotations = [entities for text, entities in batch]

        # Aktualisiere das Modell
        nlp.update(texts, annotations, losses=losses)
    print(losses)
import spacy
spacy.require_gpu()
from pathlib import Path
import random
from spacy.util import minibatch, compounding
#import xx_ent_wiki_sm
from spacy.lang.pt import Portuguese
from ast import literal_eval
import datetime
import time 

output_dir = "./sky_ner"

modelDir = Path(output_dir)

nlp = spacy.blank('pt')    
ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
print (st)
if modelDir.exists() is True:

    # training data
    TRAIN_DATA = open('dataset_new.txt', 'r').read()
    print('Dados carregados')
    try:
        TRAIN_DATA = literal_eval(TRAIN_DATA)
        print('literal eval aplicado')
    except: 
        print('Falha ao aplicar eval')     
    
    if 'ner' not in nlp.pipe_names:
Example #50
0
# Importe spaCy
import spacy

# Crée l'objet nlp français
nlp = spacy.blank("fr")

# Traite un texte
doc = nlp("Ceci est une phrase.")

# Affiche le texte du document
print(doc.text)
Example #51
0
def main(model=None, output_dir=None, n_iter=100):
    """Load the model, set up the pipeline and train the entity recognizer."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    # otherwise, get it so we can add labels
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get("entities"):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        # reset and initialize the weights randomly – but only if we're
        # training a new model
        if model is None:
            nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(
                    texts,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.5,  # dropout - make it harder to memorise data
                    losses=losses,
                )
            print("Losses", losses)

    # test the trained model
    for text, _ in TRAIN_DATA:
        doc = nlp(text)
        print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
        print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        for text, _ in TRAIN_DATA:
            doc = nlp2(text)
            print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
            print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
Example #52
0
def main(model=None, new_model_name="animal", output_dir=None, n_iter=30):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    random.seed(0)
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe("ner")

    ner.add_label(LABEL)  # add new entity label to entity recognizer
    # Adding extraneous labels shouldn't mess anything up
    ner.add_label("VEGETABLE")
    if model is None:
        optimizer = nlp.begin_training()
    else:
        optimizer = nlp.resume_training()
    move_names = list(ner.move_names)
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
    with nlp.disable_pipes(*other_pipes):  # only train NER
        sizes = compounding(1.0, 4.0, 1.001)
        # batch up the examples using spaCy's minibatch
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            batches = minibatch(TRAIN_DATA, size=sizes)
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "Do you like horses?"
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta["name"] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        # Check the classes have loaded back consistently
        assert nlp2.get_pipe("ner").move_names == move_names
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)