Ejemplo n.º 1
0
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType:
    """
    In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
    keyed by the options we used to create the spacy model, so any particular configuration only
    gets loaded once.
    """

    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        try:
            spacy_model = spacy.load(spacy_model_name, disable=disable)
        except OSError:
            logger.warning(f"Spacy models '{spacy_model_name}' not found.  Downloading and installing.")
            spacy_download(spacy_model_name)
            spacy_model = spacy.load(spacy_model_name, disable=disable)

        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]
Ejemplo n.º 2
0
def main(model=None, new_model_name='animal', output_dir=None, n_iter=20):
    """Set up the pipeline and entity recognizer, and train the new entity."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank('en')  # create blank Language class
        print("Created blank 'en' model")
    # Add entity recognizer to model if it's not in the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner)
    # otherwise, get it, so we can add labels to it
    else:
        ner = nlp.get_pipe('ner')

    ner.add_label(LABEL)   # add new entity label to entity recognizer
    if model is None:
        optimizer = nlp.begin_training()
    else:
        # Note that 'begin_training' initializes the models, so it'll zero out
        # existing entity types.
        optimizer = nlp.entity.create_optimizer()



    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update([text], [annotations], sgd=optimizer, drop=0.35,
                           losses=losses)
            print(losses)

    # test the trained model
    test_text = 'Do you like horses?'
    doc = nlp(test_text)
    print("Entities in '%s'" % test_text)
    for ent in doc.ents:
        print(ent.label_, ent.text)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.meta['name'] = new_model_name  # rename model
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc2 = nlp2(test_text)
        for ent in doc2.ents:
            print(ent.label_, ent.text)
Ejemplo n.º 3
0
def get_nlp(lang="en"):
    """Load spaCy model for a given language, determined by `models' dict or by MODEL_ENV_VAR"""
    instance = nlp.get(lang)
    if instance is None:
        import spacy
        model = models.get(lang)
        if not model:
            models[lang] = model = os.environ.get("_".join((MODEL_ENV_VAR, lang.upper()))) or \
                                   os.environ.get(MODEL_ENV_VAR) or DEFAULT_MODEL.get(lang, "xx")
        started = time.time()
        with external_write_mode():
            print("Loading spaCy model '%s'... " % model, end="", flush=True)
            try:
                nlp[lang] = instance = spacy.load(model)
            except OSError:
                spacy.cli.download(model)
                try:
                    nlp[lang] = instance = spacy.load(model)
                except OSError as e:
                    raise OSError("Failed to get spaCy model. Download it manually using "
                                  "`python -m spacy download %s`." % model) from e
            tokenizer[lang] = instance.tokenizer
            instance.tokenizer = lambda words: spacy.tokens.Doc(instance.vocab, words=words)
            print("Done (%.3fs)." % (time.time() - started))
    return instance
Ejemplo n.º 4
0
    def _load_spacy(self, evaluation, options):
        language_code = None
        language_name = self.get_option(options, 'Language', evaluation)
        if language_name is None:
            language_name = String('Undefined')
        if isinstance(language_name, String):
            language_code = _SpacyBuiltin._language_codes.get(language_name.get_string_value())
        if not language_code:
            evaluation.message(self.get_name(), 'lang', language_name, strip_context(self.get_name()))
            return None

        instance = _SpacyBuiltin._spacy_instances.get(language_code)
        if instance:
            return instance

        try:
            if 'SPACY_DATA' in os.environ:
                instance = spacy.load(language_code, via=os.environ['SPACY_DATA'])
            else:
                instance = spacy.load(language_code)

            _SpacyBuiltin._spacy_instances[language_code] = instance
            return instance
        except RuntimeError as e:
            evaluation.message(self.get_name(), 'runtime', str(e))
            return None
Ejemplo n.º 5
0
def main(model=None, output_dir=None, n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # add the parser to the pipeline if it doesn't exist
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if "parser" not in nlp.pipe_names:
        parser = nlp.create_pipe("parser")
        nlp.add_pipe(parser, first=True)
    # otherwise, get it, so we can add labels to it
    else:
        parser = nlp.get_pipe("parser")

    # add labels to the parser
    for _, annotations in TRAIN_DATA:
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_text = "I like securities."
    doc = nlp(test_text)
    print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        doc = nlp2(test_text)
        print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
Ejemplo n.º 6
0
 def __init__(self, model='en', disable=None):
     if disable is None:
         disable = []
     try:
         self._parser = spacy.load(model, disable=disable)
     except OSError:
         url = 'https://spacy.io/models'
         if license_prompt('Spacy {} model'.format(model), url) is False:
             sys.exit(0)
         spacy_download(model)
         self._parser = spacy.load(model, disable=disable)
Ejemplo n.º 7
0
def get_ents():
    data = flask.request.args.get('fragment')
    is_custom = flask.request.args.get('custom')
    if is_custom is not None:
        nlp = spacy.load(Path('./gina_haspel'))
    else:
        nlp = spacy.load('en')
    doc = nlp(data)
    print(doc)
    tuples = [(str(x), x.label_)
              for x
              in doc.ents]
    return  flask.jsonify(dict(tuples))
Ejemplo n.º 8
0
def get_nlp():
    if nlp.instance is None:
        import spacy
        model_name = os.environ.get("SPACY_MODEL", "en")
        nlp.instance = spacy.load(model_name)
        if nlp.instance.tagger is None:  # Model not really loaded
            spacy.cli.download(model_name)
            nlp.instance = spacy.load(model_name)
            assert nlp.instance.tagger, "Failed to get spaCy model. " \
                                        "Download it manually using `python -m spacy download %s`." % model_name
        nlp.tokenizer = nlp.instance.tokenizer
        nlp.instance.tokenizer = nlp.tokenizer.tokens_from_list
    return nlp.instance
def _spacy_en():
    yield None
    try:
        spacyen = spacy.load('en_default', via=data_path)
    except RuntimeError as e:
        if e.message == "Model not installed. Please run 'python -m spacy.en.download' to install latest compatible model.":
            print("Need to download Spacy data. Starting download now")
            sputnik.install('spacy', spacy.about.__version__,
                            'en_default', data_path=data_path)
            spacyen = spacy.load('en_default', via=data_path)
        else:
            raise
    while True:
        yield spacyen
Ejemplo n.º 10
0
def main(model=None, output_dir=None, n_iter=15):
    """Load the model, set up the pipeline and train the parser."""
    if model is not None:
        nlp = spacy.load(model)  # load existing spaCy model
        print("Loaded model '%s'" % model)
    else:
        nlp = spacy.blank("en")  # create blank Language class
        print("Created blank 'en' model")

    # We'll use the built-in dependency parser class, but we want to create a
    # fresh instance – just in case.
    if "parser" in nlp.pipe_names:
        nlp.remove_pipe("parser")
    parser = nlp.create_pipe("parser")
    nlp.add_pipe(parser, first=True)

    for text, annotations in TRAIN_DATA:
        for dep in annotations.get("deps", []):
            parser.add_label(dep)

    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"]
    with nlp.disable_pipes(*other_pipes):  # only train parser
        optimizer = nlp.begin_training()
        for itn in range(n_iter):
            random.shuffle(TRAIN_DATA)
            losses = {}
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                texts, annotations = zip(*batch)
                nlp.update(texts, annotations, sgd=optimizer, losses=losses)
            print("Losses", losses)

    # test the trained model
    test_model(nlp)

    # save model to output directory
    if output_dir is not None:
        output_dir = Path(output_dir)
        if not output_dir.exists():
            output_dir.mkdir()
        nlp.to_disk(output_dir)
        print("Saved model to", output_dir)

        # test the saved model
        print("Loading from", output_dir)
        nlp2 = spacy.load(output_dir)
        test_model(nlp2)
 def _load_nlp(self):
     if self.nlp is None:
         print("Loading SpacySentenceWordCache")
         nlp = spacy.load('en_core_web_lg')
         nlp.add_pipe(nlp.create_pipe('sentencizer'))
         self.nlp = nlp
     return self.nlp
Ejemplo n.º 12
0
def main(lang, in_dir, out_loc, negative=5, n_workers=4, window=5, size=128, min_count=10, nr_iter=2):
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
    model = Word2Vec(
        size=size,
        window=window,
        min_count=min_count,
        workers=n_workers,
        sample=1e-5,
        negative=negative
    )
    nlp = spacy.load(lang, parser=False, tagger=False, entity=False)
    corpus = Corpus(in_dir)
    total_words = 0
    total_sents = 0
    for text_no, text_loc in enumerate(iter_dir(corpus.directory)):
        with io.open(text_loc, 'r', encoding='utf8') as file_:
            text = file_.read()
        total_sents += text.count('\n')
        doc = nlp(text)
        total_words += corpus.count_doc(doc)
        logger.info("PROGRESS: at batch #%i, processed %i words, keeping %i word types",
                    text_no, total_words, len(corpus.strings))
    model.corpus_count = total_sents
    model.raw_vocab = defaultdict(int)
    for orth, freq in corpus.counts:
        if freq >= min_count:
            model.raw_vocab[nlp.vocab.strings[orth]] = freq
    model.scale_vocab()
    model.finalize_vocab()
    model.iter = nr_iter
    model.train(corpus)
    model.save(out_loc)
Ejemplo n.º 13
0
def train(train_loc, dev_loc, shape, settings):
    train_texts1, train_texts2, train_labels = read_snli(train_loc)
    dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)

    print("Loading spaCy")
    nlp = spacy.load('en')
    assert nlp.path is not None
    print("Compiling network")
    model = build_model(get_embeddings(nlp.vocab), shape, settings)
    print("Processing texts...")
    Xs = []
    for texts in (train_texts1, train_texts2, dev_texts1, dev_texts2):
        Xs.append(get_word_ids(list(nlp.pipe(texts, n_threads=20, batch_size=20000)),
                         max_length=shape[0],
                         rnn_encode=settings['gru_encode'],
                         tree_truncate=settings['tree_truncate']))
    train_X1, train_X2, dev_X1, dev_X2 = Xs
    print(settings)
    model.fit(
        [train_X1, train_X2],
        train_labels,
        validation_data=([dev_X1, dev_X2], dev_labels),
        nb_epoch=settings['nr_epoch'],
        batch_size=settings['batch_size'])
    if not (nlp.path / 'similarity').exists():
        (nlp.path / 'similarity').mkdir()
    print("Saving to", nlp.path / 'similarity')
    weights = model.get_weights()
    with (nlp.path / 'similarity' / 'model').open('wb') as file_:
        pickle.dump(weights[1:], file_)
    with (nlp.path / 'similarity' / 'config.json').open('wb') as file_:
        file_.write(model.to_json())
Ejemplo n.º 14
0
def train(train_texts, train_labels, dev_texts, dev_labels,
        lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5,
        by_sentence=True):
    nlp = spacy.load('en', entity=False)
    if 'nr_vector' not in lstm_shape:
        lstm_shape['nr_vector'] = max(lex.rank+1 for lex in vocab if lex.has_vector)
    print("Make model")
    model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings))
    print("Parsing texts...")
    if by_sentence:
        train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length'])
        dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length'])
    else:
        train_data = DocDataset(nlp, train_texts, train_labels)
        dev_data = DocDataset(nlp, dev_texts, dev_labels)
    train_iter = SerialIterator(train_data, batch_size=batch_size,
                                shuffle=True, repeat=True)
    dev_iter = SerialIterator(dev_data, batch_size=batch_size,
                              shuffle=False, repeat=False)
    optimizer = chainer.optimizers.Adam()
    optimizer.setup(model)
    updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0)
    trainer = chainer.training.Trainer(updater, (20, 'epoch'), out='result')

    trainer.extend(extensions.Evaluator(dev_iter, model, device=0))
    trainer.extend(extensions.LogReport())
    trainer.extend(extensions.PrintReport([
        'epoch', 'main/accuracy', 'validation/main/accuracy']))
    trainer.extend(extensions.ProgressBar())
    
    trainer.run()
Ejemplo n.º 15
0
def test_access_parse_for_merged():
    nlp = spacy.load('en_core_web_sm')
    t_t = nlp.tokenizer("Highly rated - I'll definitely")
    nlp.tagger(t_t)
    nlp.parser(t_t)
    nlp.parser(t_t)
    ss(t_t)
Ejemplo n.º 16
0
 def __init__(self, model=None, output_dir=None, n_iter=50, threshold_success=None, successive_successes=3, input_data = "TRAINING_DATA.json", param='ner'):
     """Load the model, set up the pipeline and train the entity recognizer."""
     if model is not None:
         self.nlp = nlp = spacy.load(model)  # load existing spaCy model
         print("Loaded model '%s'" % model)
     else:
         self.nlp = nlp = spacy.load('de', parser=False)  # load default `de` model
         print("Loaded existing 'de' model")
     self.output_dir = output_dir
     self.n_iter = n_iter
     self.threshold_success = threshold_success
     self.successive_successes = successive_successes
     self.TRAIN_DATA = json.load(open(input_data))
 
     self.train(param)
     self.test()
Ejemplo n.º 17
0
 def __init__(self,lang='en'):
     try:
         import spacy
     except:
         raise Exception("spacy not installed. Use `pip install spacy`.")
     super(SpaCy, self).__init__(name="spaCy")
     self.model = spacy.load('en')
Ejemplo n.º 18
0
def main(argv):
    if not argv or len(argv) < 2:
        raise TypeError("not enough arguments. two are required")
    
    # Load Spacy's English tokenizer model
    print "Loading Spacy's English model"
    nlp = spacy.load('en')
   
    # Download two news articles from internet, supplied as urls in command line arguments
    utext1 = getOnlyText(argv[0])
    utext2 = getOnlyText(argv[1])

    # Use Spacy to tokenize documents, then remove stop words and non-alpha  
    print "Parsing files" 
    doc1 = filterText(nlp(utext1), nlp)   
    doc2 = filterText(nlp(utext2), nlp)

    # Similarity is estimated using the cosine metric, between Span.vector and other.vector. 
    # By default, Span.vector is computed by averaging the vectors of its tokens. [spacy.io]
    print "Document Vectors Similarity Score:", doc1.similarity(doc2)

    # Build Bag of Words with Spacy
    wordBag1 = doc1.count_by(LOWER)
    wordBag2 = doc2.count_by(LOWER)

    # Combine Bag of Words dicts in vector format, calculate cosine similarity of resulting vectors  
    vect = DictVectorizer(sparse=False)
    wordbagVectors = vect.fit_transform([wordBag2, wordBag1])
    score = 1 - spatial.distance.cosine(wordbagVectors[0], wordbagVectors[1])
    print "Bag of Words Cosine Similarity Score:", score
    return score
Ejemplo n.º 19
0
Archivo: nlp.py Proyecto: csvance/FTBot
def create_nlp_instance():
    import spacy
    from spacymoji import Emoji

    nlp = spacy.load('en')
    emoji_pipe = Emoji(nlp)
    nlp.add_pipe(emoji_pipe, first=True)

    # Merge hashtag tokens which were split by spacy
    def hashtag_pipe(doc):
        merged_hashtag = False
        while True:
            for token_index, token in enumerate(doc):
                if token.text == '#':
                    if token.head is not None:
                        start_index = token.idx
                        end_index = start_index + len(token.head.text) + 1
                        if doc.merge(start_index, end_index) is not None:
                            merged_hashtag = True
                            break
            if not merged_hashtag:
                break
            merged_hashtag = False
        return doc

    nlp.add_pipe(hashtag_pipe)
    return nlp
Ejemplo n.º 20
0
def main():
    train, dev = datasets.imdb()
    train_X, train_y = zip(*train)
    dev_X, dev_y = zip(*dev)
    model = LinearModel(2)
    train_y = to_categorical(train_y, nb_classes=2)
    dev_y = to_categorical(dev_y, nb_classes=2)

    nlp = spacy.load("en")
    train_X = [
        model.ops.asarray([tok.orth for tok in doc], dtype="uint64")
        for doc in nlp.pipe(train_X)
    ]
    dev_X = [
        model.ops.asarray([tok.orth for tok in doc], dtype="uint64")
        for doc in nlp.pipe(dev_X)
    ]
    dev_X = preprocess(model.ops, dev_X)
    with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer):
        trainer.dropout = 0.0
        trainer.batch_size = 512
        trainer.nb_epoch = 3
        trainer.each_epoch.append(lambda: print(model.evaluate(dev_X, dev_y)))
        for X, y in trainer.iterate(train_X, train_y):
            keys_vals_lens = preprocess(model.ops, X)
            scores, backprop = model.begin_update(keys_vals_lens, drop=trainer.dropout)
            backprop(scores - y, optimizer)
    with model.use_params(optimizer.averages):
        print(model.evaluate(dev_X, dev_y))
Ejemplo n.º 21
0
    def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False):
        self.greedyness = greedyness
        self.max_dist = max_dist
        self.max_dist_match = max_dist_match
        self.debug = debug
        
        if nlp is None:
            print("Loading spacy model")
            try:
                spacy.info('en_core_web_sm')
                model = 'en_core_web_sm'
            except IOError:
                print("No spacy 2 model detected, using spacy1 'en' model")
                model = 'en'
            nlp = spacy.load(model)
        
        model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/")
        embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/")
        print("loading model from", model_path)
        self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll)
        self.coref_model = Model(model_path)

        self.clusters = {}
        self.mention_to_cluster = []
        self.mentions_single_scores = {}
        self.mentions_single_features = {}
        self.mentions_pairs_scores = {}
        self.mentions_pairs_features = {}
Ejemplo n.º 22
0
def train(train_loc, dev_loc, shape, settings):
    train_texts1, train_texts2, train_labels = read_snli(train_loc)
    dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc)

    print("Loading spaCy")
    nlp = spacy.load("en_vectors_web_lg")
    assert nlp.path is not None
    print("Processing texts...")
    train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0])
    dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0])

    print("Compiling network")
    model = build_model(get_embeddings(nlp.vocab), shape, settings)

    print(settings)
    model.fit(
        train_X,
        train_labels,
        validation_data=(dev_X, dev_labels),
        epochs=settings["nr_epoch"],
        batch_size=settings["batch_size"],
    )
    if not (nlp.path / "similarity").exists():
        (nlp.path / "similarity").mkdir()
    print("Saving to", nlp.path / "similarity")
    weights = model.get_weights()
    # remove the embedding matrix.  We can reconstruct it.
    del weights[1]
    with (nlp.path / "similarity" / "model").open("wb") as file_:
        pickle.dump(weights, file_)
    with (nlp.path / "similarity" / "config.json").open("w") as file_:
        file_.write(model.to_json())
Ejemplo n.º 23
0
    def __init__(self, vdict_path, adict_path, \
        batchsize=128, max_length=15, n_ans_vocabulary=1000, mode='train', data_shape=(2048)):

        self.batchsize = batchsize
        self.d_vocabulary = None
        self.batch_index = None
        self.batch_len = None
        self.rev_adict = None
        self.max_length = max_length
        self.n_ans_vocabulary = n_ans_vocabulary
        self.mode = mode
        self.data_shape = data_shape

        assert self.mode == 'test'

        # load vocabulary
        with open(vdict_path,'r') as f:
            vdict = json.load(f)
        with open(adict_path,'r') as f:
            adict = json.load(f)
        self.n_vocabulary, self.vdict = len(vdict), vdict
        self.n_ans_vocabulary, self.adict = len(adict), adict

        self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
        self.glove_dict = {} # word -> glove vector
Ejemplo n.º 24
0
def tokenizeText(sample,parser=spacy.load('en')):

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip()
                      if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens
Ejemplo n.º 25
0
def main(model_dir=None):
    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
        print('please run: `python -m spacy.en.download --force all` for better performance')
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    train_data = [
        (
            'Who is Shaka Khan?',
            [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')]
        ),
        (
            'I like London and Berlin.',
            [(len('I like '), len('I like London'), 'LOC'),
            (len('I like London and '), len('I like London and Berlin'), 'LOC')]
        )
    ]
    ner = train_ner(nlp, train_data, ['PERSON', 'LOC'])

    doc = nlp.make_doc('Who is Shaka Khan?')
    nlp.tagger(doc)
    ner(doc)
    for word in doc:
        print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob)

    if model_dir is not None:
        save_model(ner, model_dir)
Ejemplo n.º 26
0
def test_not_lemmatize_base_forms():
    nlp = spacy.load('en', parser=False)
    doc = nlp(u"Don't feed the dog")
    feed = doc[2]
    feed.tag_ = u'VB'
    assert feed.text == u'feed'
    assert feed.lemma_ == u'feed'
Ejemplo n.º 27
0
def load_spacy(name, **kwargs):
    """
    Load a language-specific spaCy pipeline (collection of data, models, and
    resources) for tokenizing, tagging, parsing, etc. text; the most recent
    package loaded is cached.

    Args:
        name (str): standard 2-letter language abbreviation for a language;
            currently, spaCy supports English ('en') and German ('de')
        **kwargs: keyword arguments passed to :func:`spacy.load`; see the
            `spaCy docs <https://spacy.io/docs#english>`_ for details

            * via (str): non-default directory from which to load package data
            * vocab
            * tokenizer
            * parser
            * tagger
            * entity
            * matcher
            * serializer
            * vectors

    Returns:
        :class:`spacy.<lang>.<Language>`

    Raises:
        RuntimeError: if package can't be loaded
    """
    logger.info('Loading "%s" language spaCy pipeline', name)
    return spacy.load(name, **kwargs)
Ejemplo n.º 28
0
 def __init__(self, lemmatize_it=True, stem_it=True, normalize_it=True):
     self.lemmatize_it = lemmatize_it
     self.stem_it = stem_it
     self.normalize_it = normalize_it
     self.parser = spacy.load('en')
     self.stemmer = gensim.parsing.PorterStemmer()
     self.stops = set(nltk.corpus.stopwords.words('english'))
Ejemplo n.º 29
0
def load_model(model_dir):
    model_dir = pathlib.Path(model_dir)
    nlp = spacy.load('en', parser=False, entity=False, add_vectors=False)
    with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_:
        nlp.vocab.strings.load(file_)
    nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin')
    ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True)
    return (nlp, ner)
 def __init__(self, encoder_path, bpe_path):
     self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
     self.encoder = json.load(open(encoder_path))
     self.decoder = {v:k for k,v in self.encoder.items()}
     merges = open(bpe_path).read().split('\n')[1:-1]
     merges = [tuple(merge.split()) for merge in merges]
     self.bpe_ranks = dict(zip(merges, range(len(merges))))
     self.cache = {}
Ejemplo n.º 31
0
print("Top 10 keywords and their Score")

# In[17]:

for i, j, n in zip(scored_key, scored_value, range(10)):
    print(i, " \t:\t", j)

# ### Topic Mining from The Keywords and Emotions/Sentiments Associated with it

# Using <b>spaCy</b> for the same

# In[18]:
print("IMPORTING DICTIONARY AND OTHEL NLP TOOLS ")
import spacy
nlp = spacy.load('en_core_web_md')

# <b>Retrieving the generic Domain Specific Buckets</b>
#
#     Basic Domain knowledge goes into our model here , identifiying key categories to look into

# In[16]:

# topic_list={'OVERALL':['RECOMMEND', 'EXPERIANCE','VALUE']}
# topic_list.update({'METRO':['METRO', 'METRO RAIL','TRAIN','RAIL','METRO STATION']})
# topic_list.update({'SERVICES':['CUSTOMER CARE', 'COMPLINTS','SECURITY','SMART CARD','PAYMENT','METRO TICKET']})
# topic_list.update({'BUS':['BUS TICKET', 'CONDUCTOR','BUS','PAYMENT','TICKET']})
# topic_list.update({'OTHERS':['OTHERS']})
# topic_list.update({'OPERATIONAL':['SCHEDULE', 'TIMINGS','RELIABILITY']})
# topic_list.update({'LOGISTICS':['BUS STOPS', 'BUS STATIONS','METRO STATIONS','BUS SEATS','SEATS','TICKETING MACHIENE']})
Ejemplo n.º 32
0
from NER import *
import time
import telebot
import spacy
import os
import json
 
output_dir = os.getcwd()
MODEL = spacy.load(output_dir)

TOKEN = "881147208:AAGDY-ZvgqonfxS12Dn3GDPubCl4jiJtPJA"
bot = telebot.TeleBot(token=TOKEN)

FLAG = False
FLAG2 = True

diseasess = []
potential_diseas = []
things_to_ask = []

greeting_words = ["hi", "hello", "hey", "helloo", "hellooo",
                  "g morining", "gmorning", "good morning", "morning",
                  "good day", "good afternoon", "good evening", "greetings",
                  "greeting", "good to see you", "its good seeing you",
                  "how are you", "how're you", "how are you doing",
                  "how ya doin'", "how ya doin", "how is everything",
                  "how is everything going", "how's everything going",
                  "how is you", "how's you", "how are things",
                  "how're things",
                  "how is it going", "how's it going", "how's it goin'",
                  "how's it goin", "how is life been treating you",
Ejemplo n.º 33
0
import json
import spacy
import numpy as np

nlp = spacy.load("en_vectors_web_lg")


def get_vectors(wordlist):
    tokens = nlp(" ".join(str(s) for s in wordlist))
    pageVec = np.array([])
    for token in tokens:
        wordVec = token.vector
        wordVec = wordVec.reshape(1, 300)
        if pageVec.size == 0:
            pageVec = wordVec
        else:
            pageVec = np.concatenate([pageVec, wordVec], axis=0)
    return pageVec

def get_weighted_vectors(wordlist, scores):
    tokens = nlp(" ".join(str(s) for s in wordlist))
    pageVec = np.array([])
    for token, score in zip(tokens, scores):
        wordVec = np.multiply(token.vector,np.asarray(score))
        wordVec = wordVec.reshape(1,300)
        if pageVec.size == 0:
            pageVec = wordVec
        else:
            pageVec = np.concatenate([pageVec,wordVec],axis = 0)
        
    return pageVec
Ejemplo n.º 34
0
def train_spacy():

    TRAIN_DATA = trim_entity_spans(
        convert_dataturks_to_spacy("traindata.json"))

    # nlp = spacy.blank('en')  # create blank Language class at start

    nlp = spacy.load("./model")  #load the model to add up to it
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe('ner')

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        optimizer = nlp.begin_training()
        for itn in range(10):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            for text, annotations in TRAIN_DATA:
                nlp.update(
                    [text],  # batch of texts
                    [annotations],  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)
    # test the model and evaluate it
    examples = convert_dataturks_to_spacy("testdata.json")
    c = 0
    for text, annot in examples:

        # f = open("resumes"+str(c)+".txt", "w")
        doc_to_test = nlp(text)

        # d = {}
        # for ent in doc_to_test.ents:
        #     d[ent.label_] = []
        # for ent in doc_to_test.ents:
        #     d[ent.label_].append(ent.text)
        #
        # for i in set(d.keys()):
        #
        #     f.write("\n\n")
        #     f.write(i + ":" + "\n")
        #     for j in set(d[i]):
        #         f.write(j.replace('\n', '')+"\n")
        d = {}
        for ent in doc_to_test.ents:
            d[ent.label_] = [0, 0, 0, 0, 0, 0]
        for ent in doc_to_test.ents:
            doc_gold_text = nlp.make_doc(text)
            gold = GoldParse(doc_gold_text, entities=annot.get("entities"))
            y_true = [
                ent.label_ if ent.label_ in x else 'Not ' + ent.label_
                for x in gold.ner
            ]
            y_pred = [
                x.ent_type_ if x.ent_type_ == ent.label_ else 'Not ' +
                ent.label_ for x in doc_to_test
            ]
            if (d[ent.label_][0] == 0):
                print("For Entity " + ent.label_ + "\n")
                print(classification_report(y_true, y_pred) + "\n")
                (p, r, f,
                 s) = precision_recall_fscore_support(y_true,
                                                      y_pred,
                                                      average='weighted')
                a = accuracy_score(y_true, y_pred)
                d[ent.label_][0] = 1
                d[ent.label_][1] += p
                d[ent.label_][2] += r
                d[ent.label_][3] += f
                d[ent.label_][4] += a
                d[ent.label_][5] += 1
        c += 1

        for i in d:
            print("\n For Entity " + i + "\n")
            print("Accuracy : " + str((d[i][4] / d[i][5]) * 100) + "%")
            print("Precision : " + str(d[i][1] / d[i][5]))
            print("Recall : " + str(d[i][2] / d[i][5]))
            print("F-score : " + str(d[i][3] / d[i][5]))

    nlp.to_disk("model")
Ejemplo n.º 35
0
import numpy as np
from sklearn.datasets import fetch_20newsgroups # import the dataset
import spacy # import spacy for nlp preprocessing
from gensim import corpora, models # import classes for creating bag of words & tf-idf
import re # import regex module
import pandas as pd 
from pprint import pprint
import pyLDAvis # import this package for plotting topic model
import pyLDAvis.gensim # import this class to plot based on gensim
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning) # turn off warnings

dataset = fetch_20newsgroups(subset='train', shuffle=True) # load the train data including the labels

nlp = spacy.load('en_core_web_sm') # load the spacy 'en_core_web_sm' model
data = dataset.data # extract only the data
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data] # eliminate e-mail strings from the dataset using the sub method of regex module

texts = [] # create an empty list to store the preprocessed data
allowed_pos = ['ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB'] # only the words having the parts of speech will be included in the dataset
for document in data: # looping over the documents one by one
    text = []
    doc = nlp(document) # return a spacy document object for processing
    for w in doc: # looping over single tokens of the document object
        if  (not w.is_stop) and (not w.is_punct) and (not w.like_num) and (w.pos_ in allowed_pos): # eliminating stop words, punctuations, numbers and the words whose POS is not included on the list described above using features provided by spacy
                text.append(w.lemma_) # take only the lemma of the word
    texts.append(text) # append the documents one by one to the 'texts' list created earlier

dictionary = corpora.Dictionary(texts) # create a bag of words model class using the Dictionary class of gensim
corpus = [ dictionary.doc2bow(text) for text in texts] # create the bag of words corpus off the Dictionary class
Ejemplo n.º 36
0

#turk-eng:
#sentence = "Allaha ve Peygamberine kim inanmamışsa bilsin ki şüphesiz Biz inkarcılar için çılgın alevli cehennemi hazırlamışızdır"
#sentence_eng = "And whoever does not accept faith in Allah and His Noble Messenger – We have indeed kept prepared a blazing fire for disbelievers"


#azer-turkish
#sentence = "O yer üzünü sizin üçün beşik etmiş orada sizin üçün yollar salmış və göydən su endirmişdir"
#sentence_turk = "O ki yeri size beşik yaptı ve onda sizin için yollar açtı gökten bir su indirdi"



#sentence = sentence.split()
#python -m spacy download en
spacy_eng = spacy.load("en")


tokenize_custom = lambda x: x.split()

def tokenize_eng(text):
    return [tok.text for tok in spacy_eng.tokenizer(text)]


src_tokenizer = None
trg_tokenizer = None

if set_source_to == "azerbaijani" or set_source_to == "turkish":
    src_tokenizer = tokenize_custom
elif set_source_to == "english":
    src_tokenizer = tokenize_eng
Ejemplo n.º 37
0
    # TODO aprimorar ementa
    ementa = re.search('provido', tempo)
    if fim == None or ementa == None:
        return None
    return sent[ementa.start()+7:fim.start()+14]

def relatorios(n, cases):
    final = []
    i = 0
    while i < n:
        if getRelatorio(cases.julgado[i]) != None:
            final.append(getRelatorio(cases.julgado[i]))
        i += 1
    return final

npl = spacy.load("pt_core_news_sm")
relat = relatorios(1000, train_df)
relat[6]

train_df.resultado[range(50)]

opcoes = ['provido', 'improvido', 'parcial', 'não conhecido', 'desconhecido']

i=0
relat_npl = []
while i < 4000:
    x = npl(relat[i])
    relat_npl.append(x)
    i = i+1

import os
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# use spacy small model
# nlp = en_core_web_lg.load()
import spacy
nlp = spacy.load('en_core_web_lg')
from lists_patterns import load_lists,fpath

# dependency markers for subjects
SUBJECTS = {"nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"}
# dependency markers for objects
OBJECTS = {"dobj", "dative", "attr", "oprd","pobj"}
# POS tags that will break adjoining items
BREAKER_POS = {"CCONJ", "VERB"}
# words that are negations
NEGATIONS = {"no", "not", "n't", "never", "none"}


# does dependency set contain any coordinating conjunctions?
def contains_conj(depSet):
    return "and" in depSet or "or" in depSet or "nor" in depSet or \
Ejemplo n.º 39
0
def getNouns(raw_parsed_result):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(raw_parsed_result)
    nouns = [chunk.text for chunk in doc.noun_chunks]
    return nouns
Ejemplo n.º 40
0
            day = int(tokens[0])
            year = 2020
        else:
            day = int(tokens[0])
            month = int(tokens[1])
            year = 2020
    elif len(tokens) == 1:
        day = int(tokens[0])
        year = 2020
    return (day, month, year)


# !python3 -m spacy download en_core_web_lg
import spacy

sp_lg = spacy.load('en_core_web_lg')
import nltk
import re

query_date = {}
for idx, row in enumerate(rows):
    date_occurences = [(ent.text.strip(), ent.label_)
                       for ent in sp_lg(row).ents if ent.label_ == 'DATE']
    query_date[row] = []

    for date in date_occurences:
        try:
            date_token = re.split('\s+|/|-|:', date[0])
            day, month, year = decode_date(date_token)
            query_date[row].append((f'{year}-{month}-{day}'))
            row = row.replace(date[0], "", 1)
Ejemplo n.º 41
0
def load_spacy(language):

    model = "en_core_web_lg" if language == "en" else "xx_ent_wiki_sm"

    return spacy.load(model)
Ejemplo n.º 42
0
# stdlib
import re

# third party
import toml
import spacy as sp
import toolz as fp
import pandas as pd

# load spacy model
print(
    "[WARN] Loading `en_core_web_md` model from spacy. Might take a few seconds."
)
nlp = sp.load("en_core_web_md")

# load in default_regex strings
with open("optimus/etc/regexes") as handle:
    replace = toml.load(handle)


# helper for regex
def default_cleaner(string, regex_dict=replace):
    """
    default_cleaner(string, regex_dict={"regex":"replacement"})

    A default cleaner for text. The goal for this is to remove
    the unnecessary words and other things such as numbers from the
    text. The default dictionary for this is in the ``optimus/etc/regexes``
    file.

    A version of this function that maps across a list exists
Ejemplo n.º 43
0
def main():

    import numpy as np
    import pickle
    import pandas as pd
    import streamlit as st
    import tweepy
    import pandas as pd
    import re
    import emoji
    import nltk
    import datetime
    import spacy
    from nltk.corpus import stopwords
    from nltk.stem.wordnet import WordNetLemmatizer
    import string

    st.title("Streamlit (Topic_Modelling App)")
    html_temp = """    <div style="background-color:tomato;padding:10px">
    <h1 style="color:white;text-align:center;">Topic_Modelling</h1>
    </div>"""
    st.markdown(html_temp, unsafe_allow_html=True)

    #Creating search Box for search

    Date1 = st.sidebar.date_input(
        'start date',
        datetime.date.today() - datetime.timedelta(days=7))
    Date2 = st.sidebar.date_input('end date', datetime.date.today())

    # set variables for keys and tokens to access the Twitter API
    mykeys = open('API Twitter.txt', 'r').read().splitlines()
    api_key = mykeys[0]
    api_key_secret = mykeys[1]
    access_token = mykeys[2]
    access_token_secret = mykeys[3]

    auth = tweepy.OAuthHandler(consumer_key=api_key,
                               consumer_secret=api_key_secret)
    auth.set_access_token(access_token, access_token_secret)
    api = tweepy.API(auth, wait_on_rate_limit=True)

    #Featching the data from twitter
    search_words = "news"
    date_since = Date1
    data_until = Date2
    tweets = tweepy.Cursor(api.search,
                           q=search_words,
                           lang="en",
                           tweet_mode='extended',
                           since=date_since,
                           until=data_until,
                           result_type="recent").items(300)

    # Collect tweets
    tweets = tweepy.Cursor(api.search,
                           q=search_words,
                           lang="en",
                           since=date_since).items(300)

    # Iterate and print tweets
    s = []
    for tweet in tweets:
        s.append(tweet.text)
    print(s)
    df = pd.DataFrame({'tweet': s})
    import nltk
    words = set(nltk.corpus.words.words())

    tweet = np.array(df.tweet)
    cleaned_tweet = []
    for i in df.tweet:
        no_punc_text = i.translate(str.maketrans('', '', string.punctuation))
        no_punc_text = re.sub("(RT)?(ht)?", "",
                              no_punc_text)  # to remove RT and ht word
        no_punc_text1 = re.sub(
            "[\W\d]", " ",
            no_punc_text)  #to remove not word character and numbers
        no_punc_text2 = re.sub(
            "[^a-zA-Z]", " ",
            no_punc_text1)  #to remove forien language word character
        no_punc_text2=" ".join(w for w in nltk.wordpunct_tokenize(no_punc_text2) \
          if w.lower() in words or not w.isalpha())
        cleaned_tweet.append(no_punc_text2)
    df['cleaned_tweet'] = cleaned_tweet

    df1 = df.copy()
    corpus = df1.cleaned_tweet.unique()

    # import vectorizers
    from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

    # import numpy for matrix operation
    import numpy as np

    # import LDA from sklearn
    from sklearn.decomposition import LatentDirichletAllocation

    #nltk.download('wordnet')
    # Lemmatize with POS Tag
    from nltk.corpus import wordnet
    import nltk

    #nltk.download('averaged_perceptron_tagger')

    def get_wordnet_pos(word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV
        }

        return tag_dict.get(tag, wordnet.NOUN)

    # Apply Preprocessing on the Corpus

# stop loss words
    stop = set(stopwords.words('english'))
    stop.update([
        "new", "news", 'via', 'take', 'first', 'one', 'say', 'time', 'big',
        'see', 'come', 'good', 'another', 'today', 'make', 'get', 'great',
        'could', 'like', 'make', 'set', 'end', 'dont'
    ])
    # punctuation
    exclude = set(string.punctuation)

    # lemmatization
    lemma = WordNetLemmatizer()

    # One function for all the steps:
    def clean(doc):

        # convert text into lower case + split into words
        stop_free = " ".join([i for i in doc.lower().split() if i not in stop])

        # remove any stop words present
        punc_free = ''.join(ch for ch in stop_free if ch not in exclude)

        # remove punctuations + normalize the text
        normalized = " ".join(
            lemma.lemmatize(word, get_wordnet_pos(word))
            for word in punc_free.split())
        return normalized


# clean data stored in a new list

    clean_corpus = [clean(doc).split() for doc in corpus]

    corpus1 = []
    for i in clean_corpus:
        doc = []
        #j=i.split()
        for z in i:
            #print(len(z))
            if len(z) > 2:
                doc.append(z)
        #print(doc)
        doc = " ".join(doc)
        doc1 = doc.split()
        #print(doc1)
        corpus1.append(doc1)
    clean_corpus = corpus1

    abc = []  #to create single list
    for i in clean_corpus:
        abc.append(' '.join(i))

    abc2 = " ".join(abc)

    nlp = spacy.load('en_core_web_sm')
    one_block = abc2
    doc_block = nlp(one_block)

    #collecting 'PROPN','X','NOUN','ADJ' words
    final_corpus = [
        token.text for token in doc_block
        if token.pos_ in ('PROPN', 'X', 'NOUN', 'ADJ')
    ]
    imp_words = set(final_corpus)

    # to remove the meaningless words
    #doc=[]
    corpus1 = []
    for i in clean_corpus:
        doc = []
        #j=i.split()
        for z in i:
            #print(len(z))
            if z in imp_words:
                doc.append(z)
        #print(doc)
        doc = " ".join(doc)
        doc1 = doc.split()
        #print(doc1)
        corpus1.append(doc1)
    new_clean_corpus = corpus1

    # Converting text into numerical representation
    tf_idf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc,
                                        lowercase=False)

    # Converting text into numerical representation
    cv_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)

    # Array from TF-IDF Vectorizer
    tf_idf_arr = tf_idf_vectorizer.fit_transform(clean_corpus)

    # Array from Count Vectorizer
    cv_arr = cv_vectorizer.fit_transform(clean_corpus)
    # Materialize the sparse data
    data_dense = cv_arr.todense()

    # Compute Sparsicity = Percentage of Non-Zero cells
    print("Sparsicity: ", ((data_dense > 0).sum() / data_dense.size) * 100,
          "%")

    # Creating vocabulary array which will represent all the corpus
    vocab_tf_idf = tf_idf_vectorizer.get_feature_names()

    # Creating vocabulary array which will represent all the corpus
    vocab_cv = cv_vectorizer.get_feature_names()

    result = ""
    if st.button("Search"):
        result = Predict_Topics(cv_arr, vocab_tf_idf)
    st.success(st.write(result))
Ejemplo n.º 44
0
@author: nirmalenduprakash
Identifies important entities in a document and discovers sentiment towards these 
entities
"""

import spacy
import numpy as np
import pandas as pd

#!pip install git+htt!ps://github.com/huggingface/neuralcoref.git
import nltk
#nltk.download('punkt')
import neuralcoref
from wordcloud import WordCloud, STOPWORDS

nlp = spacy.load("en_core_web_sm")
stopwords = set(STOPWORDS)
pronouns = ['i', 'he', 'she', 'you', 'we', 'they', 'them', 'it', 'his', 'who']


def read_file(filepath):
    df = pd.read_table(filepath, sep=' ')
    if ('POS_TAGS' in df.columns):
        df.drop(['POS_TAGS'], inplace=True, axis=1)
    return df


df_ug = read_file(
    '/Users/nirmalenduprakash/Documents/Project/NLP/Sentiment Mining/IBM_Debater_(R)_SC_COLING_2018/LEXICON_UG.txt'
)
df_bg = read_file(
Ejemplo n.º 45
0
    def __init__(self):
        path = os.path.dirname(os.path.realpath(__file__))
        self.df = pd.read_csv(os.path.join(path, "../data/countries.csv"))
        self.utils = nlpUtils()
        self.nlp = spacy.load("en_core_web_sm")
        self.nationality_matcher = Matcher(self.nlp.vocab)
        nat_pattern = list()
        nat_pattern.append([{
            'LEMMA': 'be'
        }, {
            'POS': 'DET'
        }, {
            'ENT_TYPE': {
                "IN": ["GPE", "NORP", "LANGUAGE"]
            },
            'OP': "*"
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "PUNCT", "ADJ", "SYM"]
            },
            "OP": "*"
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "ADJ"]
            },
            "OP": "+"
        }])
        nat_pattern.append([{
            'LEMMA': 'be'
        }, {
            'POS': 'DET'
        }, {
            'ENT_TYPE': {
                "IN": ["GPE", "NORP", "LANGUAGE"]
            },
            'OP': "*"
        }, {
            "DEP": {
                "IN": ["punct", "compound", "amod", "nmod"]
            },
            "OP": "*"
        }, {
            'POS': 'NOUN'
        }, {
            "POS": {
                "IN": ["PUNCT", "NOUN", "ADJ", "PROPN"]
            },
            "OP": "*"
        }, {
            'ORTH': 'and'
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "PUNCT", "ADJ"]
            },
            "OP": "*"
        }, {
            'POS': {
                "IN": ["NOUN", "PROPN", "ADJ"]
            },
            "OP": "+"
        }])

        self.nationality_matcher.add("nationality", nat_pattern)

        self.influence_matcher = Matcher(self.nlp.vocab)

        influence1 = list()
        influence1.append([{
            'LEMMA': {
                "IN": ["inspire", "influence"]
            },
            "POS": 'VERB'
        }, {
            'ORTH': 'by'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence1", influence1)

        influence2 = list()
        influence2.append([{
            'LEMMA': {
                "IN": ["cite", "refer", "list", "mention", "credit", "claim"]
            },
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            'LEMMA': {
                "IN": ["as", "among"]
            }
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }])
        influence2.append([{
            'LEMMA': {
                "IN": ["cite", "refer", "list", "mention", "credit", "claim"]
            },
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'be'
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }])
        self.influence_matcher.add("influence2", influence2)

        influence3 = list()
        influence3.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'ORTH': 'include',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence3", influence3)

        influence4 = list()
        influence4.append([{
            'ORTH': 'influences',
            "POS": 'NOUN'
        }, {
            'ORTH': 'cited'
        }, {
            'ORTH': 'by'
        }, {
            "OP": "*"
        }, {
            'ORTH': 'include',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence4", influence4)

        influence5 = list()
        influence5.append([{
            'LEMMA': 'cite',
            "POS": 'VERB'
        }, {
            'ORTH': ','
        }, {
            "ORTH": "as"
        }, {
            "OP": "*"
        }, {
            'ORTH': 'influences',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence5", influence5)

        influence6 = list()
        influence6.append([{
            'LEMMA': 'state',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'LEMMA': 'be'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence6", influence6)

        influence7 = list()
        influence7.append([{
            'ORTH': 'influences',
            "POS": 'NOUN'
        }, {
            "ORTH": "?"
        }, {
            "ORTH": "such"
        }, {
            "ORTH": "as"
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence7", influence7)

        influence8 = list()
        influence8.append([{
            'LEMMA': {
                "IN": ["cite", "name"]
            },
            "POS": "VERB"
        }, {
            "OP": "*"
        }, {
            "ORTH": "as"
        }, {
            "ORTH": "one"
        }, {
            "ORTH": "of"
        }, {
            "OP": "*"
        }, {
            "ORTH": "'s"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }])
        self.influence_matcher.add("influence8", influence8)

        influence9 = list()
        influence9.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "ORTH": "including"
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence9", influence9)

        influence10 = list()
        influence10.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }, {
            "ORTH": "from"
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence10", influence10)

        influence11 = list()
        influence11.append([{
            'ORTH': 'citing',
            "POS": 'VERB'
        }, {
            "ORTH": "as"
        }, {
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence11", influence11)

        influence12 = list()
        influence12.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'LEMMA': 'be'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence12", influence12)

        influence13 = list()
        influence13.append([{
            'LEMMA': 'influence',
            "POS": 'NOUN'
        }, {
            'ORTH': 'of'
        }, {
            "OP": "*"
        }])
        self.influence_matcher.add("influence13", influence13)

        influence14 = list()
        influence14.append([{
            'LEMMA': 'inspiration',
            "POS": 'NOUN'
        }, {
            'ORTH': {
                "IN": ["from", "include"]
            }
        }, {
            "OP": "*"
        }])
        influence14.append([{
            'LEMMA': 'cite',
            "POS": 'VERB'
        }, {
            "OP": "*"
        }, {
            "ORTH": "as"
        }, {
            'LEMMA': 'inspiration',
            "POS": 'NOUN'
        }])
        self.influence_matcher.add("influence14", influence14)

        self.mappa = dict()
        self.mappa[self.nlp.vocab.strings["influence1"]] = "influence1"
        self.mappa[self.nlp.vocab.strings["influence2"]] = "influence2"
        self.mappa[self.nlp.vocab.strings["influence3"]] = "influence3"
        self.mappa[self.nlp.vocab.strings["influence4"]] = "influence4"
        self.mappa[self.nlp.vocab.strings["influence5"]] = "influence5"
        self.mappa[self.nlp.vocab.strings["influence6"]] = "influence6"
        self.mappa[self.nlp.vocab.strings["influence7"]] = "influence7"
        self.mappa[self.nlp.vocab.strings["influence8"]] = "influence8"
        self.mappa[self.nlp.vocab.strings["influence9"]] = "influence9"
        self.mappa[self.nlp.vocab.strings["influence10"]] = "influence10"
        self.mappa[self.nlp.vocab.strings["influence11"]] = "influence11"
        self.mappa[self.nlp.vocab.strings["influence12"]] = "influence12"
        self.mappa[self.nlp.vocab.strings["influence13"]] = "influence13"
        self.mappa[self.nlp.vocab.strings["influence14"]] = "influence14"
    def extract(lang, tweet, params) :
        nlp = spacy.load(lang)
        nlp.max_length = SPACY_MAX_LENGTH

        tweet_dict = OrderedDict()
        if lang == 'en':
            pos_dict = {
                'SPACE': 0,
                'NOUN': 0,
                'PART': 0,
                'PRON': 0,
                'INTJ': 0,
                'SYM': 0,
                'ADJ': 0,
                'CCONJ': 0,
                'PUNCT': 0,
                'X': 0,
                'VERB': 0,
                'ADP': 0,
                'ADV': 0,
                'PROPN': 0,
                'NUM': 0,
                'DET': 0,
                'AUX': 0,
                'SCONJ' : 0
            }

            tense_dict = {
                'Past': 0,
                'Pres': 0
            }
            n_stopwords = 0
            
            for word in nlp(tweet['text']):
                # n_pos features
                pos = word.pos_
                pos_dict[pos] = pos_dict[pos] + 1
                # n_stopwords
                if word.is_stop:
                    n_stopwords += 1
                # n_tense features
                tag = word.tag_
                if tag in params['tense']['present']:
                    tense_dict['Pres'] = tense_dict['Pres'] + 1
                elif tag in params['tense']['past']:
                    tense_dict['Past'] = tense_dict['Past'] + 1

        for feature, params in params.items():
            if feature == 'tense':
                tweet_dict['n_tense_past'] = tense_dict['Past']
                tweet_dict['n_tense_pres'] = tense_dict['Pres']
            elif feature == 'pos_counts':
                tweet_dict['n_pos_space'] = pos_dict['SPACE']
                tweet_dict['n_pos_noun'] = pos_dict['NOUN']
                tweet_dict['n_pos_par'] = pos_dict['PART']
                tweet_dict['n_pos_pron'] = pos_dict['PRON']
                tweet_dict['n_pos_intj'] = pos_dict['INTJ']
                tweet_dict['n_pos_sym'] = pos_dict['SYM']
                tweet_dict['n_pos_adj'] = pos_dict['ADJ']
                tweet_dict['n_pos_conj'] = pos_dict['CCONJ']
                tweet_dict['n_pos_punct'] = pos_dict['PUNCT']
                tweet_dict['n_pos_x'] = pos_dict['X']
                tweet_dict['n_pos_verb'] = pos_dict['VERB']
                tweet_dict['n_pos_adp'] = pos_dict['ADP']
                tweet_dict['n_pos_adv'] = pos_dict['ADV']
                tweet_dict['n_pos_propn'] = pos_dict['PROPN']
                tweet_dict['n_pos_num'] = pos_dict['NUM']
                tweet_dict['n_pos_det'] = pos_dict['DET']
            elif feature == 'word_counts':
                tweet_dict['n_words'] = len(tweet['text'].split())
            elif feature == 'stopword_counts':
                tweet_dict['n_stopwords'] = n_stopwords

        return tweet_dict
Ejemplo n.º 47
0
import pytest
import spacy
import string

import numpy as np

from alibi.explainers import AnchorText
from alibi.explainers.anchor_text import Neighbors
from alibi.explainers.tests.utils import get_dataset
from alibi.explainers.tests.utils import predict_fcn
from alibi.utils.download import spacy_model

# load spaCy model
model = 'en_core_web_md'
spacy_model(model=model)
nlp = spacy.load(model)


def find_punctuation(text: str) -> int:
    """
    Returns nb of punctuation marks in a string.
    """

    punctuation = set([s for s in string.punctuation])
    tokens = set(text.split())

    return len(tokens & punctuation)


@pytest.mark.parametrize('lr_classifier', ((get_dataset('movie_sentiment')), ),
                         indirect=True)
Ejemplo n.º 48
0
 def __init__(self, model="en_core_web_sm"):
     if model is None:
         model = "en_core_web_sm"
     self._nlp = spacy.load(model)
 def __init__(self, path='data/input/'):
     self.nlp = spacy.load('en', disable=['ner', 'textcat'])
     self.pi = ProcessInput()
     self.pub_df = self.pi.load_publication_input(path=path)
Ejemplo n.º 50
0
 def __init__(self):
     self.nlp = spacy.load("en_core_web_sm")
Ejemplo n.º 51
0
 def __init__(self):
     self.spacynlp = spacy.load('en_core_web_lg')
Ejemplo n.º 52
0
def default_nlp():
    return spacy.load('en_core_web_sm')
Ejemplo n.º 53
0
Archivo: vis.py Proyecto: NPCai/Squadie
def howParse(sentence, answer):
    ''' Parser for questions that begin with the adverbial modifier how'''
    arg1 = []
    arg2 = []
    rel = []
    argument = False
    relTrue = False
    Object = False
    poss = False
    objTrue = False

    if sentence[0].lower_ != "how":
        return None
    if sentence[1].lower_ == "much" or sentence[1].lower_ == "many":
        arg1 = answer
        argument = False
        for child in sentence:
            if "subj" in child.dep_:
                _, arg2 = descendants(sentence, child, True)
            if "obj" in child.dep_:
                Object = True
        _, rel = descendants(sentence, sentence.root, True)
        rel = [token for token in rel if not token in arg2]
        stopwords = ['much', 'many']
        arg2 = [word for word in arg2 if word.lower_ not in stopwords]
        rel = [token for token in rel if token.lower_ not in stopwords]
        rel = [
            child for child in rel
            if "aux" not in child.dep_ or child.lower_ == "to"
        ]

        if len(arg2) != 0:
            if sentence[1].lower_ == "much" and arg2[0].dep_ != "prep":
                arg2.insert(0, "in")

            if sentence[1].lower_ == "many" and Object == False:
                rel.insert(0, "number")

    elif sentence[1].lower_ == "did" or sentence[1].lower_ == "is":

        for child in sentence:
            if "obj" in child.dep_:
                _, rel = descendants(sentence, child, True, sentence.root)
            if "subj" in child.dep_:
                _, arg1 = descendants(sentence, child, True)

        arg2.append(answer)
        nlp = spacy.load('en')
        answerDep = nlp(answer)
        if sentence[1].lower_ != "is":
            for token in answerDep:
                if token.dep_ == "poss":
                    arg2.insert(0, "with")
                    poss = True
            if poss == False:
                arg2.insert(0, "by")
        else:
            arg2.insert(0, "as")
        argument = True

    elif "comp" in sentence[1].dep_ or sentence[1].dep_ == "advmod":
        for child in sentence:
            if "obj" in child.dep_:
                objTrue = True

        if objTrue == True and sentence[1].dep_ == "advmod":
            for children in sentence:
                if "obj" in children.dep_:
                    _, arg1 = descendants(sentence, children, True)
            rel = sentence.root.lower_
            arg2.append(answer)
            for advmod in sentence:
                if advmod.dep_ == "advmod":
                    arg2.append(advmod.lower_)

        if objTrue == True and "comp" in sentence[1].dep_:
            for token in sentence:
                if "subj" in token.dep_:
                    _, arg1 = descendants(sentence, token, True)
                    break
            rel = sentence.root.lower_
            arg2.append(answer)
            for comp in sentence:
                if "comp" in comp.dep_:
                    arg2.append(comp.lower_)

    else:
        return None

    print("How parse")
    arg1, rel, arg2 = extractHelper(arg1, rel, arg2)
    return Extract(arg1=arg1, rel=rel, arg2=arg2)
Ejemplo n.º 54
0
import random
import math
import time

import collections
from collections import Counter
import math
import numpy as np
import subprocess

# from google.colab import drive
# drive.mount('/content/drive')
#
# !python -m spacy download en
# !python -m spacy download fr
spacy_en = spacy.load('en')
spacy_de = spacy.load('fr')

# preparing data

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# spacy_de = spacy.load('de')

def generate_features(df):
    """Generate the following features:

   'days_yelping',
   'word_count',
   'unique_words_perc',
   'no_adjectives',
   'perc_unique_verbs',
   'perc_unique_adjectives',
   'norm_of_wordvecs',
   'adj_wordvecs'
   'verb_wordvecs',
   'adj_maj_sent__negative',
   'adj_maj_sent__objective',
   'adj_maj_sent__positive',
   'adj_maj_sent__no_sentiment_assigned',
   'verb_maj_sent_negative',
   'verb_maj_sent_objective',
   'verb_maj_sent_positive',
   'verb_maj_sent_no_sentiment_assigned',
   'useful_review_compliments_received_for_other_reviews'
    """

    print('----- Feature Generation -----')

    feature_data = df[[
        'useful_01', 'text', 'stars', 'average_stars', 'compliment_more',
        'compliment_hot', 'compliment_photos', 'compliment_writer',
        'compliment_plain', 'fans', 'review_count', 'yelping_since', 'user_id',
        'useful_review'
    ]]

    feature_data = feature_data.merge(pd.DataFrame(feature_data.groupby(feature_data['user_id']).useful_review.sum()), \
                                      how='left', on='user_id',
                                      suffixes=('', '_compliments_received_for_other_reviews'))

    feature_data['useful_review_compliments_received_for_other_reviews'] = \
        feature_data['useful_review_compliments_received_for_other_reviews'] - feature_data['useful_review']

    nlp = spacy.load('en_core_web_lg', disable=['ner', "parser"])

    # define for each token the attribute _.ignore:
    ignore_getter = lambda token: (
        token.is_stop or  # remove stop words
        token.lower_ in STOP_WORDS
        or  # ignore stop words independent of their case
        # token.like_num or # ignore stuff that looks like a number
        # token.is_digit or #ignore tokens consisting of digits
        token.is_punct or  # ignore punctuation tokens
        token.is_left_punct or  # ignore left punctuation
        token.is_right_punct or  # ignore right punctuation
        token.is_space or  # ignore tokens consisting of spaces
        token.is_bracket or  # ignore brackets
        token.is_quote or  # ignore quotation marks
        not token.is_alpha)  # ignore everything that is not only letters
    # (this might be too strict, but without it special characters
    # like +$% etc will stay). With it, however, most of the previous
    # stuff is not needed..
    Token.set_extension('ignore', getter=ignore_getter,
                        force=True)  # add the _.ignore attribute

    def get_days_yelping(input_timestamp):
        days_yelp = datetime.now().date() - datetime.strptime(
            input_timestamp.split()[0], '%Y-%m-%d').date()
        return days_yelp.days

    def get_tokens(nlp_text):
        return [
            token for token in nlp_text
            if not (token._.ignore or not (len(token) > 1))
        ]

    def get_unique__perc(token_list):
        if not len(token_list) == 0:
            return len(set([str(tok) for tok in token_list])) / len(token_list)
        else:
            return 0

    def get_word_vectors(token_list):
        if not len(token_list) == 0:
            word_vectors = [token.vector for token in token_list]
            return np.linalg.norm(sum(word_vectors) / len(token_list))
        else:
            return 0

    def get_sentiment(token_list, nlp_review):
        """Get sentiment using SentiWordNet"""

        # WordNet (and thus SentiWordNet) use different POS tags than spacy output
        # here we transform from spacy to WordNet
        wordNet_pos_dict = {"NOUN": "n", "VERB": "v", "ADJ": "a", "ADV": "r"}

        def posTag_to_wordNetTag(tag):
            if tag in wordNet_pos_dict:
                return wordNet_pos_dict[tag]
            else:
                return None

        # to get the semantic score of a word from SentiWordNet we first need to find this word in WordNet
        # This process is called Word Sense Disambiguation and we use the simple lesk algorithm for that
        def get_semantic_score_with_context(token, nlp_review):

            word = token.lower_  # get lowercased token text
            position = token.idx  # get position of word in document
            pos = posTag_to_wordNetTag(
                token.pos_
            )  # get POS of token, for better word sense disambiguation

            # define how many tokens around the token of interest we look at
            num_surrounding_words = 10
            # careful if there are less then num_surrounding_words before our token or after our token
            leftmost_word_idx = max(0, position - num_surrounding_words)
            rightmostword_idx = min(len(nlp_review),
                                    position + num_surrounding_words)
            surrounding_text = nlp_review[
                leftmost_word_idx:rightmostword_idx].text

            # determine word with the closest sense in WordNet
            #     print(word,"....",surrounding_text,pos)
            try:
                word_with_closest_sense = simple_lesk(surrounding_text,
                                                      word,
                                                      pos=pos)
            except:
                word_with_closest_sense = simple_lesk(surrounding_text, word)
            #     print(word,pos,word_with_closest_sense)
            # find the sentiment score to the word we found in wordnet
            if word_with_closest_sense:
                sentiword = swn.senti_synset(word_with_closest_sense.name())

                sent_scores = {
                    "objective": sentiword.obj_score(),
                    "positive": sentiword.pos_score(),
                    "negative": sentiword.neg_score()
                }

                sentiment = max(sent_scores, key=sent_scores.get)

                return sentiment
            else:
                return 'no_sentiment_assigned'

        if not len(token_list) == 0:
            sentiments = []
            for token in token_list:
                sentiments.append(
                    get_semantic_score_with_context(token, nlp_review))

            counts = Counter(sentiments)

            return max(counts, key=counts.get)
        else:
            return 'no_sentiment_assigned'

    feature_data['days_yelping'] = feature_data.yelping_since.swifter.apply(
        get_days_yelping)
    del feature_data['yelping_since']

    feature_data['text_processed'] = feature_data.text.swifter.apply(
        lambda x: nlp(x))
    feature_data['tokens'] = feature_data.text_processed.swifter.apply(
        lambda x: get_tokens(x))

    feature_data['word_count'] = feature_data.tokens.swifter.apply(
        lambda x: len(x))
    feature_data['unique_words_perc'] = feature_data.tokens.swifter.apply(
        get_unique__perc)
    feature_data['adjectives'] = feature_data.tokens.swifter.apply(
        lambda x: [token for token in x if token.pos_ == 'ADJ'])
    feature_data['no_adjectives'] = feature_data.adjectives.swifter.apply(
        lambda x: len(x))
    feature_data['verbs'] = feature_data.tokens.swifter.apply(
        lambda x: [token for token in x if token.pos_ == 'VERB'])
    feature_data['perc_unique_verbs'] = feature_data.verbs.swifter.apply(
        get_unique__perc)
    feature_data[
        'perc_unique_adjectives'] = feature_data.adjectives.swifter.apply(
            get_unique__perc)
    feature_data['norm_of_wordvecs'] = feature_data.tokens.swifter.apply(
        get_word_vectors)
    feature_data['adj_wordvecs'] = feature_data.adjectives.swifter.apply(
        get_word_vectors)
    feature_data['verb_wordvecs'] = feature_data.verbs.swifter.apply(
        get_word_vectors)

    feature_data['adj_maj_sent'] = feature_data.swifter.apply(
        lambda x: get_sentiment(x.adjectives, x.text_processed), axis=1)
    feature_data['verb_maj_sent'] = feature_data.swifter.apply(
        lambda x: get_sentiment(x.verbs, x.text_processed), axis=1)

    feature_data = feature_data.merge(pd.get_dummies(feature_data[['adj_maj_sent', 'verb_maj_sent']], prefix = ['adj_maj_sent_', 'verb_maj_sent']), \
                       left_index = True, right_index = True)

    # feather.write_dataframe(feature_data, os.getcwd() + 'feature_data.feather')

    return feature_data
           in American history, including the Declaration of Independence, the Emancipation Proclamation, 
           and the United States Constitution. Early in his speech, King alludes to Abraham Lincoln's 
           Gettysburg Address by saying "Five score years ago ..." In reference to the abolition of 
           slavery articulated in the Emancipation Proclamation, King says: "It came as a joyous 
           daybreak to end the long night of their captivity." Anaphora (i.e., the repetition of a 
           phrase at the beginning of sentences) is employed throughout the speech. Early in his speech, 
           King urges his audience to seize the moment; "Now is the time" is repeated three times in the sixth 
           paragraph. The most widely cited example of anaphora is found in the often quoted phrase 
           "I have a dream", which is repeated eight times as King paints a picture of an integrated and 
           unified America for his audience. Other occasions include "One hundred years later", "We can 
           never be satisfied", "With this faith", "Let freedom ring", and "free at last". King was the 
           sixteenth out of eighteen people to speak that day, according to the official program.'''

stopwords = list(STOP_WORDS)

nlp = spacy.load('en_core_web_sm')

doc = nlp(text)

tokens = [token.text for token in doc ]

punctuation = punctuation + '\n' + ' ' + '  ' + '...' + '\n           '

word_frequency = {}
for word in doc:
    if word.text.lower() not in stopwords:
        if word.text.lower() not in punctuation:
            if word.text not in word_frequency.keys():
                word_frequency[word.text] = 1
            else:
                word_frequency[word.text] += 1
Ejemplo n.º 57
0
import spacy

nlp = spacy.load("en_core_web_md")
# doc = nlp("Calderon-Zygmund operators are objects that are largely responsible for our understanding of a number of physical phenomena, from heat transfer to turbulence")
doc = nlp(
    "With this award, the Chemical Structure, Dynamics and Mechanisms (CSDM-A) Program of the Division of Chemistry is funding Professor Istvan Z. Kiss and his research group at Saint Louis University to study pattern formation of electrochemical reactions"
)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)
Ejemplo n.º 58
0
	def __init__(self):
		self.query = ""
		self.nlp = spacy.load('en_core_web_sm')
Ejemplo n.º 59
0
# Make it work for Python 2+3 and with Unicode
try:
    to_unicode = unicode
except NameError:
    to_unicode = str

# Import necessary packages
import spacy
nlp = spacy.load('en')
import subprocess
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
from nltk.corpus import wordnet as wn
import re
pattern = re.compile(r'(<IN>)*(<DT>)*(<JJ>)*(<NN>|<NNS>|<NNP>)+')
w_words = ['when', 'who', 'what', 'why', 'how', 'where']
import json
from collections import defaultdict
import pandas as pd
from flask import Flask, render_template, request, redirect
import re
import io
from fuzzywuzzy import process, fuzz

## Some import word lists
aux_verb = [
    'be', 'can', 'could', 'dare', 'do', 'have', 'may', 'might', 'must', 'need',
    'ought', 'shall', 'should', 'will', 'would'
]

wh_words = ['when', 'why', 'how', 'what', 'where', 'who']
Ejemplo n.º 60
0
        else:
            # Uppercase followed by lowercase
            chars = [c.lower() if i % 2 else c.upper()
                             for i, c in enumerate(text)]
        # Create augmented training example
        example_dict = example.to_dict()
        doc = nlp.make_doc("".join(chars))
        example_dict["token_annotation"]["ORTH"] = [t.text for t in doc]
        # Original example followed by augmented example
        # yield example
        yield example.from_dict(doc, example_dict)

    return augment

dir(example.)
nlp = spacy.load("da_core_news_sm")
doc = nlp("Mit navn er Kenneth og Malte og Jakob og Kenneth.")
ent_dict = {"PER": get_names()}
example = Example(doc, doc)
lc_augmenter = create_augmenter_sponge(randomize=False)
res = next(lc_augmenter(nlp, example))
res.text
res
augment = create_augmenter(ent_dict, prob=1)
res
res = next(augment(nlp, res))
res.text

res

dir(example)