def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType: """ In order to avoid loading spacy models a whole bunch of times, we'll save references to them, keyed by the options we used to create the spacy model, so any particular configuration only gets loaded once. """ options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: disable = ['vectors', 'textcat'] if not pos_tags: disable.append('tagger') if not parse: disable.append('parser') if not ner: disable.append('ner') try: spacy_model = spacy.load(spacy_model_name, disable=disable) except OSError: logger.warning(f"Spacy models '{spacy_model_name}' not found. Downloading and installing.") spacy_download(spacy_model_name) spacy_model = spacy.load(spacy_model_name, disable=disable) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
def main(model=None, new_model_name='animal', output_dir=None, n_iter=20): """Set up the pipeline and entity recognizer, and train the new entity.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank('en') # create blank Language class print("Created blank 'en' model") # Add entity recognizer to model if it's not in the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner) # otherwise, get it, so we can add labels to it else: ner = nlp.get_pipe('ner') ner.add_label(LABEL) # add new entity label to entity recognizer if model is None: optimizer = nlp.begin_training() else: # Note that 'begin_training' initializes the models, so it'll zero out # existing entity types. optimizer = nlp.entity.create_optimizer() # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update([text], [annotations], sgd=optimizer, drop=0.35, losses=losses) print(losses) # test the trained model test_text = 'Do you like horses?' doc = nlp(test_text) print("Entities in '%s'" % test_text) for ent in doc.ents: print(ent.label_, ent.text) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.meta['name'] = new_model_name # rename model nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc2 = nlp2(test_text) for ent in doc2.ents: print(ent.label_, ent.text)
def get_nlp(lang="en"): """Load spaCy model for a given language, determined by `models' dict or by MODEL_ENV_VAR""" instance = nlp.get(lang) if instance is None: import spacy model = models.get(lang) if not model: models[lang] = model = os.environ.get("_".join((MODEL_ENV_VAR, lang.upper()))) or \ os.environ.get(MODEL_ENV_VAR) or DEFAULT_MODEL.get(lang, "xx") started = time.time() with external_write_mode(): print("Loading spaCy model '%s'... " % model, end="", flush=True) try: nlp[lang] = instance = spacy.load(model) except OSError: spacy.cli.download(model) try: nlp[lang] = instance = spacy.load(model) except OSError as e: raise OSError("Failed to get spaCy model. Download it manually using " "`python -m spacy download %s`." % model) from e tokenizer[lang] = instance.tokenizer instance.tokenizer = lambda words: spacy.tokens.Doc(instance.vocab, words=words) print("Done (%.3fs)." % (time.time() - started)) return instance
def _load_spacy(self, evaluation, options): language_code = None language_name = self.get_option(options, 'Language', evaluation) if language_name is None: language_name = String('Undefined') if isinstance(language_name, String): language_code = _SpacyBuiltin._language_codes.get(language_name.get_string_value()) if not language_code: evaluation.message(self.get_name(), 'lang', language_name, strip_context(self.get_name())) return None instance = _SpacyBuiltin._spacy_instances.get(language_code) if instance: return instance try: if 'SPACY_DATA' in os.environ: instance = spacy.load(language_code, via=os.environ['SPACY_DATA']) else: instance = spacy.load(language_code) _SpacyBuiltin._spacy_instances[language_code] = instance return instance except RuntimeError as e: evaluation.message(self.get_name(), 'runtime', str(e)) return None
def main(model=None, output_dir=None, n_iter=15): """Load the model, set up the pipeline and train the parser.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # add the parser to the pipeline if it doesn't exist # nlp.create_pipe works for built-ins that are registered with spaCy if "parser" not in nlp.pipe_names: parser = nlp.create_pipe("parser") nlp.add_pipe(parser, first=True) # otherwise, get it, so we can add labels to it else: parser = nlp.get_pipe("parser") # add labels to the parser for _, annotations in TRAIN_DATA: for dep in annotations.get("deps", []): parser.add_label(dep) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"] with nlp.disable_pipes(*other_pipes): # only train parser optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model test_text = "I like securities." doc = nlp(test_text) print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc]) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) doc = nlp2(test_text) print("Dependencies", [(t.text, t.dep_, t.head.text) for t in doc])
def __init__(self, model='en', disable=None): if disable is None: disable = [] try: self._parser = spacy.load(model, disable=disable) except OSError: url = 'https://spacy.io/models' if license_prompt('Spacy {} model'.format(model), url) is False: sys.exit(0) spacy_download(model) self._parser = spacy.load(model, disable=disable)
def get_ents(): data = flask.request.args.get('fragment') is_custom = flask.request.args.get('custom') if is_custom is not None: nlp = spacy.load(Path('./gina_haspel')) else: nlp = spacy.load('en') doc = nlp(data) print(doc) tuples = [(str(x), x.label_) for x in doc.ents] return flask.jsonify(dict(tuples))
def get_nlp(): if nlp.instance is None: import spacy model_name = os.environ.get("SPACY_MODEL", "en") nlp.instance = spacy.load(model_name) if nlp.instance.tagger is None: # Model not really loaded spacy.cli.download(model_name) nlp.instance = spacy.load(model_name) assert nlp.instance.tagger, "Failed to get spaCy model. " \ "Download it manually using `python -m spacy download %s`." % model_name nlp.tokenizer = nlp.instance.tokenizer nlp.instance.tokenizer = nlp.tokenizer.tokens_from_list return nlp.instance
def _spacy_en(): yield None try: spacyen = spacy.load('en_default', via=data_path) except RuntimeError as e: if e.message == "Model not installed. Please run 'python -m spacy.en.download' to install latest compatible model.": print("Need to download Spacy data. Starting download now") sputnik.install('spacy', spacy.about.__version__, 'en_default', data_path=data_path) spacyen = spacy.load('en_default', via=data_path) else: raise while True: yield spacyen
def main(model=None, output_dir=None, n_iter=15): """Load the model, set up the pipeline and train the parser.""" if model is not None: nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: nlp = spacy.blank("en") # create blank Language class print("Created blank 'en' model") # We'll use the built-in dependency parser class, but we want to create a # fresh instance – just in case. if "parser" in nlp.pipe_names: nlp.remove_pipe("parser") parser = nlp.create_pipe("parser") nlp.add_pipe(parser, first=True) for text, annotations in TRAIN_DATA: for dep in annotations.get("deps", []): parser.add_label(dep) other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "parser"] with nlp.disable_pipes(*other_pipes): # only train parser optimizer = nlp.begin_training() for itn in range(n_iter): random.shuffle(TRAIN_DATA) losses = {} # batch up the examples using spaCy's minibatch batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)) for batch in batches: texts, annotations = zip(*batch) nlp.update(texts, annotations, sgd=optimizer, losses=losses) print("Losses", losses) # test the trained model test_model(nlp) # save model to output directory if output_dir is not None: output_dir = Path(output_dir) if not output_dir.exists(): output_dir.mkdir() nlp.to_disk(output_dir) print("Saved model to", output_dir) # test the saved model print("Loading from", output_dir) nlp2 = spacy.load(output_dir) test_model(nlp2)
def _load_nlp(self): if self.nlp is None: print("Loading SpacySentenceWordCache") nlp = spacy.load('en_core_web_lg') nlp.add_pipe(nlp.create_pipe('sentencizer')) self.nlp = nlp return self.nlp
def main(lang, in_dir, out_loc, negative=5, n_workers=4, window=5, size=128, min_count=10, nr_iter=2): logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) model = Word2Vec( size=size, window=window, min_count=min_count, workers=n_workers, sample=1e-5, negative=negative ) nlp = spacy.load(lang, parser=False, tagger=False, entity=False) corpus = Corpus(in_dir) total_words = 0 total_sents = 0 for text_no, text_loc in enumerate(iter_dir(corpus.directory)): with io.open(text_loc, 'r', encoding='utf8') as file_: text = file_.read() total_sents += text.count('\n') doc = nlp(text) total_words += corpus.count_doc(doc) logger.info("PROGRESS: at batch #%i, processed %i words, keeping %i word types", text_no, total_words, len(corpus.strings)) model.corpus_count = total_sents model.raw_vocab = defaultdict(int) for orth, freq in corpus.counts: if freq >= min_count: model.raw_vocab[nlp.vocab.strings[orth]] = freq model.scale_vocab() model.finalize_vocab() model.iter = nr_iter model.train(corpus) model.save(out_loc)
def train(train_loc, dev_loc, shape, settings): train_texts1, train_texts2, train_labels = read_snli(train_loc) dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc) print("Loading spaCy") nlp = spacy.load('en') assert nlp.path is not None print("Compiling network") model = build_model(get_embeddings(nlp.vocab), shape, settings) print("Processing texts...") Xs = [] for texts in (train_texts1, train_texts2, dev_texts1, dev_texts2): Xs.append(get_word_ids(list(nlp.pipe(texts, n_threads=20, batch_size=20000)), max_length=shape[0], rnn_encode=settings['gru_encode'], tree_truncate=settings['tree_truncate'])) train_X1, train_X2, dev_X1, dev_X2 = Xs print(settings) model.fit( [train_X1, train_X2], train_labels, validation_data=([dev_X1, dev_X2], dev_labels), nb_epoch=settings['nr_epoch'], batch_size=settings['batch_size']) if not (nlp.path / 'similarity').exists(): (nlp.path / 'similarity').mkdir() print("Saving to", nlp.path / 'similarity') weights = model.get_weights() with (nlp.path / 'similarity' / 'model').open('wb') as file_: pickle.dump(weights[1:], file_) with (nlp.path / 'similarity' / 'config.json').open('wb') as file_: file_.write(model.to_json())
def train(train_texts, train_labels, dev_texts, dev_labels, lstm_shape, lstm_settings, lstm_optimizer, batch_size=100, nb_epoch=5, by_sentence=True): nlp = spacy.load('en', entity=False) if 'nr_vector' not in lstm_shape: lstm_shape['nr_vector'] = max(lex.rank+1 for lex in vocab if lex.has_vector) print("Make model") model = Classifier(SentimentModel(nlp, lstm_shape, **lstm_settings)) print("Parsing texts...") if by_sentence: train_data = SentenceDataset(nlp, train_texts, train_labels, lstm_shape['max_length']) dev_data = SentenceDataset(nlp, dev_texts, dev_labels, lstm_shape['max_length']) else: train_data = DocDataset(nlp, train_texts, train_labels) dev_data = DocDataset(nlp, dev_texts, dev_labels) train_iter = SerialIterator(train_data, batch_size=batch_size, shuffle=True, repeat=True) dev_iter = SerialIterator(dev_data, batch_size=batch_size, shuffle=False, repeat=False) optimizer = chainer.optimizers.Adam() optimizer.setup(model) updater = chainer.training.StandardUpdater(train_iter, optimizer, device=0) trainer = chainer.training.Trainer(updater, (20, 'epoch'), out='result') trainer.extend(extensions.Evaluator(dev_iter, model, device=0)) trainer.extend(extensions.LogReport()) trainer.extend(extensions.PrintReport([ 'epoch', 'main/accuracy', 'validation/main/accuracy'])) trainer.extend(extensions.ProgressBar()) trainer.run()
def test_access_parse_for_merged(): nlp = spacy.load('en_core_web_sm') t_t = nlp.tokenizer("Highly rated - I'll definitely") nlp.tagger(t_t) nlp.parser(t_t) nlp.parser(t_t) ss(t_t)
def __init__(self, model=None, output_dir=None, n_iter=50, threshold_success=None, successive_successes=3, input_data = "TRAINING_DATA.json", param='ner'): """Load the model, set up the pipeline and train the entity recognizer.""" if model is not None: self.nlp = nlp = spacy.load(model) # load existing spaCy model print("Loaded model '%s'" % model) else: self.nlp = nlp = spacy.load('de', parser=False) # load default `de` model print("Loaded existing 'de' model") self.output_dir = output_dir self.n_iter = n_iter self.threshold_success = threshold_success self.successive_successes = successive_successes self.TRAIN_DATA = json.load(open(input_data)) self.train(param) self.test()
def __init__(self,lang='en'): try: import spacy except: raise Exception("spacy not installed. Use `pip install spacy`.") super(SpaCy, self).__init__(name="spaCy") self.model = spacy.load('en')
def main(argv): if not argv or len(argv) < 2: raise TypeError("not enough arguments. two are required") # Load Spacy's English tokenizer model print "Loading Spacy's English model" nlp = spacy.load('en') # Download two news articles from internet, supplied as urls in command line arguments utext1 = getOnlyText(argv[0]) utext2 = getOnlyText(argv[1]) # Use Spacy to tokenize documents, then remove stop words and non-alpha print "Parsing files" doc1 = filterText(nlp(utext1), nlp) doc2 = filterText(nlp(utext2), nlp) # Similarity is estimated using the cosine metric, between Span.vector and other.vector. # By default, Span.vector is computed by averaging the vectors of its tokens. [spacy.io] print "Document Vectors Similarity Score:", doc1.similarity(doc2) # Build Bag of Words with Spacy wordBag1 = doc1.count_by(LOWER) wordBag2 = doc2.count_by(LOWER) # Combine Bag of Words dicts in vector format, calculate cosine similarity of resulting vectors vect = DictVectorizer(sparse=False) wordbagVectors = vect.fit_transform([wordBag2, wordBag1]) score = 1 - spatial.distance.cosine(wordbagVectors[0], wordbagVectors[1]) print "Bag of Words Cosine Similarity Score:", score return score
def create_nlp_instance(): import spacy from spacymoji import Emoji nlp = spacy.load('en') emoji_pipe = Emoji(nlp) nlp.add_pipe(emoji_pipe, first=True) # Merge hashtag tokens which were split by spacy def hashtag_pipe(doc): merged_hashtag = False while True: for token_index, token in enumerate(doc): if token.text == '#': if token.head is not None: start_index = token.idx end_index = start_index + len(token.head.text) + 1 if doc.merge(start_index, end_index) is not None: merged_hashtag = True break if not merged_hashtag: break merged_hashtag = False return doc nlp.add_pipe(hashtag_pipe) return nlp
def main(): train, dev = datasets.imdb() train_X, train_y = zip(*train) dev_X, dev_y = zip(*dev) model = LinearModel(2) train_y = to_categorical(train_y, nb_classes=2) dev_y = to_categorical(dev_y, nb_classes=2) nlp = spacy.load("en") train_X = [ model.ops.asarray([tok.orth for tok in doc], dtype="uint64") for doc in nlp.pipe(train_X) ] dev_X = [ model.ops.asarray([tok.orth for tok in doc], dtype="uint64") for doc in nlp.pipe(dev_X) ] dev_X = preprocess(model.ops, dev_X) with model.begin_training(train_X, train_y, L2=1e-6) as (trainer, optimizer): trainer.dropout = 0.0 trainer.batch_size = 512 trainer.nb_epoch = 3 trainer.each_epoch.append(lambda: print(model.evaluate(dev_X, dev_y))) for X, y in trainer.iterate(train_X, train_y): keys_vals_lens = preprocess(model.ops, X) scores, backprop = model.begin_update(keys_vals_lens, drop=trainer.dropout) backprop(scores - y, optimizer) with model.use_params(optimizer.averages): print(model.evaluate(dev_X, dev_y))
def __init__(self, nlp=None, greedyness=0.5, max_dist=50, max_dist_match=500, conll=None, use_no_coref_list=True, debug=False): self.greedyness = greedyness self.max_dist = max_dist self.max_dist_match = max_dist_match self.debug = debug if nlp is None: print("Loading spacy model") try: spacy.info('en_core_web_sm') model = 'en_core_web_sm' except IOError: print("No spacy 2 model detected, using spacy1 'en' model") model = 'en' nlp = spacy.load(model) model_path = os.path.join(PACKAGE_DIRECTORY, "weights/conll/" if conll is not None else "weights/") embed_model_path = os.path.join(PACKAGE_DIRECTORY, "weights/") print("loading model from", model_path) self.data = Data(nlp, model_path=embed_model_path, conll=conll, use_no_coref_list=use_no_coref_list, consider_speakers=conll) self.coref_model = Model(model_path) self.clusters = {} self.mention_to_cluster = [] self.mentions_single_scores = {} self.mentions_single_features = {} self.mentions_pairs_scores = {} self.mentions_pairs_features = {}
def train(train_loc, dev_loc, shape, settings): train_texts1, train_texts2, train_labels = read_snli(train_loc) dev_texts1, dev_texts2, dev_labels = read_snli(dev_loc) print("Loading spaCy") nlp = spacy.load("en_vectors_web_lg") assert nlp.path is not None print("Processing texts...") train_X = create_dataset(nlp, train_texts1, train_texts2, 100, shape[0]) dev_X = create_dataset(nlp, dev_texts1, dev_texts2, 100, shape[0]) print("Compiling network") model = build_model(get_embeddings(nlp.vocab), shape, settings) print(settings) model.fit( train_X, train_labels, validation_data=(dev_X, dev_labels), epochs=settings["nr_epoch"], batch_size=settings["batch_size"], ) if not (nlp.path / "similarity").exists(): (nlp.path / "similarity").mkdir() print("Saving to", nlp.path / "similarity") weights = model.get_weights() # remove the embedding matrix. We can reconstruct it. del weights[1] with (nlp.path / "similarity" / "model").open("wb") as file_: pickle.dump(weights, file_) with (nlp.path / "similarity" / "config.json").open("w") as file_: file_.write(model.to_json())
def __init__(self, vdict_path, adict_path, \ batchsize=128, max_length=15, n_ans_vocabulary=1000, mode='train', data_shape=(2048)): self.batchsize = batchsize self.d_vocabulary = None self.batch_index = None self.batch_len = None self.rev_adict = None self.max_length = max_length self.n_ans_vocabulary = n_ans_vocabulary self.mode = mode self.data_shape = data_shape assert self.mode == 'test' # load vocabulary with open(vdict_path,'r') as f: vdict = json.load(f) with open(adict_path,'r') as f: adict = json.load(f) self.n_vocabulary, self.vdict = len(vdict), vdict self.n_ans_vocabulary, self.adict = len(adict), adict self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') self.glove_dict = {} # word -> glove vector
def tokenizeText(sample,parser=spacy.load('en')): # get the tokens using spaCy tokens = parser(sample) # lemmatize lemmas = [] for tok in tokens: lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_) tokens = lemmas # stoplist the tokens tokens = [tok for tok in tokens if tok not in STOPLIST] # stoplist symbols tokens = [tok for tok in tokens if tok not in SYMBOLS] # remove large strings of whitespace while "" in tokens: tokens.remove("") while " " in tokens: tokens.remove(" ") while "\n" in tokens: tokens.remove("\n") while "\n\n" in tokens: tokens.remove("\n\n") return tokens
def main(model_dir=None): nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) # v1.1.2 onwards if nlp.tagger is None: print('---- WARNING ----') print('Data directory not found') print('please run: `python -m spacy.en.download --force all` for better performance') print('Using feature templates for tagging') print('-----------------') nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates) train_data = [ ( 'Who is Shaka Khan?', [(len('Who is '), len('Who is Shaka Khan'), 'PERSON')] ), ( 'I like London and Berlin.', [(len('I like '), len('I like London'), 'LOC'), (len('I like London and '), len('I like London and Berlin'), 'LOC')] ) ] ner = train_ner(nlp, train_data, ['PERSON', 'LOC']) doc = nlp.make_doc('Who is Shaka Khan?') nlp.tagger(doc) ner(doc) for word in doc: print(word.text, word.orth, word.lower, word.tag_, word.ent_type_, word.ent_iob) if model_dir is not None: save_model(ner, model_dir)
def test_not_lemmatize_base_forms(): nlp = spacy.load('en', parser=False) doc = nlp(u"Don't feed the dog") feed = doc[2] feed.tag_ = u'VB' assert feed.text == u'feed' assert feed.lemma_ == u'feed'
def load_spacy(name, **kwargs): """ Load a language-specific spaCy pipeline (collection of data, models, and resources) for tokenizing, tagging, parsing, etc. text; the most recent package loaded is cached. Args: name (str): standard 2-letter language abbreviation for a language; currently, spaCy supports English ('en') and German ('de') **kwargs: keyword arguments passed to :func:`spacy.load`; see the `spaCy docs <https://spacy.io/docs#english>`_ for details * via (str): non-default directory from which to load package data * vocab * tokenizer * parser * tagger * entity * matcher * serializer * vectors Returns: :class:`spacy.<lang>.<Language>` Raises: RuntimeError: if package can't be loaded """ logger.info('Loading "%s" language spaCy pipeline', name) return spacy.load(name, **kwargs)
def __init__(self, lemmatize_it=True, stem_it=True, normalize_it=True): self.lemmatize_it = lemmatize_it self.stem_it = stem_it self.normalize_it = normalize_it self.parser = spacy.load('en') self.stemmer = gensim.parsing.PorterStemmer() self.stops = set(nltk.corpus.stopwords.words('english'))
def load_model(model_dir): model_dir = pathlib.Path(model_dir) nlp = spacy.load('en', parser=False, entity=False, add_vectors=False) with (model_dir / 'vocab' / 'strings.json').open('r', encoding='utf8') as file_: nlp.vocab.strings.load(file_) nlp.vocab.load_lexemes(model_dir / 'vocab' / 'lexemes.bin') ner = EntityRecognizer.load(model_dir, nlp.vocab, require=True) return (nlp, ner)
def __init__(self, encoder_path, bpe_path): self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat']) self.encoder = json.load(open(encoder_path)) self.decoder = {v:k for k,v in self.encoder.items()} merges = open(bpe_path).read().split('\n')[1:-1] merges = [tuple(merge.split()) for merge in merges] self.bpe_ranks = dict(zip(merges, range(len(merges)))) self.cache = {}
print("Top 10 keywords and their Score") # In[17]: for i, j, n in zip(scored_key, scored_value, range(10)): print(i, " \t:\t", j) # ### Topic Mining from The Keywords and Emotions/Sentiments Associated with it # Using <b>spaCy</b> for the same # In[18]: print("IMPORTING DICTIONARY AND OTHEL NLP TOOLS ") import spacy nlp = spacy.load('en_core_web_md') # <b>Retrieving the generic Domain Specific Buckets</b> # # Basic Domain knowledge goes into our model here , identifiying key categories to look into # In[16]: # topic_list={'OVERALL':['RECOMMEND', 'EXPERIANCE','VALUE']} # topic_list.update({'METRO':['METRO', 'METRO RAIL','TRAIN','RAIL','METRO STATION']}) # topic_list.update({'SERVICES':['CUSTOMER CARE', 'COMPLINTS','SECURITY','SMART CARD','PAYMENT','METRO TICKET']}) # topic_list.update({'BUS':['BUS TICKET', 'CONDUCTOR','BUS','PAYMENT','TICKET']}) # topic_list.update({'OTHERS':['OTHERS']}) # topic_list.update({'OPERATIONAL':['SCHEDULE', 'TIMINGS','RELIABILITY']}) # topic_list.update({'LOGISTICS':['BUS STOPS', 'BUS STATIONS','METRO STATIONS','BUS SEATS','SEATS','TICKETING MACHIENE']})
from NER import * import time import telebot import spacy import os import json output_dir = os.getcwd() MODEL = spacy.load(output_dir) TOKEN = "881147208:AAGDY-ZvgqonfxS12Dn3GDPubCl4jiJtPJA" bot = telebot.TeleBot(token=TOKEN) FLAG = False FLAG2 = True diseasess = [] potential_diseas = [] things_to_ask = [] greeting_words = ["hi", "hello", "hey", "helloo", "hellooo", "g morining", "gmorning", "good morning", "morning", "good day", "good afternoon", "good evening", "greetings", "greeting", "good to see you", "its good seeing you", "how are you", "how're you", "how are you doing", "how ya doin'", "how ya doin", "how is everything", "how is everything going", "how's everything going", "how is you", "how's you", "how are things", "how're things", "how is it going", "how's it going", "how's it goin'", "how's it goin", "how is life been treating you",
import json import spacy import numpy as np nlp = spacy.load("en_vectors_web_lg") def get_vectors(wordlist): tokens = nlp(" ".join(str(s) for s in wordlist)) pageVec = np.array([]) for token in tokens: wordVec = token.vector wordVec = wordVec.reshape(1, 300) if pageVec.size == 0: pageVec = wordVec else: pageVec = np.concatenate([pageVec, wordVec], axis=0) return pageVec def get_weighted_vectors(wordlist, scores): tokens = nlp(" ".join(str(s) for s in wordlist)) pageVec = np.array([]) for token, score in zip(tokens, scores): wordVec = np.multiply(token.vector,np.asarray(score)) wordVec = wordVec.reshape(1,300) if pageVec.size == 0: pageVec = wordVec else: pageVec = np.concatenate([pageVec,wordVec],axis = 0) return pageVec
def train_spacy(): TRAIN_DATA = trim_entity_spans( convert_dataturks_to_spacy("traindata.json")) # nlp = spacy.blank('en') # create blank Language class at start nlp = spacy.load("./model") #load the model to add up to it # create the built-in pipeline components and add them to the pipeline # nlp.create_pipe works for built-ins that are registered with spaCy if 'ner' not in nlp.pipe_names: ner = nlp.create_pipe('ner') nlp.add_pipe(ner, last=True) else: ner = nlp.get_pipe('ner') # add labels for _, annotations in TRAIN_DATA: for ent in annotations.get('entities'): ner.add_label(ent[2]) # get names of other pipes to disable them during training other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner'] with nlp.disable_pipes(*other_pipes): # only train NER optimizer = nlp.begin_training() for itn in range(10): print("Starting iteration " + str(itn)) random.shuffle(TRAIN_DATA) losses = {} for text, annotations in TRAIN_DATA: nlp.update( [text], # batch of texts [annotations], # batch of annotations drop=0.2, # dropout - make it harder to memorise data sgd=optimizer, # callable to update weights losses=losses) print(losses) # test the model and evaluate it examples = convert_dataturks_to_spacy("testdata.json") c = 0 for text, annot in examples: # f = open("resumes"+str(c)+".txt", "w") doc_to_test = nlp(text) # d = {} # for ent in doc_to_test.ents: # d[ent.label_] = [] # for ent in doc_to_test.ents: # d[ent.label_].append(ent.text) # # for i in set(d.keys()): # # f.write("\n\n") # f.write(i + ":" + "\n") # for j in set(d[i]): # f.write(j.replace('\n', '')+"\n") d = {} for ent in doc_to_test.ents: d[ent.label_] = [0, 0, 0, 0, 0, 0] for ent in doc_to_test.ents: doc_gold_text = nlp.make_doc(text) gold = GoldParse(doc_gold_text, entities=annot.get("entities")) y_true = [ ent.label_ if ent.label_ in x else 'Not ' + ent.label_ for x in gold.ner ] y_pred = [ x.ent_type_ if x.ent_type_ == ent.label_ else 'Not ' + ent.label_ for x in doc_to_test ] if (d[ent.label_][0] == 0): print("For Entity " + ent.label_ + "\n") print(classification_report(y_true, y_pred) + "\n") (p, r, f, s) = precision_recall_fscore_support(y_true, y_pred, average='weighted') a = accuracy_score(y_true, y_pred) d[ent.label_][0] = 1 d[ent.label_][1] += p d[ent.label_][2] += r d[ent.label_][3] += f d[ent.label_][4] += a d[ent.label_][5] += 1 c += 1 for i in d: print("\n For Entity " + i + "\n") print("Accuracy : " + str((d[i][4] / d[i][5]) * 100) + "%") print("Precision : " + str(d[i][1] / d[i][5])) print("Recall : " + str(d[i][2] / d[i][5])) print("F-score : " + str(d[i][3] / d[i][5])) nlp.to_disk("model")
import numpy as np from sklearn.datasets import fetch_20newsgroups # import the dataset import spacy # import spacy for nlp preprocessing from gensim import corpora, models # import classes for creating bag of words & tf-idf import re # import regex module import pandas as pd from pprint import pprint import pyLDAvis # import this package for plotting topic model import pyLDAvis.gensim # import this class to plot based on gensim import matplotlib.pyplot as plt import warnings warnings.filterwarnings("ignore",category=DeprecationWarning) # turn off warnings dataset = fetch_20newsgroups(subset='train', shuffle=True) # load the train data including the labels nlp = spacy.load('en_core_web_sm') # load the spacy 'en_core_web_sm' model data = dataset.data # extract only the data data = [re.sub('\S*@\S*\s?', '', sent) for sent in data] # eliminate e-mail strings from the dataset using the sub method of regex module texts = [] # create an empty list to store the preprocessed data allowed_pos = ['ADJ', 'ADV', 'NOUN', 'PROPN', 'VERB'] # only the words having the parts of speech will be included in the dataset for document in data: # looping over the documents one by one text = [] doc = nlp(document) # return a spacy document object for processing for w in doc: # looping over single tokens of the document object if (not w.is_stop) and (not w.is_punct) and (not w.like_num) and (w.pos_ in allowed_pos): # eliminating stop words, punctuations, numbers and the words whose POS is not included on the list described above using features provided by spacy text.append(w.lemma_) # take only the lemma of the word texts.append(text) # append the documents one by one to the 'texts' list created earlier dictionary = corpora.Dictionary(texts) # create a bag of words model class using the Dictionary class of gensim corpus = [ dictionary.doc2bow(text) for text in texts] # create the bag of words corpus off the Dictionary class
#turk-eng: #sentence = "Allaha ve Peygamberine kim inanmamışsa bilsin ki şüphesiz Biz inkarcılar için çılgın alevli cehennemi hazırlamışızdır" #sentence_eng = "And whoever does not accept faith in Allah and His Noble Messenger – We have indeed kept prepared a blazing fire for disbelievers" #azer-turkish #sentence = "O yer üzünü sizin üçün beşik etmiş orada sizin üçün yollar salmış və göydən su endirmişdir" #sentence_turk = "O ki yeri size beşik yaptı ve onda sizin için yollar açtı gökten bir su indirdi" #sentence = sentence.split() #python -m spacy download en spacy_eng = spacy.load("en") tokenize_custom = lambda x: x.split() def tokenize_eng(text): return [tok.text for tok in spacy_eng.tokenizer(text)] src_tokenizer = None trg_tokenizer = None if set_source_to == "azerbaijani" or set_source_to == "turkish": src_tokenizer = tokenize_custom elif set_source_to == "english": src_tokenizer = tokenize_eng
# TODO aprimorar ementa ementa = re.search('provido', tempo) if fim == None or ementa == None: return None return sent[ementa.start()+7:fim.start()+14] def relatorios(n, cases): final = [] i = 0 while i < n: if getRelatorio(cases.julgado[i]) != None: final.append(getRelatorio(cases.julgado[i])) i += 1 return final npl = spacy.load("pt_core_news_sm") relat = relatorios(1000, train_df) relat[6] train_df.resultado[range(50)] opcoes = ['provido', 'improvido', 'parcial', 'não conhecido', 'desconhecido'] i=0 relat_npl = [] while i < 4000: x = npl(relat[i]) relat_npl.append(x) i = i+1 import os
# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # use spacy small model # nlp = en_core_web_lg.load() import spacy nlp = spacy.load('en_core_web_lg') from lists_patterns import load_lists,fpath # dependency markers for subjects SUBJECTS = {"nsubj", "nsubjpass", "csubj", "csubjpass", "agent", "expl"} # dependency markers for objects OBJECTS = {"dobj", "dative", "attr", "oprd","pobj"} # POS tags that will break adjoining items BREAKER_POS = {"CCONJ", "VERB"} # words that are negations NEGATIONS = {"no", "not", "n't", "never", "none"} # does dependency set contain any coordinating conjunctions? def contains_conj(depSet): return "and" in depSet or "or" in depSet or "nor" in depSet or \
def getNouns(raw_parsed_result): nlp = spacy.load("en_core_web_sm") doc = nlp(raw_parsed_result) nouns = [chunk.text for chunk in doc.noun_chunks] return nouns
day = int(tokens[0]) year = 2020 else: day = int(tokens[0]) month = int(tokens[1]) year = 2020 elif len(tokens) == 1: day = int(tokens[0]) year = 2020 return (day, month, year) # !python3 -m spacy download en_core_web_lg import spacy sp_lg = spacy.load('en_core_web_lg') import nltk import re query_date = {} for idx, row in enumerate(rows): date_occurences = [(ent.text.strip(), ent.label_) for ent in sp_lg(row).ents if ent.label_ == 'DATE'] query_date[row] = [] for date in date_occurences: try: date_token = re.split('\s+|/|-|:', date[0]) day, month, year = decode_date(date_token) query_date[row].append((f'{year}-{month}-{day}')) row = row.replace(date[0], "", 1)
def load_spacy(language): model = "en_core_web_lg" if language == "en" else "xx_ent_wiki_sm" return spacy.load(model)
# stdlib import re # third party import toml import spacy as sp import toolz as fp import pandas as pd # load spacy model print( "[WARN] Loading `en_core_web_md` model from spacy. Might take a few seconds." ) nlp = sp.load("en_core_web_md") # load in default_regex strings with open("optimus/etc/regexes") as handle: replace = toml.load(handle) # helper for regex def default_cleaner(string, regex_dict=replace): """ default_cleaner(string, regex_dict={"regex":"replacement"}) A default cleaner for text. The goal for this is to remove the unnecessary words and other things such as numbers from the text. The default dictionary for this is in the ``optimus/etc/regexes`` file. A version of this function that maps across a list exists
def main(): import numpy as np import pickle import pandas as pd import streamlit as st import tweepy import pandas as pd import re import emoji import nltk import datetime import spacy from nltk.corpus import stopwords from nltk.stem.wordnet import WordNetLemmatizer import string st.title("Streamlit (Topic_Modelling App)") html_temp = """ <div style="background-color:tomato;padding:10px"> <h1 style="color:white;text-align:center;">Topic_Modelling</h1> </div>""" st.markdown(html_temp, unsafe_allow_html=True) #Creating search Box for search Date1 = st.sidebar.date_input( 'start date', datetime.date.today() - datetime.timedelta(days=7)) Date2 = st.sidebar.date_input('end date', datetime.date.today()) # set variables for keys and tokens to access the Twitter API mykeys = open('API Twitter.txt', 'r').read().splitlines() api_key = mykeys[0] api_key_secret = mykeys[1] access_token = mykeys[2] access_token_secret = mykeys[3] auth = tweepy.OAuthHandler(consumer_key=api_key, consumer_secret=api_key_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth, wait_on_rate_limit=True) #Featching the data from twitter search_words = "news" date_since = Date1 data_until = Date2 tweets = tweepy.Cursor(api.search, q=search_words, lang="en", tweet_mode='extended', since=date_since, until=data_until, result_type="recent").items(300) # Collect tweets tweets = tweepy.Cursor(api.search, q=search_words, lang="en", since=date_since).items(300) # Iterate and print tweets s = [] for tweet in tweets: s.append(tweet.text) print(s) df = pd.DataFrame({'tweet': s}) import nltk words = set(nltk.corpus.words.words()) tweet = np.array(df.tweet) cleaned_tweet = [] for i in df.tweet: no_punc_text = i.translate(str.maketrans('', '', string.punctuation)) no_punc_text = re.sub("(RT)?(ht)?", "", no_punc_text) # to remove RT and ht word no_punc_text1 = re.sub( "[\W\d]", " ", no_punc_text) #to remove not word character and numbers no_punc_text2 = re.sub( "[^a-zA-Z]", " ", no_punc_text1) #to remove forien language word character no_punc_text2=" ".join(w for w in nltk.wordpunct_tokenize(no_punc_text2) \ if w.lower() in words or not w.isalpha()) cleaned_tweet.append(no_punc_text2) df['cleaned_tweet'] = cleaned_tweet df1 = df.copy() corpus = df1.cleaned_tweet.unique() # import vectorizers from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer # import numpy for matrix operation import numpy as np # import LDA from sklearn from sklearn.decomposition import LatentDirichletAllocation #nltk.download('wordnet') # Lemmatize with POS Tag from nltk.corpus import wordnet import nltk #nltk.download('averaged_perceptron_tagger') def get_wordnet_pos(word): """Map POS tag to first character lemmatize() accepts""" tag = nltk.pos_tag([word])[0][1][0].upper() tag_dict = { "J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV } return tag_dict.get(tag, wordnet.NOUN) # Apply Preprocessing on the Corpus # stop loss words stop = set(stopwords.words('english')) stop.update([ "new", "news", 'via', 'take', 'first', 'one', 'say', 'time', 'big', 'see', 'come', 'good', 'another', 'today', 'make', 'get', 'great', 'could', 'like', 'make', 'set', 'end', 'dont' ]) # punctuation exclude = set(string.punctuation) # lemmatization lemma = WordNetLemmatizer() # One function for all the steps: def clean(doc): # convert text into lower case + split into words stop_free = " ".join([i for i in doc.lower().split() if i not in stop]) # remove any stop words present punc_free = ''.join(ch for ch in stop_free if ch not in exclude) # remove punctuations + normalize the text normalized = " ".join( lemma.lemmatize(word, get_wordnet_pos(word)) for word in punc_free.split()) return normalized # clean data stored in a new list clean_corpus = [clean(doc).split() for doc in corpus] corpus1 = [] for i in clean_corpus: doc = [] #j=i.split() for z in i: #print(len(z)) if len(z) > 2: doc.append(z) #print(doc) doc = " ".join(doc) doc1 = doc.split() #print(doc1) corpus1.append(doc1) clean_corpus = corpus1 abc = [] #to create single list for i in clean_corpus: abc.append(' '.join(i)) abc2 = " ".join(abc) nlp = spacy.load('en_core_web_sm') one_block = abc2 doc_block = nlp(one_block) #collecting 'PROPN','X','NOUN','ADJ' words final_corpus = [ token.text for token in doc_block if token.pos_ in ('PROPN', 'X', 'NOUN', 'ADJ') ] imp_words = set(final_corpus) # to remove the meaningless words #doc=[] corpus1 = [] for i in clean_corpus: doc = [] #j=i.split() for z in i: #print(len(z)) if z in imp_words: doc.append(z) #print(doc) doc = " ".join(doc) doc1 = doc.split() #print(doc1) corpus1.append(doc1) new_clean_corpus = corpus1 # Converting text into numerical representation tf_idf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False) # Converting text into numerical representation cv_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False) # Array from TF-IDF Vectorizer tf_idf_arr = tf_idf_vectorizer.fit_transform(clean_corpus) # Array from Count Vectorizer cv_arr = cv_vectorizer.fit_transform(clean_corpus) # Materialize the sparse data data_dense = cv_arr.todense() # Compute Sparsicity = Percentage of Non-Zero cells print("Sparsicity: ", ((data_dense > 0).sum() / data_dense.size) * 100, "%") # Creating vocabulary array which will represent all the corpus vocab_tf_idf = tf_idf_vectorizer.get_feature_names() # Creating vocabulary array which will represent all the corpus vocab_cv = cv_vectorizer.get_feature_names() result = "" if st.button("Search"): result = Predict_Topics(cv_arr, vocab_tf_idf) st.success(st.write(result))
@author: nirmalenduprakash Identifies important entities in a document and discovers sentiment towards these entities """ import spacy import numpy as np import pandas as pd #!pip install git+htt!ps://github.com/huggingface/neuralcoref.git import nltk #nltk.download('punkt') import neuralcoref from wordcloud import WordCloud, STOPWORDS nlp = spacy.load("en_core_web_sm") stopwords = set(STOPWORDS) pronouns = ['i', 'he', 'she', 'you', 'we', 'they', 'them', 'it', 'his', 'who'] def read_file(filepath): df = pd.read_table(filepath, sep=' ') if ('POS_TAGS' in df.columns): df.drop(['POS_TAGS'], inplace=True, axis=1) return df df_ug = read_file( '/Users/nirmalenduprakash/Documents/Project/NLP/Sentiment Mining/IBM_Debater_(R)_SC_COLING_2018/LEXICON_UG.txt' ) df_bg = read_file(
def __init__(self): path = os.path.dirname(os.path.realpath(__file__)) self.df = pd.read_csv(os.path.join(path, "../data/countries.csv")) self.utils = nlpUtils() self.nlp = spacy.load("en_core_web_sm") self.nationality_matcher = Matcher(self.nlp.vocab) nat_pattern = list() nat_pattern.append([{ 'LEMMA': 'be' }, { 'POS': 'DET' }, { 'ENT_TYPE': { "IN": ["GPE", "NORP", "LANGUAGE"] }, 'OP': "*" }, { 'POS': { "IN": ["NOUN", "PROPN", "PUNCT", "ADJ", "SYM"] }, "OP": "*" }, { 'POS': { "IN": ["NOUN", "PROPN", "ADJ"] }, "OP": "+" }]) nat_pattern.append([{ 'LEMMA': 'be' }, { 'POS': 'DET' }, { 'ENT_TYPE': { "IN": ["GPE", "NORP", "LANGUAGE"] }, 'OP': "*" }, { "DEP": { "IN": ["punct", "compound", "amod", "nmod"] }, "OP": "*" }, { 'POS': 'NOUN' }, { "POS": { "IN": ["PUNCT", "NOUN", "ADJ", "PROPN"] }, "OP": "*" }, { 'ORTH': 'and' }, { 'POS': { "IN": ["NOUN", "PROPN", "PUNCT", "ADJ"] }, "OP": "*" }, { 'POS': { "IN": ["NOUN", "PROPN", "ADJ"] }, "OP": "+" }]) self.nationality_matcher.add("nationality", nat_pattern) self.influence_matcher = Matcher(self.nlp.vocab) influence1 = list() influence1.append([{ 'LEMMA': { "IN": ["inspire", "influence"] }, "POS": 'VERB' }, { 'ORTH': 'by' }, { "OP": "*" }]) self.influence_matcher.add("influence1", influence1) influence2 = list() influence2.append([{ 'LEMMA': { "IN": ["cite", "refer", "list", "mention", "credit", "claim"] }, "POS": 'VERB' }, { "OP": "*" }, { 'LEMMA': { "IN": ["as", "among"] } }, { "OP": "*" }, { 'LEMMA': 'influence', "POS": 'NOUN' }, { "OP": "*" }]) influence2.append([{ 'LEMMA': { "IN": ["cite", "refer", "list", "mention", "credit", "claim"] }, "POS": 'VERB' }, { "OP": "*" }, { 'LEMMA': 'be' }, { "OP": "*" }, { 'LEMMA': 'influence', "POS": 'NOUN' }]) self.influence_matcher.add("influence2", influence2) influence3 = list() influence3.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { 'ORTH': 'include', "POS": 'VERB' }, { "OP": "*" }]) self.influence_matcher.add("influence3", influence3) influence4 = list() influence4.append([{ 'ORTH': 'influences', "POS": 'NOUN' }, { 'ORTH': 'cited' }, { 'ORTH': 'by' }, { "OP": "*" }, { 'ORTH': 'include', "POS": 'VERB' }, { "OP": "*" }]) self.influence_matcher.add("influence4", influence4) influence5 = list() influence5.append([{ 'LEMMA': 'cite', "POS": 'VERB' }, { 'ORTH': ',' }, { "ORTH": "as" }, { "OP": "*" }, { 'ORTH': 'influences', "POS": 'NOUN' }, { "OP": "*" }]) self.influence_matcher.add("influence5", influence5) influence6 = list() influence6.append([{ 'LEMMA': 'state', "POS": 'VERB' }, { "OP": "*" }, { 'LEMMA': 'influence', "POS": 'NOUN' }, { 'LEMMA': 'be' }, { "OP": "*" }]) self.influence_matcher.add("influence6", influence6) influence7 = list() influence7.append([{ 'ORTH': 'influences', "POS": 'NOUN' }, { "ORTH": "?" }, { "ORTH": "such" }, { "ORTH": "as" }, { "OP": "*" }]) self.influence_matcher.add("influence7", influence7) influence8 = list() influence8.append([{ 'LEMMA': { "IN": ["cite", "name"] }, "POS": "VERB" }, { "OP": "*" }, { "ORTH": "as" }, { "ORTH": "one" }, { "ORTH": "of" }, { "OP": "*" }, { "ORTH": "'s" }, { 'LEMMA': 'influence', "POS": 'NOUN' }]) self.influence_matcher.add("influence8", influence8) influence9 = list() influence9.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { "ORTH": "including" }, { "OP": "*" }]) self.influence_matcher.add("influence9", influence9) influence10 = list() influence10.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { "OP": "*" }, { "ORTH": "from" }, { "OP": "*" }]) self.influence_matcher.add("influence10", influence10) influence11 = list() influence11.append([{ 'ORTH': 'citing', "POS": 'VERB' }, { "ORTH": "as" }, { 'LEMMA': 'influence', "POS": 'NOUN' }, { "OP": "*" }]) self.influence_matcher.add("influence11", influence11) influence12 = list() influence12.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { 'LEMMA': 'be' }, { "OP": "*" }]) self.influence_matcher.add("influence12", influence12) influence13 = list() influence13.append([{ 'LEMMA': 'influence', "POS": 'NOUN' }, { 'ORTH': 'of' }, { "OP": "*" }]) self.influence_matcher.add("influence13", influence13) influence14 = list() influence14.append([{ 'LEMMA': 'inspiration', "POS": 'NOUN' }, { 'ORTH': { "IN": ["from", "include"] } }, { "OP": "*" }]) influence14.append([{ 'LEMMA': 'cite', "POS": 'VERB' }, { "OP": "*" }, { "ORTH": "as" }, { 'LEMMA': 'inspiration', "POS": 'NOUN' }]) self.influence_matcher.add("influence14", influence14) self.mappa = dict() self.mappa[self.nlp.vocab.strings["influence1"]] = "influence1" self.mappa[self.nlp.vocab.strings["influence2"]] = "influence2" self.mappa[self.nlp.vocab.strings["influence3"]] = "influence3" self.mappa[self.nlp.vocab.strings["influence4"]] = "influence4" self.mappa[self.nlp.vocab.strings["influence5"]] = "influence5" self.mappa[self.nlp.vocab.strings["influence6"]] = "influence6" self.mappa[self.nlp.vocab.strings["influence7"]] = "influence7" self.mappa[self.nlp.vocab.strings["influence8"]] = "influence8" self.mappa[self.nlp.vocab.strings["influence9"]] = "influence9" self.mappa[self.nlp.vocab.strings["influence10"]] = "influence10" self.mappa[self.nlp.vocab.strings["influence11"]] = "influence11" self.mappa[self.nlp.vocab.strings["influence12"]] = "influence12" self.mappa[self.nlp.vocab.strings["influence13"]] = "influence13" self.mappa[self.nlp.vocab.strings["influence14"]] = "influence14"
def extract(lang, tweet, params) : nlp = spacy.load(lang) nlp.max_length = SPACY_MAX_LENGTH tweet_dict = OrderedDict() if lang == 'en': pos_dict = { 'SPACE': 0, 'NOUN': 0, 'PART': 0, 'PRON': 0, 'INTJ': 0, 'SYM': 0, 'ADJ': 0, 'CCONJ': 0, 'PUNCT': 0, 'X': 0, 'VERB': 0, 'ADP': 0, 'ADV': 0, 'PROPN': 0, 'NUM': 0, 'DET': 0, 'AUX': 0, 'SCONJ' : 0 } tense_dict = { 'Past': 0, 'Pres': 0 } n_stopwords = 0 for word in nlp(tweet['text']): # n_pos features pos = word.pos_ pos_dict[pos] = pos_dict[pos] + 1 # n_stopwords if word.is_stop: n_stopwords += 1 # n_tense features tag = word.tag_ if tag in params['tense']['present']: tense_dict['Pres'] = tense_dict['Pres'] + 1 elif tag in params['tense']['past']: tense_dict['Past'] = tense_dict['Past'] + 1 for feature, params in params.items(): if feature == 'tense': tweet_dict['n_tense_past'] = tense_dict['Past'] tweet_dict['n_tense_pres'] = tense_dict['Pres'] elif feature == 'pos_counts': tweet_dict['n_pos_space'] = pos_dict['SPACE'] tweet_dict['n_pos_noun'] = pos_dict['NOUN'] tweet_dict['n_pos_par'] = pos_dict['PART'] tweet_dict['n_pos_pron'] = pos_dict['PRON'] tweet_dict['n_pos_intj'] = pos_dict['INTJ'] tweet_dict['n_pos_sym'] = pos_dict['SYM'] tweet_dict['n_pos_adj'] = pos_dict['ADJ'] tweet_dict['n_pos_conj'] = pos_dict['CCONJ'] tweet_dict['n_pos_punct'] = pos_dict['PUNCT'] tweet_dict['n_pos_x'] = pos_dict['X'] tweet_dict['n_pos_verb'] = pos_dict['VERB'] tweet_dict['n_pos_adp'] = pos_dict['ADP'] tweet_dict['n_pos_adv'] = pos_dict['ADV'] tweet_dict['n_pos_propn'] = pos_dict['PROPN'] tweet_dict['n_pos_num'] = pos_dict['NUM'] tweet_dict['n_pos_det'] = pos_dict['DET'] elif feature == 'word_counts': tweet_dict['n_words'] = len(tweet['text'].split()) elif feature == 'stopword_counts': tweet_dict['n_stopwords'] = n_stopwords return tweet_dict
import pytest import spacy import string import numpy as np from alibi.explainers import AnchorText from alibi.explainers.anchor_text import Neighbors from alibi.explainers.tests.utils import get_dataset from alibi.explainers.tests.utils import predict_fcn from alibi.utils.download import spacy_model # load spaCy model model = 'en_core_web_md' spacy_model(model=model) nlp = spacy.load(model) def find_punctuation(text: str) -> int: """ Returns nb of punctuation marks in a string. """ punctuation = set([s for s in string.punctuation]) tokens = set(text.split()) return len(tokens & punctuation) @pytest.mark.parametrize('lr_classifier', ((get_dataset('movie_sentiment')), ), indirect=True)
def __init__(self, model="en_core_web_sm"): if model is None: model = "en_core_web_sm" self._nlp = spacy.load(model)
def __init__(self, path='data/input/'): self.nlp = spacy.load('en', disable=['ner', 'textcat']) self.pi = ProcessInput() self.pub_df = self.pi.load_publication_input(path=path)
def __init__(self): self.nlp = spacy.load("en_core_web_sm")
def __init__(self): self.spacynlp = spacy.load('en_core_web_lg')
def default_nlp(): return spacy.load('en_core_web_sm')
def howParse(sentence, answer): ''' Parser for questions that begin with the adverbial modifier how''' arg1 = [] arg2 = [] rel = [] argument = False relTrue = False Object = False poss = False objTrue = False if sentence[0].lower_ != "how": return None if sentence[1].lower_ == "much" or sentence[1].lower_ == "many": arg1 = answer argument = False for child in sentence: if "subj" in child.dep_: _, arg2 = descendants(sentence, child, True) if "obj" in child.dep_: Object = True _, rel = descendants(sentence, sentence.root, True) rel = [token for token in rel if not token in arg2] stopwords = ['much', 'many'] arg2 = [word for word in arg2 if word.lower_ not in stopwords] rel = [token for token in rel if token.lower_ not in stopwords] rel = [ child for child in rel if "aux" not in child.dep_ or child.lower_ == "to" ] if len(arg2) != 0: if sentence[1].lower_ == "much" and arg2[0].dep_ != "prep": arg2.insert(0, "in") if sentence[1].lower_ == "many" and Object == False: rel.insert(0, "number") elif sentence[1].lower_ == "did" or sentence[1].lower_ == "is": for child in sentence: if "obj" in child.dep_: _, rel = descendants(sentence, child, True, sentence.root) if "subj" in child.dep_: _, arg1 = descendants(sentence, child, True) arg2.append(answer) nlp = spacy.load('en') answerDep = nlp(answer) if sentence[1].lower_ != "is": for token in answerDep: if token.dep_ == "poss": arg2.insert(0, "with") poss = True if poss == False: arg2.insert(0, "by") else: arg2.insert(0, "as") argument = True elif "comp" in sentence[1].dep_ or sentence[1].dep_ == "advmod": for child in sentence: if "obj" in child.dep_: objTrue = True if objTrue == True and sentence[1].dep_ == "advmod": for children in sentence: if "obj" in children.dep_: _, arg1 = descendants(sentence, children, True) rel = sentence.root.lower_ arg2.append(answer) for advmod in sentence: if advmod.dep_ == "advmod": arg2.append(advmod.lower_) if objTrue == True and "comp" in sentence[1].dep_: for token in sentence: if "subj" in token.dep_: _, arg1 = descendants(sentence, token, True) break rel = sentence.root.lower_ arg2.append(answer) for comp in sentence: if "comp" in comp.dep_: arg2.append(comp.lower_) else: return None print("How parse") arg1, rel, arg2 = extractHelper(arg1, rel, arg2) return Extract(arg1=arg1, rel=rel, arg2=arg2)
import random import math import time import collections from collections import Counter import math import numpy as np import subprocess # from google.colab import drive # drive.mount('/content/drive') # # !python -m spacy download en # !python -m spacy download fr spacy_en = spacy.load('en') spacy_de = spacy.load('fr') # preparing data SEED = 1234 random.seed(SEED) np.random.seed(SEED) torch.manual_seed(SEED) torch.cuda.manual_seed(SEED) torch.backends.cudnn.deterministic = True # spacy_de = spacy.load('de')
def generate_features(df): """Generate the following features: 'days_yelping', 'word_count', 'unique_words_perc', 'no_adjectives', 'perc_unique_verbs', 'perc_unique_adjectives', 'norm_of_wordvecs', 'adj_wordvecs' 'verb_wordvecs', 'adj_maj_sent__negative', 'adj_maj_sent__objective', 'adj_maj_sent__positive', 'adj_maj_sent__no_sentiment_assigned', 'verb_maj_sent_negative', 'verb_maj_sent_objective', 'verb_maj_sent_positive', 'verb_maj_sent_no_sentiment_assigned', 'useful_review_compliments_received_for_other_reviews' """ print('----- Feature Generation -----') feature_data = df[[ 'useful_01', 'text', 'stars', 'average_stars', 'compliment_more', 'compliment_hot', 'compliment_photos', 'compliment_writer', 'compliment_plain', 'fans', 'review_count', 'yelping_since', 'user_id', 'useful_review' ]] feature_data = feature_data.merge(pd.DataFrame(feature_data.groupby(feature_data['user_id']).useful_review.sum()), \ how='left', on='user_id', suffixes=('', '_compliments_received_for_other_reviews')) feature_data['useful_review_compliments_received_for_other_reviews'] = \ feature_data['useful_review_compliments_received_for_other_reviews'] - feature_data['useful_review'] nlp = spacy.load('en_core_web_lg', disable=['ner', "parser"]) # define for each token the attribute _.ignore: ignore_getter = lambda token: ( token.is_stop or # remove stop words token.lower_ in STOP_WORDS or # ignore stop words independent of their case # token.like_num or # ignore stuff that looks like a number # token.is_digit or #ignore tokens consisting of digits token.is_punct or # ignore punctuation tokens token.is_left_punct or # ignore left punctuation token.is_right_punct or # ignore right punctuation token.is_space or # ignore tokens consisting of spaces token.is_bracket or # ignore brackets token.is_quote or # ignore quotation marks not token.is_alpha) # ignore everything that is not only letters # (this might be too strict, but without it special characters # like +$% etc will stay). With it, however, most of the previous # stuff is not needed.. Token.set_extension('ignore', getter=ignore_getter, force=True) # add the _.ignore attribute def get_days_yelping(input_timestamp): days_yelp = datetime.now().date() - datetime.strptime( input_timestamp.split()[0], '%Y-%m-%d').date() return days_yelp.days def get_tokens(nlp_text): return [ token for token in nlp_text if not (token._.ignore or not (len(token) > 1)) ] def get_unique__perc(token_list): if not len(token_list) == 0: return len(set([str(tok) for tok in token_list])) / len(token_list) else: return 0 def get_word_vectors(token_list): if not len(token_list) == 0: word_vectors = [token.vector for token in token_list] return np.linalg.norm(sum(word_vectors) / len(token_list)) else: return 0 def get_sentiment(token_list, nlp_review): """Get sentiment using SentiWordNet""" # WordNet (and thus SentiWordNet) use different POS tags than spacy output # here we transform from spacy to WordNet wordNet_pos_dict = {"NOUN": "n", "VERB": "v", "ADJ": "a", "ADV": "r"} def posTag_to_wordNetTag(tag): if tag in wordNet_pos_dict: return wordNet_pos_dict[tag] else: return None # to get the semantic score of a word from SentiWordNet we first need to find this word in WordNet # This process is called Word Sense Disambiguation and we use the simple lesk algorithm for that def get_semantic_score_with_context(token, nlp_review): word = token.lower_ # get lowercased token text position = token.idx # get position of word in document pos = posTag_to_wordNetTag( token.pos_ ) # get POS of token, for better word sense disambiguation # define how many tokens around the token of interest we look at num_surrounding_words = 10 # careful if there are less then num_surrounding_words before our token or after our token leftmost_word_idx = max(0, position - num_surrounding_words) rightmostword_idx = min(len(nlp_review), position + num_surrounding_words) surrounding_text = nlp_review[ leftmost_word_idx:rightmostword_idx].text # determine word with the closest sense in WordNet # print(word,"....",surrounding_text,pos) try: word_with_closest_sense = simple_lesk(surrounding_text, word, pos=pos) except: word_with_closest_sense = simple_lesk(surrounding_text, word) # print(word,pos,word_with_closest_sense) # find the sentiment score to the word we found in wordnet if word_with_closest_sense: sentiword = swn.senti_synset(word_with_closest_sense.name()) sent_scores = { "objective": sentiword.obj_score(), "positive": sentiword.pos_score(), "negative": sentiword.neg_score() } sentiment = max(sent_scores, key=sent_scores.get) return sentiment else: return 'no_sentiment_assigned' if not len(token_list) == 0: sentiments = [] for token in token_list: sentiments.append( get_semantic_score_with_context(token, nlp_review)) counts = Counter(sentiments) return max(counts, key=counts.get) else: return 'no_sentiment_assigned' feature_data['days_yelping'] = feature_data.yelping_since.swifter.apply( get_days_yelping) del feature_data['yelping_since'] feature_data['text_processed'] = feature_data.text.swifter.apply( lambda x: nlp(x)) feature_data['tokens'] = feature_data.text_processed.swifter.apply( lambda x: get_tokens(x)) feature_data['word_count'] = feature_data.tokens.swifter.apply( lambda x: len(x)) feature_data['unique_words_perc'] = feature_data.tokens.swifter.apply( get_unique__perc) feature_data['adjectives'] = feature_data.tokens.swifter.apply( lambda x: [token for token in x if token.pos_ == 'ADJ']) feature_data['no_adjectives'] = feature_data.adjectives.swifter.apply( lambda x: len(x)) feature_data['verbs'] = feature_data.tokens.swifter.apply( lambda x: [token for token in x if token.pos_ == 'VERB']) feature_data['perc_unique_verbs'] = feature_data.verbs.swifter.apply( get_unique__perc) feature_data[ 'perc_unique_adjectives'] = feature_data.adjectives.swifter.apply( get_unique__perc) feature_data['norm_of_wordvecs'] = feature_data.tokens.swifter.apply( get_word_vectors) feature_data['adj_wordvecs'] = feature_data.adjectives.swifter.apply( get_word_vectors) feature_data['verb_wordvecs'] = feature_data.verbs.swifter.apply( get_word_vectors) feature_data['adj_maj_sent'] = feature_data.swifter.apply( lambda x: get_sentiment(x.adjectives, x.text_processed), axis=1) feature_data['verb_maj_sent'] = feature_data.swifter.apply( lambda x: get_sentiment(x.verbs, x.text_processed), axis=1) feature_data = feature_data.merge(pd.get_dummies(feature_data[['adj_maj_sent', 'verb_maj_sent']], prefix = ['adj_maj_sent_', 'verb_maj_sent']), \ left_index = True, right_index = True) # feather.write_dataframe(feature_data, os.getcwd() + 'feature_data.feather') return feature_data
in American history, including the Declaration of Independence, the Emancipation Proclamation, and the United States Constitution. Early in his speech, King alludes to Abraham Lincoln's Gettysburg Address by saying "Five score years ago ..." In reference to the abolition of slavery articulated in the Emancipation Proclamation, King says: "It came as a joyous daybreak to end the long night of their captivity." Anaphora (i.e., the repetition of a phrase at the beginning of sentences) is employed throughout the speech. Early in his speech, King urges his audience to seize the moment; "Now is the time" is repeated three times in the sixth paragraph. The most widely cited example of anaphora is found in the often quoted phrase "I have a dream", which is repeated eight times as King paints a picture of an integrated and unified America for his audience. Other occasions include "One hundred years later", "We can never be satisfied", "With this faith", "Let freedom ring", and "free at last". King was the sixteenth out of eighteen people to speak that day, according to the official program.''' stopwords = list(STOP_WORDS) nlp = spacy.load('en_core_web_sm') doc = nlp(text) tokens = [token.text for token in doc ] punctuation = punctuation + '\n' + ' ' + ' ' + '...' + '\n ' word_frequency = {} for word in doc: if word.text.lower() not in stopwords: if word.text.lower() not in punctuation: if word.text not in word_frequency.keys(): word_frequency[word.text] = 1 else: word_frequency[word.text] += 1
import spacy nlp = spacy.load("en_core_web_md") # doc = nlp("Calderon-Zygmund operators are objects that are largely responsible for our understanding of a number of physical phenomena, from heat transfer to turbulence") doc = nlp( "With this award, the Chemical Structure, Dynamics and Mechanisms (CSDM-A) Program of the Division of Chemistry is funding Professor Istvan Z. Kiss and his research group at Saint Louis University to study pattern formation of electrochemical reactions" ) for ent in doc.ents: print(ent.text, ent.start_char, ent.end_char, ent.label_)
def __init__(self): self.query = "" self.nlp = spacy.load('en_core_web_sm')
# Make it work for Python 2+3 and with Unicode try: to_unicode = unicode except NameError: to_unicode = str # Import necessary packages import spacy nlp = spacy.load('en') import subprocess from nltk.stem.wordnet import WordNetLemmatizer lemmatizer = WordNetLemmatizer() from nltk.corpus import wordnet as wn import re pattern = re.compile(r'(<IN>)*(<DT>)*(<JJ>)*(<NN>|<NNS>|<NNP>)+') w_words = ['when', 'who', 'what', 'why', 'how', 'where'] import json from collections import defaultdict import pandas as pd from flask import Flask, render_template, request, redirect import re import io from fuzzywuzzy import process, fuzz ## Some import word lists aux_verb = [ 'be', 'can', 'could', 'dare', 'do', 'have', 'may', 'might', 'must', 'need', 'ought', 'shall', 'should', 'will', 'would' ] wh_words = ['when', 'why', 'how', 'what', 'where', 'who']
else: # Uppercase followed by lowercase chars = [c.lower() if i % 2 else c.upper() for i, c in enumerate(text)] # Create augmented training example example_dict = example.to_dict() doc = nlp.make_doc("".join(chars)) example_dict["token_annotation"]["ORTH"] = [t.text for t in doc] # Original example followed by augmented example # yield example yield example.from_dict(doc, example_dict) return augment dir(example.) nlp = spacy.load("da_core_news_sm") doc = nlp("Mit navn er Kenneth og Malte og Jakob og Kenneth.") ent_dict = {"PER": get_names()} example = Example(doc, doc) lc_augmenter = create_augmenter_sponge(randomize=False) res = next(lc_augmenter(nlp, example)) res.text res augment = create_augmenter(ent_dict, prob=1) res res = next(augment(nlp, res)) res.text res dir(example)