def __init__(self): self.nlp = spacy.load('en') self.word_vectors = sense2vec.load() self.handlers = { 'similar': Sense(self.nlp, self.word_vectors), 'parse': Parse(self.nlp), 'vector': Sense(self.nlp, self.word_vectors), #'intent': Intent(self.nlp, self.word_vectors) # 'converse': # 'person': # 'address': # 'date': # 'email': }
def sense_2_vec(allowed_str): # Sense2Vec: # originally from reddit, then through sense2vec, I modify sense2vec # by doing a weighted average of all the parts of speech of each word # I seek, since they are often close in the space. # NOT normalized. # 128 dimensions. a = an.load(fnames[4], verbosity=1) if a is not None: a.add_evaluators(get_e()) a.analysis(print_report=False) a.save() else: import sense2vec s2v = sense2vec.load('/mnt/pccfs/not_backed_up/nate/' 'analyst_embeddings/reddit_vectors-1.1.0/') strings = [] vectors = [] endings = [ '|ADJ', '|ADP', '|ADV', '|AUX', '|CONJ', '|DET', '|INTJ', '|NOUN', '|NUM', '|PART', '|PRON', '|PROPN', '|PUNCT', '|SCONJ', '|SYM', '|VERB', '|X' ] for s in allowed_str: senses = [] freq_sum = 0 for e in endings: try: t = s2v[s + e] senses.append(t[1] * t[0]) freq_sum += t[0] except: pass if len(senses) > 0: strings.append(s) vectors.append(np.sum(senses, axis=0) / freq_sum) a = an.Analyst(embeddings=np.array(vectors), strings=strings, metric=metric, auto_print=printing, desc="Sense2Vec", parallel_count=cpus, evaluators=get_e(), auto_save=2, file_name=fnames[4], over_write=True)
import re #import spacy #nlp = spacy.load('en') #nlp = spacy.load('en_vectors_web_lg') from textblob import Word from nltk.stem import WordNetLemmatizer from nltk.corpus import wordnet as wn #Stopwords in English from nltk.corpus import stopwords as sw en_sw = set(sw.words('english')) from nltk.corpus import wordnet_ic import sense2vec s2v = sense2vec.load('/usr/local/lib/python3.5/dist-packages/sense2vec/reddit_vectors-1.1.0/') def readFile(ruta,diccPos,diccNeg): """ Read file, extract information (Word 1, word 2, Feature, value) line per line, insert in dictionaries Input: ruta path of the file to be read (String) diccPos dictionary of candidate positive examples (dictionary) diccNeg dictionary of candidate negative examples (dictionary) Return: 2 dictionaries diccPos and diccNeg (dictionary) """ print ("\n>>> Archivo(train): ",ruta," extrayendo datos ...") #two times, one for fill dictionaries and other for delete duplicate key for i in range (0,2): try: file = open(ruta,"r") except IOError:
import sense2vec import fileinput model = sense2vec.load() print "Enter topic: " for line in fileinput.input(): freq, query_vector = model[u"{}|NOUN".format(line.strip())] print model.most_similar(query_vector, n=10) print "Enter topic: "
import sense2vec sense_vec_model = sense2vec.load() def get_stop_words_list(path='code/words.txt'): """Read stopwords file and return it as a list.""" stopwords = map( lambda x: x.strip(), open(path, 'rb').readlines() ) return stopwords stop_words = get_stop_words_list()
def get_corefnlp(): global _coref_nlp if _coref_nlp is None: _coref_nlp = sense2vec.load(COREF_MODEL) return _coref_nlp
from __future__ import unicode_literals import hug from hug_middleware_cors import CORSMiddleware from spacy.lang.en import English import sense2vec SENSES = [ 'auto', 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PERSON', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'NORP', 'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART', 'LANGUAGE' ] LEMMATIZER = English().vocab.morphology.lemmatizer S2V = sense2vec.load('reddit_vectors-1.1.0') @hug.get('/senses') def senses(): """Get all available 'senses', i.e. tags and labels.""" return SENSES @hug.post('/find') def find(word: str, sense: str = 'auto', n_results: int = 200): """Find similar terms for a given term and optional sense.""" best_word, best_sense = get_best(word, sense) if not word or not best_word: return {'text': word, 'sense': sense, 'results': [], 'count': 0} results = []
def __init__(self): self.handler = Similarity(spacy.load('en', parser=False, entity=False), sense2vec.load())
DO_TEST = 0 DO_PRINT = 0 def Log(message): if DO_PRINT: print(message) # if 'sense2vec' not in sys.modules: import sense2vec start = time.time() print('loading sense2vec') s2v_model = sense2vec.load("resources/reddit_vectors-1.1.0") print('done loading ins {0} s'.format(time.time() - start)) def loadSpacy(): import spacy print('loading SPACY english') nlp = spacy.load('en') return nlp def sense2vec_sim(token1, token2): try: return s2v_model.data.similarity(s2v_model[token1][1], s2v_model[token2][1]) except ValueError:
def teach(dataset, vectors_path, seeds, threshold=0.85, top_n=200, batch_size=5, resume=False): """ Bootstrap a terminology list sense2vec. Prodigy will suggest similar terms based on the the most similar phrases from sense2vec """ SENSES = [ "auto", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PERSON", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LANGUAGE" ] log("RECIPE: Starting recipe phrases.to-patterns", locals()) LEMMATIZER = English().vocab.morphology.lemmatizer S2V = sense2vec.load(vectors_path) log("RECIPE: Finished loading sense2vec", locals()) # Seems to be a bug in sense2vec which gets < n similar senses not <= n batch_size = min(batch_size, top_n * len(seeds)) top_n = top_n + 1 DB = connect() seed_tasks = [set_hashes({"text": s, "answer": "accept"}) for s in seeds] DB.add_examples(seed_tasks, datasets=[dataset]) accept_phrases = seeds reject_phrases = [] seen = set(accept_phrases) sensed = set() if resume: prev = DB.get_dataset(dataset) prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"] prev_reject = [eg["text"] for eg in prev if eg["answer"] == "reject"] accept_phrases += prev_accept reject_phrases += prev_reject seen.update(set(accept_phrases)) seen.update(set(reject_phrases)) log("RECIPE: Resuming from {} previous examples in dataset {}".format( len(prev), dataset)) def format_for_s2v(word, sense): return word.replace(" ", "_") + "|" + sense def get_best(word, sense): if sense != "auto": # if sense is specified, find respective entry if format_for_s2v(word, sense) in S2V: return (word, sense) return (None, None) freqs = [] casings = [word, word.upper(), word.title() ] if word.islower() else [word] for text in casings: # try options for tag in SENSES: query = format_for_s2v(text, tag) if query in S2V: freqs.append((S2V[query][0], (text, tag))) return max(freqs)[1] if freqs else (None, None) def get_similar(word, sense, n=100): query = format_for_s2v(word, sense) if query not in S2V: return [] freq, query_vector = S2V[query] words, scores = S2V.most_similar(query_vector, n) words = [word.rsplit("|", 1) for word in words] # Don't know why we'd be getting unsensed entries, but fix. words = [entry for entry in words if len(entry) == 2] words = [(word.replace("_", " "), sense) for word, sense in words] return zip(words, scores) def find_similar(word: str, sense: str = "auto", n_results: int = top_n): """Find similar terms for a given term and optional sense.""" best_word, best_sense = get_best(word, sense) results = [] if not word or not best_word: return results seen = set([best_word, min(LEMMATIZER(best_word, best_sense))]) similar = get_similar(best_word, best_sense, n_results) for (word_entry, sense_entry), score in similar: head = min(LEMMATIZER(word_entry, sense_entry)) if head not in seen and score > threshold: freq, _ = S2V[format_for_s2v(word_entry, sense_entry)] results.append((score, word_entry)) seen.add(head) if len(results) >= n_results: break return results def update(answers): """Updates accept_phrases so that the stream can find new phrases""" for answer in answers: if answer['answer'] == 'accept': accept_phrases.append(answer['text']) elif answer['answer'] == 'reject': reject_phrases.append(answer['text']) def get_stream(): """Continue querying sense2vec whenever we get a new phrase and presenting examples to the user with a similarity above the threshold parameter""" while True: seen.update(set([rp.lower() for rp in reject_phrases])) for p in accept_phrases: if p.lower() not in sensed: sensed.add(p.lower()) for score, phrase in find_similar(p): if phrase.lower() not in seen: seen.add(phrase.lower()) yield {"text": phrase, 'meta': {'score': score}} stream = get_stream() return { 'view_id': 'text', 'dataset': dataset, 'stream': stream, 'update': update, 'config': { 'batch_size': batch_size } }
from hug_middleware_cors import CORSMiddleware from spacy.lang.en import English import sense2vec # fmt: off SENSES = [ "auto", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN", "NUM", "PART", "PERSON", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART", "LANGUAGE" ] # fmt: on print("Loading") LEMMATIZER = English().vocab.morphology.lemmatizer S2V = sense2vec.load("reddit_vectors-1.1.0") print("Loaded!") @hug.get("/senses") def senses(): """Get all available 'senses', i.e. tags and labels.""" return SENSES @hug.post("/find") def find(word: str, sense: str = "auto", n_results: int = 200): """Find similar terms for a given term and optional sense.""" best_word, best_sense = get_best(word, sense) if not word or not best_word: return {"text": word, "sense": sense, "results": [], "count": 0}
def test_sample(): s2v = sense2vec.load('reddit_vectors') freq, query_vector = s2v[u"beekeepers|NOUN"] assert freq is not None assert s2v.most_similar(query_vector, 3)[0] == \ [u'beekeepers|NOUN', u'honey_bees|NOUN', u'Beekeepers|NOUN']