Esempio n. 1
0
 def __init__(self):
     self.nlp = spacy.load('en')
     self.word_vectors = sense2vec.load()
     self.handlers = {
         'similar': Sense(self.nlp, self.word_vectors),
         'parse': Parse(self.nlp),
         'vector': Sense(self.nlp, self.word_vectors),
         #'intent': Intent(self.nlp, self.word_vectors)
         # 'converse':
         # 'person':
         # 'address':
         # 'date':
         # 'email':
     }
Esempio n. 2
0
    def sense_2_vec(allowed_str):
        # Sense2Vec:
        #   originally from reddit, then through sense2vec, I modify sense2vec
        #   by doing a weighted average of all the parts of speech of each word
        #   I seek, since they are often close in the space.
        #   NOT normalized.
        #   128 dimensions.

        a = an.load(fnames[4], verbosity=1)
        if a is not None:
            a.add_evaluators(get_e())
            a.analysis(print_report=False)
            a.save()
        else:
            import sense2vec

            s2v = sense2vec.load('/mnt/pccfs/not_backed_up/nate/'
                                 'analyst_embeddings/reddit_vectors-1.1.0/')
            strings = []
            vectors = []
            endings = [
                '|ADJ', '|ADP', '|ADV', '|AUX', '|CONJ', '|DET', '|INTJ',
                '|NOUN', '|NUM', '|PART', '|PRON', '|PROPN', '|PUNCT',
                '|SCONJ', '|SYM', '|VERB', '|X'
            ]
            for s in allowed_str:
                senses = []
                freq_sum = 0
                for e in endings:
                    try:
                        t = s2v[s + e]
                        senses.append(t[1] * t[0])
                        freq_sum += t[0]
                    except:
                        pass
                if len(senses) > 0:
                    strings.append(s)
                    vectors.append(np.sum(senses, axis=0) / freq_sum)
            a = an.Analyst(embeddings=np.array(vectors),
                           strings=strings,
                           metric=metric,
                           auto_print=printing,
                           desc="Sense2Vec",
                           parallel_count=cpus,
                           evaluators=get_e(),
                           auto_save=2,
                           file_name=fnames[4],
                           over_write=True)
Esempio n. 3
0
import re

#import spacy
#nlp = spacy.load('en')
#nlp = spacy.load('en_vectors_web_lg')
from textblob import Word

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
#Stopwords in English
from nltk.corpus import stopwords as sw
en_sw = set(sw.words('english'))
from nltk.corpus import wordnet_ic

import sense2vec
s2v = sense2vec.load('/usr/local/lib/python3.5/dist-packages/sense2vec/reddit_vectors-1.1.0/')

def readFile(ruta,diccPos,diccNeg):
    """
    Read file, extract information (Word 1, word 2, Feature, value) line per line, insert in dictionaries
    Input: ruta path of the file to be read (String)
            diccPos dictionary of candidate positive examples (dictionary)
            diccNeg dictionary of candidate negative examples (dictionary)
    Return: 2 dictionaries diccPos and diccNeg (dictionary)
    """
    print ("\n>>> Archivo(train): ",ruta," extrayendo datos ...")
    #two times, one for fill dictionaries and other for delete duplicate key
    for i in range (0,2):
        try:
            file = open(ruta,"r")
        except IOError:
Esempio n. 4
0
import sense2vec
import fileinput

model = sense2vec.load()
print "Enter topic: "
for line in fileinput.input():
    freq, query_vector = model[u"{}|NOUN".format(line.strip())]
    print model.most_similar(query_vector, n=10)
    print "Enter topic: "
Esempio n. 5
0
import sense2vec

sense_vec_model = sense2vec.load()


def get_stop_words_list(path='code/words.txt'):
    """Read stopwords file and return it as a list."""
    stopwords = map(
        lambda x: x.strip(),
        open(path, 'rb').readlines()
    )
    return stopwords


stop_words = get_stop_words_list()
Esempio n. 6
0
def get_corefnlp():
    global _coref_nlp
    if _coref_nlp is None:
        _coref_nlp = sense2vec.load(COREF_MODEL)
    return _coref_nlp
Esempio n. 7
0
from __future__ import unicode_literals

import hug
from hug_middleware_cors import CORSMiddleware
from spacy.lang.en import English
import sense2vec

SENSES = [
    'auto', 'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'DET', 'INTJ', 'NOUN', 'NUM',
    'PART', 'PERSON', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'NORP',
    'FACILITY', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT', 'WORK_OF_ART',
    'LANGUAGE'
]

LEMMATIZER = English().vocab.morphology.lemmatizer
S2V = sense2vec.load('reddit_vectors-1.1.0')


@hug.get('/senses')
def senses():
    """Get all available 'senses', i.e. tags and labels."""
    return SENSES


@hug.post('/find')
def find(word: str, sense: str = 'auto', n_results: int = 200):
    """Find similar terms for a given term and optional sense."""
    best_word, best_sense = get_best(word, sense)
    if not word or not best_word:
        return {'text': word, 'sense': sense, 'results': [], 'count': 0}
    results = []
Esempio n. 8
0
 def __init__(self):
     self.handler = Similarity(spacy.load('en', parser=False, entity=False),
                               sense2vec.load())
Esempio n. 9
0
DO_TEST = 0
DO_PRINT = 0


def Log(message):
    if DO_PRINT:
        print(message)


# if 'sense2vec' not in sys.modules:
import sense2vec

start = time.time()
print('loading sense2vec')

s2v_model = sense2vec.load("resources/reddit_vectors-1.1.0")
print('done loading ins {0} s'.format(time.time() - start))


def loadSpacy():
    import spacy
    print('loading SPACY english')
    nlp = spacy.load('en')
    return nlp


def sense2vec_sim(token1, token2):
    try:
        return s2v_model.data.similarity(s2v_model[token1][1],
                                         s2v_model[token2][1])
    except ValueError:
Esempio n. 10
0
def teach(dataset,
          vectors_path,
          seeds,
          threshold=0.85,
          top_n=200,
          batch_size=5,
          resume=False):
    """
    Bootstrap a terminology list sense2vec. Prodigy
    will suggest similar terms based on the the most similar
    phrases from sense2vec
    """
    SENSES = [
        "auto", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN",
        "NUM", "PART", "PERSON", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM",
        "VERB", "NORP", "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT",
        "WORK_OF_ART", "LANGUAGE"
    ]

    log("RECIPE: Starting recipe phrases.to-patterns", locals())
    LEMMATIZER = English().vocab.morphology.lemmatizer
    S2V = sense2vec.load(vectors_path)
    log("RECIPE: Finished loading sense2vec", locals())

    # Seems to be a bug in sense2vec which gets < n similar senses not <= n
    batch_size = min(batch_size, top_n * len(seeds))
    top_n = top_n + 1

    DB = connect()
    seed_tasks = [set_hashes({"text": s, "answer": "accept"}) for s in seeds]
    DB.add_examples(seed_tasks, datasets=[dataset])

    accept_phrases = seeds
    reject_phrases = []

    seen = set(accept_phrases)
    sensed = set()

    if resume:
        prev = DB.get_dataset(dataset)
        prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"]
        prev_reject = [eg["text"] for eg in prev if eg["answer"] == "reject"]
        accept_phrases += prev_accept
        reject_phrases += prev_reject

        seen.update(set(accept_phrases))
        seen.update(set(reject_phrases))
        log("RECIPE: Resuming from {} previous examples in dataset {}".format(
            len(prev), dataset))

    def format_for_s2v(word, sense):
        return word.replace(" ", "_") + "|" + sense

    def get_best(word, sense):
        if sense != "auto":  # if sense is specified, find respective entry
            if format_for_s2v(word, sense) in S2V:
                return (word, sense)
            return (None, None)
        freqs = []
        casings = [word, word.upper(), word.title()
                   ] if word.islower() else [word]
        for text in casings:  # try options
            for tag in SENSES:
                query = format_for_s2v(text, tag)
                if query in S2V:
                    freqs.append((S2V[query][0], (text, tag)))
        return max(freqs)[1] if freqs else (None, None)

    def get_similar(word, sense, n=100):
        query = format_for_s2v(word, sense)
        if query not in S2V:
            return []
        freq, query_vector = S2V[query]
        words, scores = S2V.most_similar(query_vector, n)
        words = [word.rsplit("|", 1) for word in words]
        # Don't know why we'd be getting unsensed entries, but fix.
        words = [entry for entry in words if len(entry) == 2]
        words = [(word.replace("_", " "), sense) for word, sense in words]
        return zip(words, scores)

    def find_similar(word: str, sense: str = "auto", n_results: int = top_n):
        """Find similar terms for a given term and optional sense."""
        best_word, best_sense = get_best(word, sense)
        results = []
        if not word or not best_word:
            return results
        seen = set([best_word, min(LEMMATIZER(best_word, best_sense))])
        similar = get_similar(best_word, best_sense, n_results)
        for (word_entry, sense_entry), score in similar:
            head = min(LEMMATIZER(word_entry, sense_entry))
            if head not in seen and score > threshold:
                freq, _ = S2V[format_for_s2v(word_entry, sense_entry)]
                results.append((score, word_entry))
                seen.add(head)
            if len(results) >= n_results:
                break
        return results

    def update(answers):
        """Updates accept_phrases so that the stream can find new phrases"""
        for answer in answers:
            if answer['answer'] == 'accept':
                accept_phrases.append(answer['text'])
            elif answer['answer'] == 'reject':
                reject_phrases.append(answer['text'])

    def get_stream():
        """Continue querying sense2vec whenever we get a new phrase and presenting
        examples to the user with a similarity above the threshold parameter"""
        while True:
            seen.update(set([rp.lower() for rp in reject_phrases]))
            for p in accept_phrases:
                if p.lower() not in sensed:
                    sensed.add(p.lower())
                    for score, phrase in find_similar(p):
                        if phrase.lower() not in seen:
                            seen.add(phrase.lower())
                            yield {"text": phrase, 'meta': {'score': score}}

    stream = get_stream()

    return {
        'view_id': 'text',
        'dataset': dataset,
        'stream': stream,
        'update': update,
        'config': {
            'batch_size': batch_size
        }
    }
Esempio n. 11
0
from hug_middleware_cors import CORSMiddleware
from spacy.lang.en import English
import sense2vec

# fmt: off
SENSES = [
    "auto", "ADJ", "ADP", "ADV", "AUX", "CONJ", "DET", "INTJ", "NOUN", "NUM",
    "PART", "PERSON", "PRON", "PROPN", "PUNCT", "SCONJ", "SYM", "VERB", "NORP",
    "FACILITY", "ORG", "GPE", "LOC", "PRODUCT", "EVENT", "WORK_OF_ART",
    "LANGUAGE"
]
# fmt: on

print("Loading")
LEMMATIZER = English().vocab.morphology.lemmatizer
S2V = sense2vec.load("reddit_vectors-1.1.0")
print("Loaded!")


@hug.get("/senses")
def senses():
    """Get all available 'senses', i.e. tags and labels."""
    return SENSES


@hug.post("/find")
def find(word: str, sense: str = "auto", n_results: int = 200):
    """Find similar terms for a given term and optional sense."""
    best_word, best_sense = get_best(word, sense)
    if not word or not best_word:
        return {"text": word, "sense": sense, "results": [], "count": 0}
Esempio n. 12
0
def test_sample():
    s2v = sense2vec.load('reddit_vectors')
    freq, query_vector = s2v[u"beekeepers|NOUN"]
    assert freq is not None
    assert s2v.most_similar(query_vector, 3)[0] == \
        [u'beekeepers|NOUN', u'honey_bees|NOUN', u'Beekeepers|NOUN']