Example #1
0
def test_sense2vec_other_senses():
    s2v = Sense2Vec(shape=(6, 4))
    s2v.cfg["senses"] = ["A", "B", "C", "D"]
    for key in ["a|A", "a|B", "a|C", "b|A", "b|C", "c|A"]:
        s2v.add(key, numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
    others = s2v.get_other_senses("a|A")
    assert sorted(others) == ["a|B", "a|C"]
    others = s2v.get_other_senses("b|C")
    assert others == ["b|A"]
    others = s2v.get_other_senses("B|C")
    assert others == ["b|A"]
    others = s2v.get_other_senses("c|A")
    assert others == []
Example #2
0
def getVectorsDistance(word1, word2):
    '''
    If both given words are in the vocabulary, returns normalized euclidian distance of their corresponding vectors.
    '''
    global s2v
    if s2v == None:
        # Vector files are not present in the repository
        s2v = Sense2Vec().from_disk("C:\SKOLA\machine_learning\Project\pretrainedVectors\s2v_reddit_2019_lg")
        # s2v = Sense2Vec().from_disk("C:\SKOLA\machine_learning\Project\wiki corpus/backup/exported")

    if word1 in s2v and word2 in s2v:
        return s2v.similarity(word1, word2)
    return 0
Example #3
0
def main(in_file, vocab_file, out_dir, min_freq_ratio=0.0, min_distance=0.0):
    """
    Step 5: Export a sense2vec component

    Expects a vectors.txt and a vocab file trained with GloVe and exports
    a component that can be loaded with Sense2vec.from_disk.
    """
    input_path = Path(in_file)
    vocab_path = Path(vocab_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if input_path.suffix == ".bin":
        msg.fail("Need text-based vectors file, not binary", in_file, exits=1)
    if not vocab_path.exists():
        msg.fail("Can't find vocab file", vocab_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    with input_path.open("r", encoding="utf8") as f:
        (n_vectors, vector_size), f = _get_shape(f)
        vectors_data = f.readlines()
    with vocab_path.open("r", encoding="utf8") as f:
        vocab = read_vocab(f)
    vectors = {}
    all_senses = set()
    for item in vectors_data:
        item = item.rstrip().rsplit(" ", vector_size)
        key = item[0]
        try:
            _, sense = split_key(key)
        except ValueError:
            continue
        vec = item[1:]
        if len(vec) != vector_size:
            msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1)
        all_senses.add(sense)
        vectors[key] = numpy.asarray(vec, dtype=numpy.float32)
    discarded = set()
    discarded.update(get_minority_keys(vocab, min_freq_ratio))
    discarded.update(get_redundant_keys(vocab, vectors, min_distance))
    n_vectors = len(vectors) - len(discarded)
    s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
    for key, vector in vectors.items():
        if key not in discarded:
            s2v.add(key, vector)
            s2v.set_freq(key, vocab[key])
    msg.good("Created the sense2vec model")
    msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses")
    s2v.to_disk(output_path)
    msg.good("Saved model to directory", out_dir)
def test_sense2vec_to_from_bytes():
    s2v = Sense2Vec(shape=(2, 4))
    test_vector1 = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32)
    test_vector2 = numpy.asarray([5, 6, 7, 8], dtype=numpy.float32)
    s2v.add("test1", test_vector1, 123)
    s2v.add("test2", test_vector2, 456)
    s2v_bytes = s2v.to_bytes()
    new_s2v = Sense2Vec().from_bytes(s2v_bytes)
    assert len(new_s2v) == 2
    assert new_s2v.vectors.shape == (2, 4)
    assert "test1" in new_s2v
    assert "test2" in new_s2v
    assert new_s2v.get_freq("test1") == 123
    assert new_s2v.get_freq("test2") == 456
    assert numpy.array_equal(new_s2v["test1"], test_vector1)
    assert numpy.array_equal(new_s2v["test2"], test_vector2)
    assert s2v_bytes == new_s2v.to_bytes()
    s2v_bytes2 = s2v.to_bytes(exclude=["strings"])
    new_s2v2 = Sense2Vec().from_bytes(s2v_bytes2)
    assert len(new_s2v2.strings) == 0
    assert "test1" in new_s2v2
    assert s2v.strings["test1"] in new_s2v2
    with pytest.raises(KeyError):  # can't resolve hash
        new_s2v2.strings[s2v.strings["test2"]]
Example #5
0
def test_sense2vec_best_sense():
    s2v = Sense2Vec(shape=(5, 4))
    s2v.cfg["senses"] = ["A", "B", "C"]
    for key, freq in [("a|A", 100), ("a|B", 50), ("a|C", 10), ("b|A", 1),
                      ("B|C", 2)]:
        s2v.add(key, numpy.asarray([4, 2, 2, 2], dtype=numpy.float32), freq)
    assert s2v.get_best_sense("a") == "a|A"
    assert s2v.get_best_sense("A") == "a|A"
    assert s2v.get_best_sense("b") == "B|C"
    assert s2v.get_best_sense("b", ignore_case=False) == "b|A"
    assert s2v.get_best_sense("c") is None
    s2v.cfg["senses"] = []
    assert s2v.get_best_sense("a") is None
    assert s2v.get_best_sense("b", ["A"]) == "b|A"
    assert s2v.get_best_sense("b", ["A", "C"]) == "B|C"
Example #6
0
    def __init__(self):

        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
        model = T5ForConditionalGeneration.from_pretrained('Parth/result')
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        # model.eval()
        self.device = device
        self.model = model
        self.nlp = spacy.load('en_core_web_sm')

        self.s2v = Sense2Vec().from_disk('s2v_old')

        self.fdist = FreqDist(brown.words())
        self.normalized_levenshtein = levenshtein
        self.set_seed(42)
Example #7
0
def sense2vec_get_words(word):
    vAR_s2v = Sense2Vec().from_disk('s2v_old')
    vAR_output = []
    vAR_word = word.lower()
    vAR_word = vAR_word.replace(" ", "_")

    vAR_sense = vAR_s2v.get_best_sense(vAR_word)
    if vAR_sense:
        vAR_most_similar = vAR_s2v.most_similar(vAR_sense, n=20)
        for each_word in vAR_most_similar:
            vAR_append_word = each_word[0].split("|")[0].replace("_",
                                                                 " ").lower()
            if vAR_append_word.lower() != word.lower():
                vAR_output.append(vAR_append_word.title())

    vAR_out = list(OrderedDict.fromkeys(vAR_output))
    return vAR_out
def get_candidates_closest_to_seed_terms(terms, num_of_candidates, num_of_top_frequency_terms_to_consider):
    s2v = Sense2Vec().from_disk("s2v_reddit_2019_lg")
    query = get_query_from_terms(terms, s2v)
    most_similar = s2v.most_similar(query, n=num_of_candidates * 50)  # have some extra because of non top frequency cands
    candidates = [i[0] for i in most_similar]
    clean_candidates = [t for t in terms]
    most_frequent = s2v.frequencies[:num_of_top_frequency_terms_to_consider]
    most_frequent = [i[0] for i in most_frequent]
    for cand in candidates:
        if cand in most_frequent:
            without_pos = cand.split("|")[0]
            clean = without_pos.replace("_", " ").lower()
            to_add = clean.replace(".", "")
            if to_add not in clean_candidates:
                clean_candidates.append(to_add)
        if len(clean_candidates) == num_of_candidates:
            break
    return set(clean_candidates)
Example #9
0
    def __init__(self, lang_code='en', max_questions=20):

        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
        model = T5ForConditionalGeneration.from_pretrained('Parth/result')
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        # model.eval()
        self.device = device
        self.model = model
        self.nlp = self.try_load_spacy_model(lang_code)
        self.max_questions = int(max_questions)

        self.s2v = Sense2Vec().from_disk(
            '/Users/dev/Develop/text-to-anki/backend/src/Questgen.ai/Questgen.ai/Questgen/s2v_old'
        )

        self.fdist = FreqDist(brown.words())
        self.normalized_levenshtein = NormalizedLevenshtein()
        self.set_seed(42)
def test_sense2vec_object():
    s2v = Sense2Vec(shape=(10, 4))
    assert s2v.vectors.shape == (10, 4)
    assert len(s2v) == 10
    test_vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
    s2v.add("test", test_vector)
    assert "test" in s2v
    assert isinstance(s2v.strings["test"], int)
    assert s2v.strings["test"] in s2v
    assert "foo" not in s2v
    assert numpy.array_equal(s2v["test"], test_vector)
    assert numpy.array_equal(s2v[s2v.strings["test"]], test_vector)
    assert list(s2v.keys()) == ["test"]
    s2v.add("test2", test_vector)
    assert "test2" in s2v
    assert sorted(list(s2v.keys())) == ["test", "test2"]
    with pytest.raises(ValueError):
        s2v["test3"] = test_vector
    s2v["test2"] = test_vector
Example #11
0
def s2v_mock():
    from sense2vec import Sense2Vec
    import numpy as np
    s2v = Sense2Vec(shape=(16, 4))
    s2v.add('New_York|GPE', np.asarray([1, 1, 1, 1], dtype=np.float32))
    s2v.add('New_York|NOUN', np.asarray([1, 2, 1, 1], dtype=np.float32))
    s2v.add('big|ADJ', np.asarray([2, 5, 4, 2], dtype=np.float32))
    s2v.add('BIG|ADJ', np.asarray([2, 5, 4, 1], dtype=np.float32))
    s2v.add('apple|NOUN', np.asarray([1, 3, 9, 3], dtype=np.float32))
    s2v.add('big_apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32))
    s2v.add('Big_Apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32))
    s2v.add('Big_Apple|LOC', np.asarray([6, 6, 6, 6], dtype=np.float32))
    s2v.add('Big_apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32))
    s2v.add('BIG_apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32))
    s2v.add('BIG_Apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32))
    s2v.add('BIG_APPLE|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32))
    s2v.add('black|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32))
    s2v.add('black|ADJ', np.asarray([5, 5, 5, 5], dtype=np.float32))
    s2v.add('blue|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32))
    s2v.add('blue_big_apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32))
    return s2v
Example #12
0
def test_sense2vec_most_similar():
    s2v = Sense2Vec(shape=(6, 4))
    s2v.add("a", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
    s2v.add("b", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32))
    s2v.add("c", numpy.asarray([4, 4, 4, 2], dtype=numpy.float32))
    s2v.add("d", numpy.asarray([4, 4, 4, 4], dtype=numpy.float32))
    s2v.add("x", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
    s2v.add("y", numpy.asarray([0.1, 1, 1, 1], dtype=numpy.float32))
    result1 = s2v.most_similar(["x"], n=2)
    assert len(result1) == 2
    assert result1[0][0] == "a"
    assert result1[0][1] == 1.0
    assert result1[0][1] == pytest.approx(1.0)
    assert result1[1][0] == "b"
    result2 = s2v.most_similar(["a", "x"], n=2)
    assert len(result2) == 2
    assert sorted([key for key, _ in result2]) == ["b", "d"]
    result3 = s2v.most_similar(["a", "b"], n=3)
    assert len(result3) == 3
    assert "y" not in [key for key, _ in result3]
    assert len(s2v.most_similar(["a", "b"], n=10)) == 4
    with pytest.raises(ValueError):
        s2v.most_similar(["z"], n=1)  # key not in table
def test_registry():
    """Test that custom functions are used internally if they're registered."""

    @registry.make_key.register("custom_make_key")
    def custom_make_key(word, sense):
        return f"{word}###{sense}"

    @registry.split_key.register("custom_split_key")
    def custom_split_key(key):
        return tuple(key.split("###"))

    overrides = {"make_key": "custom_make_key", "split_key": "custom_split_key"}
    test_vector = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32)
    data = [("clear", "NOUN", 100), ("clear", "VERB", 200), ("clear", "ADJ", 300)]
    s2v = Sense2Vec(shape=(len(data), 4), overrides=overrides)
    for word, sense, freq in data:
        s2v.add(custom_make_key(word, sense), test_vector, freq)
        s2v.cfg["senses"].append(sense)
    assert "clear###NOUN" in s2v
    other_senses = s2v.get_other_senses("clear###NOUN")
    assert len(other_senses) == 2
    assert "clear###VERB" in other_senses
    assert "clear###ADJ" in other_senses
    assert s2v.get_best_sense("clear") == "clear###ADJ"
Example #14
0
    #         value_word_lemma, value_sense
    #     ] not in input_list_reduced_to_lemma:
    #       seen.add(value_word_lemma_sense_joined)
    #       result.append(item)
    #   return result


if __name__ == '__main__':
    from sense2vec import Sense2Vec
    from s2v_util import S2vUtil
    from s2v_senses import S2vSenses
    from s2v_key_case_and_sense_variations import S2vKeyCaseAndSenseVariations
    from s2v_key_commonizer import S2vKeyCommonizer
    S2V_MODAL_PATH = os.getenv('S2V_MODEL_PATH')
    print("loading model from disk..", S2V_MODAL_PATH)
    s2v = Sense2Vec().from_disk(S2V_MODAL_PATH)
    print("model loaded.")
    s2v_util = S2vUtil(s2v)
    s2v_senses = S2vSenses(s2v_util)
    s2v_key_variations = S2vKeyCaseAndSenseVariations(s2v_util, s2v_senses)
    s2v_key_commonizer = S2vKeyCommonizer()
    syn_service = S2vSynonyms(s2v_util, s2v_key_variations, s2v_key_commonizer)
    req_args = {
        'attempt-phrase-join-for-compound-phrases': 1,
        'min-score': 0.5,
        'n': 10,
        'match-input-sense': 1,
        'reduce-multicase': 1,
        'reduce-compound-nouns': 1,
        'min-word-len': 2,
    }
        np.around((input_array - np.min(input_array)) / np.ptp(input_array),
                  decimals=3))

    if inverse:
        return list(map(lambda x: round(vals_range - x - min_val, 3),
                        new_vals))
    return new_vals


# vals = [0, 21, 2288, 52300, 35004]
# print(normalize_distribution(vals))
# print(normalize_distribution(vals, inverse=True))
# exit()

print("loading model from disk..")
s2v = Sense2Vec().from_disk(os.environ['S2V_MODEL_PATH'])
print("model loaded.")

# 2015 model: s2v keys len:  1195261
print("s2v keys len: ", len(s2v))

freq_by_word_count = {}
freq_distribution_by_word_count = {}

# for key in s2v.keys():
#   word_count = len(key.split('_'))
#   if word_count <= 9:
#     # if word_count > 6:
#     #   print('big word', word_count, key, s2v.get_freq(key))
#     if word_count in freq_by_word_count:
#       freq_by_word_count[word_count] += 1
Example #16
0
    eval_whole=False,
    eval_only=False,
    show_scores=False,
):
    """
    Evaluate a sense2vec model by asking about phrase triples: is word A more
    similar to word B, or to word C? If the human mostly agrees with the model,
    the vectors model is good.
    """
    random.seed(0)
    log("RECIPE: Starting recipe sense2vec.eval", locals())
    strategies = eval_strategies.get_all()
    if strategy not in strategies.keys():
        err = f"Invalid strategy '{strategy}'. Expected: {list(strategies.keys())}"
        msg.fail(err, exits=1)
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)

    def get_html(key, score=None, large=False):
        word, sense = s2v.split_key(key)
        html_word = f"<span style='font-size: {30 if large else 20}px'>{word}</span>"
        html_sense = f"<strong style='opacity: 0.75; font-size: 14px; padding-left: 10px'>{sense}</strong>"
        html = f"{html_word} {html_sense}"
        if show_scores and score is not None:
            html += f" <span style='opacity: 0.75; font-size: 12px; padding-left: 10px'>{score:.4}</span>"
        return html

    def get_stream():
        strategy_func = eval_strategies.get(strategy)
        log(f"RECIPE: Using strategy {strategy}")
        # Limit to most frequent entries
Example #17
0
def teach(
    dataset,
    vectors_path,
    seeds,
    threshold=0.85,
    n_similar=20,
    batch_size=5,
    resume=False,
):
    """
    Bootstrap a terminology list using sense2vec. Prodigy will suggest similar
    terms based on the the most similar phrases from sense2vec, and the
    suggestions will be adjusted as you annotate and accept similar phrases. For
    each seed term, the best matching sense according to the sense2vec vectors
    will be used.
    """
    log("RECIPE: Starting recipe sense2vec.teach", locals())
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)
    html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span><strong style='opacity: 0.75'>{{sense}}</strong>"
    accept_keys = []
    seen = set(accept_keys)
    seed_tasks = []
    for seed in seeds:
        key = s2v.get_best_sense(seed)
        if key is None:
            raise ValueError(f"Can't find seed term '{seed}' in vectors")
        accept_keys.append(key)
        best_word, best_sense = s2v.split_key(key)
        task = {
            "text": key,
            "word": best_word,
            "sense": best_sense,
            "meta": {
                "score": 1.0
            },
            "answer": "accept",
        }
        seed_tasks.append(set_hashes(task))
    print(f"Starting with seed keys: {accept_keys}")
    DB = connect()
    if dataset not in DB:
        DB.add_dataset(dataset)
    dataset_hashes = DB.get_task_hashes(dataset)
    DB.add_examples(
        [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes],
        datasets=[dataset],
    )

    if resume:
        prev = DB.get_dataset(dataset)
        prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"]
        accept_keys += prev_accept
        seen.update(set(accept_keys))
        log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}"
            )

    def update(answers):
        """Updates accept_keys so that the stream can find new phrases."""
        log(f"RECIPE: Updating with {len(answers)} answers")
        for answer in answers:
            phrase = answer["text"]
            if answer["answer"] == "accept":
                accept_keys.append(phrase)

    def get_stream():
        """Continue querying sense2vec whenever we get a new phrase and
        presenting examples to the user with a similarity above the threshold
        parameter."""
        while True:
            log(f"RECIPE: Looking for {n_similar} phrases most similar to "
                f"{len(accept_keys)} accepted keys")
            most_similar = s2v.most_similar(accept_keys, n=n_similar)
            log(f"RECIPE: Found {len(most_similar)} most similar phrases")
            for key, score in most_similar:
                if key not in seen and score > threshold:
                    seen.add(key)
                    word, sense = s2v.split_key(key)
                    # Make sure the score is a regular float, otherwise server
                    # may fail when trying to serialize it to/from JSON
                    meta = {"score": float(score)}
                    yield {
                        "text": key,
                        "word": word,
                        "sense": sense,
                        "meta": meta
                    }

    stream = get_stream()

    return {
        "view_id": "html",
        "dataset": dataset,
        "stream": stream,
        "update": update,
        "config": {
            "batch_size": batch_size,
            "html_template": html_template
        },
    }
Example #18
0
from flask import Flask, request, Response
import json
import datetime
from sense2vec import Sense2Vec
from s2v_util import S2vUtil
from s2v_senses import S2vSenses
from s2v_key_case_and_sense_variations import S2vKeyCaseAndSenseVariations
from s2v_key_commonizer import S2vKeyCommonizer
from s2v_similarity import S2vSimilarity
from s2v_synonyms import S2vSynonyms

app = Flask(__name__)
port = 80 if os.getuid() == 0 else 8000

print("loading model from disk..")
s2v = Sense2Vec().from_disk("/sense2vec-model")
print("model loaded.")
s2v_util = S2vUtil(s2v)
s2v_senses = S2vSenses(s2v_util)
s2v_key_variations = S2vKeyCaseAndSenseVariations(s2v_util, s2v_senses)
s2v_key_commonizer = S2vKeyCommonizer()
similarity_service = S2vSimilarity(s2v_util, s2v_key_variations,
                                   s2v_key_commonizer)
synonyms_service = S2vSynonyms(s2v_util, s2v_key_variations,
                               s2v_key_commonizer)


@app.route('/', methods=['POST', 'GET'])
def index():
    start = datetime.datetime.utcnow()
    data = request.data.decode('utf-8')
Example #19
0
text_file.close()

word_counter = Counter(corpus.split())
most_common = word_counter.most_common()
i = count = 0
most_common_list = []
while count < top_n:
    word = most_common[i][0]
    if word in pos_aspect or word in pos_opinion:
        most_common_list.append(word)
        count += 1
    i += 1
print(most_common_list)
print(len(most_common_list))

s2v = Sense2Vec().from_disk(folder)

if selection_mode == 'SVM':
    model_file = Path('{}/model.pkl'.format(folder))
    if model_file.is_file():
        most_similar = []
        for word in most_common_list:
            most_similar.extend(s2v.most_similar(word, n=most_similar_n))
        most_similar_words = [word[0] for word in most_similar]
        most_similar_words.extend(most_common_list)
        most_similar_vectors = [s2v[word] for word in most_similar_words]
        labels = []
        for word in most_similar_words:
            label = 1 if word in pos_aspect else 0 if word in neg_example else -1
            labels.append(label)
        test_df = pd.DataFrame({
Example #20
0
from sense2vec import Sense2Vec, Sense2VecComponent
import spacy, pandas, pickle

nlp = spacy.load("en_core_web_sm")
s2v = Sense2Vec().from_disk("./models/s2v_reddit_2015_md/s2v_old/")

df = pandas.read_csv("./twitter_data/exploration_dataset.csv")

vectors_df = pandas.DataFrame(
    columns=['id', 'vectors', 'label', "size", "text"])

corpus = []

for idx, row in df.head(100).iterrows():
    print("Parsing sentences")
    try:
        doc = nlp(row['text'])
        vectors = []
        for token in doc:
            key = "{0}|{1}".format(token.lemma_, token.pos_)
            if key in s2v:
                vector = s2v[key]
                vectors.append(vector)
        vectors_df = vectors_df.append(
            {
                "id": idx,
                "vectors": vectors,
                'label': row['label'],
                "size": len(vectors),
                "text": row['text']
            },
Example #21
0
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from unidecode import unidecode
from collections import Counter
import numpy as np
import string
import itertools
import csv
import json
from spellchecker import SpellChecker

# Load the essential models
nlp = spacy.load("en_core_web_sm")  # needs to be replaced with large model
# s2vOrg = nlp.add_pipe("sense2vec")
# s2vOrg.from_disk("./data/s2v_reddit_2015_md")
s2vOrg = Sense2Vec().from_disk("./data/s2v_reddit_2015_md")


def loadFile(path):
    corpuses = []  # used to store final results
    file = open(path, "rt", encoding="utf-8")
    # data = csv.reader(file, delimiter=",")
    data = csv.DictReader(file)
    removeEntries = ['isFinal', 'category', 'hit', 'mergeParent']
    for row in data:
        dict = list(map(row.pop, removeEntries))
        corpuses.append(row)
    # print(corpuses)
    return corpuses

Example #22
0
 def __init__(self, sense2vec_path):
     self.s2v = Sense2Vec().from_disk(sense2vec_path)
Example #23
0
def main(model_path,
         out_dir,
         min_freq_ratio=0.0,
         min_distance=0.0,
         check_keys=''):
    check_keys_list = []
    if len(check_keys) > 0:
        check_keys_list = list(map(lambda x: x.strip(), check_keys.split(',')))

    s2v = Sense2Vec().from_disk(model_path)
    output_path = Path(out_dir)
    vocab = {}
    for key, score in s2v.frequencies:
        vocab[key] = score
    vectors = {}
    for key, val in s2v:
        vectors[key] = val
    msg.info("loading vectors")
    for key, val in s2v:
        vector_size = len(val)
        break
    all_senses = s2v.senses
    msg.info("loaded vectors")

    if len(check_keys_list) > 0:
        blacklist = {}
        whitelist = []
        blacklisted_sense_keys = get_blacklisted_sense_keys(vocab)
        markdown_and_url_keys = get_markdown_and_url_keys(vocab)
        minority_keys = get_minority_keys(vocab, min_freq_ratio)
        redundant_keys = get_redundant_keys(vocab, vectors, min_distance)
        for k in check_keys_list:
            if k in blacklisted_sense_keys:
                blacklist[k] = 'sense'
            elif k in markdown_and_url_keys:
                blacklist[k] = 'markdown / url'
            elif k in minority_keys:
                blacklist[k] = 'minority'
            elif k in redundant_keys:
                blacklist[k] = 'redundant'
            else:
                whitelist.append(k)
        msg.warn('blacklist')
        for k in blacklist.keys():
            msg.warn("{k}: {v}".format(k=k, v=blacklist[k]))
        msg.good('whitelist')
        for k in whitelist:
            msg.good(k)
    else:
        discarded = set()
        discarded.update(get_blacklisted_sense_keys(vocab))
        discarded.update(get_markdown_and_url_keys(vocab))
        discarded.update(get_minority_keys(vocab, min_freq_ratio))
        discarded.update(get_redundant_keys(vocab, vectors, min_distance))
        n_vectors = len(vectors) - len(discarded)
        s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
        for key, vector in vectors.items():
            if key not in discarded:
                s2v.add(key, vector)
                if key in vocab:
                    s2v.set_freq(key, vocab[key])
        msg.good("Created the sense2vec model")
        msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses")
        s2v.to_disk(output_path)
        msg.good("Saved model to directory", out_dir)
Example #24
0
def load_vectors(path):
    return Sense2Vec().from_disk(path)
Example #25
0
def teach(
    dataset,
    vectors_path,
    seeds,
    threshold=0.85,
    n_similar=100,
    batch_size=5,
    case_sensitive=False,
    resume=False,
):
    """
    Bootstrap a terminology list using sense2vec. Prodigy will suggest similar
    terms based on the the most similar phrases from sense2vec, and the
    suggestions will be adjusted as you annotate and accept similar phrases. For
    each seed term, the best matching sense according to the sense2vec vectors
    will be used.

    If no similar terms are found above the given threshold, the threshold is
    lowered by 0.1 and similar terms are requested again.
    """
    log("RECIPE: Starting recipe sense2vec.teach", locals())
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)
    html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span>"
    accept_keys = []
    seen = set()
    seed_tasks = []
    for seed in seeds:
        key = s2v.get_best_sense(seed)
        if key is None:
            msg.fail(f"Can't find seed term '{seed}' in vectors", exits=1)
        accept_keys.append(key)
        best_word, best_sense = s2v.split_key(key)
        seen.add(best_word if case_sensitive else best_word.lower())
        task = {
            "text": key,
            "word": best_word,
            "sense": best_sense,
            "meta": {
                "score": 1.0,
                "sense": best_sense
            },
            "answer": "accept",
        }
        seed_tasks.append(set_hashes(task))
    print(f"Starting with seed keys: {accept_keys}")
    DB = connect()
    if dataset not in DB:
        DB.add_dataset(dataset)
    dataset_hashes = DB.get_task_hashes(dataset)
    DB.add_examples(
        [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes],
        datasets=[dataset],
    )

    if resume:
        prev = DB.get_dataset(dataset)
        prev_accept_keys = [
            eg["text"] for eg in prev if eg["answer"] == "accept"
        ]
        prev_words = [
            eg["word"] if case_sensitive else eg["word"].lower() for eg in prev
        ]
        accept_keys += prev_accept_keys
        seen.update(set(prev_words))
        log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}"
            )

    def update(answers):
        """Updates accept_keys so that the stream can find new phrases."""
        log(f"RECIPE: Updating with {len(answers)} answers")
        for answer in answers:
            phrase = answer["text"]
            if answer["answer"] == "accept":
                accept_keys.append(phrase)

    def get_stream():
        """Continue querying sense2vec whenever we get a new phrase and
        presenting examples to the user with a similarity above the threshold
        parameter."""
        nonlocal threshold
        while True:
            log(f"RECIPE: Looking for {n_similar} phrases most similar to "
                f"{len(accept_keys)} accepted keys")
            most_similar = s2v.most_similar(accept_keys, n=n_similar)
            log(f"RECIPE: Found {len(most_similar)} most similar phrases")
            n_skipped = 0
            n_duplicate = 0
            for key, score in most_similar:
                if score > threshold:
                    word, sense = s2v.split_key(key)
                    if (case_sensitive
                            and word in seen) or (not case_sensitive
                                                  and word.lower() in seen):
                        n_duplicate += 1
                        continue
                    seen.add(word if case_sensitive else word.lower())
                    # Make sure the score is a regular float, otherwise server
                    # may fail when trying to serialize it to/from JSON
                    meta = {"score": float(score), "sense": sense}
                    yield {
                        "text": key,
                        "word": word,
                        "sense": sense,
                        "meta": meta
                    }
                else:
                    n_skipped += 1
            if n_skipped:
                log(f"RECIPE: Skipped {n_skipped} phrases below threshold {threshold}"
                    )
            if n_skipped == len(most_similar) - n_duplicate:
                # No most similar phrases were found that are above the
                # threshold, so lower the threshold if it's not already 0 or
                # return empty list so Prodigy shows "no tasks available"
                new_threshold = threshold - 0.1
                if new_threshold <= 0.0:
                    log(f"RECIPE: No suggestions for threshold {threshold:.2}")
                    return []
                log(f"RECIPE: Lowering threshold from {threshold:.2} to {new_threshold:.2}"
                    )
                threshold = new_threshold

    stream = get_stream()

    return {
        "view_id": "html",
        "dataset": dataset,
        "stream": stream,
        "update": update,
        "config": {
            "batch_size": batch_size,
            "html_template": html_template
        },
    }
Example #26
0
# Weights to be used in each of the following functions
#################################################################
start_word = "Not_Considered"
#set previous word dis-similarity weight
pwsw = 0.7
#set the max value of weighting associated to a perfectly matched rhyming word
pw  = 0.9
#set the value associated with a matching first letter
flw = 0.1
#set the value associated with a matching secound letter
slw = 0.05

#################################################################

nlp = spacy.load("en_core_web_lg")
s2v = Sense2Vec().from_disk("C:/fyp/s2v_reddit_2019_lg")
#read in and process the input list the user wants to remember
#with open("api/v3/input_list/input_list.txt","r", encoding="utf-8") as f:
#    TEXT = f.read()
#doc = nlp(TEXT)

def create_output_list_v3(doc, in_start_word,pw,slw,flw, pwsw):
    #this takes in a one word start_word from the user and returns the most similar unique word to it that starts with the same letter for every word in the input list
    #there is currently an unrequired for loop, and it throws warnings about the .simalrity
    doc = nlp(doc)
    highest_scoring_list = []
    second_best_list=[]
    third_best_list=[]
    result = list()
    docu = nlp(in_start_word)
    previous_word = docu[0]
Example #27
0
def s2v():
    data_path = Path(__file__).parent / "data"
    return Sense2Vec().from_disk(data_path)
from sense2vec import Sense2Vec
import sqlite3
import csv
import argparse
import os

parser = argparse.ArgumentParser(description="create initial ontology graph")
parser.add_argument("--keywordDB", type=str, default="../data/keywords.db", required=False, help="path to sqlite keywords db file")
parser.add_argument("--s2vModel", type=str, default="../data/sense2vec_train/05", required=False, help="path to trained sense2vec model")
parser.add_argument("--outputDir", type=str, default="../data/", required=False, help="output directory")
parser.add_argument("--threshold", type=float, default=0.6, required=False, help="cosine similarity threshold used to create edges between keywords")
parser.add_argument("--keywordLimit", type=int, default=100000, required=False, help="max number of keywords to create the graph from")
args = parser.parse_args()

# load s2v model
s2v = Sense2Vec().from_disk(args.s2vModel)
words = list(s2v.keys())
limit = str(args.keywordLimit)

# load keywords
conn = sqlite3.connect(args.keywordDB)
c = conn.cursor()
c.execute(f"SELECT words, COUNT(paperID) AS word_count FROM keywords GROUP BY words ORDER BY word_count DESC limit {limit};")

vectoredKeywords = []
keyword = c.fetchone()
senses=s2v.senses
senses.remove("PUNCT") # remove punctation
senses.remove("X") # remove uncategorized words
while keyword is not None:
    keyword = keyword[0]