Exemple #1
0
def test_registry():
    """Test that custom functions are used internally if they're registered."""
    @registry.make_key.register("custom_make_key")
    def custom_make_key(word, sense):
        return f"{word}###{sense}"

    @registry.split_key.register("custom_split_key")
    def custom_split_key(key):
        return tuple(key.split("###"))

    overrides = {
        "make_key": "custom_make_key",
        "split_key": "custom_split_key"
    }
    test_vector = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32)
    data = [("clear", "NOUN", 100), ("clear", "VERB", 200),
            ("clear", "ADJ", 300)]
    s2v = Sense2Vec(shape=(len(data), 4), overrides=overrides)
    for word, sense, freq in data:
        s2v.add(custom_make_key(word, sense), test_vector, freq)
        s2v.cfg["senses"].append(sense)
    assert "clear###NOUN" in s2v
    other_senses = s2v.get_other_senses("clear###NOUN")
    assert len(other_senses) == 2
    assert "clear###VERB" in other_senses
    assert "clear###ADJ" in other_senses
    assert s2v.get_best_sense("clear") == "clear###ADJ"
Exemple #2
0
def main(
    # fmt: off
    in_file: str = typer.Argument(..., help="Vectors file (text-based)"),
    vocab_file: str = typer.Argument(..., help="Vocabulary file"),
    out_dir: str = typer.Argument(..., help="Path to output directory"),
    min_freq_ratio: float = typer.Option(0.0, "--min-freq-ratio", "-r", help="Frequency ratio threshold for discarding minority senses or casings"),
    min_distance: float = typer.Option(0.0, "--min-distance", "-s", help="Similarity threshold for discarding redundant keys"),
    # fmt: on
):
    """
    Step 5: Export a sense2vec component

    Expects a vectors.txt and a vocab file trained with GloVe and exports
    a component that can be loaded with Sense2vec.from_disk.
    """
    input_path = Path(in_file)
    vocab_path = Path(vocab_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if input_path.suffix == ".bin":
        msg.fail("Need text-based vectors file, not binary", in_file, exits=1)
    if not vocab_path.exists():
        msg.fail("Can't find vocab file", vocab_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    with input_path.open("r", encoding="utf8") as f:
        (n_vectors, vector_size), f = _get_shape(f)
        vectors_data = f.readlines()
    with vocab_path.open("r", encoding="utf8") as f:
        vocab = read_vocab(f)
    vectors = {}
    all_senses = set()
    for item in vectors_data:
        item = item.rstrip().rsplit(" ", vector_size)
        key = item[0]
        try:
            _, sense = split_key(key)
        except ValueError:
            continue
        vec = item[1:]
        if len(vec) != vector_size:
            msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1)
        all_senses.add(sense)
        vectors[key] = numpy.asarray(vec, dtype=numpy.float32)
    discarded = set()
    discarded.update(get_minority_keys(vocab, min_freq_ratio))
    discarded.update(get_redundant_keys(vocab, vectors, min_distance))
    n_vectors = len(vectors) - len(discarded)
    s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
    for key, vector in vectors.items():
        if key not in discarded:
            s2v.add(key, vector)
            s2v.set_freq(key, vocab[key])
    msg.good("Created the sense2vec model")
    msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses")
    s2v.to_disk(output_path)
    msg.good("Saved model to directory", out_dir)
Exemple #3
0
    def handle(self, *args, **options):

        s2v = Sense2Vec().from_disk("/Users/jasonbenn/data/s2v_old")
        query = "natural_language_processing|NOUN"
        assert query in s2v
        vector = s2v[query]
        freq = s2v.get_freq(query)
        most_similar = s2v.most_similar(query, n=3)
        from IPython import embed
        embed()
Exemple #4
0
def main(in_file, vocab_file, out_dir):
    """
    Step 5: Export a sense2vec component

    Expects a vectors.txt and a vocab file trained with GloVe and exports
    a component that can be loaded with Sense2vec.from_disk.
    """
    input_path = Path(in_file)
    vocab_path = Path(vocab_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if input_path.suffix == ".bin":
        msg.fail("Need text-based vectors file, not binary", in_file, exits=1)
    if not vocab_path.exists():
        msg.fail("Can't find vocab file", vocab_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    with input_path.open("r", encoding="utf8") as f:
        (n_vectors, vector_size), f = _get_shape(f)
        vectors_data = f.readlines()
    with vocab_path.open("r", encoding="utf8") as f:
        vocab_data = f.readlines()
    data = []
    all_senses = set()
    for item in vectors_data:
        item = item.rstrip().rsplit(" ", vector_size)
        key = item[0]
        try:
            _, sense = split_key(key)
        except ValueError:
            continue
        vec = item[1:]
        if len(vec) != vector_size:
            msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})",
                     exits=1)
        all_senses.add(sense)
        data.append((key, numpy.asarray(vec, dtype=numpy.float32)))
    s2v = Sense2Vec(shape=(len(data), vector_size), senses=all_senses)
    for key, vector in data:
        s2v.add(key, vector)
    for item in vocab_data:
        item = item.rstrip()
        if item.endswith(" word"):  # for fastText vocabs
            item = item[:-5]
        try:
            key, freq = item.rsplit(" ", 1)
        except ValueError:
            continue
        s2v.set_freq(key, int(freq))
    msg.good("Created the sense2vec model")
    msg.info(f"{len(data)} vectors, {len(all_senses)} total senses")
    s2v.to_disk(output_path)
    msg.good("Saved model to directory", out_dir)
def s2v_mock():
    from sense2vec import Sense2Vec
    import numpy as np
    s2v = Sense2Vec(shape=(10, 4))
    s2v.add('New_York|GPE', np.asarray([1, 1, 1, 1], dtype=np.float32))
    s2v.add('New_York|NOUN', np.asarray([1, 2, 1, 1], dtype=np.float32))
    s2v.add('big|ADJ', np.asarray([2, 5, 4, 2], dtype=np.float32))
    s2v.add('BIG|ADJ', np.asarray([2, 5, 4, 1], dtype=np.float32))
    s2v.add('apple|NOUN', np.asarray([1, 3, 9, 3], dtype=np.float32))
    s2v.add('big_apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32))
    return s2v
Exemple #6
0
def generateOtherChoice(answer, count):
    # make all word lowercase
    answer = str.lower(answer)
    closestWords = []

    try:

        doc1 = nlp((answer))

        # if answer count ==1, use glove2word2vec
        if (len(doc1) == 1):
            closestWords = model_glove.most_similar(positive=[doc1.text],
                                                    topn=count)

        # if answer count >1, use Sense2Vec
        else:
            temp = doc1.text.replace(' ', '_') + '|NOUN'

            s2v = Sense2Vec().from_disk("./Out_Source/s2v_old")
            most_similar = s2v.most_similar(temp, n=count)

            for each_choice in most_similar:
                del_ = each_choice[0].replace('_', ' ')
                choice, sep, suffix = del_.partition('|')
                closestWords.append((choice, each_choice[1]))

    except:
        return []

    other_choice = list(map(lambda x: x[0], closestWords))[0:count]

    # Remove words that are the same as the answer
    temp = other_choice[:]
    for i in temp:
        choice = nlp(i)
        for token in choice:
            if (token.lemma_ in answer) or (answer in token.lemma_):
                other_choice.remove(i)
                break

    # Remove the cognate words in the options
    other_choice_ = []
    for j in other_choice:
        choice = nlp(j)
        if (len(choice) == 1):
            for token in choice:
                other_choice_.append(token.lemma_)
        else:
            tempword = ''
            for token in choice:
                tempword = tempword + token.lemma_ + ' '
            other_choice_.append(tempword.rstrip(' '))

    return list(set(other_choice_))
def test_sense2vec_other_senses():
    s2v = Sense2Vec(shape=(6, 4))
    s2v.cfg["senses"] = ["A", "B", "C", "D"]
    for key in ["a|A", "a|B", "a|C", "b|A", "b|C", "c|A"]:
        s2v.add(key, numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
    others = s2v.get_other_senses("a|A")
    assert sorted(others) == ["a|B", "a|C"]
    others = s2v.get_other_senses("b|C")
    assert others == ["b|A"]
    others = s2v.get_other_senses("c|A")
    assert others == []
def test_sense2vec_best_sense():
    s2v = Sense2Vec(shape=(5, 4))
    s2v.cfg["senses"] = ["A", "B", "C"]
    for key, freq in [("a|A", 100), ("a|B", 50), ("a|C", 10), ("b|A", 1), ("B|C", 2)]:
        s2v.add(key, numpy.asarray([4, 2, 2, 2], dtype=numpy.float32), freq)
    assert s2v.get_best_sense("a") == "a|A"
    assert s2v.get_best_sense("b") == "B|C"
    assert s2v.get_best_sense("b", ignore_case=False) == "b|A"
    assert s2v.get_best_sense("c") is None
    s2v.cfg["senses"] = []
    assert s2v.get_best_sense("a") is None
def test_sense2vec_freqs():
    s2v = Sense2Vec(shape=(10, 4))
    vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
    s2v.add("test1", vector, 123)
    s2v.add("test2", vector, 456)
    assert len(s2v.freqs) == 2
    assert s2v.get_freq("test1") == 123
    assert s2v.get_freq("test2") == 456
    assert s2v.get_freq("test3") is None
    assert s2v.get_freq("test3", 100) == 100
    s2v.set_freq("test3", 200)
    assert s2v.get_freq("test3") == 200
Exemple #10
0
def getVectorsDistance(word1, word2):
    '''
    If both given words are in the vocabulary, returns normalized euclidian distance of their corresponding vectors.
    '''
    global s2v
    if s2v == None:
        # Vector files are not present in the repository
        s2v = Sense2Vec().from_disk("C:\SKOLA\machine_learning\Project\pretrainedVectors\s2v_reddit_2019_lg")
        # s2v = Sense2Vec().from_disk("C:\SKOLA\machine_learning\Project\wiki corpus/backup/exported")

    if word1 in s2v and word2 in s2v:
        return s2v.similarity(word1, word2)
    return 0
def test_sense2vec_similarity():
    s2v = Sense2Vec(shape=(5, 4))
    s2v.add("a", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32))
    s2v.add("b", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32))
    s2v.add("c", numpy.asarray([4, 4, 4, 2], dtype=numpy.float32))
    s2v.add("d", numpy.asarray([0.1, 0.2, 0.3, 0.4], dtype=numpy.float32))
    s2v.add("e", numpy.asarray([0, 0, 0, 0], dtype=numpy.float32))
    assert s2v.similarity("a", "b") == 1.0
    assert 1.0 > s2v.similarity("b", "c") > 0.9
    assert 1.0 > s2v.similarity(["a", "b"], "c") > 0.9
    assert s2v.similarity("b", "c") == s2v.similarity(["a", "b"], "c")
    assert s2v.similarity("a", "d") < 0.8
    assert s2v.similarity("a", "e") == 0.0
def test_sense2vec_to_from_bytes():
    s2v = Sense2Vec(shape=(2, 4))
    test_vector1 = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32)
    test_vector2 = numpy.asarray([5, 6, 7, 8], dtype=numpy.float32)
    s2v.add("test1", test_vector1, 123)
    s2v.add("test2", test_vector2, 456)
    s2v_bytes = s2v.to_bytes()
    new_s2v = Sense2Vec().from_bytes(s2v_bytes)
    assert len(new_s2v) == 2
    assert new_s2v.vectors.shape == (2, 4)
    assert "test1" in new_s2v
    assert "test2" in new_s2v
    assert new_s2v.get_freq("test1") == 123
    assert new_s2v.get_freq("test2") == 456
    assert numpy.array_equal(new_s2v["test1"], test_vector1)
    assert numpy.array_equal(new_s2v["test2"], test_vector2)
    assert s2v_bytes == new_s2v.to_bytes()
    s2v_bytes2 = s2v.to_bytes(exclude=["strings"])
    new_s2v2 = Sense2Vec().from_bytes(s2v_bytes2)
    assert len(new_s2v2.strings) == 0
    assert "test1" in new_s2v2
    assert s2v.strings["test1"] in new_s2v2
    with pytest.raises(KeyError):  # can't resolve hash
        new_s2v2.strings[s2v.strings["test2"]]
Exemple #13
0
    def __init__(self):

        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
        model = T5ForConditionalGeneration.from_pretrained('Parth/result')
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        # model.eval()
        self.device = device
        self.model = model
        self.nlp = spacy.load('en_core_web_sm')

        self.s2v = Sense2Vec().from_disk('s2v_old')

        self.fdist = FreqDist(brown.words())
        self.normalized_levenshtein = levenshtein
        self.set_seed(42)
Exemple #14
0
def sense2vec_get_words(word):
    vAR_s2v = Sense2Vec().from_disk('s2v_old')
    vAR_output = []
    vAR_word = word.lower()
    vAR_word = vAR_word.replace(" ", "_")

    vAR_sense = vAR_s2v.get_best_sense(vAR_word)
    if vAR_sense:
        vAR_most_similar = vAR_s2v.most_similar(vAR_sense, n=20)
        for each_word in vAR_most_similar:
            vAR_append_word = each_word[0].split("|")[0].replace("_",
                                                                 " ").lower()
            if vAR_append_word.lower() != word.lower():
                vAR_output.append(vAR_append_word.title())

    vAR_out = list(OrderedDict.fromkeys(vAR_output))
    return vAR_out
def get_candidates_closest_to_seed_terms(terms, num_of_candidates, num_of_top_frequency_terms_to_consider):
    s2v = Sense2Vec().from_disk("s2v_reddit_2019_lg")
    query = get_query_from_terms(terms, s2v)
    most_similar = s2v.most_similar(query, n=num_of_candidates * 50)  # have some extra because of non top frequency cands
    candidates = [i[0] for i in most_similar]
    clean_candidates = [t for t in terms]
    most_frequent = s2v.frequencies[:num_of_top_frequency_terms_to_consider]
    most_frequent = [i[0] for i in most_frequent]
    for cand in candidates:
        if cand in most_frequent:
            without_pos = cand.split("|")[0]
            clean = without_pos.replace("_", " ").lower()
            to_add = clean.replace(".", "")
            if to_add not in clean_candidates:
                clean_candidates.append(to_add)
        if len(clean_candidates) == num_of_candidates:
            break
    return set(clean_candidates)
def test_sense2vec_object():
    s2v = Sense2Vec(shape=(10, 4))
    assert s2v.vectors.shape == (10, 4)
    assert len(s2v) == 10
    test_vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
    s2v.add("test", test_vector)
    assert "test" in s2v
    assert isinstance(s2v.strings["test"], int)
    assert s2v.strings["test"] in s2v
    assert "foo" not in s2v
    assert numpy.array_equal(s2v["test"], test_vector)
    assert numpy.array_equal(s2v[s2v.strings["test"]], test_vector)
    assert list(s2v.keys()) == ["test"]
    s2v.add("test2", test_vector)
    assert "test2" in s2v
    assert sorted(list(s2v.keys())) == ["test", "test2"]
    with pytest.raises(ValueError):
        s2v["test3"] = test_vector
    s2v["test2"] = test_vector
Exemple #17
0
    def __init__(self, lang_code='en', max_questions=20):

        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
        model = T5ForConditionalGeneration.from_pretrained('Parth/result')
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        # model.eval()
        self.device = device
        self.model = model
        self.nlp = self.try_load_spacy_model(lang_code)
        self.max_questions = int(max_questions)

        self.s2v = Sense2Vec().from_disk(
            '/Users/dev/Develop/text-to-anki/backend/src/Questgen.ai/Questgen.ai/Questgen/s2v_old'
        )

        self.fdist = FreqDist(brown.words())
        self.normalized_levenshtein = NormalizedLevenshtein()
        self.set_seed(42)
Exemple #18
0
def test_sense2vec_most_similar():
    s2v = Sense2Vec(shape=(6, 4))
    s2v.add("a", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
    s2v.add("b", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32))
    s2v.add("c", numpy.asarray([4, 4, 4, 2], dtype=numpy.float32))
    s2v.add("d", numpy.asarray([4, 4, 4, 4], dtype=numpy.float32))
    s2v.add("x", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
    s2v.add("y", numpy.asarray([0.1, 1, 1, 1], dtype=numpy.float32))
    result1 = s2v.most_similar(["x"], n=2)
    assert len(result1) == 2
    assert result1[0][0] == "a"
    assert result1[0][1] == 1.0
    assert result1[0][1] == pytest.approx(1.0)
    assert result1[1][0] == "b"
    result2 = s2v.most_similar(["a", "x"], n=2)
    assert len(result2) == 2
    assert sorted([key for key, _ in result2]) == ["b", "d"]
    result3 = s2v.most_similar(["a", "b"], n=3)
    assert len(result3) == 3
    assert "y" not in [key for key, _ in result3]
    assert len(s2v.most_similar(["a", "b"], n=10)) == 4
    with pytest.raises(ValueError):
        s2v.most_similar(["z"], n=1)  # key not in table
def s2v():
    data_path = Path(__file__).parent / "data"
    return Sense2Vec().from_disk(data_path)
Exemple #20
0
 def __init__(self, sense2vec_path):
     self.s2v = Sense2Vec().from_disk(sense2vec_path)
Exemple #21
0
from flask import Flask, request, Response
import json
import datetime
from sense2vec import Sense2Vec
from s2v_util import S2vUtil
from s2v_senses import S2vSenses
from s2v_key_case_and_sense_variations import S2vKeyCaseAndSenseVariations
from s2v_key_commonizer import S2vKeyCommonizer
from s2v_similarity import S2vSimilarity
from s2v_synonyms import S2vSynonyms

app = Flask(__name__)
port = 80 if os.getuid() == 0 else 8000

print("loading model from disk..")
s2v = Sense2Vec().from_disk("/sense2vec-model")
print("model loaded.")
s2v_util = S2vUtil(s2v)
s2v_senses = S2vSenses(s2v_util)
s2v_key_variations = S2vKeyCaseAndSenseVariations(s2v_util, s2v_senses)
s2v_key_commonizer = S2vKeyCommonizer()
similarity_service = S2vSimilarity(s2v_util, s2v_key_variations,
                                   s2v_key_commonizer)
synonyms_service = S2vSynonyms(s2v_util, s2v_key_variations,
                               s2v_key_commonizer)


@app.route('/', methods=['POST', 'GET'])
def index():
    start = datetime.datetime.utcnow()
    data = request.data.decode('utf-8')
Exemple #22
0
    eval_whole=False,
    eval_only=False,
    show_scores=False,
):
    """
    Evaluate a sense2vec model by asking about phrase triples: is word A more
    similar to word B, or to word C? If the human mostly agrees with the model,
    the vectors model is good.
    """
    random.seed(0)
    log("RECIPE: Starting recipe sense2vec.eval", locals())
    strategies = eval_strategies.get_all()
    if strategy not in strategies.keys():
        err = f"Invalid strategy '{strategy}'. Expected: {list(strategies.keys())}"
        msg.fail(err, exits=1)
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)

    def get_html(key, score=None, large=False):
        word, sense = s2v.split_key(key)
        html_word = f"<span style='font-size: {30 if large else 20}px'>{word}</span>"
        html_sense = f"<strong style='opacity: 0.75; font-size: 14px; padding-left: 10px'>{sense}</strong>"
        html = f"{html_word} {html_sense}"
        if show_scores and score is not None:
            html += f" <span style='opacity: 0.75; font-size: 12px; padding-left: 10px'>{score:.4}</span>"
        return html

    def get_stream():
        strategy_func = eval_strategies.get(strategy)
        log(f"RECIPE: Using strategy {strategy}")
        # Limit to most frequent entries
Exemple #23
0
def teach(
    dataset,
    vectors_path,
    seeds,
    threshold=0.85,
    n_similar=100,
    batch_size=5,
    case_sensitive=False,
    resume=False,
):
    """
    Bootstrap a terminology list using sense2vec. Prodigy will suggest similar
    terms based on the the most similar phrases from sense2vec, and the
    suggestions will be adjusted as you annotate and accept similar phrases. For
    each seed term, the best matching sense according to the sense2vec vectors
    will be used.

    If no similar terms are found above the given threshold, the threshold is
    lowered by 0.1 and similar terms are requested again.
    """
    log("RECIPE: Starting recipe sense2vec.teach", locals())
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)
    html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span>"
    accept_keys = []
    seen = set()
    seed_tasks = []
    for seed in seeds:
        key = s2v.get_best_sense(seed)
        if key is None:
            msg.fail(f"Can't find seed term '{seed}' in vectors", exits=1)
        accept_keys.append(key)
        best_word, best_sense = s2v.split_key(key)
        seen.add(best_word if case_sensitive else best_word.lower())
        task = {
            "text": key,
            "word": best_word,
            "sense": best_sense,
            "meta": {
                "score": 1.0,
                "sense": best_sense
            },
            "answer": "accept",
        }
        seed_tasks.append(set_hashes(task))
    print(f"Starting with seed keys: {accept_keys}")
    DB = connect()
    if dataset not in DB:
        DB.add_dataset(dataset)
    dataset_hashes = DB.get_task_hashes(dataset)
    DB.add_examples(
        [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes],
        datasets=[dataset],
    )

    if resume:
        prev = DB.get_dataset(dataset)
        prev_accept_keys = [
            eg["text"] for eg in prev if eg["answer"] == "accept"
        ]
        prev_words = [
            eg["word"] if case_sensitive else eg["word"].lower() for eg in prev
        ]
        accept_keys += prev_accept_keys
        seen.update(set(prev_words))
        log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}"
            )

    def update(answers):
        """Updates accept_keys so that the stream can find new phrases."""
        log(f"RECIPE: Updating with {len(answers)} answers")
        for answer in answers:
            phrase = answer["text"]
            if answer["answer"] == "accept":
                accept_keys.append(phrase)

    def get_stream():
        """Continue querying sense2vec whenever we get a new phrase and
        presenting examples to the user with a similarity above the threshold
        parameter."""
        nonlocal threshold
        while True:
            log(f"RECIPE: Looking for {n_similar} phrases most similar to "
                f"{len(accept_keys)} accepted keys")
            most_similar = s2v.most_similar(accept_keys, n=n_similar)
            log(f"RECIPE: Found {len(most_similar)} most similar phrases")
            n_skipped = 0
            n_duplicate = 0
            for key, score in most_similar:
                if score > threshold:
                    word, sense = s2v.split_key(key)
                    if (case_sensitive
                            and word in seen) or (not case_sensitive
                                                  and word.lower() in seen):
                        n_duplicate += 1
                        continue
                    seen.add(word if case_sensitive else word.lower())
                    # Make sure the score is a regular float, otherwise server
                    # may fail when trying to serialize it to/from JSON
                    meta = {"score": float(score), "sense": sense}
                    yield {
                        "text": key,
                        "word": word,
                        "sense": sense,
                        "meta": meta
                    }
                else:
                    n_skipped += 1
            if n_skipped:
                log(f"RECIPE: Skipped {n_skipped} phrases below threshold {threshold}"
                    )
            if n_skipped == len(most_similar) - n_duplicate:
                # No most similar phrases were found that are above the
                # threshold, so lower the threshold if it's not already 0 or
                # return empty list so Prodigy shows "no tasks available"
                new_threshold = threshold - 0.1
                if new_threshold <= 0.0:
                    log(f"RECIPE: No suggestions for threshold {threshold:.2}")
                    return []
                log(f"RECIPE: Lowering threshold from {threshold:.2} to {new_threshold:.2}"
                    )
                threshold = new_threshold

    stream = get_stream()

    return {
        "view_id": "html",
        "dataset": dataset,
        "stream": stream,
        "update": update,
        "config": {
            "batch_size": batch_size,
            "html_template": html_template
        },
    }
Exemple #24
0
def teach(
    dataset,
    vectors_path,
    seeds,
    threshold=0.85,
    n_similar=20,
    batch_size=5,
    resume=False,
):
    """
    Bootstrap a terminology list using sense2vec. Prodigy will suggest similar
    terms based on the the most similar phrases from sense2vec, and the
    suggestions will be adjusted as you annotate and accept similar phrases. For
    each seed term, the best matching sense according to the sense2vec vectors
    will be used.
    """
    log("RECIPE: Starting recipe sense2vec.teach", locals())
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)
    html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span><strong style='opacity: 0.75'>{{sense}}</strong>"
    accept_keys = []
    seen = set(accept_keys)
    seed_tasks = []
    for seed in seeds:
        key = s2v.get_best_sense(seed)
        if key is None:
            raise ValueError(f"Can't find seed term '{seed}' in vectors")
        accept_keys.append(key)
        best_word, best_sense = s2v.split_key(key)
        task = {
            "text": key,
            "word": best_word,
            "sense": best_sense,
            "meta": {
                "score": 1.0
            },
            "answer": "accept",
        }
        seed_tasks.append(set_hashes(task))
    print(f"Starting with seed keys: {accept_keys}")
    DB = connect()
    if dataset not in DB:
        DB.add_dataset(dataset)
    dataset_hashes = DB.get_task_hashes(dataset)
    DB.add_examples(
        [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes],
        datasets=[dataset],
    )

    if resume:
        prev = DB.get_dataset(dataset)
        prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"]
        accept_keys += prev_accept
        seen.update(set(accept_keys))
        log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}"
            )

    def update(answers):
        """Updates accept_keys so that the stream can find new phrases."""
        log(f"RECIPE: Updating with {len(answers)} answers")
        for answer in answers:
            phrase = answer["text"]
            if answer["answer"] == "accept":
                accept_keys.append(phrase)

    def get_stream():
        """Continue querying sense2vec whenever we get a new phrase and
        presenting examples to the user with a similarity above the threshold
        parameter."""
        while True:
            log(f"RECIPE: Looking for {n_similar} phrases most similar to "
                f"{len(accept_keys)} accepted keys")
            most_similar = s2v.most_similar(accept_keys, n=n_similar)
            log(f"RECIPE: Found {len(most_similar)} most similar phrases")
            for key, score in most_similar:
                if key not in seen and score > threshold:
                    seen.add(key)
                    word, sense = s2v.split_key(key)
                    # Make sure the score is a regular float, otherwise server
                    # may fail when trying to serialize it to/from JSON
                    meta = {"score": float(score)}
                    yield {
                        "text": key,
                        "word": word,
                        "sense": sense,
                        "meta": meta
                    }

    stream = get_stream()

    return {
        "view_id": "html",
        "dataset": dataset,
        "stream": stream,
        "update": update,
        "config": {
            "batch_size": batch_size,
            "html_template": html_template
        },
    }
Exemple #25
0
def main(model_path,
         out_dir,
         min_freq_ratio=0.0,
         min_distance=0.0,
         check_keys=''):
    check_keys_list = []
    if len(check_keys) > 0:
        check_keys_list = list(map(lambda x: x.strip(), check_keys.split(',')))

    s2v = Sense2Vec().from_disk(model_path)
    output_path = Path(out_dir)
    vocab = {}
    for key, score in s2v.frequencies:
        vocab[key] = score
    vectors = {}
    for key, val in s2v:
        vectors[key] = val
    msg.info("loading vectors")
    for key, val in s2v:
        vector_size = len(val)
        break
    all_senses = s2v.senses
    msg.info("loaded vectors")

    if len(check_keys_list) > 0:
        blacklist = {}
        whitelist = []
        blacklisted_sense_keys = get_blacklisted_sense_keys(vocab)
        markdown_and_url_keys = get_markdown_and_url_keys(vocab)
        minority_keys = get_minority_keys(vocab, min_freq_ratio)
        redundant_keys = get_redundant_keys(vocab, vectors, min_distance)
        for k in check_keys_list:
            if k in blacklisted_sense_keys:
                blacklist[k] = 'sense'
            elif k in markdown_and_url_keys:
                blacklist[k] = 'markdown / url'
            elif k in minority_keys:
                blacklist[k] = 'minority'
            elif k in redundant_keys:
                blacklist[k] = 'redundant'
            else:
                whitelist.append(k)
        msg.warn('blacklist')
        for k in blacklist.keys():
            msg.warn("{k}: {v}".format(k=k, v=blacklist[k]))
        msg.good('whitelist')
        for k in whitelist:
            msg.good(k)
    else:
        discarded = set()
        discarded.update(get_blacklisted_sense_keys(vocab))
        discarded.update(get_markdown_and_url_keys(vocab))
        discarded.update(get_minority_keys(vocab, min_freq_ratio))
        discarded.update(get_redundant_keys(vocab, vectors, min_distance))
        n_vectors = len(vectors) - len(discarded)
        s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
        for key, vector in vectors.items():
            if key not in discarded:
                s2v.add(key, vector)
                if key in vocab:
                    s2v.set_freq(key, vocab[key])
        msg.good("Created the sense2vec model")
        msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses")
        s2v.to_disk(output_path)
        msg.good("Saved model to directory", out_dir)
Exemple #26
0
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from unidecode import unidecode
from collections import Counter
import numpy as np
import string
import itertools
import csv
import json
from spellchecker import SpellChecker

# Load the essential models
nlp = spacy.load("en_core_web_sm")  # needs to be replaced with large model
# s2vOrg = nlp.add_pipe("sense2vec")
# s2vOrg.from_disk("./data/s2v_reddit_2015_md")
s2vOrg = Sense2Vec().from_disk("./data/s2v_reddit_2015_md")


def loadFile(path):
    corpuses = []  # used to store final results
    file = open(path, "rt", encoding="utf-8")
    # data = csv.reader(file, delimiter=",")
    data = csv.DictReader(file)
    removeEntries = ['isFinal', 'category', 'hit', 'mergeParent']
    for row in data:
        dict = list(map(row.pop, removeEntries))
        corpuses.append(row)
    # print(corpuses)
    return corpuses

Exemple #27
0
def load_vectors(path):
    return Sense2Vec().from_disk(path)
Exemple #28
0
# Weights to be used in each of the following functions
#################################################################
start_word = "Not_Considered"
#set previous word dis-similarity weight
pwsw = 0.7
#set the max value of weighting associated to a perfectly matched rhyming word
pw  = 0.9
#set the value associated with a matching first letter
flw = 0.1
#set the value associated with a matching secound letter
slw = 0.05

#################################################################

nlp = spacy.load("en_core_web_lg")
s2v = Sense2Vec().from_disk("C:/fyp/s2v_reddit_2019_lg")
#read in and process the input list the user wants to remember
#with open("api/v3/input_list/input_list.txt","r", encoding="utf-8") as f:
#    TEXT = f.read()
#doc = nlp(TEXT)

def create_output_list_v3(doc, in_start_word,pw,slw,flw, pwsw):
    #this takes in a one word start_word from the user and returns the most similar unique word to it that starts with the same letter for every word in the input list
    #there is currently an unrequired for loop, and it throws warnings about the .simalrity
    doc = nlp(doc)
    highest_scoring_list = []
    second_best_list=[]
    third_best_list=[]
    result = list()
    docu = nlp(in_start_word)
    previous_word = docu[0]
    #         value_word_lemma, value_sense
    #     ] not in input_list_reduced_to_lemma:
    #       seen.add(value_word_lemma_sense_joined)
    #       result.append(item)
    #   return result


if __name__ == '__main__':
    from sense2vec import Sense2Vec
    from s2v_util import S2vUtil
    from s2v_senses import S2vSenses
    from s2v_key_case_and_sense_variations import S2vKeyCaseAndSenseVariations
    from s2v_key_commonizer import S2vKeyCommonizer
    S2V_MODAL_PATH = os.getenv('S2V_MODEL_PATH')
    print("loading model from disk..", S2V_MODAL_PATH)
    s2v = Sense2Vec().from_disk(S2V_MODAL_PATH)
    print("model loaded.")
    s2v_util = S2vUtil(s2v)
    s2v_senses = S2vSenses(s2v_util)
    s2v_key_variations = S2vKeyCaseAndSenseVariations(s2v_util, s2v_senses)
    s2v_key_commonizer = S2vKeyCommonizer()
    syn_service = S2vSynonyms(s2v_util, s2v_key_variations, s2v_key_commonizer)
    req_args = {
        'attempt-phrase-join-for-compound-phrases': 1,
        'min-score': 0.5,
        'n': 10,
        'match-input-sense': 1,
        'reduce-multicase': 1,
        'reduce-compound-nouns': 1,
        'min-word-len': 2,
    }
Exemple #30
0
text_file.close()

word_counter = Counter(corpus.split())
most_common = word_counter.most_common()
i = count = 0
most_common_list = []
while count < top_n:
    word = most_common[i][0]
    if word in pos_aspect or word in pos_opinion:
        most_common_list.append(word)
        count += 1
    i += 1
print(most_common_list)
print(len(most_common_list))

s2v = Sense2Vec().from_disk(folder)

if selection_mode == 'SVM':
    model_file = Path('{}/model.pkl'.format(folder))
    if model_file.is_file():
        most_similar = []
        for word in most_common_list:
            most_similar.extend(s2v.most_similar(word, n=most_similar_n))
        most_similar_words = [word[0] for word in most_similar]
        most_similar_words.extend(most_common_list)
        most_similar_vectors = [s2v[word] for word in most_similar_words]
        labels = []
        for word in most_similar_words:
            label = 1 if word in pos_aspect else 0 if word in neg_example else -1
            labels.append(label)
        test_df = pd.DataFrame({