コード例 #1
0
def test_registry():
    """Test that custom functions are used internally if they're registered."""
    @registry.make_key.register("custom_make_key")
    def custom_make_key(word, sense):
        return f"{word}###{sense}"

    @registry.split_key.register("custom_split_key")
    def custom_split_key(key):
        return tuple(key.split("###"))

    overrides = {
        "make_key": "custom_make_key",
        "split_key": "custom_split_key"
    }
    test_vector = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32)
    data = [("clear", "NOUN", 100), ("clear", "VERB", 200),
            ("clear", "ADJ", 300)]
    s2v = Sense2Vec(shape=(len(data), 4), overrides=overrides)
    for word, sense, freq in data:
        s2v.add(custom_make_key(word, sense), test_vector, freq)
        s2v.cfg["senses"].append(sense)
    assert "clear###NOUN" in s2v
    other_senses = s2v.get_other_senses("clear###NOUN")
    assert len(other_senses) == 2
    assert "clear###VERB" in other_senses
    assert "clear###ADJ" in other_senses
    assert s2v.get_best_sense("clear") == "clear###ADJ"
コード例 #2
0
def main(
    # fmt: off
    in_file: str = typer.Argument(..., help="Vectors file (text-based)"),
    vocab_file: str = typer.Argument(..., help="Vocabulary file"),
    out_dir: str = typer.Argument(..., help="Path to output directory"),
    min_freq_ratio: float = typer.Option(0.0, "--min-freq-ratio", "-r", help="Frequency ratio threshold for discarding minority senses or casings"),
    min_distance: float = typer.Option(0.0, "--min-distance", "-s", help="Similarity threshold for discarding redundant keys"),
    # fmt: on
):
    """
    Step 5: Export a sense2vec component

    Expects a vectors.txt and a vocab file trained with GloVe and exports
    a component that can be loaded with Sense2vec.from_disk.
    """
    input_path = Path(in_file)
    vocab_path = Path(vocab_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if input_path.suffix == ".bin":
        msg.fail("Need text-based vectors file, not binary", in_file, exits=1)
    if not vocab_path.exists():
        msg.fail("Can't find vocab file", vocab_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    with input_path.open("r", encoding="utf8") as f:
        (n_vectors, vector_size), f = _get_shape(f)
        vectors_data = f.readlines()
    with vocab_path.open("r", encoding="utf8") as f:
        vocab = read_vocab(f)
    vectors = {}
    all_senses = set()
    for item in vectors_data:
        item = item.rstrip().rsplit(" ", vector_size)
        key = item[0]
        try:
            _, sense = split_key(key)
        except ValueError:
            continue
        vec = item[1:]
        if len(vec) != vector_size:
            msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1)
        all_senses.add(sense)
        vectors[key] = numpy.asarray(vec, dtype=numpy.float32)
    discarded = set()
    discarded.update(get_minority_keys(vocab, min_freq_ratio))
    discarded.update(get_redundant_keys(vocab, vectors, min_distance))
    n_vectors = len(vectors) - len(discarded)
    s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
    for key, vector in vectors.items():
        if key not in discarded:
            s2v.add(key, vector)
            s2v.set_freq(key, vocab[key])
    msg.good("Created the sense2vec model")
    msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses")
    s2v.to_disk(output_path)
    msg.good("Saved model to directory", out_dir)
コード例 #3
0
ファイル: sense2vec.py プロジェクト: JasonBenn/worldview
    def handle(self, *args, **options):

        s2v = Sense2Vec().from_disk("/Users/jasonbenn/data/s2v_old")
        query = "natural_language_processing|NOUN"
        assert query in s2v
        vector = s2v[query]
        freq = s2v.get_freq(query)
        most_similar = s2v.most_similar(query, n=3)
        from IPython import embed
        embed()
コード例 #4
0
ファイル: 05_export.py プロジェクト: trizna3/S2V_ML
def main(in_file, vocab_file, out_dir):
    """
    Step 5: Export a sense2vec component

    Expects a vectors.txt and a vocab file trained with GloVe and exports
    a component that can be loaded with Sense2vec.from_disk.
    """
    input_path = Path(in_file)
    vocab_path = Path(vocab_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if input_path.suffix == ".bin":
        msg.fail("Need text-based vectors file, not binary", in_file, exits=1)
    if not vocab_path.exists():
        msg.fail("Can't find vocab file", vocab_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    with input_path.open("r", encoding="utf8") as f:
        (n_vectors, vector_size), f = _get_shape(f)
        vectors_data = f.readlines()
    with vocab_path.open("r", encoding="utf8") as f:
        vocab_data = f.readlines()
    data = []
    all_senses = set()
    for item in vectors_data:
        item = item.rstrip().rsplit(" ", vector_size)
        key = item[0]
        try:
            _, sense = split_key(key)
        except ValueError:
            continue
        vec = item[1:]
        if len(vec) != vector_size:
            msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})",
                     exits=1)
        all_senses.add(sense)
        data.append((key, numpy.asarray(vec, dtype=numpy.float32)))
    s2v = Sense2Vec(shape=(len(data), vector_size), senses=all_senses)
    for key, vector in data:
        s2v.add(key, vector)
    for item in vocab_data:
        item = item.rstrip()
        if item.endswith(" word"):  # for fastText vocabs
            item = item[:-5]
        try:
            key, freq = item.rsplit(" ", 1)
        except ValueError:
            continue
        s2v.set_freq(key, int(freq))
    msg.good("Created the sense2vec model")
    msg.info(f"{len(data)} vectors, {len(all_senses)} total senses")
    s2v.to_disk(output_path)
    msg.good("Saved model to directory", out_dir)
コード例 #5
0
def s2v_mock():
    from sense2vec import Sense2Vec
    import numpy as np
    s2v = Sense2Vec(shape=(10, 4))
    s2v.add('New_York|GPE', np.asarray([1, 1, 1, 1], dtype=np.float32))
    s2v.add('New_York|NOUN', np.asarray([1, 2, 1, 1], dtype=np.float32))
    s2v.add('big|ADJ', np.asarray([2, 5, 4, 2], dtype=np.float32))
    s2v.add('BIG|ADJ', np.asarray([2, 5, 4, 1], dtype=np.float32))
    s2v.add('apple|NOUN', np.asarray([1, 3, 9, 3], dtype=np.float32))
    s2v.add('big_apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32))
    return s2v
コード例 #6
0
ファイル: Generate_Choice.py プロジェクト: crond-jaist/CyATP
def generateOtherChoice(answer, count):
    # make all word lowercase
    answer = str.lower(answer)
    closestWords = []

    try:

        doc1 = nlp((answer))

        # if answer count ==1, use glove2word2vec
        if (len(doc1) == 1):
            closestWords = model_glove.most_similar(positive=[doc1.text],
                                                    topn=count)

        # if answer count >1, use Sense2Vec
        else:
            temp = doc1.text.replace(' ', '_') + '|NOUN'

            s2v = Sense2Vec().from_disk("./Out_Source/s2v_old")
            most_similar = s2v.most_similar(temp, n=count)

            for each_choice in most_similar:
                del_ = each_choice[0].replace('_', ' ')
                choice, sep, suffix = del_.partition('|')
                closestWords.append((choice, each_choice[1]))

    except:
        return []

    other_choice = list(map(lambda x: x[0], closestWords))[0:count]

    # Remove words that are the same as the answer
    temp = other_choice[:]
    for i in temp:
        choice = nlp(i)
        for token in choice:
            if (token.lemma_ in answer) or (answer in token.lemma_):
                other_choice.remove(i)
                break

    # Remove the cognate words in the options
    other_choice_ = []
    for j in other_choice:
        choice = nlp(j)
        if (len(choice) == 1):
            for token in choice:
                other_choice_.append(token.lemma_)
        else:
            tempword = ''
            for token in choice:
                tempword = tempword + token.lemma_ + ' '
            other_choice_.append(tempword.rstrip(' '))

    return list(set(other_choice_))
コード例 #7
0
def test_sense2vec_other_senses():
    s2v = Sense2Vec(shape=(6, 4))
    s2v.cfg["senses"] = ["A", "B", "C", "D"]
    for key in ["a|A", "a|B", "a|C", "b|A", "b|C", "c|A"]:
        s2v.add(key, numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
    others = s2v.get_other_senses("a|A")
    assert sorted(others) == ["a|B", "a|C"]
    others = s2v.get_other_senses("b|C")
    assert others == ["b|A"]
    others = s2v.get_other_senses("c|A")
    assert others == []
コード例 #8
0
def test_sense2vec_best_sense():
    s2v = Sense2Vec(shape=(5, 4))
    s2v.cfg["senses"] = ["A", "B", "C"]
    for key, freq in [("a|A", 100), ("a|B", 50), ("a|C", 10), ("b|A", 1), ("B|C", 2)]:
        s2v.add(key, numpy.asarray([4, 2, 2, 2], dtype=numpy.float32), freq)
    assert s2v.get_best_sense("a") == "a|A"
    assert s2v.get_best_sense("b") == "B|C"
    assert s2v.get_best_sense("b", ignore_case=False) == "b|A"
    assert s2v.get_best_sense("c") is None
    s2v.cfg["senses"] = []
    assert s2v.get_best_sense("a") is None
コード例 #9
0
def test_sense2vec_freqs():
    s2v = Sense2Vec(shape=(10, 4))
    vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
    s2v.add("test1", vector, 123)
    s2v.add("test2", vector, 456)
    assert len(s2v.freqs) == 2
    assert s2v.get_freq("test1") == 123
    assert s2v.get_freq("test2") == 456
    assert s2v.get_freq("test3") is None
    assert s2v.get_freq("test3", 100) == 100
    s2v.set_freq("test3", 200)
    assert s2v.get_freq("test3") == 200
コード例 #10
0
ファイル: SimilarityDict.py プロジェクト: trizna3/S2V_ML
def getVectorsDistance(word1, word2):
    '''
    If both given words are in the vocabulary, returns normalized euclidian distance of their corresponding vectors.
    '''
    global s2v
    if s2v == None:
        # Vector files are not present in the repository
        s2v = Sense2Vec().from_disk("C:\SKOLA\machine_learning\Project\pretrainedVectors\s2v_reddit_2019_lg")
        # s2v = Sense2Vec().from_disk("C:\SKOLA\machine_learning\Project\wiki corpus/backup/exported")

    if word1 in s2v and word2 in s2v:
        return s2v.similarity(word1, word2)
    return 0
コード例 #11
0
def test_sense2vec_similarity():
    s2v = Sense2Vec(shape=(5, 4))
    s2v.add("a", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32))
    s2v.add("b", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32))
    s2v.add("c", numpy.asarray([4, 4, 4, 2], dtype=numpy.float32))
    s2v.add("d", numpy.asarray([0.1, 0.2, 0.3, 0.4], dtype=numpy.float32))
    s2v.add("e", numpy.asarray([0, 0, 0, 0], dtype=numpy.float32))
    assert s2v.similarity("a", "b") == 1.0
    assert 1.0 > s2v.similarity("b", "c") > 0.9
    assert 1.0 > s2v.similarity(["a", "b"], "c") > 0.9
    assert s2v.similarity("b", "c") == s2v.similarity(["a", "b"], "c")
    assert s2v.similarity("a", "d") < 0.8
    assert s2v.similarity("a", "e") == 0.0
コード例 #12
0
def test_sense2vec_to_from_bytes():
    s2v = Sense2Vec(shape=(2, 4))
    test_vector1 = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32)
    test_vector2 = numpy.asarray([5, 6, 7, 8], dtype=numpy.float32)
    s2v.add("test1", test_vector1, 123)
    s2v.add("test2", test_vector2, 456)
    s2v_bytes = s2v.to_bytes()
    new_s2v = Sense2Vec().from_bytes(s2v_bytes)
    assert len(new_s2v) == 2
    assert new_s2v.vectors.shape == (2, 4)
    assert "test1" in new_s2v
    assert "test2" in new_s2v
    assert new_s2v.get_freq("test1") == 123
    assert new_s2v.get_freq("test2") == 456
    assert numpy.array_equal(new_s2v["test1"], test_vector1)
    assert numpy.array_equal(new_s2v["test2"], test_vector2)
    assert s2v_bytes == new_s2v.to_bytes()
    s2v_bytes2 = s2v.to_bytes(exclude=["strings"])
    new_s2v2 = Sense2Vec().from_bytes(s2v_bytes2)
    assert len(new_s2v2.strings) == 0
    assert "test1" in new_s2v2
    assert s2v.strings["test1"] in new_s2v2
    with pytest.raises(KeyError):  # can't resolve hash
        new_s2v2.strings[s2v.strings["test2"]]
コード例 #13
0
ファイル: main.py プロジェクト: sreerajm104/HerokuQuestGen
    def __init__(self):

        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
        model = T5ForConditionalGeneration.from_pretrained('Parth/result')
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        # model.eval()
        self.device = device
        self.model = model
        self.nlp = spacy.load('en_core_web_sm')

        self.s2v = Sense2Vec().from_disk('s2v_old')

        self.fdist = FreqDist(brown.words())
        self.normalized_levenshtein = levenshtein
        self.set_seed(42)
コード例 #14
0
def sense2vec_get_words(word):
    vAR_s2v = Sense2Vec().from_disk('s2v_old')
    vAR_output = []
    vAR_word = word.lower()
    vAR_word = vAR_word.replace(" ", "_")

    vAR_sense = vAR_s2v.get_best_sense(vAR_word)
    if vAR_sense:
        vAR_most_similar = vAR_s2v.most_similar(vAR_sense, n=20)
        for each_word in vAR_most_similar:
            vAR_append_word = each_word[0].split("|")[0].replace("_",
                                                                 " ").lower()
            if vAR_append_word.lower() != word.lower():
                vAR_output.append(vAR_append_word.title())

    vAR_out = list(OrderedDict.fromkeys(vAR_output))
    return vAR_out
コード例 #15
0
def get_candidates_closest_to_seed_terms(terms, num_of_candidates, num_of_top_frequency_terms_to_consider):
    s2v = Sense2Vec().from_disk("s2v_reddit_2019_lg")
    query = get_query_from_terms(terms, s2v)
    most_similar = s2v.most_similar(query, n=num_of_candidates * 50)  # have some extra because of non top frequency cands
    candidates = [i[0] for i in most_similar]
    clean_candidates = [t for t in terms]
    most_frequent = s2v.frequencies[:num_of_top_frequency_terms_to_consider]
    most_frequent = [i[0] for i in most_frequent]
    for cand in candidates:
        if cand in most_frequent:
            without_pos = cand.split("|")[0]
            clean = without_pos.replace("_", " ").lower()
            to_add = clean.replace(".", "")
            if to_add not in clean_candidates:
                clean_candidates.append(to_add)
        if len(clean_candidates) == num_of_candidates:
            break
    return set(clean_candidates)
コード例 #16
0
def test_sense2vec_object():
    s2v = Sense2Vec(shape=(10, 4))
    assert s2v.vectors.shape == (10, 4)
    assert len(s2v) == 10
    test_vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)
    s2v.add("test", test_vector)
    assert "test" in s2v
    assert isinstance(s2v.strings["test"], int)
    assert s2v.strings["test"] in s2v
    assert "foo" not in s2v
    assert numpy.array_equal(s2v["test"], test_vector)
    assert numpy.array_equal(s2v[s2v.strings["test"]], test_vector)
    assert list(s2v.keys()) == ["test"]
    s2v.add("test2", test_vector)
    assert "test2" in s2v
    assert sorted(list(s2v.keys())) == ["test", "test2"]
    with pytest.raises(ValueError):
        s2v["test3"] = test_vector
    s2v["test2"] = test_vector
コード例 #17
0
    def __init__(self, lang_code='en', max_questions=20):

        self.tokenizer = T5Tokenizer.from_pretrained('t5-base')
        model = T5ForConditionalGeneration.from_pretrained('Parth/result')
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model.to(device)
        # model.eval()
        self.device = device
        self.model = model
        self.nlp = self.try_load_spacy_model(lang_code)
        self.max_questions = int(max_questions)

        self.s2v = Sense2Vec().from_disk(
            '/Users/dev/Develop/text-to-anki/backend/src/Questgen.ai/Questgen.ai/Questgen/s2v_old'
        )

        self.fdist = FreqDist(brown.words())
        self.normalized_levenshtein = NormalizedLevenshtein()
        self.set_seed(42)
コード例 #18
0
def test_sense2vec_most_similar():
    s2v = Sense2Vec(shape=(6, 4))
    s2v.add("a", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
    s2v.add("b", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32))
    s2v.add("c", numpy.asarray([4, 4, 4, 2], dtype=numpy.float32))
    s2v.add("d", numpy.asarray([4, 4, 4, 4], dtype=numpy.float32))
    s2v.add("x", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32))
    s2v.add("y", numpy.asarray([0.1, 1, 1, 1], dtype=numpy.float32))
    result1 = s2v.most_similar(["x"], n=2)
    assert len(result1) == 2
    assert result1[0][0] == "a"
    assert result1[0][1] == 1.0
    assert result1[0][1] == pytest.approx(1.0)
    assert result1[1][0] == "b"
    result2 = s2v.most_similar(["a", "x"], n=2)
    assert len(result2) == 2
    assert sorted([key for key, _ in result2]) == ["b", "d"]
    result3 = s2v.most_similar(["a", "b"], n=3)
    assert len(result3) == 3
    assert "y" not in [key for key, _ in result3]
    assert len(s2v.most_similar(["a", "b"], n=10)) == 4
    with pytest.raises(ValueError):
        s2v.most_similar(["z"], n=1)  # key not in table
コード例 #19
0
def s2v():
    data_path = Path(__file__).parent / "data"
    return Sense2Vec().from_disk(data_path)
コード例 #20
0
 def __init__(self, sense2vec_path):
     self.s2v = Sense2Vec().from_disk(sense2vec_path)
コード例 #21
0
ファイル: server.py プロジェクト: joshweir/sense2vec-rest
from flask import Flask, request, Response
import json
import datetime
from sense2vec import Sense2Vec
from s2v_util import S2vUtil
from s2v_senses import S2vSenses
from s2v_key_case_and_sense_variations import S2vKeyCaseAndSenseVariations
from s2v_key_commonizer import S2vKeyCommonizer
from s2v_similarity import S2vSimilarity
from s2v_synonyms import S2vSynonyms

app = Flask(__name__)
port = 80 if os.getuid() == 0 else 8000

print("loading model from disk..")
s2v = Sense2Vec().from_disk("/sense2vec-model")
print("model loaded.")
s2v_util = S2vUtil(s2v)
s2v_senses = S2vSenses(s2v_util)
s2v_key_variations = S2vKeyCaseAndSenseVariations(s2v_util, s2v_senses)
s2v_key_commonizer = S2vKeyCommonizer()
similarity_service = S2vSimilarity(s2v_util, s2v_key_variations,
                                   s2v_key_commonizer)
synonyms_service = S2vSynonyms(s2v_util, s2v_key_variations,
                               s2v_key_commonizer)


@app.route('/', methods=['POST', 'GET'])
def index():
    start = datetime.datetime.utcnow()
    data = request.data.decode('utf-8')
コード例 #22
0
    eval_whole=False,
    eval_only=False,
    show_scores=False,
):
    """
    Evaluate a sense2vec model by asking about phrase triples: is word A more
    similar to word B, or to word C? If the human mostly agrees with the model,
    the vectors model is good.
    """
    random.seed(0)
    log("RECIPE: Starting recipe sense2vec.eval", locals())
    strategies = eval_strategies.get_all()
    if strategy not in strategies.keys():
        err = f"Invalid strategy '{strategy}'. Expected: {list(strategies.keys())}"
        msg.fail(err, exits=1)
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)

    def get_html(key, score=None, large=False):
        word, sense = s2v.split_key(key)
        html_word = f"<span style='font-size: {30 if large else 20}px'>{word}</span>"
        html_sense = f"<strong style='opacity: 0.75; font-size: 14px; padding-left: 10px'>{sense}</strong>"
        html = f"{html_word} {html_sense}"
        if show_scores and score is not None:
            html += f" <span style='opacity: 0.75; font-size: 12px; padding-left: 10px'>{score:.4}</span>"
        return html

    def get_stream():
        strategy_func = eval_strategies.get(strategy)
        log(f"RECIPE: Using strategy {strategy}")
        # Limit to most frequent entries
コード例 #23
0
def teach(
    dataset,
    vectors_path,
    seeds,
    threshold=0.85,
    n_similar=100,
    batch_size=5,
    case_sensitive=False,
    resume=False,
):
    """
    Bootstrap a terminology list using sense2vec. Prodigy will suggest similar
    terms based on the the most similar phrases from sense2vec, and the
    suggestions will be adjusted as you annotate and accept similar phrases. For
    each seed term, the best matching sense according to the sense2vec vectors
    will be used.

    If no similar terms are found above the given threshold, the threshold is
    lowered by 0.1 and similar terms are requested again.
    """
    log("RECIPE: Starting recipe sense2vec.teach", locals())
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)
    html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span>"
    accept_keys = []
    seen = set()
    seed_tasks = []
    for seed in seeds:
        key = s2v.get_best_sense(seed)
        if key is None:
            msg.fail(f"Can't find seed term '{seed}' in vectors", exits=1)
        accept_keys.append(key)
        best_word, best_sense = s2v.split_key(key)
        seen.add(best_word if case_sensitive else best_word.lower())
        task = {
            "text": key,
            "word": best_word,
            "sense": best_sense,
            "meta": {
                "score": 1.0,
                "sense": best_sense
            },
            "answer": "accept",
        }
        seed_tasks.append(set_hashes(task))
    print(f"Starting with seed keys: {accept_keys}")
    DB = connect()
    if dataset not in DB:
        DB.add_dataset(dataset)
    dataset_hashes = DB.get_task_hashes(dataset)
    DB.add_examples(
        [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes],
        datasets=[dataset],
    )

    if resume:
        prev = DB.get_dataset(dataset)
        prev_accept_keys = [
            eg["text"] for eg in prev if eg["answer"] == "accept"
        ]
        prev_words = [
            eg["word"] if case_sensitive else eg["word"].lower() for eg in prev
        ]
        accept_keys += prev_accept_keys
        seen.update(set(prev_words))
        log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}"
            )

    def update(answers):
        """Updates accept_keys so that the stream can find new phrases."""
        log(f"RECIPE: Updating with {len(answers)} answers")
        for answer in answers:
            phrase = answer["text"]
            if answer["answer"] == "accept":
                accept_keys.append(phrase)

    def get_stream():
        """Continue querying sense2vec whenever we get a new phrase and
        presenting examples to the user with a similarity above the threshold
        parameter."""
        nonlocal threshold
        while True:
            log(f"RECIPE: Looking for {n_similar} phrases most similar to "
                f"{len(accept_keys)} accepted keys")
            most_similar = s2v.most_similar(accept_keys, n=n_similar)
            log(f"RECIPE: Found {len(most_similar)} most similar phrases")
            n_skipped = 0
            n_duplicate = 0
            for key, score in most_similar:
                if score > threshold:
                    word, sense = s2v.split_key(key)
                    if (case_sensitive
                            and word in seen) or (not case_sensitive
                                                  and word.lower() in seen):
                        n_duplicate += 1
                        continue
                    seen.add(word if case_sensitive else word.lower())
                    # Make sure the score is a regular float, otherwise server
                    # may fail when trying to serialize it to/from JSON
                    meta = {"score": float(score), "sense": sense}
                    yield {
                        "text": key,
                        "word": word,
                        "sense": sense,
                        "meta": meta
                    }
                else:
                    n_skipped += 1
            if n_skipped:
                log(f"RECIPE: Skipped {n_skipped} phrases below threshold {threshold}"
                    )
            if n_skipped == len(most_similar) - n_duplicate:
                # No most similar phrases were found that are above the
                # threshold, so lower the threshold if it's not already 0 or
                # return empty list so Prodigy shows "no tasks available"
                new_threshold = threshold - 0.1
                if new_threshold <= 0.0:
                    log(f"RECIPE: No suggestions for threshold {threshold:.2}")
                    return []
                log(f"RECIPE: Lowering threshold from {threshold:.2} to {new_threshold:.2}"
                    )
                threshold = new_threshold

    stream = get_stream()

    return {
        "view_id": "html",
        "dataset": dataset,
        "stream": stream,
        "update": update,
        "config": {
            "batch_size": batch_size,
            "html_template": html_template
        },
    }
コード例 #24
0
def teach(
    dataset,
    vectors_path,
    seeds,
    threshold=0.85,
    n_similar=20,
    batch_size=5,
    resume=False,
):
    """
    Bootstrap a terminology list using sense2vec. Prodigy will suggest similar
    terms based on the the most similar phrases from sense2vec, and the
    suggestions will be adjusted as you annotate and accept similar phrases. For
    each seed term, the best matching sense according to the sense2vec vectors
    will be used.
    """
    log("RECIPE: Starting recipe sense2vec.teach", locals())
    s2v = Sense2Vec().from_disk(vectors_path)
    log("RECIPE: Loaded sense2vec vectors", vectors_path)
    html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span><strong style='opacity: 0.75'>{{sense}}</strong>"
    accept_keys = []
    seen = set(accept_keys)
    seed_tasks = []
    for seed in seeds:
        key = s2v.get_best_sense(seed)
        if key is None:
            raise ValueError(f"Can't find seed term '{seed}' in vectors")
        accept_keys.append(key)
        best_word, best_sense = s2v.split_key(key)
        task = {
            "text": key,
            "word": best_word,
            "sense": best_sense,
            "meta": {
                "score": 1.0
            },
            "answer": "accept",
        }
        seed_tasks.append(set_hashes(task))
    print(f"Starting with seed keys: {accept_keys}")
    DB = connect()
    if dataset not in DB:
        DB.add_dataset(dataset)
    dataset_hashes = DB.get_task_hashes(dataset)
    DB.add_examples(
        [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes],
        datasets=[dataset],
    )

    if resume:
        prev = DB.get_dataset(dataset)
        prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"]
        accept_keys += prev_accept
        seen.update(set(accept_keys))
        log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}"
            )

    def update(answers):
        """Updates accept_keys so that the stream can find new phrases."""
        log(f"RECIPE: Updating with {len(answers)} answers")
        for answer in answers:
            phrase = answer["text"]
            if answer["answer"] == "accept":
                accept_keys.append(phrase)

    def get_stream():
        """Continue querying sense2vec whenever we get a new phrase and
        presenting examples to the user with a similarity above the threshold
        parameter."""
        while True:
            log(f"RECIPE: Looking for {n_similar} phrases most similar to "
                f"{len(accept_keys)} accepted keys")
            most_similar = s2v.most_similar(accept_keys, n=n_similar)
            log(f"RECIPE: Found {len(most_similar)} most similar phrases")
            for key, score in most_similar:
                if key not in seen and score > threshold:
                    seen.add(key)
                    word, sense = s2v.split_key(key)
                    # Make sure the score is a regular float, otherwise server
                    # may fail when trying to serialize it to/from JSON
                    meta = {"score": float(score)}
                    yield {
                        "text": key,
                        "word": word,
                        "sense": sense,
                        "meta": meta
                    }

    stream = get_stream()

    return {
        "view_id": "html",
        "dataset": dataset,
        "stream": stream,
        "update": update,
        "config": {
            "batch_size": batch_size,
            "html_template": html_template
        },
    }
コード例 #25
0
ファイル: prune_s2v.py プロジェクト: joshweir/sense2vec-rest
def main(model_path,
         out_dir,
         min_freq_ratio=0.0,
         min_distance=0.0,
         check_keys=''):
    check_keys_list = []
    if len(check_keys) > 0:
        check_keys_list = list(map(lambda x: x.strip(), check_keys.split(',')))

    s2v = Sense2Vec().from_disk(model_path)
    output_path = Path(out_dir)
    vocab = {}
    for key, score in s2v.frequencies:
        vocab[key] = score
    vectors = {}
    for key, val in s2v:
        vectors[key] = val
    msg.info("loading vectors")
    for key, val in s2v:
        vector_size = len(val)
        break
    all_senses = s2v.senses
    msg.info("loaded vectors")

    if len(check_keys_list) > 0:
        blacklist = {}
        whitelist = []
        blacklisted_sense_keys = get_blacklisted_sense_keys(vocab)
        markdown_and_url_keys = get_markdown_and_url_keys(vocab)
        minority_keys = get_minority_keys(vocab, min_freq_ratio)
        redundant_keys = get_redundant_keys(vocab, vectors, min_distance)
        for k in check_keys_list:
            if k in blacklisted_sense_keys:
                blacklist[k] = 'sense'
            elif k in markdown_and_url_keys:
                blacklist[k] = 'markdown / url'
            elif k in minority_keys:
                blacklist[k] = 'minority'
            elif k in redundant_keys:
                blacklist[k] = 'redundant'
            else:
                whitelist.append(k)
        msg.warn('blacklist')
        for k in blacklist.keys():
            msg.warn("{k}: {v}".format(k=k, v=blacklist[k]))
        msg.good('whitelist')
        for k in whitelist:
            msg.good(k)
    else:
        discarded = set()
        discarded.update(get_blacklisted_sense_keys(vocab))
        discarded.update(get_markdown_and_url_keys(vocab))
        discarded.update(get_minority_keys(vocab, min_freq_ratio))
        discarded.update(get_redundant_keys(vocab, vectors, min_distance))
        n_vectors = len(vectors) - len(discarded)
        s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses)
        for key, vector in vectors.items():
            if key not in discarded:
                s2v.add(key, vector)
                if key in vocab:
                    s2v.set_freq(key, vocab[key])
        msg.good("Created the sense2vec model")
        msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses")
        s2v.to_disk(output_path)
        msg.good("Saved model to directory", out_dir)
コード例 #26
0
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from unidecode import unidecode
from collections import Counter
import numpy as np
import string
import itertools
import csv
import json
from spellchecker import SpellChecker

# Load the essential models
nlp = spacy.load("en_core_web_sm")  # needs to be replaced with large model
# s2vOrg = nlp.add_pipe("sense2vec")
# s2vOrg.from_disk("./data/s2v_reddit_2015_md")
s2vOrg = Sense2Vec().from_disk("./data/s2v_reddit_2015_md")


def loadFile(path):
    corpuses = []  # used to store final results
    file = open(path, "rt", encoding="utf-8")
    # data = csv.reader(file, delimiter=",")
    data = csv.DictReader(file)
    removeEntries = ['isFinal', 'category', 'hit', 'mergeParent']
    for row in data:
        dict = list(map(row.pop, removeEntries))
        corpuses.append(row)
    # print(corpuses)
    return corpuses

コード例 #27
0
def load_vectors(path):
    return Sense2Vec().from_disk(path)
コード例 #28
0
# Weights to be used in each of the following functions
#################################################################
start_word = "Not_Considered"
#set previous word dis-similarity weight
pwsw = 0.7
#set the max value of weighting associated to a perfectly matched rhyming word
pw  = 0.9
#set the value associated with a matching first letter
flw = 0.1
#set the value associated with a matching secound letter
slw = 0.05

#################################################################

nlp = spacy.load("en_core_web_lg")
s2v = Sense2Vec().from_disk("C:/fyp/s2v_reddit_2019_lg")
#read in and process the input list the user wants to remember
#with open("api/v3/input_list/input_list.txt","r", encoding="utf-8") as f:
#    TEXT = f.read()
#doc = nlp(TEXT)

def create_output_list_v3(doc, in_start_word,pw,slw,flw, pwsw):
    #this takes in a one word start_word from the user and returns the most similar unique word to it that starts with the same letter for every word in the input list
    #there is currently an unrequired for loop, and it throws warnings about the .simalrity
    doc = nlp(doc)
    highest_scoring_list = []
    second_best_list=[]
    third_best_list=[]
    result = list()
    docu = nlp(in_start_word)
    previous_word = docu[0]
コード例 #29
0
    #         value_word_lemma, value_sense
    #     ] not in input_list_reduced_to_lemma:
    #       seen.add(value_word_lemma_sense_joined)
    #       result.append(item)
    #   return result


if __name__ == '__main__':
    from sense2vec import Sense2Vec
    from s2v_util import S2vUtil
    from s2v_senses import S2vSenses
    from s2v_key_case_and_sense_variations import S2vKeyCaseAndSenseVariations
    from s2v_key_commonizer import S2vKeyCommonizer
    S2V_MODAL_PATH = os.getenv('S2V_MODEL_PATH')
    print("loading model from disk..", S2V_MODAL_PATH)
    s2v = Sense2Vec().from_disk(S2V_MODAL_PATH)
    print("model loaded.")
    s2v_util = S2vUtil(s2v)
    s2v_senses = S2vSenses(s2v_util)
    s2v_key_variations = S2vKeyCaseAndSenseVariations(s2v_util, s2v_senses)
    s2v_key_commonizer = S2vKeyCommonizer()
    syn_service = S2vSynonyms(s2v_util, s2v_key_variations, s2v_key_commonizer)
    req_args = {
        'attempt-phrase-join-for-compound-phrases': 1,
        'min-score': 0.5,
        'n': 10,
        'match-input-sense': 1,
        'reduce-multicase': 1,
        'reduce-compound-nouns': 1,
        'min-word-len': 2,
    }
コード例 #30
0
text_file.close()

word_counter = Counter(corpus.split())
most_common = word_counter.most_common()
i = count = 0
most_common_list = []
while count < top_n:
    word = most_common[i][0]
    if word in pos_aspect or word in pos_opinion:
        most_common_list.append(word)
        count += 1
    i += 1
print(most_common_list)
print(len(most_common_list))

s2v = Sense2Vec().from_disk(folder)

if selection_mode == 'SVM':
    model_file = Path('{}/model.pkl'.format(folder))
    if model_file.is_file():
        most_similar = []
        for word in most_common_list:
            most_similar.extend(s2v.most_similar(word, n=most_similar_n))
        most_similar_words = [word[0] for word in most_similar]
        most_similar_words.extend(most_common_list)
        most_similar_vectors = [s2v[word] for word in most_similar_words]
        labels = []
        for word in most_similar_words:
            label = 1 if word in pos_aspect else 0 if word in neg_example else -1
            labels.append(label)
        test_df = pd.DataFrame({