def test_sense2vec_other_senses(): s2v = Sense2Vec(shape=(6, 4)) s2v.cfg["senses"] = ["A", "B", "C", "D"] for key in ["a|A", "a|B", "a|C", "b|A", "b|C", "c|A"]: s2v.add(key, numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) others = s2v.get_other_senses("a|A") assert sorted(others) == ["a|B", "a|C"] others = s2v.get_other_senses("b|C") assert others == ["b|A"] others = s2v.get_other_senses("B|C") assert others == ["b|A"] others = s2v.get_other_senses("c|A") assert others == []
def getVectorsDistance(word1, word2): ''' If both given words are in the vocabulary, returns normalized euclidian distance of their corresponding vectors. ''' global s2v if s2v == None: # Vector files are not present in the repository s2v = Sense2Vec().from_disk("C:\SKOLA\machine_learning\Project\pretrainedVectors\s2v_reddit_2019_lg") # s2v = Sense2Vec().from_disk("C:\SKOLA\machine_learning\Project\wiki corpus/backup/exported") if word1 in s2v and word2 in s2v: return s2v.similarity(word1, word2) return 0
def main(in_file, vocab_file, out_dir, min_freq_ratio=0.0, min_distance=0.0): """ Step 5: Export a sense2vec component Expects a vectors.txt and a vocab file trained with GloVe and exports a component that can be loaded with Sense2vec.from_disk. """ input_path = Path(in_file) vocab_path = Path(vocab_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if input_path.suffix == ".bin": msg.fail("Need text-based vectors file, not binary", in_file, exits=1) if not vocab_path.exists(): msg.fail("Can't find vocab file", vocab_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") with input_path.open("r", encoding="utf8") as f: (n_vectors, vector_size), f = _get_shape(f) vectors_data = f.readlines() with vocab_path.open("r", encoding="utf8") as f: vocab = read_vocab(f) vectors = {} all_senses = set() for item in vectors_data: item = item.rstrip().rsplit(" ", vector_size) key = item[0] try: _, sense = split_key(key) except ValueError: continue vec = item[1:] if len(vec) != vector_size: msg.fail(f"Wrong vector size: {len(vec)} (expected {vector_size})", exits=1) all_senses.add(sense) vectors[key] = numpy.asarray(vec, dtype=numpy.float32) discarded = set() discarded.update(get_minority_keys(vocab, min_freq_ratio)) discarded.update(get_redundant_keys(vocab, vectors, min_distance)) n_vectors = len(vectors) - len(discarded) s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses) for key, vector in vectors.items(): if key not in discarded: s2v.add(key, vector) s2v.set_freq(key, vocab[key]) msg.good("Created the sense2vec model") msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses") s2v.to_disk(output_path) msg.good("Saved model to directory", out_dir)
def test_sense2vec_to_from_bytes(): s2v = Sense2Vec(shape=(2, 4)) test_vector1 = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32) test_vector2 = numpy.asarray([5, 6, 7, 8], dtype=numpy.float32) s2v.add("test1", test_vector1, 123) s2v.add("test2", test_vector2, 456) s2v_bytes = s2v.to_bytes() new_s2v = Sense2Vec().from_bytes(s2v_bytes) assert len(new_s2v) == 2 assert new_s2v.vectors.shape == (2, 4) assert "test1" in new_s2v assert "test2" in new_s2v assert new_s2v.get_freq("test1") == 123 assert new_s2v.get_freq("test2") == 456 assert numpy.array_equal(new_s2v["test1"], test_vector1) assert numpy.array_equal(new_s2v["test2"], test_vector2) assert s2v_bytes == new_s2v.to_bytes() s2v_bytes2 = s2v.to_bytes(exclude=["strings"]) new_s2v2 = Sense2Vec().from_bytes(s2v_bytes2) assert len(new_s2v2.strings) == 0 assert "test1" in new_s2v2 assert s2v.strings["test1"] in new_s2v2 with pytest.raises(KeyError): # can't resolve hash new_s2v2.strings[s2v.strings["test2"]]
def test_sense2vec_best_sense(): s2v = Sense2Vec(shape=(5, 4)) s2v.cfg["senses"] = ["A", "B", "C"] for key, freq in [("a|A", 100), ("a|B", 50), ("a|C", 10), ("b|A", 1), ("B|C", 2)]: s2v.add(key, numpy.asarray([4, 2, 2, 2], dtype=numpy.float32), freq) assert s2v.get_best_sense("a") == "a|A" assert s2v.get_best_sense("A") == "a|A" assert s2v.get_best_sense("b") == "B|C" assert s2v.get_best_sense("b", ignore_case=False) == "b|A" assert s2v.get_best_sense("c") is None s2v.cfg["senses"] = [] assert s2v.get_best_sense("a") is None assert s2v.get_best_sense("b", ["A"]) == "b|A" assert s2v.get_best_sense("b", ["A", "C"]) == "B|C"
def __init__(self): self.tokenizer = T5Tokenizer.from_pretrained('t5-base') model = T5ForConditionalGeneration.from_pretrained('Parth/result') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # model.eval() self.device = device self.model = model self.nlp = spacy.load('en_core_web_sm') self.s2v = Sense2Vec().from_disk('s2v_old') self.fdist = FreqDist(brown.words()) self.normalized_levenshtein = levenshtein self.set_seed(42)
def sense2vec_get_words(word): vAR_s2v = Sense2Vec().from_disk('s2v_old') vAR_output = [] vAR_word = word.lower() vAR_word = vAR_word.replace(" ", "_") vAR_sense = vAR_s2v.get_best_sense(vAR_word) if vAR_sense: vAR_most_similar = vAR_s2v.most_similar(vAR_sense, n=20) for each_word in vAR_most_similar: vAR_append_word = each_word[0].split("|")[0].replace("_", " ").lower() if vAR_append_word.lower() != word.lower(): vAR_output.append(vAR_append_word.title()) vAR_out = list(OrderedDict.fromkeys(vAR_output)) return vAR_out
def get_candidates_closest_to_seed_terms(terms, num_of_candidates, num_of_top_frequency_terms_to_consider): s2v = Sense2Vec().from_disk("s2v_reddit_2019_lg") query = get_query_from_terms(terms, s2v) most_similar = s2v.most_similar(query, n=num_of_candidates * 50) # have some extra because of non top frequency cands candidates = [i[0] for i in most_similar] clean_candidates = [t for t in terms] most_frequent = s2v.frequencies[:num_of_top_frequency_terms_to_consider] most_frequent = [i[0] for i in most_frequent] for cand in candidates: if cand in most_frequent: without_pos = cand.split("|")[0] clean = without_pos.replace("_", " ").lower() to_add = clean.replace(".", "") if to_add not in clean_candidates: clean_candidates.append(to_add) if len(clean_candidates) == num_of_candidates: break return set(clean_candidates)
def __init__(self, lang_code='en', max_questions=20): self.tokenizer = T5Tokenizer.from_pretrained('t5-base') model = T5ForConditionalGeneration.from_pretrained('Parth/result') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) # model.eval() self.device = device self.model = model self.nlp = self.try_load_spacy_model(lang_code) self.max_questions = int(max_questions) self.s2v = Sense2Vec().from_disk( '/Users/dev/Develop/text-to-anki/backend/src/Questgen.ai/Questgen.ai/Questgen/s2v_old' ) self.fdist = FreqDist(brown.words()) self.normalized_levenshtein = NormalizedLevenshtein() self.set_seed(42)
def test_sense2vec_object(): s2v = Sense2Vec(shape=(10, 4)) assert s2v.vectors.shape == (10, 4) assert len(s2v) == 10 test_vector = numpy.asarray([4, 2, 2, 2], dtype=numpy.float32) s2v.add("test", test_vector) assert "test" in s2v assert isinstance(s2v.strings["test"], int) assert s2v.strings["test"] in s2v assert "foo" not in s2v assert numpy.array_equal(s2v["test"], test_vector) assert numpy.array_equal(s2v[s2v.strings["test"]], test_vector) assert list(s2v.keys()) == ["test"] s2v.add("test2", test_vector) assert "test2" in s2v assert sorted(list(s2v.keys())) == ["test", "test2"] with pytest.raises(ValueError): s2v["test3"] = test_vector s2v["test2"] = test_vector
def s2v_mock(): from sense2vec import Sense2Vec import numpy as np s2v = Sense2Vec(shape=(16, 4)) s2v.add('New_York|GPE', np.asarray([1, 1, 1, 1], dtype=np.float32)) s2v.add('New_York|NOUN', np.asarray([1, 2, 1, 1], dtype=np.float32)) s2v.add('big|ADJ', np.asarray([2, 5, 4, 2], dtype=np.float32)) s2v.add('BIG|ADJ', np.asarray([2, 5, 4, 1], dtype=np.float32)) s2v.add('apple|NOUN', np.asarray([1, 3, 9, 3], dtype=np.float32)) s2v.add('big_apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32)) s2v.add('Big_Apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32)) s2v.add('Big_Apple|LOC', np.asarray([6, 6, 6, 6], dtype=np.float32)) s2v.add('Big_apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32)) s2v.add('BIG_apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32)) s2v.add('BIG_Apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32)) s2v.add('BIG_APPLE|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32)) s2v.add('black|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32)) s2v.add('black|ADJ', np.asarray([5, 5, 5, 5], dtype=np.float32)) s2v.add('blue|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32)) s2v.add('blue_big_apple|NOUN', np.asarray([6, 6, 6, 6], dtype=np.float32)) return s2v
def test_sense2vec_most_similar(): s2v = Sense2Vec(shape=(6, 4)) s2v.add("a", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) s2v.add("b", numpy.asarray([4, 4, 2, 2], dtype=numpy.float32)) s2v.add("c", numpy.asarray([4, 4, 4, 2], dtype=numpy.float32)) s2v.add("d", numpy.asarray([4, 4, 4, 4], dtype=numpy.float32)) s2v.add("x", numpy.asarray([4, 2, 2, 2], dtype=numpy.float32)) s2v.add("y", numpy.asarray([0.1, 1, 1, 1], dtype=numpy.float32)) result1 = s2v.most_similar(["x"], n=2) assert len(result1) == 2 assert result1[0][0] == "a" assert result1[0][1] == 1.0 assert result1[0][1] == pytest.approx(1.0) assert result1[1][0] == "b" result2 = s2v.most_similar(["a", "x"], n=2) assert len(result2) == 2 assert sorted([key for key, _ in result2]) == ["b", "d"] result3 = s2v.most_similar(["a", "b"], n=3) assert len(result3) == 3 assert "y" not in [key for key, _ in result3] assert len(s2v.most_similar(["a", "b"], n=10)) == 4 with pytest.raises(ValueError): s2v.most_similar(["z"], n=1) # key not in table
def test_registry(): """Test that custom functions are used internally if they're registered.""" @registry.make_key.register("custom_make_key") def custom_make_key(word, sense): return f"{word}###{sense}" @registry.split_key.register("custom_split_key") def custom_split_key(key): return tuple(key.split("###")) overrides = {"make_key": "custom_make_key", "split_key": "custom_split_key"} test_vector = numpy.asarray([1, 2, 3, 4], dtype=numpy.float32) data = [("clear", "NOUN", 100), ("clear", "VERB", 200), ("clear", "ADJ", 300)] s2v = Sense2Vec(shape=(len(data), 4), overrides=overrides) for word, sense, freq in data: s2v.add(custom_make_key(word, sense), test_vector, freq) s2v.cfg["senses"].append(sense) assert "clear###NOUN" in s2v other_senses = s2v.get_other_senses("clear###NOUN") assert len(other_senses) == 2 assert "clear###VERB" in other_senses assert "clear###ADJ" in other_senses assert s2v.get_best_sense("clear") == "clear###ADJ"
# value_word_lemma, value_sense # ] not in input_list_reduced_to_lemma: # seen.add(value_word_lemma_sense_joined) # result.append(item) # return result if __name__ == '__main__': from sense2vec import Sense2Vec from s2v_util import S2vUtil from s2v_senses import S2vSenses from s2v_key_case_and_sense_variations import S2vKeyCaseAndSenseVariations from s2v_key_commonizer import S2vKeyCommonizer S2V_MODAL_PATH = os.getenv('S2V_MODEL_PATH') print("loading model from disk..", S2V_MODAL_PATH) s2v = Sense2Vec().from_disk(S2V_MODAL_PATH) print("model loaded.") s2v_util = S2vUtil(s2v) s2v_senses = S2vSenses(s2v_util) s2v_key_variations = S2vKeyCaseAndSenseVariations(s2v_util, s2v_senses) s2v_key_commonizer = S2vKeyCommonizer() syn_service = S2vSynonyms(s2v_util, s2v_key_variations, s2v_key_commonizer) req_args = { 'attempt-phrase-join-for-compound-phrases': 1, 'min-score': 0.5, 'n': 10, 'match-input-sense': 1, 'reduce-multicase': 1, 'reduce-compound-nouns': 1, 'min-word-len': 2, }
np.around((input_array - np.min(input_array)) / np.ptp(input_array), decimals=3)) if inverse: return list(map(lambda x: round(vals_range - x - min_val, 3), new_vals)) return new_vals # vals = [0, 21, 2288, 52300, 35004] # print(normalize_distribution(vals)) # print(normalize_distribution(vals, inverse=True)) # exit() print("loading model from disk..") s2v = Sense2Vec().from_disk(os.environ['S2V_MODEL_PATH']) print("model loaded.") # 2015 model: s2v keys len: 1195261 print("s2v keys len: ", len(s2v)) freq_by_word_count = {} freq_distribution_by_word_count = {} # for key in s2v.keys(): # word_count = len(key.split('_')) # if word_count <= 9: # # if word_count > 6: # # print('big word', word_count, key, s2v.get_freq(key)) # if word_count in freq_by_word_count: # freq_by_word_count[word_count] += 1
eval_whole=False, eval_only=False, show_scores=False, ): """ Evaluate a sense2vec model by asking about phrase triples: is word A more similar to word B, or to word C? If the human mostly agrees with the model, the vectors model is good. """ random.seed(0) log("RECIPE: Starting recipe sense2vec.eval", locals()) strategies = eval_strategies.get_all() if strategy not in strategies.keys(): err = f"Invalid strategy '{strategy}'. Expected: {list(strategies.keys())}" msg.fail(err, exits=1) s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) def get_html(key, score=None, large=False): word, sense = s2v.split_key(key) html_word = f"<span style='font-size: {30 if large else 20}px'>{word}</span>" html_sense = f"<strong style='opacity: 0.75; font-size: 14px; padding-left: 10px'>{sense}</strong>" html = f"{html_word} {html_sense}" if show_scores and score is not None: html += f" <span style='opacity: 0.75; font-size: 12px; padding-left: 10px'>{score:.4}</span>" return html def get_stream(): strategy_func = eval_strategies.get(strategy) log(f"RECIPE: Using strategy {strategy}") # Limit to most frequent entries
def teach( dataset, vectors_path, seeds, threshold=0.85, n_similar=20, batch_size=5, resume=False, ): """ Bootstrap a terminology list using sense2vec. Prodigy will suggest similar terms based on the the most similar phrases from sense2vec, and the suggestions will be adjusted as you annotate and accept similar phrases. For each seed term, the best matching sense according to the sense2vec vectors will be used. """ log("RECIPE: Starting recipe sense2vec.teach", locals()) s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span><strong style='opacity: 0.75'>{{sense}}</strong>" accept_keys = [] seen = set(accept_keys) seed_tasks = [] for seed in seeds: key = s2v.get_best_sense(seed) if key is None: raise ValueError(f"Can't find seed term '{seed}' in vectors") accept_keys.append(key) best_word, best_sense = s2v.split_key(key) task = { "text": key, "word": best_word, "sense": best_sense, "meta": { "score": 1.0 }, "answer": "accept", } seed_tasks.append(set_hashes(task)) print(f"Starting with seed keys: {accept_keys}") DB = connect() if dataset not in DB: DB.add_dataset(dataset) dataset_hashes = DB.get_task_hashes(dataset) DB.add_examples( [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes], datasets=[dataset], ) if resume: prev = DB.get_dataset(dataset) prev_accept = [eg["text"] for eg in prev if eg["answer"] == "accept"] accept_keys += prev_accept seen.update(set(accept_keys)) log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}" ) def update(answers): """Updates accept_keys so that the stream can find new phrases.""" log(f"RECIPE: Updating with {len(answers)} answers") for answer in answers: phrase = answer["text"] if answer["answer"] == "accept": accept_keys.append(phrase) def get_stream(): """Continue querying sense2vec whenever we get a new phrase and presenting examples to the user with a similarity above the threshold parameter.""" while True: log(f"RECIPE: Looking for {n_similar} phrases most similar to " f"{len(accept_keys)} accepted keys") most_similar = s2v.most_similar(accept_keys, n=n_similar) log(f"RECIPE: Found {len(most_similar)} most similar phrases") for key, score in most_similar: if key not in seen and score > threshold: seen.add(key) word, sense = s2v.split_key(key) # Make sure the score is a regular float, otherwise server # may fail when trying to serialize it to/from JSON meta = {"score": float(score)} yield { "text": key, "word": word, "sense": sense, "meta": meta } stream = get_stream() return { "view_id": "html", "dataset": dataset, "stream": stream, "update": update, "config": { "batch_size": batch_size, "html_template": html_template }, }
from flask import Flask, request, Response import json import datetime from sense2vec import Sense2Vec from s2v_util import S2vUtil from s2v_senses import S2vSenses from s2v_key_case_and_sense_variations import S2vKeyCaseAndSenseVariations from s2v_key_commonizer import S2vKeyCommonizer from s2v_similarity import S2vSimilarity from s2v_synonyms import S2vSynonyms app = Flask(__name__) port = 80 if os.getuid() == 0 else 8000 print("loading model from disk..") s2v = Sense2Vec().from_disk("/sense2vec-model") print("model loaded.") s2v_util = S2vUtil(s2v) s2v_senses = S2vSenses(s2v_util) s2v_key_variations = S2vKeyCaseAndSenseVariations(s2v_util, s2v_senses) s2v_key_commonizer = S2vKeyCommonizer() similarity_service = S2vSimilarity(s2v_util, s2v_key_variations, s2v_key_commonizer) synonyms_service = S2vSynonyms(s2v_util, s2v_key_variations, s2v_key_commonizer) @app.route('/', methods=['POST', 'GET']) def index(): start = datetime.datetime.utcnow() data = request.data.decode('utf-8')
text_file.close() word_counter = Counter(corpus.split()) most_common = word_counter.most_common() i = count = 0 most_common_list = [] while count < top_n: word = most_common[i][0] if word in pos_aspect or word in pos_opinion: most_common_list.append(word) count += 1 i += 1 print(most_common_list) print(len(most_common_list)) s2v = Sense2Vec().from_disk(folder) if selection_mode == 'SVM': model_file = Path('{}/model.pkl'.format(folder)) if model_file.is_file(): most_similar = [] for word in most_common_list: most_similar.extend(s2v.most_similar(word, n=most_similar_n)) most_similar_words = [word[0] for word in most_similar] most_similar_words.extend(most_common_list) most_similar_vectors = [s2v[word] for word in most_similar_words] labels = [] for word in most_similar_words: label = 1 if word in pos_aspect else 0 if word in neg_example else -1 labels.append(label) test_df = pd.DataFrame({
from sense2vec import Sense2Vec, Sense2VecComponent import spacy, pandas, pickle nlp = spacy.load("en_core_web_sm") s2v = Sense2Vec().from_disk("./models/s2v_reddit_2015_md/s2v_old/") df = pandas.read_csv("./twitter_data/exploration_dataset.csv") vectors_df = pandas.DataFrame( columns=['id', 'vectors', 'label', "size", "text"]) corpus = [] for idx, row in df.head(100).iterrows(): print("Parsing sentences") try: doc = nlp(row['text']) vectors = [] for token in doc: key = "{0}|{1}".format(token.lemma_, token.pos_) if key in s2v: vector = s2v[key] vectors.append(vector) vectors_df = vectors_df.append( { "id": idx, "vectors": vectors, 'label': row['label'], "size": len(vectors), "text": row['text'] },
from nltk.stem import WordNetLemmatizer from nltk.tokenize import word_tokenize from unidecode import unidecode from collections import Counter import numpy as np import string import itertools import csv import json from spellchecker import SpellChecker # Load the essential models nlp = spacy.load("en_core_web_sm") # needs to be replaced with large model # s2vOrg = nlp.add_pipe("sense2vec") # s2vOrg.from_disk("./data/s2v_reddit_2015_md") s2vOrg = Sense2Vec().from_disk("./data/s2v_reddit_2015_md") def loadFile(path): corpuses = [] # used to store final results file = open(path, "rt", encoding="utf-8") # data = csv.reader(file, delimiter=",") data = csv.DictReader(file) removeEntries = ['isFinal', 'category', 'hit', 'mergeParent'] for row in data: dict = list(map(row.pop, removeEntries)) corpuses.append(row) # print(corpuses) return corpuses
def __init__(self, sense2vec_path): self.s2v = Sense2Vec().from_disk(sense2vec_path)
def main(model_path, out_dir, min_freq_ratio=0.0, min_distance=0.0, check_keys=''): check_keys_list = [] if len(check_keys) > 0: check_keys_list = list(map(lambda x: x.strip(), check_keys.split(','))) s2v = Sense2Vec().from_disk(model_path) output_path = Path(out_dir) vocab = {} for key, score in s2v.frequencies: vocab[key] = score vectors = {} for key, val in s2v: vectors[key] = val msg.info("loading vectors") for key, val in s2v: vector_size = len(val) break all_senses = s2v.senses msg.info("loaded vectors") if len(check_keys_list) > 0: blacklist = {} whitelist = [] blacklisted_sense_keys = get_blacklisted_sense_keys(vocab) markdown_and_url_keys = get_markdown_and_url_keys(vocab) minority_keys = get_minority_keys(vocab, min_freq_ratio) redundant_keys = get_redundant_keys(vocab, vectors, min_distance) for k in check_keys_list: if k in blacklisted_sense_keys: blacklist[k] = 'sense' elif k in markdown_and_url_keys: blacklist[k] = 'markdown / url' elif k in minority_keys: blacklist[k] = 'minority' elif k in redundant_keys: blacklist[k] = 'redundant' else: whitelist.append(k) msg.warn('blacklist') for k in blacklist.keys(): msg.warn("{k}: {v}".format(k=k, v=blacklist[k])) msg.good('whitelist') for k in whitelist: msg.good(k) else: discarded = set() discarded.update(get_blacklisted_sense_keys(vocab)) discarded.update(get_markdown_and_url_keys(vocab)) discarded.update(get_minority_keys(vocab, min_freq_ratio)) discarded.update(get_redundant_keys(vocab, vectors, min_distance)) n_vectors = len(vectors) - len(discarded) s2v = Sense2Vec(shape=(n_vectors, vector_size), senses=all_senses) for key, vector in vectors.items(): if key not in discarded: s2v.add(key, vector) if key in vocab: s2v.set_freq(key, vocab[key]) msg.good("Created the sense2vec model") msg.info(f"{n_vectors} vectors, {len(all_senses)} total senses") s2v.to_disk(output_path) msg.good("Saved model to directory", out_dir)
def load_vectors(path): return Sense2Vec().from_disk(path)
def teach( dataset, vectors_path, seeds, threshold=0.85, n_similar=100, batch_size=5, case_sensitive=False, resume=False, ): """ Bootstrap a terminology list using sense2vec. Prodigy will suggest similar terms based on the the most similar phrases from sense2vec, and the suggestions will be adjusted as you annotate and accept similar phrases. For each seed term, the best matching sense according to the sense2vec vectors will be used. If no similar terms are found above the given threshold, the threshold is lowered by 0.1 and similar terms are requested again. """ log("RECIPE: Starting recipe sense2vec.teach", locals()) s2v = Sense2Vec().from_disk(vectors_path) log("RECIPE: Loaded sense2vec vectors", vectors_path) html_template = "<span style='font-size: {{theme.largeText}}px'>{{word}}</span>" accept_keys = [] seen = set() seed_tasks = [] for seed in seeds: key = s2v.get_best_sense(seed) if key is None: msg.fail(f"Can't find seed term '{seed}' in vectors", exits=1) accept_keys.append(key) best_word, best_sense = s2v.split_key(key) seen.add(best_word if case_sensitive else best_word.lower()) task = { "text": key, "word": best_word, "sense": best_sense, "meta": { "score": 1.0, "sense": best_sense }, "answer": "accept", } seed_tasks.append(set_hashes(task)) print(f"Starting with seed keys: {accept_keys}") DB = connect() if dataset not in DB: DB.add_dataset(dataset) dataset_hashes = DB.get_task_hashes(dataset) DB.add_examples( [st for st in seed_tasks if st[TASK_HASH_ATTR] not in dataset_hashes], datasets=[dataset], ) if resume: prev = DB.get_dataset(dataset) prev_accept_keys = [ eg["text"] for eg in prev if eg["answer"] == "accept" ] prev_words = [ eg["word"] if case_sensitive else eg["word"].lower() for eg in prev ] accept_keys += prev_accept_keys seen.update(set(prev_words)) log(f"RECIPE: Resuming from {len(prev)} previous examples in dataset {dataset}" ) def update(answers): """Updates accept_keys so that the stream can find new phrases.""" log(f"RECIPE: Updating with {len(answers)} answers") for answer in answers: phrase = answer["text"] if answer["answer"] == "accept": accept_keys.append(phrase) def get_stream(): """Continue querying sense2vec whenever we get a new phrase and presenting examples to the user with a similarity above the threshold parameter.""" nonlocal threshold while True: log(f"RECIPE: Looking for {n_similar} phrases most similar to " f"{len(accept_keys)} accepted keys") most_similar = s2v.most_similar(accept_keys, n=n_similar) log(f"RECIPE: Found {len(most_similar)} most similar phrases") n_skipped = 0 n_duplicate = 0 for key, score in most_similar: if score > threshold: word, sense = s2v.split_key(key) if (case_sensitive and word in seen) or (not case_sensitive and word.lower() in seen): n_duplicate += 1 continue seen.add(word if case_sensitive else word.lower()) # Make sure the score is a regular float, otherwise server # may fail when trying to serialize it to/from JSON meta = {"score": float(score), "sense": sense} yield { "text": key, "word": word, "sense": sense, "meta": meta } else: n_skipped += 1 if n_skipped: log(f"RECIPE: Skipped {n_skipped} phrases below threshold {threshold}" ) if n_skipped == len(most_similar) - n_duplicate: # No most similar phrases were found that are above the # threshold, so lower the threshold if it's not already 0 or # return empty list so Prodigy shows "no tasks available" new_threshold = threshold - 0.1 if new_threshold <= 0.0: log(f"RECIPE: No suggestions for threshold {threshold:.2}") return [] log(f"RECIPE: Lowering threshold from {threshold:.2} to {new_threshold:.2}" ) threshold = new_threshold stream = get_stream() return { "view_id": "html", "dataset": dataset, "stream": stream, "update": update, "config": { "batch_size": batch_size, "html_template": html_template }, }
# Weights to be used in each of the following functions ################################################################# start_word = "Not_Considered" #set previous word dis-similarity weight pwsw = 0.7 #set the max value of weighting associated to a perfectly matched rhyming word pw = 0.9 #set the value associated with a matching first letter flw = 0.1 #set the value associated with a matching secound letter slw = 0.05 ################################################################# nlp = spacy.load("en_core_web_lg") s2v = Sense2Vec().from_disk("C:/fyp/s2v_reddit_2019_lg") #read in and process the input list the user wants to remember #with open("api/v3/input_list/input_list.txt","r", encoding="utf-8") as f: # TEXT = f.read() #doc = nlp(TEXT) def create_output_list_v3(doc, in_start_word,pw,slw,flw, pwsw): #this takes in a one word start_word from the user and returns the most similar unique word to it that starts with the same letter for every word in the input list #there is currently an unrequired for loop, and it throws warnings about the .simalrity doc = nlp(doc) highest_scoring_list = [] second_best_list=[] third_best_list=[] result = list() docu = nlp(in_start_word) previous_word = docu[0]
def s2v(): data_path = Path(__file__).parent / "data" return Sense2Vec().from_disk(data_path)
from sense2vec import Sense2Vec import sqlite3 import csv import argparse import os parser = argparse.ArgumentParser(description="create initial ontology graph") parser.add_argument("--keywordDB", type=str, default="../data/keywords.db", required=False, help="path to sqlite keywords db file") parser.add_argument("--s2vModel", type=str, default="../data/sense2vec_train/05", required=False, help="path to trained sense2vec model") parser.add_argument("--outputDir", type=str, default="../data/", required=False, help="output directory") parser.add_argument("--threshold", type=float, default=0.6, required=False, help="cosine similarity threshold used to create edges between keywords") parser.add_argument("--keywordLimit", type=int, default=100000, required=False, help="max number of keywords to create the graph from") args = parser.parse_args() # load s2v model s2v = Sense2Vec().from_disk(args.s2vModel) words = list(s2v.keys()) limit = str(args.keywordLimit) # load keywords conn = sqlite3.connect(args.keywordDB) c = conn.cursor() c.execute(f"SELECT words, COUNT(paperID) AS word_count FROM keywords GROUP BY words ORDER BY word_count DESC limit {limit};") vectoredKeywords = [] keyword = c.fetchone() senses=s2v.senses senses.remove("PUNCT") # remove punctation senses.remove("X") # remove uncategorized words while keyword is not None: keyword = keyword[0]