def conduct_experiment(self, devcorpus, candidates_list): """ Streamlines experiments with the various ranking modules :param devcorpus: devcorpus generated with make_devcorpus.py :param candidates_list: list of candidate list per misspelling :return: correction accuracy, list of corrections """ corrected_list = devcorpus[0] detection_list = devcorpus[1] detection_contexts = devcorpus[2] self.corrected_list = corrected_list self.detection_list = detection_list self.detection_contexts = detection_contexts self.candidates_list = candidates_list if self.ranking_method == 'context': print("Loading embeddings") r = Reach.load(self.pathtovectors, header=True) print("Done") correction_list = self.ranking_experiment(detection_list, detection_contexts, candidates_list, r) elif self.ranking_method == 'noisy_channel': correction_list = self.noisychannel_ranking( detection_list, candidates_list) elif self.ranking_method == 'frequency': correction_list = self.frequency_baseline(detection_list, candidates_list) elif self.ranking_method == 'ensemble': print("Loading embeddings") r = Reach.load(self.pathtovectors, header=True) print("Done") correction_list = self.ranking_experiment(detection_list, detection_contexts, candidates_list, r) correction_list_2 = self.noisychannel_ranking( detection_list, candidates_list) for i, confidence in enumerate(self.confidences): if confidence > 1.3: correction_list[i] = correction_list_2[i] else: raise ValueError('No valid ranking method given') score = self.sub_sampling(correction_list, corrected_list) self.correction_list = correction_list self.score = score return score, correction_list
def __init__(self, detection_list, language, model, k, backoff, pathtofrequencies, pathtomodel, pathtovectors): """ :param detection_list: list with tuples containing (misspelling, list of 10 left context tokens, list of 10 right context tokens) :param language: 1 if English, 0 if Dutch :param model: 1 if context-sensitive, 0 if noisy channel :param k: number of ranked corrections returned """ # prepare model print('Initializing spelling correction model...') assert len(detection_list[0]) == 3, 'Wrong input format' self.misspellings, self.left_contexts, self.right_contexts = zip( *detection_list) assert len(self.misspellings) == len(self.left_contexts) == len( self.right_contexts), 'Input data not properly synchronized' print(len(self.misspellings), 'misspellings to correct') self.ranking_model = model assert self.ranking_model in range( 2), 'No valid correction model specified' assert k >= 1, 'No valid k specified' self.k = k self.backoff = backoff if language == 1: self.language = 'en' elif language == 0: self.language = 'nl' else: raise ValueError('No valid language input specified') # load embedding model and corpus frequencies with open(pathtofrequencies, 'r') as f: self.frequency_dict = json.load(f) self.model = fasttext.load_model(pathtomodel) self.r = Reach.load(pathtovectors, header=True) # set parameters for correction if self.language == "en": self.window_size = 9 self.oov_penalty = 1.7 elif self.language == "nl": self.window_size = 10 self.oov_penalty = 2.4 print('Model initialized')
def frequency_baseline(self, detection_list, candidates_list): """ Majority frequency baseline :param detection_list: list of misspellings :param candidates_list: list of candidate list per misspelling :return: list with corrections or k-best corrections """ correction_list = [] print("Loading vector representations") r = Reach.load(self.pathtovectors, header=True) print("Done") for misspelling, candidates in zip(detection_list, candidates_list): candidates = [ candidate for candidate in candidates if candidate in self.frequency_dict.keys() ] frequencies = [ self.frequency_dict[candidate] for candidate in candidates ] if self.k == 1: try: correction_list.append(candidates[np.argmax(frequencies)]) except ValueError: correction_list.append('') elif self.k > 1: correction_list.append([ candidates[i] for i in np.argsort(frequencies)[::-1][:self.k] ]) else: raise ValueError('k must be positive natural number') return correction_list
def tune_oov(devcorpus, candidates_list, best_parameters, language): """ Conduct search for best oov penalty for corpus :param devcorpus: devcorpus generated with make_devcorpus.py :param candidates_list: list of candidate list per misspelling :param best_parameters: best parameters for the devcorpus :param language: language from ["en", "nl"] :return: dictionary with oov penalties as keys and their correction accuracy as values """ dev = Development(best_parameters, language) print("Loading embeddings") r = Reach.load(dev.pathtovectors, header=True) print("Done") corrected_list = devcorpus[0] detection_list = devcorpus[1] detection_contexts = devcorpus[2] scores_dict = {} values = list(range(30)) values = [value / 10 for value in values] for value in values: dev.oov_penalty = value correction_list = dev.ranking_experiment(detection_list, detection_contexts, candidates_list, r) accuracy = len([ c for i, c in enumerate(correction_list) if c == corrected_list[i] ]) / len(correction_list) scores_dict[value] = accuracy return scores_dict
parsed_train = json.load(open("data/partners_uima.json")) parsed_train = list(zip(*sorted(parsed_train.items())))[1] gold_train = json.load(open("data/partners_gold.json")) gold_train = list(zip(*sorted(gold_train.items())))[1] parsed_test = json.load(open("data/beth_uima.json")) parsed_test = list(zip(*sorted(parsed_test.items())))[1] gold_test = json.load(open("data/beth_gold.json")) gold_test = list(zip(*sorted(gold_test.items())))[1] txt, gold_chunks_train = zip(*gold_train) _, gold_chunks_test = zip(*gold_test) embeddings = Reach.load("") for a, b in zip(parsed_train, gold_train): assert len(a[0]) == len(b[0]) for a, b in zip(parsed_test, gold_test): assert len(a[0]) == len(b[0]) knn_focus = experiment(parsed_train, gold_chunks_train, parsed_test, gold_chunks_test, np.mean, np.mean, embeddings, reciprocal,
from cat.simple import get_scores, rbf_attention from cat.dataset import restaurants_train from reach import Reach from sklearn.metrics import precision_recall_fscore_support from collections import defaultdict, Counter GAMMA = .03 BEST_ATT = {"n_noun": 980} BEST_RBF = {"n_noun": 200} if __name__ == "__main__": scores = defaultdict(dict) r = Reach.load("embeddings/restaurant_vecs_w2v.vec", unk_word="<UNK>") att = rbf_attention datums = list(restaurants_train()) d = json.load(open("data/nouns_restaurant.json")) nouns = Counter() for k, v in d.items(): if k.lower() in r.items: nouns[k.lower()] += v if att == rbf_attention: r.vectors[r.items["<UNK>"]] = r.vectors.max() if att == rbf_attention: candidates, _ = zip(*nouns.most_common(BEST_RBF["n_noun"]))
"""Test with word embeddings.""" from reach import Reach from plate.plate import circular_convolution, decode if __name__ == "__main__": r = Reach.load("PATH_TO_EMBEDDINGS") # Encode "dog chase cat" a = circular_convolution(r["subject"], r["dog"]) b = circular_convolution(r["verb"], r["chase"]) c = circular_convolution(r["object"], r["cat"]) sentence = a + b + c vec = decode(r["subject"], sentence) result = r.nearest_neighbor(vec) # The top result should be dog
import json from cat.simple import get_scores, rbf_attention from reach import Reach from collections import defaultdict GAMMA = .03 N_ASPECT_WORDS = 200 if __name__ == "__main__": scores = defaultdict(dict) r = Reach.load("embeddings/my_word_vectors.vec", unk_word="<UNK>") aspects = [[x] for x in json.load(open("data/aspect_words.json"))] aspects = aspects[:N_ASPECT_WORDS] instances = ["text_1".split(), "text_2".split()] label_set = {"label1", "label2", "label3"} s = get_scores(instances, aspects, r, label_set, gamma=GAMMA, remove_oov=False, attention_func=rbf_attention) pred = s.argmax(1)
scores = {} gold = json.load(open("data/beth_gold.json")) gold = list(zip(*sorted(gold.items())))[1] txt, gold_chunks = zip(*gold) data = json.load(open("data/beth_uima.json")) data = list(zip(*sorted(data.items())))[1] # Sanity check for a, b in zip(data, gold): assert len(a[0]) == len(b[0]) embeddings = Reach.load("../../corpora/mimic_vecs_200_cbow.vec", unk_word="UNK") scores = {} focus = experiment(data, gold_chunks, np.mean, np.mean, embeddings, reciprocal, 0, k=100, use_focus=True) full = experiment(data, gold_chunks,
def noisychannel_ranking(self, detection_list, candidates_list): """ An approximate implementation of the ranking method described in (Lai et al. 2015) :param detection_list: list of misspellings :param candidates_list: list of candidate list per misspelling :param frequency_dict: corpus frequencies from training data :param k_best: if True, return k highest ranked candidates instead of single one :return: list with corrections or k-best corrections """ correction_list = [] confidences = [] print("Loading vector representations") r = Reach.load(self.pathtovectors, header=True) print("Done") for misspelling, candidates in zip(detection_list, candidates_list): # candidates = [candidate for candidate in candidates if candidate in r.words.keys()] score_list = [] for candidate in candidates: orthographic_edit_distance = damerau_levenshtein_distance( misspelling, candidate) phonetic_edit_distance = damerau_levenshtein_distance( dm(misspelling)[0], dm(candidate)[0]) spell_score = (2 * orthographic_edit_distance + phonetic_edit_distance)**2 # P(m|c) try: frequency = self.frequency_dict[candidate] except KeyError: frequency = 1 frequency_score = 1 / (1 + log(frequency)) # P(c) score = spell_score * frequency_score # P(c|m) = P(m|c)*P(c) score_list.append(score) score_list = np.array(score_list) if len(score_list) > 1: sorted_distances = [ score_list[i] for i in np.argsort(score_list) ] top1 = sorted_distances[0] top2 = sorted_distances[1] confidence = abs(top1 - top2) / top1 confidences.append(confidence) else: confidences.append(0) if self.k == 1: try: correction_list.append(candidates[np.argmin(score_list)]) except ValueError: correction_list.append('') elif self.k > 1: correction_list.append( [candidates[i] for i in np.argsort(score_list)[:self.k]]) else: raise ValueError('k must be positive natural number') self.confidences = confidences return correction_list
import tensorflow as tf import re from reach import Reach r = Reach.load('./tulkens-embeddings/160/sonar-160.txt', header=True) objsize = 160 holisticsize = 10 #%% #import list of verbs verb = 'overleef' print('Verb:', verb) #create tensor for verb verbtens = tf.Variable(tf.random_uniform([objsize, holisticsize], 0.0, 1.0)) inp = tf.placeholder(tf.float32, [objsize]) outp = tf.matmul(verbtens, inp) sess = tf.Session() #get VO-combinations list combos = [] rowsfile = open('./cooccurrence/rows1.rows', 'r') done = False found = False while done == False: line = rowsfile.readline() if line.startswith(verb): found = True combos.append(line) else:
if __name__ == "__main__": # Set this flag to true to replicate the perfect chunking setting # in experiment 3. perfect = True gold = json.load(open("data/test_gold.json")) gold = list(zip(*sorted(gold.items())))[1] if perfect: data = json.load(open("data/test_gold.json")) data = list(zip(*sorted(data.items())))[1] txt, gold_bio = zip(*gold) r = Reach.load("../../corpora/mimiciii-min5-neg3-w5-100.vec", unk_word="<UNK>") r_concept = Reach.load_fast_format(f"data/concept_vectors") concept_labels = json.load(open("data/names2label.json")) grouped = defaultdict(list) for k, v in concept_labels.items(): grouped[v].append(r_concept[k]) grouped.pop("np") memory = {} for k, v in tqdm(grouped.items()): km = KMeans(10) km.fit(v)
if __name__ == "__main__": import logging import time import json # Setup # logging.basicConfig(level=logging.INFO) umls = "sample_data/umls_sample.json" msh = "sample_data/abstracts_example.json" path_to_embeddings = "" use_subset = False # Be sure to set add_unk to True, or to mark the UNK index. embeddings = Reach.load(path_to_embeddings, unk_word="UNK") logging.info("loaded embeddings.") start = time.time() y = Yarn(embeddings) umls = json.load(open(umls)) msh = json.load(open(msh)) if use_subset: subset = [u'di', u'tat', u'erp',
vec = embeddings.vectorize(desc, remove_oov=True) if not np.any(vec): continue concept.append(np.mean(vec, axis=0)) except ValueError: pass if not concept: continue concept_names.append(name) vectors.append(np.array(concept).mean(axis=0)) r = Reach(np.array(vectors), concept_names) return r if __name__ == "__main__": path_to_embeddings = "" r_1 = Reach.load(path_to_embeddings, unk_word="UNK") concepts = json.load(open("data/all_concepts.json")) sty = json.load(open("data/concept_label.json")) r = create_concepts(concepts, r_1, include_np=True, labels=sty) r.save_fast_format("data/concept_vectors") name2label = {k: sty[k.split("-")[0]] for k in r.items()} json.dump(name2label, open("data/names2label.json", 'w'))
def grid_search(devcorpus, candidates_list, language): """ Conduct grid search to find best parameters for a corpus containing only in-vector-vocabulary corrections :param devcorpus: devcorpus generated with make_devcorpus.py :param candidates_list: list of candidate list per misspelling :param language: language from ["en", "nl"] :return: dictionary with parameter settings as keys and their correction accuracy as values """ # default parameters parameters = { 'comp_function': 'sum', 'include_misspelling': False, 'include_oov_candidates': False, 'window_size': 6, 'reciprocal': False, 'remove_stopwords': False, 'edit_distance': 1, 'oov_penalty': 1.5, 'ranking_method': 'context', 'k-best': 1 } dev = Development(parameters, language) print("Loading embeddings") r = Reach.load(dev.pathtovectors, header=True) print("Done") corrected_list = devcorpus[0] detection_list = devcorpus[1] detection_contexts = devcorpus[2] scores_dict = {} start_time = 0 end_time = 0 for comp_function in ["sum", "mult", "max"]: print("New run") run_time = end_time - start_time print("Last run took " + str(run_time) + " seconds") start_time = time.time() dev.comp_function = comp_function for include_misspelling in [True, False]: dev.include_misspelling = include_misspelling for window_size in range(11): dev.window_size = window_size for reciprocal in [True, False]: dev.reciprocal = reciprocal for remove_stopwords in [True, False]: dev.remove_stopwords = remove_stopwords for edit_distance in range(1, 5): dev.edit_distance = edit_distance correction_list = dev.ranking_experiment( detection_list, detection_contexts, candidates_list, r) accuracy = len([ c for i, c in enumerate(correction_list) if c == corrected_list[i] ]) / len(correction_list) parameters = (comp_function, include_misspelling, window_size, reciprocal, remove_stopwords, edit_distance) scores_dict[parameters] = accuracy end_time = time.time() return scores_dict
# in experiment 3. perfect = False gold = json.load(open("data/test_gold.json")) gold = list(zip(*sorted(gold.items())))[1] if perfect: data = json.load(open("data/test_gold.json")) else: data = json.load(open("data/test_uima.json")) data = list(zip(*sorted(data.items())))[1] txt, gold_bio = zip(*gold) _, data_bio = zip(*data) embeddings = Reach.load("", unk_word="UNK") concept_reach = Reach.load_fast_format("data/concept_vectors") concept_labels = json.load(open("data/concept_names2label.json")) gold_bio = list(chain.from_iterable(gold_bio)) results_bio = {} r_phrases = compose(data, f1=np.mean, f2=np.mean, window=0, embeddings=embeddings, context_function=reciprocal) pred_bio_focus = eval_extrinsic(list(chain.from_iterable(data_bio)),
if __name__ == "__main__": import logging import time import json # Setup # logging.basicConfig(level=logging.INFO) umls = "sample_data/umls_sample.json" msh = "sample_data/abstracts_example.json" path_to_embeddings = "" use_subset = False # Be sure to set add_unk to True, or to mark the UNK index. embeddings = Reach.load(path_to_embeddings, header=True, unk_word="UNK") logging.info("loaded embeddings.") start = time.time() y = Yarn(embeddings) umls = json.load(open(umls)) msh = json.load(open(msh)) if use_subset: subset = [ u'di', u'tat', u'erp', u'ori', u'crna', u'pep', u'de', u'hip', u'glycoside', u'sterilization', u'ra', u'don', u'ecg', u'cell',