Ejemplo n.º 1
0
    def conduct_experiment(self, devcorpus, candidates_list):
        """
        Streamlines experiments with the various ranking modules
        :param devcorpus: devcorpus generated with make_devcorpus.py
        :param candidates_list: list of candidate list per misspelling
        :return: correction accuracy, list of corrections
        """

        corrected_list = devcorpus[0]
        detection_list = devcorpus[1]
        detection_contexts = devcorpus[2]

        self.corrected_list = corrected_list
        self.detection_list = detection_list
        self.detection_contexts = detection_contexts
        self.candidates_list = candidates_list

        if self.ranking_method == 'context':
            print("Loading embeddings")
            r = Reach.load(self.pathtovectors, header=True)
            print("Done")
            correction_list = self.ranking_experiment(detection_list,
                                                      detection_contexts,
                                                      candidates_list, r)
        elif self.ranking_method == 'noisy_channel':
            correction_list = self.noisychannel_ranking(
                detection_list, candidates_list)
        elif self.ranking_method == 'frequency':
            correction_list = self.frequency_baseline(detection_list,
                                                      candidates_list)
        elif self.ranking_method == 'ensemble':
            print("Loading embeddings")
            r = Reach.load(self.pathtovectors, header=True)
            print("Done")
            correction_list = self.ranking_experiment(detection_list,
                                                      detection_contexts,
                                                      candidates_list, r)
            correction_list_2 = self.noisychannel_ranking(
                detection_list, candidates_list)
            for i, confidence in enumerate(self.confidences):
                if confidence > 1.3:
                    correction_list[i] = correction_list_2[i]
        else:
            raise ValueError('No valid ranking method given')

        score = self.sub_sampling(correction_list, corrected_list)

        self.correction_list = correction_list
        self.score = score

        return score, correction_list
Ejemplo n.º 2
0
    def __init__(self, detection_list, language, model, k, backoff,
                 pathtofrequencies, pathtomodel, pathtovectors):
        """
        :param detection_list: list with tuples containing (misspelling, list of 10 left context tokens, list of 10 right context tokens)
        :param language: 1 if English, 0 if Dutch
        :param model: 1 if context-sensitive, 0 if noisy channel
        :param k: number of ranked corrections returned
        """
        # prepare model
        print('Initializing spelling correction model...')
        assert len(detection_list[0]) == 3, 'Wrong input format'
        self.misspellings, self.left_contexts, self.right_contexts = zip(
            *detection_list)
        assert len(self.misspellings) == len(self.left_contexts) == len(
            self.right_contexts), 'Input data not properly synchronized'
        print(len(self.misspellings), 'misspellings to correct')
        self.ranking_model = model
        assert self.ranking_model in range(
            2), 'No valid correction model specified'
        assert k >= 1, 'No valid k specified'
        self.k = k
        self.backoff = backoff
        if language == 1:
            self.language = 'en'
        elif language == 0:
            self.language = 'nl'
        else:
            raise ValueError('No valid language input specified')

        # load embedding model and corpus frequencies
        with open(pathtofrequencies, 'r') as f:
            self.frequency_dict = json.load(f)
        self.model = fasttext.load_model(pathtomodel)
        self.r = Reach.load(pathtovectors, header=True)

        # set parameters for correction
        if self.language == "en":
            self.window_size = 9
            self.oov_penalty = 1.7
        elif self.language == "nl":
            self.window_size = 10
            self.oov_penalty = 2.4
        print('Model initialized')
Ejemplo n.º 3
0
    def frequency_baseline(self, detection_list, candidates_list):
        """
        Majority frequency baseline
        :param detection_list: list of misspellings
        :param candidates_list: list of candidate list per misspelling
        :return: list with corrections or k-best corrections
        """

        correction_list = []

        print("Loading vector representations")
        r = Reach.load(self.pathtovectors, header=True)
        print("Done")

        for misspelling, candidates in zip(detection_list, candidates_list):

            candidates = [
                candidate for candidate in candidates
                if candidate in self.frequency_dict.keys()
            ]

            frequencies = [
                self.frequency_dict[candidate] for candidate in candidates
            ]

            if self.k == 1:
                try:
                    correction_list.append(candidates[np.argmax(frequencies)])
                except ValueError:
                    correction_list.append('')
            elif self.k > 1:
                correction_list.append([
                    candidates[i]
                    for i in np.argsort(frequencies)[::-1][:self.k]
                ])
            else:
                raise ValueError('k must be positive natural number')

        return correction_list
Ejemplo n.º 4
0
    def tune_oov(devcorpus, candidates_list, best_parameters, language):
        """
        Conduct search for best oov penalty for corpus
        :param devcorpus: devcorpus generated with make_devcorpus.py
        :param candidates_list: list of candidate list per misspelling
        :param best_parameters: best parameters for the devcorpus
        :param language: language from ["en", "nl"]
        :return: dictionary with oov penalties as keys and their correction accuracy as values
        """

        dev = Development(best_parameters, language)

        print("Loading embeddings")
        r = Reach.load(dev.pathtovectors, header=True)
        print("Done")

        corrected_list = devcorpus[0]
        detection_list = devcorpus[1]
        detection_contexts = devcorpus[2]

        scores_dict = {}

        values = list(range(30))
        values = [value / 10 for value in values]

        for value in values:
            dev.oov_penalty = value
            correction_list = dev.ranking_experiment(detection_list,
                                                     detection_contexts,
                                                     candidates_list, r)
            accuracy = len([
                c for i, c in enumerate(correction_list)
                if c == corrected_list[i]
            ]) / len(correction_list)
            scores_dict[value] = accuracy

        return scores_dict
Ejemplo n.º 5
0
    parsed_train = json.load(open("data/partners_uima.json"))
    parsed_train = list(zip(*sorted(parsed_train.items())))[1]

    gold_train = json.load(open("data/partners_gold.json"))
    gold_train = list(zip(*sorted(gold_train.items())))[1]

    parsed_test = json.load(open("data/beth_uima.json"))
    parsed_test = list(zip(*sorted(parsed_test.items())))[1]

    gold_test = json.load(open("data/beth_gold.json"))
    gold_test = list(zip(*sorted(gold_test.items())))[1]

    txt, gold_chunks_train = zip(*gold_train)
    _, gold_chunks_test = zip(*gold_test)

    embeddings = Reach.load("")

    for a, b in zip(parsed_train, gold_train):
        assert len(a[0]) == len(b[0])

    for a, b in zip(parsed_test, gold_test):
        assert len(a[0]) == len(b[0])

    knn_focus = experiment(parsed_train,
                           gold_chunks_train,
                           parsed_test,
                           gold_chunks_test,
                           np.mean,
                           np.mean,
                           embeddings,
                           reciprocal,
Ejemplo n.º 6
0
from cat.simple import get_scores, rbf_attention
from cat.dataset import restaurants_train
from reach import Reach
from sklearn.metrics import precision_recall_fscore_support
from collections import defaultdict, Counter


GAMMA = .03
BEST_ATT = {"n_noun": 980}
BEST_RBF = {"n_noun": 200}

if __name__ == "__main__":

    scores = defaultdict(dict)
    r = Reach.load("embeddings/restaurant_vecs_w2v.vec",
                   unk_word="<UNK>")

    att = rbf_attention
    datums = list(restaurants_train())

    d = json.load(open("data/nouns_restaurant.json"))
    nouns = Counter()
    for k, v in d.items():
        if k.lower() in r.items:
            nouns[k.lower()] += v

    if att == rbf_attention:
        r.vectors[r.items["<UNK>"]] = r.vectors.max()

    if att == rbf_attention:
        candidates, _ = zip(*nouns.most_common(BEST_RBF["n_noun"]))
Ejemplo n.º 7
0
"""Test with word embeddings."""
from reach import Reach
from plate.plate import circular_convolution, decode

if __name__ == "__main__":

    r = Reach.load("PATH_TO_EMBEDDINGS")

    # Encode "dog chase cat"
    a = circular_convolution(r["subject"], r["dog"])
    b = circular_convolution(r["verb"], r["chase"])
    c = circular_convolution(r["object"], r["cat"])

    sentence = a + b + c
    vec = decode(r["subject"], sentence)
    result = r.nearest_neighbor(vec)

    # The top result should be dog
Ejemplo n.º 8
0
import json

from cat.simple import get_scores, rbf_attention
from reach import Reach
from collections import defaultdict

GAMMA = .03
N_ASPECT_WORDS = 200

if __name__ == "__main__":

    scores = defaultdict(dict)
    r = Reach.load("embeddings/my_word_vectors.vec", unk_word="<UNK>")

    aspects = [[x] for x in json.load(open("data/aspect_words.json"))]
    aspects = aspects[:N_ASPECT_WORDS]

    instances = ["text_1".split(), "text_2".split()]
    label_set = {"label1", "label2", "label3"}

    s = get_scores(instances,
                   aspects,
                   r,
                   label_set,
                   gamma=GAMMA,
                   remove_oov=False,
                   attention_func=rbf_attention)

    pred = s.argmax(1)
Ejemplo n.º 9
0
    scores = {}

    gold = json.load(open("data/beth_gold.json"))
    gold = list(zip(*sorted(gold.items())))[1]

    txt, gold_chunks = zip(*gold)

    data = json.load(open("data/beth_uima.json"))
    data = list(zip(*sorted(data.items())))[1]

    # Sanity check
    for a, b in zip(data, gold):
        assert len(a[0]) == len(b[0])

    embeddings = Reach.load("../../corpora/mimic_vecs_200_cbow.vec",
                            unk_word="UNK")

    scores = {}

    focus = experiment(data,
                       gold_chunks,
                       np.mean,
                       np.mean,
                       embeddings,
                       reciprocal,
                       0,
                       k=100,
                       use_focus=True)

    full = experiment(data,
                      gold_chunks,
Ejemplo n.º 10
0
    def noisychannel_ranking(self, detection_list, candidates_list):
        """
        An approximate implementation of the ranking method described in (Lai et al. 2015)
        :param detection_list: list of misspellings
        :param candidates_list: list of candidate list per misspelling
        :param frequency_dict: corpus frequencies from training data
        :param k_best: if True, return k highest ranked candidates instead of single one
        :return: list with corrections or k-best corrections
        """

        correction_list = []
        confidences = []

        print("Loading vector representations")
        r = Reach.load(self.pathtovectors, header=True)
        print("Done")

        for misspelling, candidates in zip(detection_list, candidates_list):

            # candidates = [candidate for candidate in candidates if candidate in r.words.keys()]

            score_list = []

            for candidate in candidates:
                orthographic_edit_distance = damerau_levenshtein_distance(
                    misspelling, candidate)
                phonetic_edit_distance = damerau_levenshtein_distance(
                    dm(misspelling)[0],
                    dm(candidate)[0])

                spell_score = (2 * orthographic_edit_distance +
                               phonetic_edit_distance)**2  # P(m|c)

                try:
                    frequency = self.frequency_dict[candidate]
                except KeyError:
                    frequency = 1

                frequency_score = 1 / (1 + log(frequency))  # P(c)

                score = spell_score * frequency_score  # P(c|m) = P(m|c)*P(c)
                score_list.append(score)

            score_list = np.array(score_list)

            if len(score_list) > 1:
                sorted_distances = [
                    score_list[i] for i in np.argsort(score_list)
                ]
                top1 = sorted_distances[0]
                top2 = sorted_distances[1]
                confidence = abs(top1 - top2) / top1
                confidences.append(confidence)
            else:
                confidences.append(0)

            if self.k == 1:
                try:
                    correction_list.append(candidates[np.argmin(score_list)])
                except ValueError:
                    correction_list.append('')
            elif self.k > 1:
                correction_list.append(
                    [candidates[i] for i in np.argsort(score_list)[:self.k]])
            else:
                raise ValueError('k must be positive natural number')

        self.confidences = confidences

        return correction_list
Ejemplo n.º 11
0
import tensorflow as tf
import re
from reach import Reach
r = Reach.load('./tulkens-embeddings/160/sonar-160.txt', header=True)

objsize = 160
holisticsize = 10

#%%

#import list of verbs
verb = 'overleef'
print('Verb:', verb)

#create tensor for verb
verbtens = tf.Variable(tf.random_uniform([objsize, holisticsize], 0.0, 1.0))
inp = tf.placeholder(tf.float32, [objsize])
outp = tf.matmul(verbtens, inp)
sess = tf.Session()

#get VO-combinations list
combos = []
rowsfile = open('./cooccurrence/rows1.rows', 'r')
done = False
found = False
while done == False:
    line = rowsfile.readline()
    if line.startswith(verb):
        found = True
        combos.append(line)
    else:
Ejemplo n.º 12
0
if __name__ == "__main__":

    # Set this flag to true to replicate the perfect chunking setting
    # in experiment 3.
    perfect = True

    gold = json.load(open("data/test_gold.json"))
    gold = list(zip(*sorted(gold.items())))[1]

    if perfect:
        data = json.load(open("data/test_gold.json"))
    data = list(zip(*sorted(data.items())))[1]

    txt, gold_bio = zip(*gold)
    r = Reach.load("../../corpora/mimiciii-min5-neg3-w5-100.vec",
                   unk_word="<UNK>")

    r_concept = Reach.load_fast_format(f"data/concept_vectors")
    concept_labels = json.load(open("data/names2label.json"))

    grouped = defaultdict(list)
    for k, v in concept_labels.items():
        grouped[v].append(r_concept[k])

    grouped.pop("np")

    memory = {}
    for k, v in tqdm(grouped.items()):

        km = KMeans(10)
        km.fit(v)
Ejemplo n.º 13
0
if __name__ == "__main__":

    import logging
    import time
    import json

    # Setup
    # logging.basicConfig(level=logging.INFO)

    umls = "sample_data/umls_sample.json"
    msh = "sample_data/abstracts_example.json"
    path_to_embeddings = ""
    use_subset = False

    # Be sure to set add_unk to True, or to mark the UNK index.
    embeddings = Reach.load(path_to_embeddings, unk_word="UNK")

    logging.info("loaded embeddings.")

    start = time.time()

    y = Yarn(embeddings)

    umls = json.load(open(umls))
    msh = json.load(open(msh))

    if use_subset:

        subset = [u'di',
                  u'tat',
                  u'erp',
Ejemplo n.º 14
0
                vec = embeddings.vectorize(desc, remove_oov=True)
                if not np.any(vec):
                    continue
                concept.append(np.mean(vec, axis=0))
            except ValueError:
                pass

        if not concept:
            continue

        concept_names.append(name)
        vectors.append(np.array(concept).mean(axis=0))

    r = Reach(np.array(vectors), concept_names)

    return r


if __name__ == "__main__":

    path_to_embeddings = ""
    r_1 = Reach.load(path_to_embeddings, unk_word="UNK")

    concepts = json.load(open("data/all_concepts.json"))
    sty = json.load(open("data/concept_label.json"))
    r = create_concepts(concepts, r_1, include_np=True, labels=sty)
    r.save_fast_format("data/concept_vectors")

    name2label = {k: sty[k.split("-")[0]] for k in r.items()}
    json.dump(name2label, open("data/names2label.json", 'w'))
Ejemplo n.º 15
0
    def grid_search(devcorpus, candidates_list, language):
        """
        Conduct grid search to find best parameters for a corpus containing only in-vector-vocabulary corrections
        :param devcorpus: devcorpus generated with make_devcorpus.py
        :param candidates_list: list of candidate list per misspelling
        :param language: language from ["en", "nl"]
        :return: dictionary with parameter settings as keys and their correction accuracy as values
        """

        # default parameters
        parameters = {
            'comp_function': 'sum',
            'include_misspelling': False,
            'include_oov_candidates': False,
            'window_size': 6,
            'reciprocal': False,
            'remove_stopwords': False,
            'edit_distance': 1,
            'oov_penalty': 1.5,
            'ranking_method': 'context',
            'k-best': 1
        }

        dev = Development(parameters, language)

        print("Loading embeddings")
        r = Reach.load(dev.pathtovectors, header=True)
        print("Done")

        corrected_list = devcorpus[0]
        detection_list = devcorpus[1]
        detection_contexts = devcorpus[2]

        scores_dict = {}

        start_time = 0
        end_time = 0
        for comp_function in ["sum", "mult", "max"]:
            print("New run")
            run_time = end_time - start_time
            print("Last run took " + str(run_time) + " seconds")
            start_time = time.time()
            dev.comp_function = comp_function
            for include_misspelling in [True, False]:
                dev.include_misspelling = include_misspelling
                for window_size in range(11):
                    dev.window_size = window_size
                    for reciprocal in [True, False]:
                        dev.reciprocal = reciprocal
                        for remove_stopwords in [True, False]:
                            dev.remove_stopwords = remove_stopwords
                            for edit_distance in range(1, 5):
                                dev.edit_distance = edit_distance
                                correction_list = dev.ranking_experiment(
                                    detection_list, detection_contexts,
                                    candidates_list, r)
                                accuracy = len([
                                    c for i, c in enumerate(correction_list)
                                    if c == corrected_list[i]
                                ]) / len(correction_list)
                                parameters = (comp_function,
                                              include_misspelling, window_size,
                                              reciprocal, remove_stopwords,
                                              edit_distance)
                                scores_dict[parameters] = accuracy

            end_time = time.time()

        return scores_dict
Ejemplo n.º 16
0
    # in experiment 3.
    perfect = False

    gold = json.load(open("data/test_gold.json"))
    gold = list(zip(*sorted(gold.items())))[1]

    if perfect:
        data = json.load(open("data/test_gold.json"))
    else:
        data = json.load(open("data/test_uima.json"))
    data = list(zip(*sorted(data.items())))[1]

    txt, gold_bio = zip(*gold)
    _, data_bio = zip(*data)

    embeddings = Reach.load("", unk_word="UNK")
    concept_reach = Reach.load_fast_format("data/concept_vectors")
    concept_labels = json.load(open("data/concept_names2label.json"))

    gold_bio = list(chain.from_iterable(gold_bio))

    results_bio = {}

    r_phrases = compose(data,
                        f1=np.mean,
                        f2=np.mean,
                        window=0,
                        embeddings=embeddings,
                        context_function=reciprocal)

    pred_bio_focus = eval_extrinsic(list(chain.from_iterable(data_bio)),
Ejemplo n.º 17
0
if __name__ == "__main__":

    import logging
    import time
    import json

    # Setup
    # logging.basicConfig(level=logging.INFO)

    umls = "sample_data/umls_sample.json"
    msh = "sample_data/abstracts_example.json"
    path_to_embeddings = ""
    use_subset = False

    # Be sure to set add_unk to True, or to mark the UNK index.
    embeddings = Reach.load(path_to_embeddings, header=True, unk_word="UNK")

    logging.info("loaded embeddings.")

    start = time.time()

    y = Yarn(embeddings)

    umls = json.load(open(umls))
    msh = json.load(open(msh))

    if use_subset:

        subset = [
            u'di', u'tat', u'erp', u'ori', u'crna', u'pep', u'de', u'hip',
            u'glycoside', u'sterilization', u'ra', u'don', u'ecg', u'cell',