Example #1
0
File: yarn.py Project: clips/yarn
def compute_nearest_neighbours(definitions, abstracts):
    """
    Compute nearest neighbours from abstracts to definitions.

    Parameters
    ----------
    definitions : dictionary of dictionaries.
        A dictionary of dictionaries containing vectors.
        The top key is the Ambiguous term, the bottom key is the CUI.

            Example: {AMBIGTERM: {CUI1: VECTOR, CUI2: VECTOR}}
    abstracts : dictionary of dictionaries
        Like definitions.

    Returns
    -------
    result : dict
        A dictionary, the keys of which are the ambiguous terms, and the values
        are lists of tuples. The first item of each tuple is the true class,
        the second item of each tuple is the predicted class.

        example: {AMBIGTERM1: [(y1, y_pred1), (y2, y_pred2), ...]}

    """
    output = {}

    for k, v in abstracts.items():

        results = []

        labels, vectors = dict_to_tuple(v)

        try:
            targets, matrix = dict_to_tuple(definitions[k])
        except KeyError:
            continue
        matrix = Reach.normalize(np.asarray(matrix))
        vectors = Reach.normalize(np.asarray(vectors))

        for vec in vectors:

            result = -vec.dot(matrix.T)
            results.append(targets[np.argsort(result)[0]])

        output[k] = list(zip(labels, results))

    return output
Example #2
0
    def extract_online_lstm_embeddings(self,
                                       prune=False,
                                       normalize=True,
                                       verbose=False,
                                       provided_names=(),
                                       preprocess=False):

        self.model.eval()

        if provided_names:
            input_items = provided_names
            if preprocess:
                input_items = [self.preprocess(name) for name in input_items]
        else:
            embeddings = deepcopy(self.sampling.pretrained_name_embeddings)
            if prune:
                names_to_prune = set(
                    self.sampling.exemplar_to_concept.keys()).union(
                        self.sampling.validation_references.keys())
                embeddings.prune(names_to_prune)
            input_items = [x for _, x in sorted(embeddings.indices.items())]

        # batch input items to save up on memory...
        all_embeddings = []
        batch_size = 500 if self.hidden_size >= 9600 else 1000
        for i in tqdm(range(0, len(input_items), batch_size),
                      disable=not verbose):
            input_batch = input_items[i:i + batch_size]
            input_vectors = []
            for item in input_batch:
                vector = self.sampling.vectorize_string(item, norm=normalize)
                input_vectors.append(torch.FloatTensor(vector).to(self.device))

            # pass through LSTM network
            lstm_embeddings = self.forward_lstm(input_vectors)
            online_batch = lstm_embeddings.detach().cpu().numpy()

            # add batch
            all_embeddings.append(online_batch)

        # convert to embeddings
        all_embeddings = np.concatenate(all_embeddings)

        online_embeddings = Reach(all_embeddings, input_items)

        return online_embeddings
    def __init__(self, detection_list, language, model, k, backoff,
                 pathtofrequencies, pathtomodel, pathtovectors):
        """
        :param detection_list: list with tuples containing (misspelling, list of 10 left context tokens, list of 10 right context tokens)
        :param language: 1 if English, 0 if Dutch
        :param model: 1 if context-sensitive, 0 if noisy channel
        :param k: number of ranked corrections returned
        """
        # prepare model
        print('Initializing spelling correction model...')
        assert len(detection_list[0]) == 3, 'Wrong input format'
        self.misspellings, self.left_contexts, self.right_contexts = zip(
            *detection_list)
        assert len(self.misspellings) == len(self.left_contexts) == len(
            self.right_contexts), 'Input data not properly synchronized'
        print(len(self.misspellings), 'misspellings to correct')
        self.ranking_model = model
        assert self.ranking_model in range(
            2), 'No valid correction model specified'
        assert k >= 1, 'No valid k specified'
        self.k = k
        self.backoff = backoff
        if language == 1:
            self.language = 'en'
        elif language == 0:
            self.language = 'nl'
        else:
            raise ValueError('No valid language input specified')

        # load embedding model and corpus frequencies
        with open(pathtofrequencies, 'r') as f:
            self.frequency_dict = json.load(f)
        self.model = fasttext.load_model(pathtomodel)
        self.r = Reach.load(pathtovectors, header=True)

        # set parameters for correction
        if self.language == "en":
            self.window_size = 9
            self.oov_penalty = 1.7
        elif self.language == "nl":
            self.window_size = 10
            self.oov_penalty = 2.4
        print('Model initialized')
Example #4
0
    def get_grouped_questions(self, trainSet, simThreshold):

        grouped_questions = defaultdict(
            list
        )  #{id:[list of similar questions, where each item is a list of covered tokens in the question]}
        questions_type = defaultdict(lambda: defaultdict(int))
        grouped_questions_cat = defaultdict(set)

        for d in trainSet:
            cur_segment = self.segmenter.segment(d.getTextObject())
            for qap in cur_segment:
                qid = len(grouped_questions.keys())
                cur_q_tokens = d.getTextObject().get_covered_tokens(
                    qap.begQue, qap.endQue)

                if any(cur_q_tokens in val
                       for val in grouped_questions.values()):
                    continue
                qVec = Resources.getWordVectors().vectorize(cur_q_tokens,
                                                            remove_oov=True)
                if not qVec:
                    continue
                norm_q_vec = Reach.normalize(np.mean(qVec, axis=0))

                k = self.get_grouped_qid(norm_q_vec, grouped_questions,
                                         simThreshold)
                if k is not None:
                    qid = k

                grouped_questions[qid].append(cur_q_tokens)
                ansType, cat = self.get_ans_type(qap.answers)

                if not ansType:
                    continue

                questions_type[qid][ansType] += 1

                if cat:
                    grouped_questions_cat[qid].add(cat)

        return (grouped_questions, questions_type, grouped_questions_cat)
Example #5
0
def create_concepts(concepts, embeddings, include_np=True, labels=None):
    """Create concepts by summing over descriptions in embedding spaces."""
    # Gold standard labels for concepts:
    concept_names = []
    vectors = []

    for name, descriptions in tqdm(list(concepts.items())):

        if labels is not None:
            try:
                label = sty[name]
            except KeyError:
                continue

        if not include_np and label == "np":
            continue

        concept = []

        for idx, desc in enumerate(descriptions):

            try:
                desc = desc.lower().split()
                # desc = [x for x in desc if x not in STOP_WORDS]
                vec = embeddings.vectorize(desc, remove_oov=True)
                if not np.any(vec):
                    continue
                concept.append(np.mean(vec, axis=0))
            except ValueError:
                pass

        if not concept:
            continue

        concept_names.append(name)
        vectors.append(np.array(concept).mean(axis=0))

    r = Reach(np.array(vectors), concept_names)

    return r
Example #6
0
    def extract_online_dan_embeddings(self,
                                      prune=False,
                                      normalize=True,
                                      verbose=False,
                                      provided_names=(),
                                      preprocess=False):

        self.model.eval()

        if provided_names:
            input_items = provided_names
            if preprocess:
                input_items = [self.preprocess(name) for name in input_items]
            embeddings = self.sampling.create_reach_object(input_items)
        else:
            embeddings = deepcopy(self.sampling.pretrained_name_embeddings)
        if prune:
            names_to_prune = set(
                self.sampling.exemplar_to_concept.keys()).union(
                    self.sampling.validation_references.keys())
            embeddings.prune(names_to_prune)

        input_vectors = embeddings.norm_vectors if normalize else embeddings.vectors
        input_items = [x for _, x in sorted(embeddings.indices.items())]

        # batch input items to save up on memory...
        all_embeddings = []
        batch_size = 1000
        for i in tqdm(range(0, len(input_items), batch_size),
                      disable=not verbose):
            input_batch = input_vectors[i:i + batch_size]
            input_tensor = torch.FloatTensor(input_batch).to(self.device)
            online_batch = self.model(input_tensor).detach().cpu().numpy()
            all_embeddings.append(online_batch)
        all_embeddings = np.concatenate(all_embeddings)

        online_embeddings = Reach(all_embeddings, input_items)

        return online_embeddings
Example #7
0
    def frequency_baseline(self, detection_list, candidates_list):
        """
        Majority frequency baseline
        :param detection_list: list of misspellings
        :param candidates_list: list of candidate list per misspelling
        :return: list with corrections or k-best corrections
        """

        correction_list = []

        print("Loading vector representations")
        r = Reach.load(self.pathtovectors, header=True)
        print("Done")

        for misspelling, candidates in zip(detection_list, candidates_list):

            candidates = [
                candidate for candidate in candidates
                if candidate in self.frequency_dict.keys()
            ]

            frequencies = [
                self.frequency_dict[candidate] for candidate in candidates
            ]

            if self.k == 1:
                try:
                    correction_list.append(candidates[np.argmax(frequencies)])
                except ValueError:
                    correction_list.append('')
            elif self.k > 1:
                correction_list.append([
                    candidates[i]
                    for i in np.argsort(frequencies)[::-1][:self.k]
                ])
            else:
                raise ValueError('k must be positive natural number')

        return correction_list
    def create_cluster_prototypes(self,
                                  provided_embeddings=None,
                                  total=False,
                                  pretrained=True):

        if provided_embeddings != None:
            embeddings = provided_embeddings
        else:
            if pretrained:
                embeddings = self.pretrained_name_embeddings
            else:
                embeddings = self.extract_online_dan_embeddings(prune=False)

        clusters = self.clusters if total else self.training_clusters

        print('Creating cluster prototypes...')
        cluster_prototypes = {}
        for label, strings in clusters.items():
            strings = set(strings).intersection(self.training_names)
            cluster_prototypes[label] = self.create_prototype(
                strings, embeddings)
        items, vectors = zip(*cluster_prototypes.items())
        self.cluster_prototypes = Reach(vectors, items)
Example #9
0
    def tune_oov(devcorpus, candidates_list, best_parameters, language):
        """
        Conduct search for best oov penalty for corpus
        :param devcorpus: devcorpus generated with make_devcorpus.py
        :param candidates_list: list of candidate list per misspelling
        :param best_parameters: best parameters for the devcorpus
        :param language: language from ["en", "nl"]
        :return: dictionary with oov penalties as keys and their correction accuracy as values
        """

        dev = Development(best_parameters, language)

        print("Loading embeddings")
        r = Reach.load(dev.pathtovectors, header=True)
        print("Done")

        corrected_list = devcorpus[0]
        detection_list = devcorpus[1]
        detection_contexts = devcorpus[2]

        scores_dict = {}

        values = list(range(30))
        values = [value / 10 for value in values]

        for value in values:
            dev.oov_penalty = value
            correction_list = dev.ranking_experiment(detection_list,
                                                     detection_contexts,
                                                     candidates_list, r)
            accuracy = len([
                c for i, c in enumerate(correction_list)
                if c == corrected_list[i]
            ]) / len(correction_list)
            scores_dict[value] = accuracy

        return scores_dict
Example #10
0
if __name__ == "__main__":

    import logging
    import time
    import json

    # Setup
    # logging.basicConfig(level=logging.INFO)

    umls = "sample_data/umls_sample.json"
    msh = "sample_data/abstracts_example.json"
    path_to_embeddings = ""
    use_subset = False

    # Be sure to set add_unk to True, or to mark the UNK index.
    embeddings = Reach.load(path_to_embeddings, header=True, unk_word="UNK")

    logging.info("loaded embeddings.")

    start = time.time()

    y = Yarn(embeddings)

    umls = json.load(open(umls))
    msh = json.load(open(msh))

    if use_subset:

        subset = [
            u'di', u'tat', u'erp', u'ori', u'crna', u'pep', u'de', u'hip',
            u'glycoside', u'sterilization', u'ra', u'don', u'ecg', u'cell',
# stores a list of row numbers and argument strings per verb.

import re
import numpy as np
from reach import Reach
import transformargs

#pathnames
rowspath = './cooccurrence/weighted_sm.rows'
embeddingspath = './tulkens-embeddings/160/sonar-160.txt'
logpath = './failedwords.txt'
exportpath = './verbtrainingindex2'

#import data
rowsfile = open(rowspath, 'r', encoding='utf-8')
r = Reach(embeddingspath, header=True)
#holmatrix = np.load(holmatrixpath)

#load output file
log = open(logpath, 'w', encoding='utf-8')

control = np.zeros(160)
failedcount = 0
rowcount = 590408
t = transformargs.Transformer()
verbarray = np.array(
    ['', np.array([np.array([0, ''], object)], object)], object
)  #will contain line indexes and corresponding argument strings for each verb
#dummy first row added to show structure
verbindex = 0
oldkey = ''
Example #12
0
"""Test with word embeddings."""
from reach import Reach
from plate.plate import circular_convolution, decode

if __name__ == "__main__":

    r = Reach.load("PATH_TO_EMBEDDINGS")

    # Encode "dog chase cat"
    a = circular_convolution(r["subject"], r["dog"])
    b = circular_convolution(r["verb"], r["chase"])
    c = circular_convolution(r["object"], r["cat"])

    sentence = a + b + c
    vec = decode(r["subject"], sentence)
    result = r.nearest_neighbor(vec)

    # The top result should be dog
Example #13
0
    scores = {}

    gold = json.load(open("data/beth_gold.json"))
    gold = list(zip(*sorted(gold.items())))[1]

    txt, gold_chunks = zip(*gold)

    data = json.load(open("data/beth_uima.json"))
    data = list(zip(*sorted(data.items())))[1]

    # Sanity check
    for a, b in zip(data, gold):
        assert len(a[0]) == len(b[0])

    embeddings = Reach.load("../../corpora/mimic_vecs_200_cbow.vec",
                            unk_word="UNK")

    scores = {}

    focus = experiment(data,
                       gold_chunks,
                       np.mean,
                       np.mean,
                       embeddings,
                       reciprocal,
                       0,
                       k=100,
                       use_focus=True)

    full = experiment(data,
                      gold_chunks,
Example #14
0
if __name__ == "__main__":

    # Set this flag to true to replicate the perfect chunking setting
    # in experiment 3.
    perfect = True

    gold = json.load(open("data/test_gold.json"))
    gold = list(zip(*sorted(gold.items())))[1]

    if perfect:
        data = json.load(open("data/test_gold.json"))
    data = list(zip(*sorted(data.items())))[1]

    txt, gold_bio = zip(*gold)
    r = Reach.load("../../corpora/mimiciii-min5-neg3-w5-100.vec",
                   unk_word="<UNK>")

    r_concept = Reach.load_fast_format(f"data/concept_vectors")
    concept_labels = json.load(open("data/names2label.json"))

    grouped = defaultdict(list)
    for k, v in concept_labels.items():
        grouped[v].append(r_concept[k])

    grouped.pop("np")

    memory = {}
    for k, v in tqdm(grouped.items()):

        km = KMeans(10)
        km.fit(v)
Example #15
0
import json

from cat.simple import get_scores, rbf_attention
from reach import Reach
from collections import defaultdict

GAMMA = .03
N_ASPECT_WORDS = 200

if __name__ == "__main__":

    scores = defaultdict(dict)
    r = Reach.load("embeddings/my_word_vectors.vec", unk_word="<UNK>")

    aspects = [[x] for x in json.load(open("data/aspect_words.json"))]
    aspects = aspects[:N_ASPECT_WORDS]

    instances = ["text_1".split(), "text_2".split()]
    label_set = {"label1", "label2", "label3"}

    s = get_scores(instances,
                   aspects,
                   r,
                   label_set,
                   gamma=GAMMA,
                   remove_oov=False,
                   attention_func=rbf_attention)

    pred = s.argmax(1)
 def load_test_vectors(self, embeddings_infile):
     # load vectors
     print('Loading vectors...')
     self.test_vectors = Reach.load_fast_format(embeddings_infile)
Example #17
0
    def __init__(self, river, len_dang_arcs, fast):
        # Initialize the variables
        self.array_skeleton = []
        self.row = 0
        self.col = 0
        self.length_dangling_arcs = len_dang_arcs
        self.array_Junction = []
        self.array_done = []
        self.list_Junction = []
        self.l_Reach = []
        self.flag_fast = fast

        # Find the skeleton and extract the river boundary
        # By the end of this constructor function we have a skeleton of the river with no dangling arcs, with junction points and reaches identified

        # 1. Get the river image in the form of numpy array -------------------
        array_Image = river.getRiver()

        # 2. Find skeleton for the river --------------------------------------
        print("Finding the skeleton")
        self.array_skeleton = morphology.skeletonize(array_Image > 0)
        self.array_skeleton = npy.array(self.array_skeleton, dtype=npy.uint8)
        (self.row, self.col) = self.array_skeleton.shape
        # ---------------------------------------------------------------------
        # image after taking the skeleton of river
        plt.imshow(self.array_skeleton)
        plt.savefig('temp/10_Skeleton.png', format='jpg', dpi=1200)
        # ---------------------------------------------------------------------

        # 3. Remove dangling arcs ---------------------------------------------
        if not fast:
            print("Removing dangling arcs")
            self.RemoveDanglingArc()
            # -----------------------------------------------------------------
            # image after removing dangling arcs form the skeleton
            plt.imshow(self.array_skeleton)
            plt.savefig('temp/11_RemoveDanglingArc.png',
                        format='jpg',
                        dpi=1200)
            # -----------------------------------------------------------------

        # 4. Find the junction points -----------------------------------------
        print("Identifying all the junctions")
        self.array_done = npy.zeros((self.row, self.col), dtype=npy.int)
        self.array_Junction = npy.zeros((self.row, self.col), dtype=npy.int)
        self.MarkJunctions()
        # ---------------------------------------------------------------------
        # image after finding the junctions of the river
        plt.imshow(self.array_Junction)
        plt.savefig('temp/12_Junctions.png', format='jpg', dpi=1200)
        # ---------------------------------------------------------------------

        # 5. Identify all the reaches -----------------------------------------
        print("Identifying all the reaches")
        reach = Reach(0)  # create a new Reach type variable with ReachID = 0
        self.l_Reach.append(reach)  # append it to the list - l_Reach
        self.IdentifyReach()

        # self.MarkJunctionsAndNeighbourhood()
        # self.MarryReachJunction()
        return
Example #18
0
if __name__ == "__main__":

    import logging
    import time
    import json

    # Setup
    # logging.basicConfig(level=logging.INFO)

    umls = "sample_data/umls_sample.json"
    msh = "sample_data/abstracts_example.json"
    path_to_embeddings = ""
    use_subset = False

    # Be sure to set add_unk to True, or to mark the UNK index.
    embeddings = Reach.load(path_to_embeddings, unk_word="UNK")

    logging.info("loaded embeddings.")

    start = time.time()

    y = Yarn(embeddings)

    umls = json.load(open(umls))
    msh = json.load(open(msh))

    if use_subset:

        subset = [u'di',
                  u'tat',
                  u'erp',
import sklearn.preprocessing
import math
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
import pandas as pd

#pathnames
indexpath = './verbtrainingindex_withweights.npy'
holmatrixpath = './cooccurrence/svd/newmatrix.npy'
embeddingspath = './tulkens-embeddings/160/sonar-160.txt'
outputpath = './verbmatrices/version3'

#import data
index = np.load(indexpath)
holmatrix = np.load(holmatrixpath)
arg_data = Reach(embeddingspath, header=True)

#parameters
n_dim = 160
s_dim = 200
alpha_value = 50
min_sample_size = 400
# note how in testing, a samplesize of N = 500 was deemed acceptable. here we
# do not split in train an test data, so the min sample size can be 80% of the
# one used when testing.
variance_control = True
mean_std = 0.08

matrices = dict()

#loop through verbs
Example #20
0
    def noisychannel_ranking(self, detection_list, candidates_list):
        """
        An approximate implementation of the ranking method described in (Lai et al. 2015)
        :param detection_list: list of misspellings
        :param candidates_list: list of candidate list per misspelling
        :param frequency_dict: corpus frequencies from training data
        :param k_best: if True, return k highest ranked candidates instead of single one
        :return: list with corrections or k-best corrections
        """

        correction_list = []
        confidences = []

        print("Loading vector representations")
        r = Reach.load(self.pathtovectors, header=True)
        print("Done")

        for misspelling, candidates in zip(detection_list, candidates_list):

            # candidates = [candidate for candidate in candidates if candidate in r.words.keys()]

            score_list = []

            for candidate in candidates:
                orthographic_edit_distance = damerau_levenshtein_distance(
                    misspelling, candidate)
                phonetic_edit_distance = damerau_levenshtein_distance(
                    dm(misspelling)[0],
                    dm(candidate)[0])

                spell_score = (2 * orthographic_edit_distance +
                               phonetic_edit_distance)**2  # P(m|c)

                try:
                    frequency = self.frequency_dict[candidate]
                except KeyError:
                    frequency = 1

                frequency_score = 1 / (1 + log(frequency))  # P(c)

                score = spell_score * frequency_score  # P(c|m) = P(m|c)*P(c)
                score_list.append(score)

            score_list = np.array(score_list)

            if len(score_list) > 1:
                sorted_distances = [
                    score_list[i] for i in np.argsort(score_list)
                ]
                top1 = sorted_distances[0]
                top2 = sorted_distances[1]
                confidence = abs(top1 - top2) / top1
                confidences.append(confidence)
            else:
                confidences.append(0)

            if self.k == 1:
                try:
                    correction_list.append(candidates[np.argmin(score_list)])
                except ValueError:
                    correction_list.append('')
            elif self.k > 1:
                correction_list.append(
                    [candidates[i] for i in np.argsort(score_list)[:self.k]])
            else:
                raise ValueError('k must be positive natural number')

        self.confidences = confidences

        return correction_list
Example #21
0
import tensorflow as tf
import re
from reach import Reach
r = Reach.load('./tulkens-embeddings/160/sonar-160.txt', header=True)

objsize = 160
holisticsize = 10

#%%

#import list of verbs
verb = 'overleef'
print('Verb:', verb)

#create tensor for verb
verbtens = tf.Variable(tf.random_uniform([objsize, holisticsize], 0.0, 1.0))
inp = tf.placeholder(tf.float32, [objsize])
outp = tf.matmul(verbtens, inp)
sess = tf.Session()

#get VO-combinations list
combos = []
rowsfile = open('./cooccurrence/rows1.rows', 'r')
done = False
found = False
while done == False:
    line = rowsfile.readline()
    if line.startswith(verb):
        found = True
        combos.append(line)
    else:
Example #22
0
if __name__ == "__main__":

    import logging
    import time
    import json

    # Setup
    logging.basicConfig(level=logging.INFO)

    umls = "sample_data/umls_sample.json"
    msh = "sample_data/abstracts_example.json"
    path_to_embeddings = ""
    use_subset = False

    embeddings = Reach(path_to_embeddings, header=True, verbose=False)

    logging.info("loaded embeddings.")

    start = time.time()

    y = Yarn(embeddings)

    umls = json.load(open(umls))
    msh = json.load(open(msh))

    if use_subset:

        subset = [
            u'di', u'tat', u'erp', u'ori', u'crna', u'pep', u'de', u'hip',
            u'glycoside', u'sterilization', u'ra', u'don', u'ecg', u'cell',
Example #23
0
def compose(documents,
            f1,
            f2,
            embeddings,
            window,
            context_function,
            use_focus=True,
            norm=False):
    """
    Map phrases from sentences to vectors.

    Parameters
    ==========
    documents : list of lists
        A list of lists, where each sublist contains 2 lists of the same
        length, where the first list contains the tokens of a text, and
        the second list contains the BIO of the NP chunks for said text.
    f1 : function
        A function which is used to compose the vectorized lists of words
        into a single vector. Must take an axis parameter.
    f2 : function
        A function which is used to compose the vectors vectorized with f1
        into a second-order vector. Must also take an axis parameter.
    embeddings : Reach
        A reach instance which contains the embeddings you want to use to
        vectorize.
    window : int
        The window size to use.
    context_function : function
        The function which is used to weigh the contexts. Must take a 2D
        matrix and return a 2D matrix of the same shape.
    use_focus : bool, optional, default True
        Whether to vectorize the focus word.
    norm : bool, optional, default False
        Whether to use the unit vectors to compose.

    Returns
    =======
    phrases : Reach
        A reach instance containing the phrases and their vectors.

    """
    bio_regex = re.compile(r"BI*")

    phrases, vectors = [], []

    for idx, (txt, bio) in enumerate(documents):

        txt = " ".join(txt).lower().split()
        bio = "".join([x.split("-")[0] for x in bio])
        for t in bio_regex.finditer(bio):
            b, e = t.span()
            phrase_string, vector = create_phrase_vector(
                txt, b, e, window, embeddings, f1, f2, context_function,
                use_focus, norm)

            # Phrase string needs to be augmented with index to make
            # the dictionary mapping not overwrite itself.
            phrase_string = "{}-{}".format(phrase_string, len(phrases))
            phrases.append(phrase_string)
            vectors.append(vector)

    return Reach(vectors, phrases)
Example #24
0
    # in experiment 3.
    perfect = True

    gold = json.load(open("data/test_gold.json"))
    gold = list(zip(*sorted(gold.items())))[1]

    if perfect:
        data = json.load(open("data/test_gold.json"))
    else:
        data = json.load(open("data/test_uima.json"))
    data = list(zip(*sorted(data.items())))[1]

    txt, gold_bio = zip(*gold)
    _, data_bio = zip(*data)

    embeddings = Reach.load("../../corpora/mimiciii-min5-neg3-w5-100.vec",
                            unk_word="<UNK>")

    concept_reach = Reach.load_fast_format("data/concept_vectors")
    concept_labels = json.load(open("data/names2label.json"))

    gold_bio = list(chain.from_iterable(gold_bio))

    results_bio = {}

    r_phrases = compose(data,
                        f1=np.mean,
                        f2=np.mean,
                        window=0,
                        embeddings=embeddings,
                        context_function=reciprocal)
Example #25
0
from cat.simple import get_scores, rbf_attention
from cat.dataset import restaurants_train
from reach import Reach
from sklearn.metrics import precision_recall_fscore_support
from collections import defaultdict, Counter


GAMMA = .03
BEST_ATT = {"n_noun": 980}
BEST_RBF = {"n_noun": 200}

if __name__ == "__main__":

    scores = defaultdict(dict)
    r = Reach.load("embeddings/restaurant_vecs_w2v.vec",
                   unk_word="<UNK>")

    att = rbf_attention
    datums = list(restaurants_train())

    d = json.load(open("data/nouns_restaurant.json"))
    nouns = Counter()
    for k, v in d.items():
        if k.lower() in r.items:
            nouns[k.lower()] += v

    if att == rbf_attention:
        r.vectors[r.items["<UNK>"]] = r.vectors.max()

    if att == rbf_attention:
        candidates, _ = zip(*nouns.most_common(BEST_RBF["n_noun"]))
Example #26
0
    scores = {}

    gold = json.load(open("data/beth_gold.json"))
    gold = list(zip(*sorted(gold.items())))[1]

    txt, gold_chunks = zip(*gold)

    data = json.load(open("data/beth_uima.json"))
    data = list(zip(*sorted(data.items())))[1]

    # Sanity check
    for a, b in zip(data, gold):
        assert len(a[0]) == len(b[0])

    embeddings = Reach.load("", unk_word="UNK")

    scores = {}

    focus = experiment(data,
                       gold_chunks,
                       np.mean,
                       np.mean,
                       embeddings,
                       reciprocal,
                       0,
                       k=100,
                       use_focus=True)

    full = experiment(data,
                      gold_chunks,
Example #27
0
    parsed_train = json.load(open("data/partners_uima.json"))
    parsed_train = list(zip(*sorted(parsed_train.items())))[1]

    gold_train = json.load(open("data/partners_gold.json"))
    gold_train = list(zip(*sorted(gold_train.items())))[1]

    parsed_test = json.load(open("data/beth_uima.json"))
    parsed_test = list(zip(*sorted(parsed_test.items())))[1]

    gold_test = json.load(open("data/beth_gold.json"))
    gold_test = list(zip(*sorted(gold_test.items())))[1]

    txt, gold_chunks_train = zip(*gold_train)
    _, gold_chunks_test = zip(*gold_test)

    embeddings = Reach.load("")

    for a, b in zip(parsed_train, gold_train):
        assert len(a[0]) == len(b[0])

    for a, b in zip(parsed_test, gold_test):
        assert len(a[0]) == len(b[0])

    knn_focus = experiment(parsed_train,
                           gold_chunks_train,
                           parsed_test,
                           gold_chunks_test,
                           np.mean,
                           np.mean,
                           embeddings,
                           reciprocal,
Example #28
0
def run(options):
    logger = get_logger()

    validation_dataset = get_validation_dataset(options)
    validation_iterator = get_validation_iterator(options, validation_dataset)
    word2idx = validation_dataset['word2idx']
    embeddings = validation_dataset['embeddings']

    idx2word = {v: k for k, v in word2idx.items()}

    logger.info('Initializing model.')
    trainer = build_net(options, embeddings, validation_iterator)
    diora = trainer.net.diora

    # 1. Get all relevant phrase vectors.
    dtype = {
        'example_ids': 'list',
        'labels': 'list',
        'positions': 'list',
        'sizes': 'list',
        'phrases': 'list',
        'inside': 'torch',
        'outside': 'torch',
    }
    batch_recorder = BatchRecorder(dtype=dtype)
    # Eval mode.
    trainer.net.eval()

    batches = validation_iterator.get_iterator(random_seed=options.seed)

    logger.info('Beginning to embed phrases.')

    strings = []
    with torch.no_grad():
        for i, batch_map in enumerate(batches):
            sentences = batch_map['sentences']
            length = sentences.shape[1]

            # Skips very short examples.
            if length <= 2:
                continue
            strings.extend([
                "".join([idx2word[idx] for idx in x])
                for x in sentences.numpy()
            ])
            trainer.step(batch_map, train=False, compute_loss=False)

            batch_result = {}
            batch_result['inside'] = diora.inside_h[:, -1]
            batch_result['outside'] = diora.outside_h[:, -1]
            batch_recorder.record(**batch_result)

    result = batch_recorder.get_flattened_result()

    # 2. Build an index of nearest neighbors.
    vectors = np.concatenate([result['inside'], result['outside']], axis=1)
    print(len(strings), vectors.shape)
    r = Reach(vectors, strings)

    for s in strings:
        print(s)
        print(r.most_similar(s))
Example #29
0
                vec = embeddings.vectorize(desc, remove_oov=True)
                if not np.any(vec):
                    continue
                concept.append(np.mean(vec, axis=0))
            except ValueError:
                pass

        if not concept:
            continue

        concept_names.append(name)
        vectors.append(np.array(concept).mean(axis=0))

    r = Reach(np.array(vectors), concept_names)

    return r


if __name__ == "__main__":

    path_to_embeddings = ""
    r_1 = Reach.load(path_to_embeddings, unk_word="UNK")

    concepts = json.load(open("data/all_concepts.json"))
    sty = json.load(open("data/concept_label.json"))
    r = create_concepts(concepts, r_1, include_np=True, labels=sty)
    r.save_fast_format("data/concept_vectors")

    name2label = {k: sty[k.split("-")[0]] for k in r.items()}
    json.dump(name2label, open("data/names2label.json", 'w'))
Example #30
0
    def grid_search(devcorpus, candidates_list, language):
        """
        Conduct grid search to find best parameters for a corpus containing only in-vector-vocabulary corrections
        :param devcorpus: devcorpus generated with make_devcorpus.py
        :param candidates_list: list of candidate list per misspelling
        :param language: language from ["en", "nl"]
        :return: dictionary with parameter settings as keys and their correction accuracy as values
        """

        # default parameters
        parameters = {
            'comp_function': 'sum',
            'include_misspelling': False,
            'include_oov_candidates': False,
            'window_size': 6,
            'reciprocal': False,
            'remove_stopwords': False,
            'edit_distance': 1,
            'oov_penalty': 1.5,
            'ranking_method': 'context',
            'k-best': 1
        }

        dev = Development(parameters, language)

        print("Loading embeddings")
        r = Reach.load(dev.pathtovectors, header=True)
        print("Done")

        corrected_list = devcorpus[0]
        detection_list = devcorpus[1]
        detection_contexts = devcorpus[2]

        scores_dict = {}

        start_time = 0
        end_time = 0
        for comp_function in ["sum", "mult", "max"]:
            print("New run")
            run_time = end_time - start_time
            print("Last run took " + str(run_time) + " seconds")
            start_time = time.time()
            dev.comp_function = comp_function
            for include_misspelling in [True, False]:
                dev.include_misspelling = include_misspelling
                for window_size in range(11):
                    dev.window_size = window_size
                    for reciprocal in [True, False]:
                        dev.reciprocal = reciprocal
                        for remove_stopwords in [True, False]:
                            dev.remove_stopwords = remove_stopwords
                            for edit_distance in range(1, 5):
                                dev.edit_distance = edit_distance
                                correction_list = dev.ranking_experiment(
                                    detection_list, detection_contexts,
                                    candidates_list, r)
                                accuracy = len([
                                    c for i, c in enumerate(correction_list)
                                    if c == corrected_list[i]
                                ]) / len(correction_list)
                                parameters = (comp_function,
                                              include_misspelling, window_size,
                                              reciprocal, remove_stopwords,
                                              edit_distance)
                                scores_dict[parameters] = accuracy

            end_time = time.time()

        return scores_dict
Example #31
0
    # in experiment 3.
    perfect = False

    gold = json.load(open("data/test_gold.json"))
    gold = list(zip(*sorted(gold.items())))[1]

    if perfect:
        data = json.load(open("data/test_gold.json"))
    else:
        data = json.load(open("data/test_uima.json"))
    data = list(zip(*sorted(data.items())))[1]

    txt, gold_bio = zip(*gold)
    _, data_bio = zip(*data)

    embeddings = Reach.load("", unk_word="UNK")
    concept_reach = Reach.load_fast_format("data/concept_vectors")
    concept_labels = json.load(open("data/concept_names2label.json"))

    gold_bio = list(chain.from_iterable(gold_bio))

    results_bio = {}

    r_phrases = compose(data,
                        f1=np.mean,
                        f2=np.mean,
                        window=0,
                        embeddings=embeddings,
                        context_function=reciprocal)

    pred_bio_focus = eval_extrinsic(list(chain.from_iterable(data_bio)),
    def synonym_retrieval_zeroshot(self,
                                   zeroshot_pairs,
                                   isolated=False,
                                   verbose=False,
                                   outfile=''):

        assert self.train_vectors != None, 'No train vectors are loaded yet!'
        assert self.test_vectors != None, 'No test vectors are loaded yet!'

        # new setting: add ALL zeroshot data to train data to cause more confusion
        train_items = [
            x for _, x in sorted(self.train_vectors.indices.items())
        ]
        train_vectors = self.train_vectors.vectors

        zeroshot_items = set()
        for concept, reference, synonyms in zeroshot_pairs:
            zeroshot_items.add(reference)
            zeroshot_items.update(synonyms)
        zeroshot_items = sorted(zeroshot_items)
        zeroshot_vectors = []
        for zeroshot_item in zeroshot_items:
            zeroshot_vectors.append(self.test_vectors[zeroshot_item])
        if isolated:
            fused_vectors = Reach(zeroshot_vectors, zeroshot_items)
        else:
            all_items = train_items + zeroshot_items
            zeroshot_vectors = np.array(zeroshot_vectors)
            all_vectors = np.concatenate((train_vectors, zeroshot_vectors),
                                         axis=0)
            fused_vectors = Reach(all_vectors, all_items)

        # now rank
        complete_ranking = []
        for instance in tqdm(zeroshot_pairs, disable=False):

            concept, reference, synonyms = instance

            synonym_idxs = [fused_vectors.items[syn] for syn in synonyms]

            reference_idx = fused_vectors.items[reference]

            # calculate distances
            reference_vector = fused_vectors.norm_vectors[reference_idx]
            scores = fused_vectors.norm_vectors.dot(reference_vector.T)

            # extract ranking
            mask = [
                1 if x == reference_idx else 0
                for x in range(len(fused_vectors.items))
            ]
            scores = np.ma.array(scores, mask=mask)
            ranking = np.argsort(-scores)
            ranks = [
                np.where(ranking == synonym_idx)[0][0]
                for synonym_idx in synonym_idxs
            ]
            assert ranks
            ranks, synonyms = zip(*sorted(zip(ranks, synonyms)))
            instance = (concept, reference, synonyms)
            complete_ranking.append((instance, ranks))

        if outfile:
            print('Saving...')
            with open(outfile, 'w') as f:
                json.dump(complete_ranking, f)

        if verbose:
            instances, rankings = zip(*complete_ranking)
            print(round(self.mean_average_precision(rankings), 2), '&',
                  round(self.ranking_accuracy(rankings), 2), '&',
                  round(self.mean_reciprocal_rank(rankings), 2), '&')

        return complete_ranking