Example #1
0
    def get_group_id(self, questions, qTokens, simThreshold=0.9):

        vec = Resources.getWordVectors().vectorize(qTokens, remove_oov=True)
        if not vec:
            return None

        qVec = Reach.normalize(np.mean(vec, axis=0))
        mostSimQ = None
        maxSim = 0.0

        for groupId, groupQTokens in questions.items():
            for cur_q_tokens in groupQTokens:
                cur_vec = self.expSet.getWordVectors().vectorize(
                    cur_q_tokens, remove_oov=True)
                if not cur_vec:
                    continue
                curSim = np.dot(qVec, Reach.normalize(np.mean(cur_vec,
                                                              axis=0)))
                if curSim > maxSim:
                    maxSim = curSim
                    mostSimQ = groupId

        if maxSim >= simThreshold:
            return mostSimQ
        else:
            return None
    def correlation_benchmarks(self, baseline=False, normalize=True):

        self.model.eval()
        self.vectorize.allow_construct_oov()

        corrs = []
        benchmarks = self.load_benchmarks()
        for benchmark, data in benchmarks.items():
            print(benchmark)
            source_names = data['source']
            target_names = data['target']
            sims = data['sims']

            # calculate cosines
            source_vectors = []
            target_vectors = []
            for source, target in zip(source_names, target_names):
                source_vector = np.average(self.vectorize.vectorize_string(
                    source, norm=False),
                                           axis=0)
                target_vector = np.average(self.vectorize.vectorize_string(
                    target, norm=False),
                                           axis=0)
                if normalize:
                    source_vector = Reach.normalize(source_vector)
                    target_vector = Reach.normalize(target_vector)
                source_vectors.append(source_vector)
                target_vectors.append(target_vector)
            source_vectors = np.array(source_vectors)
            target_vectors = np.array(target_vectors)

            if baseline:
                source_vectors = Reach.normalize(source_vectors)
                target_vectors = Reach.normalize(target_vectors)
                cosines = [
                    x.dot(y.T) for x, y in zip(source_vectors, target_vectors)
                ]
            else:
                source_vectors = torch.FloatTensor(source_vectors).to(
                    self.device).reshape(-1, self.input_size)
                target_vectors = torch.FloatTensor(target_vectors).to(
                    self.device).reshape(-1, self.input_size)
                source_out = self.model(source_vectors)
                target_out = self.model(target_vectors)
                # take the dot product of the outputted reference and synonym embedding
                ref = source_out / source_out.norm(dim=1).reshape(-1, 1)
                syn = target_out / target_out.norm(dim=1).reshape(-1, 1)
                dot_products = torch.stack([
                    torch.mm(x.reshape(1, -1),
                             y.reshape(1, -1).t()) for x, y in zip(ref, syn)
                ],
                                           dim=0)
                cosines = dot_products.reshape(-1).detach().cpu().numpy()

            corr = spearmanr(cosines, sims)
            print(corr)
            corrs.append(corr)

        return corrs
Example #3
0
    def fit_cca(self, outfile=''):

        # fits linear CCA constraint and replaces pretrained name embeddings with CCA transformed embeddings

        self.load_embeddings()
        self.extract_pretrained_prototype_embeddings()

        items, vectors = zip(
            *[(k, v) for k, v in self.pretrained_prototype_embeddings.items()
              if k in self.exemplar_to_concept])
        concept_embs = Reach(vectors, items)

        train_vectors = []
        for x in items:
            train_vectors.append(self.train_embeddings[x])
        train_vectors = Reach.normalize(train_vectors)

        cca = CCA(n_components=self.train_embeddings.size, max_iter=10000)
        cca.fit(train_vectors, concept_embs.norm_vectors)

        # transform all name embeddings using the CCA mapping
        all_name_embeddings = deepcopy(self.pretrained_name_embeddings)
        items = [x for _, x in sorted(all_name_embeddings.indices.items())]
        projected_name_embeddings = cca.transform(
            all_name_embeddings.norm_vectors)
        new_name_embeddings = Reach(projected_name_embeddings, items)

        self.pretrained_name_embeddings = new_name_embeddings
        self.load_embeddings()

        if outfile:
            with open('{}_cca.p', 'wb') as f:
                pickle.dump(cca, f)
Example #4
0
File: yarn.py Project: clips/yarn
def compute_nearest_neighbours(definitions, abstracts):
    """
    Compute nearest neighbours from abstracts to definitions.

    Parameters
    ----------
    definitions : dictionary of dictionaries.
        A dictionary of dictionaries containing vectors.
        The top key is the Ambiguous term, the bottom key is the CUI.

            Example: {AMBIGTERM: {CUI1: VECTOR, CUI2: VECTOR}}
    abstracts : dictionary of dictionaries
        Like definitions.

    Returns
    -------
    result : dict
        A dictionary, the keys of which are the ambiguous terms, and the values
        are lists of tuples. The first item of each tuple is the true class,
        the second item of each tuple is the predicted class.

        example: {AMBIGTERM1: [(y1, y_pred1), (y2, y_pred2), ...]}

    """
    output = {}

    for k, v in abstracts.items():

        results = []

        labels, vectors = dict_to_tuple(v)

        try:
            targets, matrix = dict_to_tuple(definitions[k])
        except KeyError:
            continue
        matrix = Reach.normalize(np.asarray(matrix))
        vectors = Reach.normalize(np.asarray(vectors))

        for vec in vectors:

            result = -vec.dot(matrix.T)
            results.append(targets[np.argsort(result)[0]])

        output[k] = list(zip(labels, results))

    return output
Example #5
0
def compute_nearest_neighbours(definitions, abstracts):
    """
    Compute nearest neighbours from abstracts to definitions.

    Parameters
    ----------
    definitions : dictionary of dictionaries.
        A dictionary of dictionaries containing vectors.
        The top key is the Ambiguous term, the bottom key is the CUI.

            Example: {AMBIGTERM: {CUI1: VECTOR, CUI2: VECTOR}}
    abstracts : dictionary of dictionaries
        Like definitions.

    Returns
    -------
    result : dict
        A dictionary, the keys of which are the ambiguous terms, and the values
        are lists of tuples. The first item of each tuple is the true class,
        the second item of each tuple is the predicted class.

        example: {AMBIGTERM1: [(y1, y_pred1), (y2, y_pred2), ...]}

    """
    output = {}

    for k, v in abstracts.items():

        results = []

        labels, vectors = dict_to_tuple(v)

        try:
            targets, matrix = dict_to_tuple(definitions[k])
        except KeyError:
            continue
        matrix = Reach.normalize(np.asarray(matrix))
        vectors = Reach.normalize(np.asarray(vectors))

        for vec in vectors:

            result = -vec.dot(matrix.T)
            results.append(targets[np.argsort(result)[0]])

        output[k] = list(zip(labels, results))

    return output
Example #6
0
    def vectorize_string(self, string, norm):

        tokens = string.split()
        token_embeddings = []
        for token in tokens:
            vector = self.fasttext_model.get_word_vector(token)
            if norm:
                vector = Reach.normalize(vector)
            token_embeddings.append(vector)
        token_embeddings = np.array(token_embeddings)

        return token_embeddings
Example #7
0
    def get_grouped_qid(self, norm_q_vec, grouped_questions, simThreshold):
        for k, q_tokens_list in grouped_questions.items():
            for t_list in q_tokens_list:
                if not Resources.getWordVectors().vectorize(t_list,
                                                            remove_oov=True):
                    continue
                if np.dot(
                        norm_q_vec,
                        Reach.normalize(
                            np.mean(Resources.getWordVectors().vectorize(
                                t_list, remove_oov=True),
                                    axis=0))) >= simThreshold:
                    return k

        return None
Example #8
0
    def fit(self, X):
        """
        Fit the transformer to some data.

        Fitting, in this case, means unpacking the file and loading the
        feature matrix. Only words on which you fit are kept.
        """
        super().fit(X)
        X = self._unpack(X)
        mtr, words = Reach._load(self.path, X)
        if self.normalize:
            mtr = Reach.normalize(mtr)
        self.features = dict(zip(words, mtr))
        self.vec_len = mtr.shape[1]
        self.feature_names = set(self.features.keys())
        return self
Example #9
0
    def get_grouped_questions(self, trainSet, simThreshold):

        grouped_questions = defaultdict(
            list
        )  #{id:[list of similar questions, where each item is a list of covered tokens in the question]}
        questions_type = defaultdict(lambda: defaultdict(int))
        grouped_questions_cat = defaultdict(set)

        for d in trainSet:
            cur_segment = self.segmenter.segment(d.getTextObject())
            for qap in cur_segment:
                qid = len(grouped_questions.keys())
                cur_q_tokens = d.getTextObject().get_covered_tokens(
                    qap.begQue, qap.endQue)

                if any(cur_q_tokens in val
                       for val in grouped_questions.values()):
                    continue
                qVec = Resources.getWordVectors().vectorize(cur_q_tokens,
                                                            remove_oov=True)
                if not qVec:
                    continue
                norm_q_vec = Reach.normalize(np.mean(qVec, axis=0))

                k = self.get_grouped_qid(norm_q_vec, grouped_questions,
                                         simThreshold)
                if k is not None:
                    qid = k

                grouped_questions[qid].append(cur_q_tokens)
                ansType, cat = self.get_ans_type(qap.answers)

                if not ansType:
                    continue

                questions_type[qid][ansType] += 1

                if cat:
                    grouped_questions_cat[qid].add(cat)

        return (grouped_questions, questions_type, grouped_questions_cat)