def get_group_id(self, questions, qTokens, simThreshold=0.9): vec = Resources.getWordVectors().vectorize(qTokens, remove_oov=True) if not vec: return None qVec = Reach.normalize(np.mean(vec, axis=0)) mostSimQ = None maxSim = 0.0 for groupId, groupQTokens in questions.items(): for cur_q_tokens in groupQTokens: cur_vec = self.expSet.getWordVectors().vectorize( cur_q_tokens, remove_oov=True) if not cur_vec: continue curSim = np.dot(qVec, Reach.normalize(np.mean(cur_vec, axis=0))) if curSim > maxSim: maxSim = curSim mostSimQ = groupId if maxSim >= simThreshold: return mostSimQ else: return None
def correlation_benchmarks(self, baseline=False, normalize=True): self.model.eval() self.vectorize.allow_construct_oov() corrs = [] benchmarks = self.load_benchmarks() for benchmark, data in benchmarks.items(): print(benchmark) source_names = data['source'] target_names = data['target'] sims = data['sims'] # calculate cosines source_vectors = [] target_vectors = [] for source, target in zip(source_names, target_names): source_vector = np.average(self.vectorize.vectorize_string( source, norm=False), axis=0) target_vector = np.average(self.vectorize.vectorize_string( target, norm=False), axis=0) if normalize: source_vector = Reach.normalize(source_vector) target_vector = Reach.normalize(target_vector) source_vectors.append(source_vector) target_vectors.append(target_vector) source_vectors = np.array(source_vectors) target_vectors = np.array(target_vectors) if baseline: source_vectors = Reach.normalize(source_vectors) target_vectors = Reach.normalize(target_vectors) cosines = [ x.dot(y.T) for x, y in zip(source_vectors, target_vectors) ] else: source_vectors = torch.FloatTensor(source_vectors).to( self.device).reshape(-1, self.input_size) target_vectors = torch.FloatTensor(target_vectors).to( self.device).reshape(-1, self.input_size) source_out = self.model(source_vectors) target_out = self.model(target_vectors) # take the dot product of the outputted reference and synonym embedding ref = source_out / source_out.norm(dim=1).reshape(-1, 1) syn = target_out / target_out.norm(dim=1).reshape(-1, 1) dot_products = torch.stack([ torch.mm(x.reshape(1, -1), y.reshape(1, -1).t()) for x, y in zip(ref, syn) ], dim=0) cosines = dot_products.reshape(-1).detach().cpu().numpy() corr = spearmanr(cosines, sims) print(corr) corrs.append(corr) return corrs
def fit_cca(self, outfile=''): # fits linear CCA constraint and replaces pretrained name embeddings with CCA transformed embeddings self.load_embeddings() self.extract_pretrained_prototype_embeddings() items, vectors = zip( *[(k, v) for k, v in self.pretrained_prototype_embeddings.items() if k in self.exemplar_to_concept]) concept_embs = Reach(vectors, items) train_vectors = [] for x in items: train_vectors.append(self.train_embeddings[x]) train_vectors = Reach.normalize(train_vectors) cca = CCA(n_components=self.train_embeddings.size, max_iter=10000) cca.fit(train_vectors, concept_embs.norm_vectors) # transform all name embeddings using the CCA mapping all_name_embeddings = deepcopy(self.pretrained_name_embeddings) items = [x for _, x in sorted(all_name_embeddings.indices.items())] projected_name_embeddings = cca.transform( all_name_embeddings.norm_vectors) new_name_embeddings = Reach(projected_name_embeddings, items) self.pretrained_name_embeddings = new_name_embeddings self.load_embeddings() if outfile: with open('{}_cca.p', 'wb') as f: pickle.dump(cca, f)
def compute_nearest_neighbours(definitions, abstracts): """ Compute nearest neighbours from abstracts to definitions. Parameters ---------- definitions : dictionary of dictionaries. A dictionary of dictionaries containing vectors. The top key is the Ambiguous term, the bottom key is the CUI. Example: {AMBIGTERM: {CUI1: VECTOR, CUI2: VECTOR}} abstracts : dictionary of dictionaries Like definitions. Returns ------- result : dict A dictionary, the keys of which are the ambiguous terms, and the values are lists of tuples. The first item of each tuple is the true class, the second item of each tuple is the predicted class. example: {AMBIGTERM1: [(y1, y_pred1), (y2, y_pred2), ...]} """ output = {} for k, v in abstracts.items(): results = [] labels, vectors = dict_to_tuple(v) try: targets, matrix = dict_to_tuple(definitions[k]) except KeyError: continue matrix = Reach.normalize(np.asarray(matrix)) vectors = Reach.normalize(np.asarray(vectors)) for vec in vectors: result = -vec.dot(matrix.T) results.append(targets[np.argsort(result)[0]]) output[k] = list(zip(labels, results)) return output
def compute_nearest_neighbours(definitions, abstracts): """ Compute nearest neighbours from abstracts to definitions. Parameters ---------- definitions : dictionary of dictionaries. A dictionary of dictionaries containing vectors. The top key is the Ambiguous term, the bottom key is the CUI. Example: {AMBIGTERM: {CUI1: VECTOR, CUI2: VECTOR}} abstracts : dictionary of dictionaries Like definitions. Returns ------- result : dict A dictionary, the keys of which are the ambiguous terms, and the values are lists of tuples. The first item of each tuple is the true class, the second item of each tuple is the predicted class. example: {AMBIGTERM1: [(y1, y_pred1), (y2, y_pred2), ...]} """ output = {} for k, v in abstracts.items(): results = [] labels, vectors = dict_to_tuple(v) try: targets, matrix = dict_to_tuple(definitions[k]) except KeyError: continue matrix = Reach.normalize(np.asarray(matrix)) vectors = Reach.normalize(np.asarray(vectors)) for vec in vectors: result = -vec.dot(matrix.T) results.append(targets[np.argsort(result)[0]]) output[k] = list(zip(labels, results)) return output
def vectorize_string(self, string, norm): tokens = string.split() token_embeddings = [] for token in tokens: vector = self.fasttext_model.get_word_vector(token) if norm: vector = Reach.normalize(vector) token_embeddings.append(vector) token_embeddings = np.array(token_embeddings) return token_embeddings
def get_grouped_qid(self, norm_q_vec, grouped_questions, simThreshold): for k, q_tokens_list in grouped_questions.items(): for t_list in q_tokens_list: if not Resources.getWordVectors().vectorize(t_list, remove_oov=True): continue if np.dot( norm_q_vec, Reach.normalize( np.mean(Resources.getWordVectors().vectorize( t_list, remove_oov=True), axis=0))) >= simThreshold: return k return None
def fit(self, X): """ Fit the transformer to some data. Fitting, in this case, means unpacking the file and loading the feature matrix. Only words on which you fit are kept. """ super().fit(X) X = self._unpack(X) mtr, words = Reach._load(self.path, X) if self.normalize: mtr = Reach.normalize(mtr) self.features = dict(zip(words, mtr)) self.vec_len = mtr.shape[1] self.feature_names = set(self.features.keys()) return self
def get_grouped_questions(self, trainSet, simThreshold): grouped_questions = defaultdict( list ) #{id:[list of similar questions, where each item is a list of covered tokens in the question]} questions_type = defaultdict(lambda: defaultdict(int)) grouped_questions_cat = defaultdict(set) for d in trainSet: cur_segment = self.segmenter.segment(d.getTextObject()) for qap in cur_segment: qid = len(grouped_questions.keys()) cur_q_tokens = d.getTextObject().get_covered_tokens( qap.begQue, qap.endQue) if any(cur_q_tokens in val for val in grouped_questions.values()): continue qVec = Resources.getWordVectors().vectorize(cur_q_tokens, remove_oov=True) if not qVec: continue norm_q_vec = Reach.normalize(np.mean(qVec, axis=0)) k = self.get_grouped_qid(norm_q_vec, grouped_questions, simThreshold) if k is not None: qid = k grouped_questions[qid].append(cur_q_tokens) ansType, cat = self.get_ans_type(qap.answers) if not ansType: continue questions_type[qid][ansType] += 1 if cat: grouped_questions_cat[qid].add(cat) return (grouped_questions, questions_type, grouped_questions_cat)