def get_grouped_qid(self, norm_q_vec, grouped_questions, simThreshold): for k, q_tokens_list in grouped_questions.items(): for t_list in q_tokens_list: if not Resources.getWordVectors().vectorize(t_list, remove_oov=True): continue if np.dot( norm_q_vec, Reach.normalize( np.mean(Resources.getWordVectors().vectorize( t_list, remove_oov=True), axis=0))) >= simThreshold: return k return None
def get_group_id(self, questions, qTokens, simThreshold=0.9): vec = Resources.getWordVectors().vectorize(qTokens, remove_oov=True) if not vec: return None qVec = Reach.normalize(np.mean(vec, axis=0)) mostSimQ = None maxSim = 0.0 for groupId, groupQTokens in questions.items(): for cur_q_tokens in groupQTokens: cur_vec = self.expSet.getWordVectors().vectorize( cur_q_tokens, remove_oov=True) if not cur_vec: continue curSim = np.dot(qVec, Reach.normalize(np.mean(cur_vec, axis=0))) if curSim > maxSim: maxSim = curSim mostSimQ = groupId if maxSim >= simThreshold: return mostSimQ else: return None
def calculateAverageWordVectorDistance(self, concepts, filterName='DSM'): ''' This function calculates the average word vector distance, based on the words that a concept is encompassed by. ''' feats = dict() if (filterName=='ALL'): feats.update(self.calculateAverageWordVectorDistance(concepts, 'DSM')) feats.update(self.calculateAverageWordVectorDistance(concepts, 'DSM+1')) feats.update(self.calculateAverageWordVectorDistance(concepts, 'MED')) else: subset = list(set(self.getSubsetOfConcepts(concepts, filterName))) maxDist = 0 dists = [] for cui in subset: cumulDist = 0 cnt = 0 for cui2 in subset: try: words = cui.split(';')[1].lower().split(' ') wVector= np.mean(Resources.getWordVectors().vectorize(words, remove_oov=True), axis=0) words = cui2.split(';')[1].lower().split(' ') wVector2= np.mean(Resources.getWordVectors().vectorize(words, remove_oov=True), axis=0) cosDist = spatial.distance.cosine(wVector, wVector2) if not math.isnan(cosDist): cumulDist += cosDist cnt += 1 except: continue try: if (cnt != 0): dists.append(cumulDist/cnt) if (cumulDist/cnt) > maxDist: maxDist = cumulDist/cnt except: continue feats[filterName + 'maxWordVectorDist'] = round(maxDist,3) if bool(dists): feats[filterName + 'avgWordVectorDist'] = round(np.mean(dists),3) return feats
def get_grouped_questions(self, trainSet, simThreshold): grouped_questions = defaultdict( list ) #{id:[list of similar questions, where each item is a list of covered tokens in the question]} questions_type = defaultdict(lambda: defaultdict(int)) grouped_questions_cat = defaultdict(set) for d in trainSet: cur_segment = self.segmenter.segment(d.getTextObject()) for qap in cur_segment: qid = len(grouped_questions.keys()) cur_q_tokens = d.getTextObject().get_covered_tokens( qap.begQue, qap.endQue) if any(cur_q_tokens in val for val in grouped_questions.values()): continue qVec = Resources.getWordVectors().vectorize(cur_q_tokens, remove_oov=True) if not qVec: continue norm_q_vec = Reach.normalize(np.mean(qVec, axis=0)) k = self.get_grouped_qid(norm_q_vec, grouped_questions, simThreshold) if k is not None: qid = k grouped_questions[qid].append(cur_q_tokens) ansType, cat = self.get_ans_type(qap.answers) if not ansType: continue questions_type[qid][ansType] += 1 if cat: grouped_questions_cat[qid].add(cat) return (grouped_questions, questions_type, grouped_questions_cat)
def getVectorizedWords(self, words): return {str(idx): k for idx, k in enumerate(np.mean(Resources.getWordVectors().vectorize(words, remove_oov=True), axis=0))}