コード例 #1
0
    def get_grouped_qid(self, norm_q_vec, grouped_questions, simThreshold):
        for k, q_tokens_list in grouped_questions.items():
            for t_list in q_tokens_list:
                if not Resources.getWordVectors().vectorize(t_list,
                                                            remove_oov=True):
                    continue
                if np.dot(
                        norm_q_vec,
                        Reach.normalize(
                            np.mean(Resources.getWordVectors().vectorize(
                                t_list, remove_oov=True),
                                    axis=0))) >= simThreshold:
                    return k

        return None
コード例 #2
0
    def get_group_id(self, questions, qTokens, simThreshold=0.9):

        vec = Resources.getWordVectors().vectorize(qTokens, remove_oov=True)
        if not vec:
            return None

        qVec = Reach.normalize(np.mean(vec, axis=0))
        mostSimQ = None
        maxSim = 0.0

        for groupId, groupQTokens in questions.items():
            for cur_q_tokens in groupQTokens:
                cur_vec = self.expSet.getWordVectors().vectorize(
                    cur_q_tokens, remove_oov=True)
                if not cur_vec:
                    continue
                curSim = np.dot(qVec, Reach.normalize(np.mean(cur_vec,
                                                              axis=0)))
                if curSim > maxSim:
                    maxSim = curSim
                    mostSimQ = groupId

        if maxSim >= simThreshold:
            return mostSimQ
        else:
            return None
コード例 #3
0
ファイル: UMLSFeatures.py プロジェクト: aascode/rdocChallenge
 def calculateAverageWordVectorDistance(self, concepts, filterName='DSM'):
     '''
     This function calculates the average word vector distance, based on the words that a concept is encompassed by.
     
     '''
     feats = dict()
     if (filterName=='ALL'):
         feats.update(self.calculateAverageWordVectorDistance(concepts, 'DSM'))
         feats.update(self.calculateAverageWordVectorDistance(concepts, 'DSM+1'))
         feats.update(self.calculateAverageWordVectorDistance(concepts, 'MED'))
     else:
         subset = list(set(self.getSubsetOfConcepts(concepts, filterName)))
         maxDist = 0
         dists = []
         
         for cui in subset:
             cumulDist = 0
             cnt = 0
             for cui2 in subset:
                 try:
                     words = cui.split(';')[1].lower().split(' ')
                     wVector= np.mean(Resources.getWordVectors().vectorize(words, remove_oov=True), axis=0)
                     words = cui2.split(';')[1].lower().split(' ')
                     wVector2= np.mean(Resources.getWordVectors().vectorize(words, remove_oov=True), axis=0)
                     cosDist = spatial.distance.cosine(wVector, wVector2)
                     if not math.isnan(cosDist):
                         cumulDist += cosDist
                         cnt += 1
                 except:
                     continue
             try:
                 if (cnt != 0):
                     dists.append(cumulDist/cnt)
                 if (cumulDist/cnt) > maxDist:
                     maxDist = cumulDist/cnt
             except:
                 continue
         
         feats[filterName + 'maxWordVectorDist'] = round(maxDist,3)
         if bool(dists):
             feats[filterName + 'avgWordVectorDist'] = round(np.mean(dists),3)
     return feats
コード例 #4
0
    def get_grouped_questions(self, trainSet, simThreshold):

        grouped_questions = defaultdict(
            list
        )  #{id:[list of similar questions, where each item is a list of covered tokens in the question]}
        questions_type = defaultdict(lambda: defaultdict(int))
        grouped_questions_cat = defaultdict(set)

        for d in trainSet:
            cur_segment = self.segmenter.segment(d.getTextObject())
            for qap in cur_segment:
                qid = len(grouped_questions.keys())
                cur_q_tokens = d.getTextObject().get_covered_tokens(
                    qap.begQue, qap.endQue)

                if any(cur_q_tokens in val
                       for val in grouped_questions.values()):
                    continue
                qVec = Resources.getWordVectors().vectorize(cur_q_tokens,
                                                            remove_oov=True)
                if not qVec:
                    continue
                norm_q_vec = Reach.normalize(np.mean(qVec, axis=0))

                k = self.get_grouped_qid(norm_q_vec, grouped_questions,
                                         simThreshold)
                if k is not None:
                    qid = k

                grouped_questions[qid].append(cur_q_tokens)
                ansType, cat = self.get_ans_type(qap.answers)

                if not ansType:
                    continue

                questions_type[qid][ansType] += 1

                if cat:
                    grouped_questions_cat[qid].add(cat)

        return (grouped_questions, questions_type, grouped_questions_cat)
コード例 #5
0
ファイル: UMLSFeatures.py プロジェクト: aascode/rdocChallenge
    def getVectorizedWords(self, words):

        return {str(idx): k for idx, k in enumerate(np.mean(Resources.getWordVectors().vectorize(words, remove_oov=True), axis=0))}