Esempio n. 1
0
    def add_context_vec(self,
                        cui,
                        context_vec,
                        negative=False,
                        cntx_type='LONG',
                        inc_cui_count=True,
                        anneal=True,
                        lr=0.5):
        """ Add the vector representation of a context for this CUI

        cui:  The concept in question
        context_vec:  Vector represenation of the context
        negative:  Is this negative context of positive
        cntx_type:  Currently only two supported LONG and SHORT
                     pretty much just based on the window size
        inc_cui_count:  should this be counted
        """
        if cui not in self.cui_count:
            self.increase_cui_count(cui, True)

        # Ignore very similar context
        prob = 0.95

        # Set the right context
        if cntx_type == 'MED':
            cui2context_vec = self.cui2context_vec
        elif cntx_type == 'SHORT':
            cui2context_vec = self.cui2context_vec_short
        elif cntx_type == 'LONG':
            cui2context_vec = self.cui2context_vec_long

        sim = 0
        cv = context_vec
        if cui in cui2context_vec:
            sim = np.dot(unitvec(cv), unitvec(cui2context_vec[cui]))
            if anneal:
                lr = max(lr / self.cui_count[cui], 0.0005)

            if negative:
                b = max(0, sim) * lr
                cui2context_vec[cui] = cui2context_vec[cui] * (1 - b) - cv * b
                #cui2context_vec[cui] = cui2context_vec[cui] - cv*b
            else:
                if sim < prob:
                    b = (1 - max(0, sim)) * lr
                    cui2context_vec[cui] = cui2context_vec[cui] * (1 -
                                                                   b) + cv * b
                    #cui2context_vec[cui] = cui2context_vec[cui] + cv*b

                    # Increase cui count
                    self.increase_cui_count(cui, inc_cui_count)
        else:
            if negative:
                cui2context_vec[cui] = -1 * cv
            else:
                cui2context_vec[cui] = cv

            self.increase_cui_count(cui, inc_cui_count)

        return sim
    def _similarity(self, cui, vectors):
        r''' Calculate similarity once we have vectors and a cui.

        Args:
            cui
            vectors
        '''

        cui_vectors = self.cdb.cui2context_vectors.get(cui, {})

        if cui_vectors and self.cdb.cui2count_train[cui] >= self.config.linking['train_count_threshold']:
            similarity = 0
            for context_type in self.config.linking['context_vector_weights']:
                # Can be that a certain context_type does not exist for a cui/context
                if context_type in vectors and context_type in cui_vectors:
                    weight = self.config.linking['context_vector_weights'][context_type]
                    s = np.dot(unitvec(vectors[context_type]), unitvec(cui_vectors[context_type]))
                    similarity += weight * s

                    # DEBUG
                    self.log.debug("Similarity for CUI: {}, Count: {}, Context Type: {:10}, Weight: {:.2f}, Similarity: {:.3f}, S*W: {:.3f}".format(
                        cui, self.cdb.cui2count_train[cui], context_type, weight, s, s*weight))
            return similarity
        else:
            return -1
Esempio n. 3
0
    def update_context_vector(self, cui, vectors, negative=False, lr=None, cui_count=0):
        r''' Add the vector representation of a context for this CUI.

        cui (`str`):
            The concept in question.
        vectors (`Dict[str, np.array]`):
            Vector represenation of the context, must have the format: {'context_type': np.array(<vector>), ...}
            context_type - is usually one of: ['long', 'medium', 'short']
        negative (`bool`, defaults to `False`):
            Is this negative context of positive.
        lr (`int`, optional):
            If set it will override the base value from the config file.
        cui_count (`int`, defaults to 0):
            The learning rate will be calculated based on the count for the provided CUI + cui_count.
        '''
        if cui not in self.cui2context_vectors:
            self.cui2context_vectors = {}
            self.cui2count_train = 0

        similarity = None
        for context_type, vector in vectors.items():
            # Get the right context
            if context_type in self.cui2context_vectors[cui]:
                cv = self.cui2context_vectors[cui][context_type]
                similarity = np.dot(unitvec(cv), unitvec(vector))

                # Get the learning rate if None
                if lr is None:
                    lr = get_lr_linking(self.config, self.cui2count_train[cui] + cui_count, self._optim_params, similarity)

                if negative:
                    # Add negative context
                    b = max(0, similarity) * lr
                    self.cui2context_vectors[cui][context_type] = cv*(1-b) - vector*b
                else:
                    b = (1 - max(0, similarity)) * lr
                    self.cui2context_vectors[cui][context_type] = cv*(1-b) + vector*b

                # DEBUG
                self.log.debug("Updated vector embedding.\n" + \
                        "CUI: {}, Context Type: {}, Similarity: {:.2f}, Is Negative: {}, LR: {:.5f}, b: {:.3f}".format(cui, context_type,
                            similarity, negative, lr, b))
                cv = self.cui2context_vectors[cui][context_type]
                similarity_after = np.dot(unitvec(cv), unitvec(vector))
                self.log.debug("Similarity before vs after: {:.5f} vs {:.5f}".format(similarity, similarity_after))
            else:
                if negative:
                    self.cui2context_vectors[cui][context_type] = -1 * vector
                else:
                    self.cui2context_vectors[cui][context_type] = vector

                # DEBUG
                self.log.debug("Added new context type with vectors.\n" + \
                        "CUI: {}, Context Type: {}, Is Negative: {}".format(cui, context_type, negative))

        if not negative:
            # Increase counter only for positive examples
            self.cui2count_train[cui] += 1
Esempio n. 4
0
    def add_ncontext_vec(self, cui, ncontext_vec):
        """ Add the vector representation of a context for this CUI

        cui:  The concept in question
        ncontext_vec:  Vector represenation of the context
        """

        sim = 0
        cv = ncontext_vec
        cui2context_vec = self.cui2ncontext_vec

        if cui in self.cui_count:
            if cui in cui2context_vec:
                sim = np.dot(unitvec(cv), unitvec(cui2context_vec[cui]))
                c = 0.001
                b = max((0.1 / self.cui_count[cui]), c)  * (1 - max(0, sim))
                cui2context_vec[cui] = cui2context_vec[cui]*(1-b) + cv*b
            else:
                cui2context_vec[cui] = cv
Esempio n. 5
0
    def _add_cntx_vec(self, cui, doc, tkns, negative=False, lr=None, anneal=None):
        """ Add context vectors for this CUI

        cui:  concept id
        doc:  spacy document where the cui was found
        tkns:  tokens that were found for this cui
        """
        if lr is None:
            lr = self.LR
        if anneal is None:
            anneal = self.ANNEAL

        if negative:
            self.cdb.cui_disamb_always[cui] = True

        # Get words around this concept
        words = self._get_doc_words(doc, tkns, span=self.CNTX_SPAN, skip_words=True, skip_current=False)
        words_short = self._get_doc_words(doc, tkns, span=self.CNTX_SPAN_SHORT, skip_current=True)

        cntx_vecs = []
        for word in words:
            if word in self.vocab and self.vocab.vec(word) is not None:
                cntx_vecs.append(self.vocab.vec(word))

        cntx_vecs_short = []
        for word in words_short:
            if word in self.vocab and self.vocab.vec(word) is not None:
                cntx_vecs_short.append(self.vocab.vec(word))

        if len(cntx_vecs) > 0:
            cntx = np.average(cntx_vecs, axis=0)
            # Add context vectors only if we have some
            self.cdb.add_context_vec(cui, cntx, cntx_type='MED', negative=negative, lr=lr,
                                     anneal=anneal)

        if len(cntx_vecs_short) > 0:
            cntx_short = np.average(cntx_vecs_short, axis=0)
            # Add context vectors only if we have some
            self.cdb.add_context_vec(cui, cntx_short, cntx_type='SHORT', inc_cui_count=False,
                    negative=negative, lr=lr, anneal=anneal)

        if np.random.rand() < self.NEG_PROB and not negative:
            # Add only if probability and 'not' negative input
            negs = self.vocab.get_negative_samples(n=self.CNTX_SPAN * 2)
            neg_cntx_vecs = [self.vocab.vec(self.vocab.index2word[x]) for x in negs]
            neg_cntx = np.average(neg_cntx_vecs, axis=0)
            self.cdb.add_context_vec(cui, neg_cntx, negative=True, cntx_type='MED',
                                      inc_cui_count=False)

        #### DEBUG ONLY ####
        if self.DEBUG:
            if cui in self.cdb.cui2context_vec and len(cntx_vecs) > 0:
                if np.dot(unitvec(cntx), unitvec(self.cdb.cui2context_vec[cui])) < 0.01:
                    log.debug("SIMILARITY MED::::::::::::::::::::")
                    log.debug(words)
                    log.debug(cui)
                    log.debug(tkns)
                    log.debug(np.dot(unitvec(cntx),
                              unitvec(self.cdb.cui2context_vec[cui])))
                    log.debug(":::::::::::::::::::::::::::::::::::\n")

            if cui in self.cdb.cui2context_vec_short and len(cntx_vecs_short) > 0:
                if np.dot(unitvec(cntx_short), unitvec(self.cdb.cui2context_vec_short[cui])) < 0.01:
                    log.debug("SIMILARITY SHORT::::::::::::::::::::")
                    log.debug(words_short)
                    log.debug(cui)
                    log.debug(tkns)
                    log.debug(np.dot(unitvec(cntx_short),
                              unitvec(self.cdb.cui2context_vec[cui])))
                    log.debug(":::::::::::::::::::::::::::::::::::\n")
Esempio n. 6
0
    def _calc_acc(self, cui, doc, tkns, name=None):
        """ Calculate the accuracy for an annotation

        cui:  concept id
        doc:  spacy document
        tkns:  tokens for the concept that was found
        name:  concept name
        """
        cntx = None
        cntx_short = None
        words = self._get_doc_words(doc, tkns, span=self.CNTX_SPAN, skip_words=True, skip_current=False)
        words_short = self._get_doc_words(doc, tkns, span=self.CNTX_SPAN_SHORT, skip_current=False)

        cntx_vecs = []
        for word in words:
            if word in self.vocab and self.vocab.vec(word) is not None:
                cntx_vecs.append(self.vocab.vec(word))

        cntx_vecs_short = []
        for word in words_short:
            if word in self.vocab and self.vocab.vec(word) is not None:
                cntx_vecs_short.append(self.vocab.vec(word))

        if len(cntx_vecs_short) > 0:
            cntx_short = np.average(cntx_vecs_short, axis=0)

        if len(cntx_vecs) > 0:
            cntx = np.average(cntx_vecs, axis=0)

        #### DEBUG ONLY ####
        if self.DEBUG:
            if cui in self.cdb.cui2context_vec and len(cntx_vecs) > 0:
                log.debug("SIMILARITY MED::::::::::::::::::::")
                log.debug(words)
                log.debug(cui)
                log.debug(tkns)
                log.debug(np.dot(unitvec(cntx),
                          unitvec(self.cdb.cui2context_vec[cui])))
                log.debug(":::::::::::::::::::::::::::::::::::\n")

            if cui in self.cdb.cui2context_vec_short and len(cntx_vecs_short) > 0:
                log.debug("SIMILARITY SHORT::::::::::::::::::::")
                log.debug(words_short)
                log.debug(cui)
                log.debug(tkns)
                log.debug(np.dot(unitvec(cntx_short),
                          unitvec(self.cdb.cui2context_vec_short[cui])))
                log.debug(":::::::::::::::::::::::::::::::::::\n")
        #### END OF DEBUG ####

        if cui in self.cdb.cui2context_vec and len(cntx_vecs) > 0:
            sim = np.dot(unitvec(cntx), unitvec(self.cdb.cui2context_vec[cui]))

            if cui in self.cdb.cui2context_vec_short and len(cntx_vecs_short) > 0:
                sim2 = np.dot(unitvec(cntx_short), unitvec(self.cdb.cui2context_vec_short[cui]))
                if sim2 > 0 and abs(sim - sim2) > 0.1:
                    sim = (sim + sim2) / 2
            if name is not None:
                if cui in self.cdb.cui2pref_name and sim > self.MIN_ACC:
                    if name == self.cdb.cui2pref_name[cui]:
                        sim = min(1, sim + 0.3)
            return sim
        else:
            return -1
Esempio n. 7
0
    def most_similar(self, cui, tui_filter=[], min_cnt=0, topn=50):
        r'''
        Given a concept it will calculat what other concepts in this CDB have the most similar
        embedding.

        Args:
            cui (str):
                The concept ID for the base concept for which you want to get the most similar concepts.
            tui_filter (list):
                A list of TUIs that will be used to filterout the returned results. Using this it is possible
                to limit the similarity calculation to only disorders/symptoms/drugs/...
            min_cnt (int):
                Minimum training examples (unsupervised+supervised) that a concept must have to be considered
                for the similarity calculation.
            topn (int):
                How many results to return

        Return:
            results (dict):
                A dictionary with topn results like: {<cui>: {'name': <name>, 'sim': <similarity>, 'tui_name': <tui_name>,
                                                              'tui': <tui>, 'cnt': <number of training examples the concept has seen>}, ...}

        '''
        # Create the matrix if necessary
        if not hasattr(self, 'sim_vectors') or self.sim_vectors is None or len(self.sim_vectors) < len(self.cui2context_vec):
            print("Building similarity matrix")
            log.info("Building similarity matrix")

            sim_vectors = []
            sim_vectors_counts = []
            sim_vectors_tuis = []
            sim_vectors_cuis = []
            for _cui in self.cui2context_vec:
                sim_vectors.append(unitvec(self.cui2context_vec[_cui]))
                sim_vectors_counts.append(self.cui_count[_cui])
                sim_vectors_tuis.append(self.cui2tui.get(_cui, 'unk'))
                sim_vectors_cuis.append(_cui)

            self.sim_vectors = np.array(sim_vectors)
            self.sim_vectors_counts = np.array(sim_vectors_counts)
            self.sim_vectors_tuis = np.array(sim_vectors_tuis)
            self.sim_vectors_cuis = np.array(sim_vectors_cuis)

        # Select appropirate concepts
        tui_inds = np.arange(0, len(self.sim_vectors_tuis))
        if len(tui_filter) > 0:
            tui_inds = np.array([], dtype=np.int32)
            for tui in tui_filter:
                tui_inds = np.union1d(np.where(self.sim_vectors_tuis == tui)[0], tui_inds)
        cnt_inds = np.arange(0, len(self.sim_vectors_counts))
        if min_cnt > 0:
            cnt_inds = np.where(self.sim_vectors_counts >= min_cnt)[0]

        # Intersect cnt and tui
        inds = np.intersect1d(tui_inds, cnt_inds)

        mtrx = self.sim_vectors[inds]
        cuis = self.sim_vectors_cuis[inds]

        sims = np.dot(mtrx, unitvec(self.cui2context_vec[cui]))

        sims_srt = np.argsort(-1*sims)

        # Create the return dict
        res = {}
        for ind, _cui in enumerate(cuis[sims_srt[0:topn]]):
            res[_cui] = {'name': self.cui2pretty_name[_cui], 'sim': sims[sims_srt][ind],
                         'tui_name': self.tui2name.get(self.cui2tui.get(_cui, 'unk'), 'unk'),
                         'tui': self.cui2tui.get(_cui, 'unk'),
                         'cnt': self.cui_count[_cui]}

        return res
Esempio n. 8
0
    def most_similar(self, cui, context_type, type_id_filter=[], min_cnt=0, topn=50, force_build=False):
        r''' Given a concept it will calculate what other concepts in this CDB have the most similar
        embedding.

        Args:
            cui (`str`):
                The concept ID for the base concept for which you want to get the most similar concepts.
            context_type (`str`):
                On what vector type from the cui2context_vectors map will the similarity be calculated.
            type_id_filter (`List[str]`):
                A list of type_ids that will be used to filterout the returned results. Using this it is possible
                to limit the similarity calculation to only disorders/symptoms/drugs/...
            min_cnt (`int`):
                Minimum training examples (unsupervised+supervised) that a concept must have to be considered
                for the similarity calculation.
            topn (`int`):
                How many results to return
            force_build (`bool`, defaults to `False`):
                Do not use cached sim matrix

        Return:
            results (Dict):
                A dictionary with topn results like: {<cui>: {'name': <name>, 'sim': <similarity>, 'type_name': <type_name>,
                                                              'type_id': <type_id>, 'cnt': <number of training examples the concept has seen>}, ...}

        '''

        if 'similarity' in self.addl_info:
            if context_type not in self.addl_info['similarity']:
                self.addl_info['similarity'][context_type] = {}
        else:
            self.addl_info['similarity'] = {context_type: {}}

        sim_data = self.addl_info['similarity'][context_type]

        # Create the matrix if necessary
        if 'sim_vectors' not in sim_data or force_build:
            self.log.info("Building similarity matrix")

            sim_vectors = []
            sim_vectors_counts = []
            sim_vectors_type_ids = []
            sim_vectors_cuis = []
            for _cui in self.cui2context_vectors:
                if context_type in self.cui2context_vectors[_cui]:
                    sim_vectors.append(unitvec(self.cui2context_vectors[_cui][context_type]))
                    sim_vectors_counts.append(self.cui2count_train.get(_cui, 0))
                    sim_vectors_type_ids.append(self.cui2type_ids.get(_cui, {'unk'}))
                    sim_vectors_cuis.append(_cui)

            sim_data['sim_vectors'] = np.array(sim_vectors)
            sim_data['sim_vectors_counts'] = np.array(sim_vectors_counts)
            sim_data['sim_vectors_type_ids'] = np.array(sim_vectors_type_ids)
            sim_data['sim_vectors_cuis'] = np.array(sim_vectors_cuis)

        # Select appropriate concepts
        type_id_inds = np.arange(0, len(sim_data['sim_vectors_type_ids']))
        if len(type_id_filter) > 0:
            type_id_inds = np.array([], dtype=np.int32)
            for type_id in type_id_filter:
                type_id_inds = np.union1d(np.array([ind for ind, type_ids in enumerate(sim_data['sim_vectors_type_ids']) if type_id in type_ids]),
                        type_id_inds)
        cnt_inds = np.arange(0, len(sim_data['sim_vectors_counts']))
        if min_cnt > 0:
            cnt_inds = np.where(sim_data['sim_vectors_counts'] >= min_cnt)[0]
        # Intersect cnt and type_id 
        inds = np.intersect1d(type_id_inds, cnt_inds)

        mtrx = sim_data['sim_vectors'][inds]
        cuis = sim_data['sim_vectors_cuis'][inds]

        sims = np.dot(mtrx, unitvec(self.cui2context_vectors[cui][context_type]))

        sims_srt = np.argsort(-1*sims)

        # Create the return dict
        res = {}
        print()
        for ind, _cui in enumerate(cuis[sims_srt[0:topn]]):
            res[_cui] = {'name': self.cui2preferred_name.get(_cui, list(self.cui2names[_cui])[0]), 'sim': sims[sims_srt][ind],
                         'type_names': [self.addl_info['type_id2name'].get(cui, 'unk') for cui in self.cui2type_ids.get(_cui, ['unk'])],
                         'type_ids': self.cui2type_ids.get(_cui, 'unk'),
                         'cnt': self.cui2count_train.get(_cui, 0)}

        return res
Esempio n. 9
0
    def add_context_vec(self, cui, context_vec, negative=False, cntx_type='LONG', inc_cui_count=True, manual=False):
        """ Add the vector representation of a context for this CUI

        cui:  The concept in question
        context_vec:  Vector represenation of the context
        negative:  Is this negative context of positive
        cntx_type:  Currently only two supported LONG and SHORT
                     pretty much just based on the window size
        inc_cui_count:  should this be counted
        """
        if cui not in self.cui_count:
            self.increase_cui_count(cui, True, manual=manual)

        prob = 0.95
        """
        cnt = self.cui_count[cui]
        if cnt < int(cui_limit_high / 2):
            prob = 0.95
        else:
            div = 2*cui_limit_high
            prob = max(0.5, 0.95 - (cnt / div))
        """

        # Set the right context
        if cntx_type == 'MED':
            cui2context_vec = self.cui2context_vec
        elif cntx_type == 'SHORT':
            cui2context_vec = self.cui2context_vec_short
        elif cntx_type == 'LONG':
            cui2context_vec = self.cui2context_vec_long


        sim = 0
        cv = context_vec
        if cui in cui2context_vec:
            sim = np.dot(unitvec(cv), unitvec(cui2context_vec[cui]))

            if negative:
                if not manual:
                    b = max((0.2 / self.cui_count[cui]), 0.0001)  * max(0, sim)
                else:
                    # Means someone manually annotated the example, use high learning rate
                    b = 0.1 * max(0, sim)
                cui2context_vec[cui] = cui2context_vec[cui]*(1-b) - cv*b
            else:
                if sim < prob:
                    if not manual:
                        # Annotation is from Unsupervised learning
                        c = 0.001
                        b = max((0.5 / self.cui_count[cui]), c)  * (1 - max(0, sim))
                    else:
                        # Means someone manually annotated the example, use high learning rate
                        b = 0.1 * (1 - max(0, sim))

                    cui2context_vec[cui] = cui2context_vec[cui]*(1-b) + cv*b

                    # Increase cui count
                    self.increase_cui_count(cui, inc_cui_count)
        else:
            if negative:
                cui2context_vec[cui] = -cv
            else:
                cui2context_vec[cui] = cv

            self.increase_cui_count(cui, inc_cui_count, manual)

        return sim