def add_context_vec(self, cui, context_vec, negative=False, cntx_type='LONG', inc_cui_count=True, anneal=True, lr=0.5): """ Add the vector representation of a context for this CUI cui: The concept in question context_vec: Vector represenation of the context negative: Is this negative context of positive cntx_type: Currently only two supported LONG and SHORT pretty much just based on the window size inc_cui_count: should this be counted """ if cui not in self.cui_count: self.increase_cui_count(cui, True) # Ignore very similar context prob = 0.95 # Set the right context if cntx_type == 'MED': cui2context_vec = self.cui2context_vec elif cntx_type == 'SHORT': cui2context_vec = self.cui2context_vec_short elif cntx_type == 'LONG': cui2context_vec = self.cui2context_vec_long sim = 0 cv = context_vec if cui in cui2context_vec: sim = np.dot(unitvec(cv), unitvec(cui2context_vec[cui])) if anneal: lr = max(lr / self.cui_count[cui], 0.0005) if negative: b = max(0, sim) * lr cui2context_vec[cui] = cui2context_vec[cui] * (1 - b) - cv * b #cui2context_vec[cui] = cui2context_vec[cui] - cv*b else: if sim < prob: b = (1 - max(0, sim)) * lr cui2context_vec[cui] = cui2context_vec[cui] * (1 - b) + cv * b #cui2context_vec[cui] = cui2context_vec[cui] + cv*b # Increase cui count self.increase_cui_count(cui, inc_cui_count) else: if negative: cui2context_vec[cui] = -1 * cv else: cui2context_vec[cui] = cv self.increase_cui_count(cui, inc_cui_count) return sim
def _similarity(self, cui, vectors): r''' Calculate similarity once we have vectors and a cui. Args: cui vectors ''' cui_vectors = self.cdb.cui2context_vectors.get(cui, {}) if cui_vectors and self.cdb.cui2count_train[cui] >= self.config.linking['train_count_threshold']: similarity = 0 for context_type in self.config.linking['context_vector_weights']: # Can be that a certain context_type does not exist for a cui/context if context_type in vectors and context_type in cui_vectors: weight = self.config.linking['context_vector_weights'][context_type] s = np.dot(unitvec(vectors[context_type]), unitvec(cui_vectors[context_type])) similarity += weight * s # DEBUG self.log.debug("Similarity for CUI: {}, Count: {}, Context Type: {:10}, Weight: {:.2f}, Similarity: {:.3f}, S*W: {:.3f}".format( cui, self.cdb.cui2count_train[cui], context_type, weight, s, s*weight)) return similarity else: return -1
def update_context_vector(self, cui, vectors, negative=False, lr=None, cui_count=0): r''' Add the vector representation of a context for this CUI. cui (`str`): The concept in question. vectors (`Dict[str, np.array]`): Vector represenation of the context, must have the format: {'context_type': np.array(<vector>), ...} context_type - is usually one of: ['long', 'medium', 'short'] negative (`bool`, defaults to `False`): Is this negative context of positive. lr (`int`, optional): If set it will override the base value from the config file. cui_count (`int`, defaults to 0): The learning rate will be calculated based on the count for the provided CUI + cui_count. ''' if cui not in self.cui2context_vectors: self.cui2context_vectors = {} self.cui2count_train = 0 similarity = None for context_type, vector in vectors.items(): # Get the right context if context_type in self.cui2context_vectors[cui]: cv = self.cui2context_vectors[cui][context_type] similarity = np.dot(unitvec(cv), unitvec(vector)) # Get the learning rate if None if lr is None: lr = get_lr_linking(self.config, self.cui2count_train[cui] + cui_count, self._optim_params, similarity) if negative: # Add negative context b = max(0, similarity) * lr self.cui2context_vectors[cui][context_type] = cv*(1-b) - vector*b else: b = (1 - max(0, similarity)) * lr self.cui2context_vectors[cui][context_type] = cv*(1-b) + vector*b # DEBUG self.log.debug("Updated vector embedding.\n" + \ "CUI: {}, Context Type: {}, Similarity: {:.2f}, Is Negative: {}, LR: {:.5f}, b: {:.3f}".format(cui, context_type, similarity, negative, lr, b)) cv = self.cui2context_vectors[cui][context_type] similarity_after = np.dot(unitvec(cv), unitvec(vector)) self.log.debug("Similarity before vs after: {:.5f} vs {:.5f}".format(similarity, similarity_after)) else: if negative: self.cui2context_vectors[cui][context_type] = -1 * vector else: self.cui2context_vectors[cui][context_type] = vector # DEBUG self.log.debug("Added new context type with vectors.\n" + \ "CUI: {}, Context Type: {}, Is Negative: {}".format(cui, context_type, negative)) if not negative: # Increase counter only for positive examples self.cui2count_train[cui] += 1
def add_ncontext_vec(self, cui, ncontext_vec): """ Add the vector representation of a context for this CUI cui: The concept in question ncontext_vec: Vector represenation of the context """ sim = 0 cv = ncontext_vec cui2context_vec = self.cui2ncontext_vec if cui in self.cui_count: if cui in cui2context_vec: sim = np.dot(unitvec(cv), unitvec(cui2context_vec[cui])) c = 0.001 b = max((0.1 / self.cui_count[cui]), c) * (1 - max(0, sim)) cui2context_vec[cui] = cui2context_vec[cui]*(1-b) + cv*b else: cui2context_vec[cui] = cv
def _add_cntx_vec(self, cui, doc, tkns, negative=False, lr=None, anneal=None): """ Add context vectors for this CUI cui: concept id doc: spacy document where the cui was found tkns: tokens that were found for this cui """ if lr is None: lr = self.LR if anneal is None: anneal = self.ANNEAL if negative: self.cdb.cui_disamb_always[cui] = True # Get words around this concept words = self._get_doc_words(doc, tkns, span=self.CNTX_SPAN, skip_words=True, skip_current=False) words_short = self._get_doc_words(doc, tkns, span=self.CNTX_SPAN_SHORT, skip_current=True) cntx_vecs = [] for word in words: if word in self.vocab and self.vocab.vec(word) is not None: cntx_vecs.append(self.vocab.vec(word)) cntx_vecs_short = [] for word in words_short: if word in self.vocab and self.vocab.vec(word) is not None: cntx_vecs_short.append(self.vocab.vec(word)) if len(cntx_vecs) > 0: cntx = np.average(cntx_vecs, axis=0) # Add context vectors only if we have some self.cdb.add_context_vec(cui, cntx, cntx_type='MED', negative=negative, lr=lr, anneal=anneal) if len(cntx_vecs_short) > 0: cntx_short = np.average(cntx_vecs_short, axis=0) # Add context vectors only if we have some self.cdb.add_context_vec(cui, cntx_short, cntx_type='SHORT', inc_cui_count=False, negative=negative, lr=lr, anneal=anneal) if np.random.rand() < self.NEG_PROB and not negative: # Add only if probability and 'not' negative input negs = self.vocab.get_negative_samples(n=self.CNTX_SPAN * 2) neg_cntx_vecs = [self.vocab.vec(self.vocab.index2word[x]) for x in negs] neg_cntx = np.average(neg_cntx_vecs, axis=0) self.cdb.add_context_vec(cui, neg_cntx, negative=True, cntx_type='MED', inc_cui_count=False) #### DEBUG ONLY #### if self.DEBUG: if cui in self.cdb.cui2context_vec and len(cntx_vecs) > 0: if np.dot(unitvec(cntx), unitvec(self.cdb.cui2context_vec[cui])) < 0.01: log.debug("SIMILARITY MED::::::::::::::::::::") log.debug(words) log.debug(cui) log.debug(tkns) log.debug(np.dot(unitvec(cntx), unitvec(self.cdb.cui2context_vec[cui]))) log.debug(":::::::::::::::::::::::::::::::::::\n") if cui in self.cdb.cui2context_vec_short and len(cntx_vecs_short) > 0: if np.dot(unitvec(cntx_short), unitvec(self.cdb.cui2context_vec_short[cui])) < 0.01: log.debug("SIMILARITY SHORT::::::::::::::::::::") log.debug(words_short) log.debug(cui) log.debug(tkns) log.debug(np.dot(unitvec(cntx_short), unitvec(self.cdb.cui2context_vec[cui]))) log.debug(":::::::::::::::::::::::::::::::::::\n")
def _calc_acc(self, cui, doc, tkns, name=None): """ Calculate the accuracy for an annotation cui: concept id doc: spacy document tkns: tokens for the concept that was found name: concept name """ cntx = None cntx_short = None words = self._get_doc_words(doc, tkns, span=self.CNTX_SPAN, skip_words=True, skip_current=False) words_short = self._get_doc_words(doc, tkns, span=self.CNTX_SPAN_SHORT, skip_current=False) cntx_vecs = [] for word in words: if word in self.vocab and self.vocab.vec(word) is not None: cntx_vecs.append(self.vocab.vec(word)) cntx_vecs_short = [] for word in words_short: if word in self.vocab and self.vocab.vec(word) is not None: cntx_vecs_short.append(self.vocab.vec(word)) if len(cntx_vecs_short) > 0: cntx_short = np.average(cntx_vecs_short, axis=0) if len(cntx_vecs) > 0: cntx = np.average(cntx_vecs, axis=0) #### DEBUG ONLY #### if self.DEBUG: if cui in self.cdb.cui2context_vec and len(cntx_vecs) > 0: log.debug("SIMILARITY MED::::::::::::::::::::") log.debug(words) log.debug(cui) log.debug(tkns) log.debug(np.dot(unitvec(cntx), unitvec(self.cdb.cui2context_vec[cui]))) log.debug(":::::::::::::::::::::::::::::::::::\n") if cui in self.cdb.cui2context_vec_short and len(cntx_vecs_short) > 0: log.debug("SIMILARITY SHORT::::::::::::::::::::") log.debug(words_short) log.debug(cui) log.debug(tkns) log.debug(np.dot(unitvec(cntx_short), unitvec(self.cdb.cui2context_vec_short[cui]))) log.debug(":::::::::::::::::::::::::::::::::::\n") #### END OF DEBUG #### if cui in self.cdb.cui2context_vec and len(cntx_vecs) > 0: sim = np.dot(unitvec(cntx), unitvec(self.cdb.cui2context_vec[cui])) if cui in self.cdb.cui2context_vec_short and len(cntx_vecs_short) > 0: sim2 = np.dot(unitvec(cntx_short), unitvec(self.cdb.cui2context_vec_short[cui])) if sim2 > 0 and abs(sim - sim2) > 0.1: sim = (sim + sim2) / 2 if name is not None: if cui in self.cdb.cui2pref_name and sim > self.MIN_ACC: if name == self.cdb.cui2pref_name[cui]: sim = min(1, sim + 0.3) return sim else: return -1
def most_similar(self, cui, tui_filter=[], min_cnt=0, topn=50): r''' Given a concept it will calculat what other concepts in this CDB have the most similar embedding. Args: cui (str): The concept ID for the base concept for which you want to get the most similar concepts. tui_filter (list): A list of TUIs that will be used to filterout the returned results. Using this it is possible to limit the similarity calculation to only disorders/symptoms/drugs/... min_cnt (int): Minimum training examples (unsupervised+supervised) that a concept must have to be considered for the similarity calculation. topn (int): How many results to return Return: results (dict): A dictionary with topn results like: {<cui>: {'name': <name>, 'sim': <similarity>, 'tui_name': <tui_name>, 'tui': <tui>, 'cnt': <number of training examples the concept has seen>}, ...} ''' # Create the matrix if necessary if not hasattr(self, 'sim_vectors') or self.sim_vectors is None or len(self.sim_vectors) < len(self.cui2context_vec): print("Building similarity matrix") log.info("Building similarity matrix") sim_vectors = [] sim_vectors_counts = [] sim_vectors_tuis = [] sim_vectors_cuis = [] for _cui in self.cui2context_vec: sim_vectors.append(unitvec(self.cui2context_vec[_cui])) sim_vectors_counts.append(self.cui_count[_cui]) sim_vectors_tuis.append(self.cui2tui.get(_cui, 'unk')) sim_vectors_cuis.append(_cui) self.sim_vectors = np.array(sim_vectors) self.sim_vectors_counts = np.array(sim_vectors_counts) self.sim_vectors_tuis = np.array(sim_vectors_tuis) self.sim_vectors_cuis = np.array(sim_vectors_cuis) # Select appropirate concepts tui_inds = np.arange(0, len(self.sim_vectors_tuis)) if len(tui_filter) > 0: tui_inds = np.array([], dtype=np.int32) for tui in tui_filter: tui_inds = np.union1d(np.where(self.sim_vectors_tuis == tui)[0], tui_inds) cnt_inds = np.arange(0, len(self.sim_vectors_counts)) if min_cnt > 0: cnt_inds = np.where(self.sim_vectors_counts >= min_cnt)[0] # Intersect cnt and tui inds = np.intersect1d(tui_inds, cnt_inds) mtrx = self.sim_vectors[inds] cuis = self.sim_vectors_cuis[inds] sims = np.dot(mtrx, unitvec(self.cui2context_vec[cui])) sims_srt = np.argsort(-1*sims) # Create the return dict res = {} for ind, _cui in enumerate(cuis[sims_srt[0:topn]]): res[_cui] = {'name': self.cui2pretty_name[_cui], 'sim': sims[sims_srt][ind], 'tui_name': self.tui2name.get(self.cui2tui.get(_cui, 'unk'), 'unk'), 'tui': self.cui2tui.get(_cui, 'unk'), 'cnt': self.cui_count[_cui]} return res
def most_similar(self, cui, context_type, type_id_filter=[], min_cnt=0, topn=50, force_build=False): r''' Given a concept it will calculate what other concepts in this CDB have the most similar embedding. Args: cui (`str`): The concept ID for the base concept for which you want to get the most similar concepts. context_type (`str`): On what vector type from the cui2context_vectors map will the similarity be calculated. type_id_filter (`List[str]`): A list of type_ids that will be used to filterout the returned results. Using this it is possible to limit the similarity calculation to only disorders/symptoms/drugs/... min_cnt (`int`): Minimum training examples (unsupervised+supervised) that a concept must have to be considered for the similarity calculation. topn (`int`): How many results to return force_build (`bool`, defaults to `False`): Do not use cached sim matrix Return: results (Dict): A dictionary with topn results like: {<cui>: {'name': <name>, 'sim': <similarity>, 'type_name': <type_name>, 'type_id': <type_id>, 'cnt': <number of training examples the concept has seen>}, ...} ''' if 'similarity' in self.addl_info: if context_type not in self.addl_info['similarity']: self.addl_info['similarity'][context_type] = {} else: self.addl_info['similarity'] = {context_type: {}} sim_data = self.addl_info['similarity'][context_type] # Create the matrix if necessary if 'sim_vectors' not in sim_data or force_build: self.log.info("Building similarity matrix") sim_vectors = [] sim_vectors_counts = [] sim_vectors_type_ids = [] sim_vectors_cuis = [] for _cui in self.cui2context_vectors: if context_type in self.cui2context_vectors[_cui]: sim_vectors.append(unitvec(self.cui2context_vectors[_cui][context_type])) sim_vectors_counts.append(self.cui2count_train.get(_cui, 0)) sim_vectors_type_ids.append(self.cui2type_ids.get(_cui, {'unk'})) sim_vectors_cuis.append(_cui) sim_data['sim_vectors'] = np.array(sim_vectors) sim_data['sim_vectors_counts'] = np.array(sim_vectors_counts) sim_data['sim_vectors_type_ids'] = np.array(sim_vectors_type_ids) sim_data['sim_vectors_cuis'] = np.array(sim_vectors_cuis) # Select appropriate concepts type_id_inds = np.arange(0, len(sim_data['sim_vectors_type_ids'])) if len(type_id_filter) > 0: type_id_inds = np.array([], dtype=np.int32) for type_id in type_id_filter: type_id_inds = np.union1d(np.array([ind for ind, type_ids in enumerate(sim_data['sim_vectors_type_ids']) if type_id in type_ids]), type_id_inds) cnt_inds = np.arange(0, len(sim_data['sim_vectors_counts'])) if min_cnt > 0: cnt_inds = np.where(sim_data['sim_vectors_counts'] >= min_cnt)[0] # Intersect cnt and type_id inds = np.intersect1d(type_id_inds, cnt_inds) mtrx = sim_data['sim_vectors'][inds] cuis = sim_data['sim_vectors_cuis'][inds] sims = np.dot(mtrx, unitvec(self.cui2context_vectors[cui][context_type])) sims_srt = np.argsort(-1*sims) # Create the return dict res = {} print() for ind, _cui in enumerate(cuis[sims_srt[0:topn]]): res[_cui] = {'name': self.cui2preferred_name.get(_cui, list(self.cui2names[_cui])[0]), 'sim': sims[sims_srt][ind], 'type_names': [self.addl_info['type_id2name'].get(cui, 'unk') for cui in self.cui2type_ids.get(_cui, ['unk'])], 'type_ids': self.cui2type_ids.get(_cui, 'unk'), 'cnt': self.cui2count_train.get(_cui, 0)} return res
def add_context_vec(self, cui, context_vec, negative=False, cntx_type='LONG', inc_cui_count=True, manual=False): """ Add the vector representation of a context for this CUI cui: The concept in question context_vec: Vector represenation of the context negative: Is this negative context of positive cntx_type: Currently only two supported LONG and SHORT pretty much just based on the window size inc_cui_count: should this be counted """ if cui not in self.cui_count: self.increase_cui_count(cui, True, manual=manual) prob = 0.95 """ cnt = self.cui_count[cui] if cnt < int(cui_limit_high / 2): prob = 0.95 else: div = 2*cui_limit_high prob = max(0.5, 0.95 - (cnt / div)) """ # Set the right context if cntx_type == 'MED': cui2context_vec = self.cui2context_vec elif cntx_type == 'SHORT': cui2context_vec = self.cui2context_vec_short elif cntx_type == 'LONG': cui2context_vec = self.cui2context_vec_long sim = 0 cv = context_vec if cui in cui2context_vec: sim = np.dot(unitvec(cv), unitvec(cui2context_vec[cui])) if negative: if not manual: b = max((0.2 / self.cui_count[cui]), 0.0001) * max(0, sim) else: # Means someone manually annotated the example, use high learning rate b = 0.1 * max(0, sim) cui2context_vec[cui] = cui2context_vec[cui]*(1-b) - cv*b else: if sim < prob: if not manual: # Annotation is from Unsupervised learning c = 0.001 b = max((0.5 / self.cui_count[cui]), c) * (1 - max(0, sim)) else: # Means someone manually annotated the example, use high learning rate b = 0.1 * (1 - max(0, sim)) cui2context_vec[cui] = cui2context_vec[cui]*(1-b) + cv*b # Increase cui count self.increase_cui_count(cui, inc_cui_count) else: if negative: cui2context_vec[cui] = -cv else: cui2context_vec[cui] = cv self.increase_cui_count(cui, inc_cui_count, manual) return sim