def list_conditions_with_qumls(path_to_directory_condition, path_to_qumls_files): """ param: path to directory where list of conditions is stored returns for each variable what part of the string is recognized as a biomedical concept, to which biomedical concept it is mapped, and if its fully/partly or not recognized, """ term_dict = dict() matcher = QuickUMLS(path_to_qumls_files) for string in itterating_sentences(path_to_directory_condition): x = matcher.match(string, best_match=True, ignore_syntax=False) term_string = string if len(x) > 0: for y in x: for z in y: ngram = z["ngram"] term2 = z["term"] term3 = z["similarity"] if term_string.lower() == term2.lower(): term_dict[term_string] = [ ngram, term2, "full recognition" ] if term_string.lower() != term2.lower(): term_dict[term_string] = [ ngram, term2, "partial recognition", term3 ] if len(x) == 0: term_dict[term_string] = ["none", "none", "not recognized"] return term_dict
def __init__(self, args): from quickumls import QuickUMLS assert args.quickumls_path is not None, "Please provide path where QuickUMLS is installed" assert args.num_worker == 1, "QuickUMLS doesn't support num_workers > 1" self.matcher = QuickUMLS(args.quickumls_path, 'score', threshold=0.6)
def _start(self): """ Instantiate the QuickUMLS matcher. """ self._linker = QuickUMLS(self.quickumls_install, overlapping_criteria=self.criterion, threshold=self.min_score, accepted_semtypes=self.keep_semtypes) self._log("Started")
def extract(self, file_item): print 'quickumls_fp: ' + self.quickumls_fp print 'overlapping_criteria: ' + self.overlapping_criteria print 'threshold: ' + str(self.threshold) print 'similarity_name: ' + self.similarity_name print 'minMatchedLength: ' + str(self.minMatchedLength) print 'window: ' + str(self.window) matcher = QuickUMLS(self.quickumls_fp, self.overlapping_criteria, self.threshold, self.window, self.similarity_name, self.minMatchedLength, constants.ACCEPTED_SEMTYPES, True) extraction_result = matcher.match(self.text, best_match=True, ignore_syntax=False) self.buildXML(extraction_result, file_item)
def run_quickumls_server(opts): matcher = QuickUMLS(quickumls_fp=opts.quickumls_fp, threshold=opts.threshold, overlapping_criteria=opts.overlapping_criteria, similarity_name=opts.similarity_name, window=opts.window, min_match_length=opts.min_match_length, verbose=opts.verbose) run_server(matcher, host=opts.host, port=opts.port, buffersize=4096)
def load( cls, path_to_quickumls: str, accepted_semtypes: Optional[Set[str]] = None, threshold: float = 0.9, similarity_name: str = "jaccard", spacy_string: str = "en_core_sci_sm", best_match: bool = False, n_workers: int = 1, ) -> "QuickUMLSClassifier": if accepted_semtypes is None: accepted_semtypes = ALL_SEMTYPES q = QuickUMLS(path_to_quickumls, accepted_semtypes=accepted_semtypes, threshold=threshold, similarity_name=similarity_name) # Load the spacy model, Disable the NER and Parser. q.nlp = spacy.load(spacy_string, disable=("ner", "parser")) return cls(q, n_workers)
def load( cls, path_to_quickumls: str, accepted_semtypes: Optional[Set[str]] = None, threshold: float = 0.9, similarity_name: str = "jaccard", pooling: str = "mean", spacy_string: str = "en_core_sci_sm", priors: Optional[Dict[str, float]] = None, n_workers: int = 1, ) -> "QuickUMLSClassifier": """ Load a QuickUMLSClassifier instance. :param path_to_quickumls: The path to a valid quickUMLS installation. :param accepted_semtypes: A set of accepted semantic types. If this is None, we revert to all semantic types. :param threshold: The threshold to accept. :param similarity_name: The name of the similarity function. Accepted are 'jaccard', 'overlap', 'cosine' and 'dice'. :param pooling: The name of the pooling function to use. Should be 'mean', 'max' or 'sum'. :param spacy_string: The string of the spacy model to use. :param priors: None or a dictionary mapping from semantic types to class probabilities. :param n_workers: The number of workers to use during prediction. :return: An initialized QuickUMLSClassifier. """ # Fail early if pooling not in cls.FUNCS: raise ValueError( f"mode should be in {cls.FUNCS}, is now {pooling}") if accepted_semtypes is None: accepted_semtypes = ALL_SEMTYPES q = QuickUMLS(path_to_quickumls, accepted_semtypes=accepted_semtypes, threshold=threshold, similarity_name=similarity_name) # Load the spacy model, Disable the NER and Parser. q.nlp = spacy.load(spacy_string, disable=("ner", "parser")) return cls(q, pooling, priors, n_workers)
def process_data(pid, doc_list): data = [] matcher = QuickUMLS(args.quickumls_path, 'score', threshold=0.6) for i, doc in enumerate(doc_list): qumls_res = matcher.match(doc['text']) res_list = ddict(list) for men in qumls_res: for cand in men: start, end = cand['start'], cand['end'] umls_cui = cand['cui'] score = cand['similarity'] res_list[(start, end)].append((umls_cui, score)) doc['result'] = dict(res_list) data.append(doc) if i % 10 == 0: print('Completed [{}] {}, {}'.format( pid, i, time.strftime("%d_%m_%Y") + '_' + time.strftime("%H:%M:%S"))) return data
class QuickUMLSProcessor(MERToolProcessor): def __init__(self, config): self.__quickumls = QuickUMLS('/home/daniel/QuickUMLS') self.__matches = None super().__init__(config) def process_input(self): """Extracts information from input""" input_file = self._input_filepath.open(encoding='utf8') text = input_file.read() print('--- QuickUMLS: Processing input ---') start_time = time.time() self.__matches = self.__quickumls.match(text, best_match=True, ignore_syntax=False) end_time = time.time() - start_time print('--- {} seconds ---'.format(end_time)) def format_output(self): """Formats the original output to eHealth-KD subtask A output""" umls_concepts = map(lambda match_list: match_list[0], self.__matches) # Only first term (preferred term) ordered_concepts = sorted( umls_concepts, key=lambda umls_concept: umls_concept['start']) # Order by start # Converts an UMLS concept to a eHealth-KD keyphrase for concept in ordered_concepts: keyphrase = {'label': 'Concept', 'term': concept['ngram']} multiword_term = concept['ngram'].split() if not multiword_term: keyphrase['span'] = '{0} {1}'.format(concept['start'], concept['end']) else: span = [] for token in multiword_term: if not span: span.append( (concept['start'], concept['start'] + len(token))) else: span.append( (span[-1][1] + 1, span[-1][1] + 1 + len(token))) span = map(lambda tup: '{0} {1}'.format(tup[0], tup[1]), span) keyphrase['span'] = ';'.join(span) self._key_phrases.append(keyphrase)
class QUMLS(BaseLinker): def __init__(self, args): from quickumls import QuickUMLS assert args.quickumls_path is not None, "Please provide path where QuickUMLS is installed" assert args.num_worker == 1, "QuickUMLS doesn't support num_workers > 1" self.matcher = QuickUMLS(args.quickumls_path, 'score', threshold=0.6) def __call__(self, text): qumls_res = self.matcher.match(text) men_list = ddict(list) for men in qumls_res: for cand in men: start, end = cand['start'], cand['end'] umls_cui = cand['cui'] score = cand['similarity'] men_list[(start, end)].append([umls_cui, round(score, 3)]) return self.reformat(men_list, text)
start = time.time() if b == 'true': best_match = True else: best_match = False # directory of notes to process directory_to_parse = '/data/data_in/' # QuickUMLS data directory quickumls_fp = '/data/UMLS/' os.chdir(directory_to_parse) matcher = QuickUMLS(quickumls_fp, o, 0.7, 5, s) test = pd.DataFrame() fn = pd.concat gn = matcher.match df = pd.DataFrame for fname in glob.glob(directory_to_parse + '*.txt'): t = os.path.basename(fname) u = t.split('.')[0] with open(directory_to_parse + u + '.txt') as f: f1 = f.read() print(u) out = gn(f1, best_match=best_match, ignore_syntax=False) for i in out:
def main(args): print('=============') if args.granularity not in ['N', 'S', 'W']: raise TypeError( 'Invalid value for the granularity - should be N, S, or W') print('Reading MIMIC-III data...') if args.skiplims is None: notes_df = read_csv(args.noteevents_fp) else: to_skip = [] for i in range(0, len(args.skiplims), 2): to_skip += [ j for j in range(args.skiplims[i], args.skiplims[i + 1]) ] notes_df = read_csv(args.noteevents_fp, skiprows=to_skip) print('Preprocessing notes ...') parsed_list = [] for note in tqdm(notes_df['TEXT']): note = note.lower() note = re.sub('[^a-zA-Z.]', ' ', note) note = re.sub(r'\s+', ' ', note) # For finer granularity than entire notes, they are tokenized so that we # can iterate over sentences or words if args.granularity != 'N': note = nltk.sent_tokenize(note) if args.granularity == 'W': for i in range(len(note)): note[i] = re.sub('[.]', '', note[i]) note = [nltk.word_tokenize for sentence in note] for i in range(len(note)): note[i] = [ word for word in note_[i] if word not in stopwords.words('english') ] parsed_list.append(note) print('Matching with UMLS corpus...') # initialise QuickUMLS string matching object matcher = QuickUMLS(args.qumls_fp, threshold=args.thresh, similarity_name=args.sim) # useful to define these two here so the mapping loop isn't too verbose qumls_getter = lambda n: matcher.match( n, best_match=False, ignore_syntax=False) # this gets the maximum similarity score and its index in the list for that ngram simscore_getter = lambda l: max(enumerate([d['similarity'] for d in l]), key=itemgetter(1)) ALL = args.attr == 'all' if ALL: # make a dictionary which will have the columns to be added to the dataframe names = ['term', 'cui', 'semtypes'] attrs = {} for name in names: attrs[name] = [] else: mapped_corpus = [] if args.keep_similarity: similarity_scores = [] for note in tqdm(parsed_list): if ALL: # note-level mini-version of the dictionary "attrs" to collect the attributes for each note sub_attr = {} for name in names: sub_attr[name] = [] else: single_attr_list = [] if args.keep_similarity: sim_list = [] if args.granularity == 'N': res = qumls_getter(note) for l in res: ss = simscore_getter(l) if ALL: for name in names: sub_attr[name].append(l[ss[0]][name]) else: single_attr_list.append(l[ss[0]][args.attr]) if args.keep_similarity: sim_list.append(ss[1]) else: for s in note: if args.granularity != 'W': res = qumls_getter(s) for l in res: ss = simscore_getter(l) if ALL: for name in names: sub_attr[name].append(l[ss[0]][name]) else: single_attr_list.append(l[ss[0]][args.attr]) if args.keep_similarity: sim_list.append(ss[1]) else: for w in s: res = qumls_getter(w)[0] ss = simscore_getter(res) if ALL: for name in names: sub_attr[name].append(res[ss[0]][name]) else: single_attr_list.append(res[ss[0]][args.attr]) if args.keep_similarity: sim_list.append(ss[1]) if ALL: if args.filter_semtypes_file is not None: irrelevant_type_ids = [ i[:-1] for i in open(args.filter_semtypes_file, 'r') ] indices_to_remove = [] for st_set in sub_attr['semtypes']: if all(st in irrelevant_type_ids for st in st_set): indices_to_remove.append( sub_attr['semtypes'].index(st_set)) for name in names: sub_attr[name] = [ st for st in sub_attr[name] if sub_attr[name].index(st) not in indices_to_remove ] for name in names: mapped_note = '' for a in sub_attr[name]: if name == 'semtypes': for a_ in a: mapped_note += a_ + ' ' else: mapped_note += a + ' ' attrs[name].append(mapped_note) else: mapped_note = '' for word in single_attr_list: mapped_note += word mapped_note += ' ' mapped_corpus.append(mapped_note) print('Matching finished!') print('Writing .csv file...') if ALL: for name, mapped_corpus in attrs.items(): notes_df[name.upper()] = mapped_corpus if args.keep_similarity: notes_df['SIM_SCORE'] = sim_list else: notes_df[args.attr.upper()] = mapped_corpus if args.outfilepath[-4:] != '.csv': args.outfilepath += '.csv' notes_df.to_csv(args.outfilepath, index=False) print('Done!') print('=============')
def run(snippets, nlp): resource_path = configure.RESOURCE_PATH sem_file = os.path.join(configure.RESOURCE_PATH, 'SemGroups.txt') quickUMLS_file = configure.QUICKUMLS_FILE # retrieve the predefined treatment semantic types drug_types, procedure_types, activity_types, device_types = configure.quickUMLS_config( ) # get the exclude_terms exclude_terms = get_exclude_terms( os.path.join(resource_path, 'attribute_patterns.txt'), os.path.join(resource_path, 'relation_patterns.txt'), os.path.join(resource_path, 'exclude_terms.txt')) # get sem_map, which is the association of the semantic types and semantic groups sem_map = get_semtype_map(sem_file) # initial extraction print('*' * 25 + 'initial extraction' + '*' * 25) matcher = QuickUMLS(quickUMLS_file, overlapping_criteria='score', threshold=0.8, accepted_semtypes=','.join([ drug_types, procedure_types, activity_types, device_types ])) for snippet in snippets: snippet['entities'] = extract_entities(snippet['processed'], matcher, exclude_terms, sem_map) convert_snippet(snippet) # remapping: expand the boundary of initially extracted treatment entities print('*' * 25 + 'remapping' + '*' * 25) file = configure.QUICKUMLS_FILE # the overlapping criteria is changed to 'length' prior. matcher = QuickUMLS(file, overlapping_criteria='length', threshold=0.8, accepted_semtypes=','.join([ drug_types, procedure_types, activity_types, device_types ])) remapping_exclude_terms = get_exclude_terms( os.path.join(resource_path, 'attribute_patterns.txt'), os.path.join(resource_path, 'relation_patterns.txt'), os.path.join(resource_path, 'remapping_exclude_terms.txt')) for snippet in snippets: print('processing:\t' + snippet['processed']) if len(snippet['entities']) == 0: continue print('before expanding:') for entity in snippet['entities']: print(entity['ngram']) new_entities = remapping( snippet['entities'], expand_boundary(snippet['representation'], nlp, remapping_exclude_terms), snippet['representation'], snippet['processed'], matcher, sem_map, exclude_terms) new_entities = sorted(new_entities, key=lambda x: x['start']) snippet['entities'] = new_entities print('after expanding:') for entity in snippet['entities']: print(entity['ngram']) # convert semtype set to list (for json) for snippet in snippets: if 'entities' in snippet.keys(): for entity in snippet['entities']: entity['semtypes'] = list(entity['semtypes']) # convert to representation for snippet in snippets: convert_snippet(snippet)
if os.environ.get("deployment", False): app.config.from_pyfile('/etc/cs4300-volume-cfg/cs4300app.cfg') else: app.config.from_pyfile(os.path.join( os.path.join(os.getcwd(), "secrets"), "cs4300app.cfg")) gunicorn_logger = logging.getLogger('gunicorn.error') app.logger.handlers = gunicorn_logger.handlers app.logger.setLevel(gunicorn_logger.level) os.system("cp -r concept_matching/quickUCSLS concept_matching/quickUCSLS_{}".format(os.getpid())) app.logger.debug("PID: {}".format(os.getpid())) concept_matcher = QuickUCSLS("./concept_matching/quickUCSLS_{}".format(os.getpid()), accepted_semtypes={"T{:03d}".format(i) for i in range(1,35)}, threshold=0.5, min_match_length=0) app.logger.debug("Matcher res: {}".format(concept_matcher.match("cos sim"))) app.logger.debug("Matcher Ready") def get_preferred_terms(): preferred_term = dict() with codecs.open("./concept_matching/definition_files/MRCONSO.RRF") as f: for i, ln in enumerate(f): if i < 1: continue cui, s, _, pref = ln.strip().split("|") if pref == "Y": preferred_term[cui] = s return preferred_term preferred_term = get_preferred_terms()
import os from pymetamap import MetaMap from quickumls import QuickUMLS """This is a script that inputs local text files and outputs Patient IDs and UMLS terms. This script splits text files within the working directory into lines, removes any lines with commonly negated terms and then extracts UMLS information from the remaining positive lines. This script outputs Patient IDs, given that the name of the file is the Patient ID, and UMLS information for non-negated text lines. The default is to also output each line that has positive terms. This can easily be commented out if desired. This script is meant for use with UCSF clinical notes. Please see the README file for information on the UMLS metathesaurus and QuickUMLS installations. """ ## if running more than once, comment this line out. Will result in error if you try to define 'matcher' more than once. ## path should be your destination_path created during QuickUMLS installation. Change accordingly. matcher = QuickUMLS( '/Users/madisonmyers/Desktop/QuickUMLS-master/destination_path') location = os.getcwd( ) ## Will use the directory you are working in. Make sure notes/text files are available in this folder. for file in os.listdir(location): if file.endswith(".txt"): ## many of the UCSF clinical notes need utf-8 encoding else it will result in an error open_file = open(file, 'r', encoding='utf-8', errors='ignore') doclist = [line for line in open_file] docstr = ''.join(doclist) bn_sents = re.split(r'[.!?]', docstr) out = matcher.match(bn_sents, result) filename = file.split(".")[0].split("/")[-1] ## most common negated terms in clinical text f = [
threshold = os.environ.get('THRESHOLD', 0.7) similarity_name = os.environ.get('SIMILARITY_NAME', "jaccard") window = os.environ.get('WINDOW', 5) min_match_length = os.environ.get('MIN_MATCH_LENGTH', 3) verbose = os.environ.get('VERBOSE', False) accepted_semtypes = os.environ.get('ACCEPTED_SEMTYPES', constants.ACCEPTED_SEMTYPES) print( "quickumls_fp={}, overlapping_criteria={}, threshold={}, similarity_name={}, window={}, accepted_semtypes={}" .format(quickumls_fp, overlapping_criteria, threshold, similarity_name, window, accepted_semtypes)) matcher = QuickUMLS(quickumls_fp, overlapping_criteria, threshold, window, similarity_name, min_match_length, accepted_semtypes, verbose) class SetEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, set): return list(obj) return json.JSONEncoder.default(self, obj) def process(data): dto = json.loads(str(data)) text = dto['text'] matches = matcher.match(text, best_match=True, ignore_syntax=True) return json.dumps(matches, cls=SetEncoder)
type=float, default=0.9, help='Select a threshold (between 0 and 1) - default: 0.9') ARGStemp = parser.parse_args() return ARGStemp if __name__ == "__main__": # Parse arguments global ARGS ARGS = parse_arguments() # Start the process global matcher matcher = QuickUMLS(quickumls_fp='./QuickUMLS', overlapping_criteria='score', threshold=ARGS.t, similarity_name='cosine', window=5) print("QuickUMLS Threshold: ", ARGS.t) global TUIs if ARGS.TUI == "Alpha" or ARGS.TUI == "alpha": TUIs = TUI_alpha print("TUI list Alpha selected") else: TUIs = TUI_beta print("TUI list Beta selected") pool = Pool(os.cpu_count() - 4) pool.map(main_funct, os.listdir(dirchunks))
Return pkl files of the dataframes with two added columns containing the CUIs identified by QuickUMLS (directory is specified by the '--path_out' parameter). """ ############## PARAMETERS ############## argparser = argparse.ArgumentParser() argparser.add_argument('--path_in', default='../data/bern_df/') argparser.add_argument('--path_out', default='../data/cuis/') args = argparser.parse_args() ############## INSTANTIATE QuickUMLS ############## sem_diseases = ['T020', 'T190', 'T049', 'T019', 'T047', 'T050', 'T033', 'T037', 'T048', 'T191', 'T046', 'T184'] sem_drugs = ['T116', 'T195', 'T123', 'T122', 'T103', 'T120', 'T104', 'T200', 'T196', 'T126', 'T131', 'T125', 'T129', 'T130', 'T197', 'T114', 'T109', 'T121', 'T192', 'T127'] sem_dis_drug = sem_diseases + sem_drugs data_dir = '../data/quickUMLS_eng' matcher = QuickUMLS(quickumls_fp=data_dir, accepted_semtypes=sem_dis_drug) ############## PROCESS ############## path_in = Path(args.path_in) path_out = Path(args.path_out) path_out.mkdir(exist_ok=True, parents=True) for file in path_in.glob('*.pkl'): batch = process_batch(file, 'idx') batch_name = file.stem batch['disease_cuis'] = batch['ent_text_disease'].apply(apply_QuickUMLS, args=(matcher,)) batch['drug_cuis'] = batch['ent_text_drug'].apply(apply_QuickUMLS, args=(matcher,)) batch.to_pickle(f"{path_out}/{batch_name}.pkl") print(f"{batch_name} is processed and saved.")
def __init__(self, config): self.__quickumls = QuickUMLS('/home/daniel/QuickUMLS') self.__matches = None super().__init__(config)
return ann_by_text_id def format_annotation(ann): outann = { "start": ann["start"], "end": ann["end"], "matched_text": ann["ngram"], "semtypes": list(ann["semtypes"]), "cui": ann["cui"] } return outann if __name__ == "__main__": args = parse_args() if os.path.exists(args.outfile): print("Output file already exists. Please delete it and rerun.") print("Aborting.") sys.exit(1) docs = json.load(open(args.documents, 'r')) conf = json.load(open(args.quickumls_conf)) print("========================") print("QuickUMLS configuration:") print("QuickUMLS installation: {args.quickumls_install_dir}") print(json.dumps(conf, indent=2)) print("========================") matcher = QuickUMLS(args.quickumls_install_dir, **conf) anns = run_quickumls(docs, matcher) json.dump(anns, open(args.outfile, 'w'))
return list(set(similarWords)) pattern = "(?:^(?:never|no|nothing|nowhere|noone|none|not|havent|hasnt|hadnt|cant|couldnt|shouldnt|wont|wouldnt|dont|doesnt|didnt|isnt|arent|aint)$)|n't" endSymbolsTillNegation = [',', '.', ':', ';', '!', '?'] stopWords = stopwords.words('english') print(stopWords) removeSymbolsList = [ '∆', '(', ')', ',', '.', 'β', 'α', "'s'", '$', '``', "''", "'s", ':', ';', '/', '\\', '+' ] matcher = QuickUMLS( '/home/roysoumya/Documents/ClinicalTrials_Coding/QuickUMLS/QuickUMLS_data/' ) lemmatizer = WordNetLemmatizer() ps = PorterStemmer() input_query_path = "/mnt/c/Users/roysoumya/Documents/ClinicalTrials_Coding/COCTR_multidimensional_ranking-master/datasetPreparation/src/ExtendedRetrievalCodes/data/extended_retr_pagerank/" output_path = "/mnt/c/Users/roysoumya/Documents/ClinicalTrials_Coding/COCTR_multidimensional_ranking-master/datasetPreparation/src/ExtendedRetrievalCodes/data/appendSynsetMatch_10thSept/" files = sorted(listdir(input_query_path)) # In[12]: def getListOfWordsForWhichUMLSConceptdidntGen(query): final_query = query
print('Creating QuickUMLS object...') quickumls_path = r'C:\quickumls\SNOMED_RXNORM_CPT_lowercase' total_iterations = 1 ignore_syntax = False threshold = 0.7 accepted_semtypes = None # accepted_semtypes = constants.ACCEPTED_SEMTYPES print('Setting up for semtypes (None means all types) : {}'.format( accepted_semtypes)) matcher = QuickUMLS(quickumls_path, accepted_semtypes=accepted_semtypes, threshold=threshold) print('QuickUMLS object created...') text_file_path = r'data/colonoscopy-1.txt' file = open(text_file_path, 'r') text = file.read() file.close() print('Length of Text : {0} characters'.format(len(text))) print('About to reprocess this text [{0}] times'.format(total_iterations)) results_list = [] result_count = 0
class QuickUMLSDriver(EntityLinker): def __init__(self, name="quickumls", quickumls_install="", criterion="score", min_score=0.7, keep_semtypes=None): """ Interface to QuickUMLS. :param str quickumls_install: The path to the QuickUMLS installation. :param float min_score: Minimum score to consider, between 0 and 1.0. :param list keep_semtypes: List of semantic types to consider. """ super().__init__(name) self.quickumls_install = quickumls_install self.criterion = criterion self.min_score = min_score self.keep_semtypes = keep_semtypes self._log_parameters() self._start() def _log_parameters(self): self._log(f"Staring annotator '{self.name}'") self._log(f"{self.name} parameters:") self._log(f" quickumls_install : {self.quickumls_install}") self._log(f" criterion : {self.criterion}") self._log(f" min_score : {self.min_score}") self._log(f" keep_semtypes : {self.keep_semtypes}") def _start(self): """ Instantiate the QuickUMLS matcher. """ self._linker = QuickUMLS(self.quickumls_install, overlapping_criteria=self.criterion, threshold=self.min_score, accepted_semtypes=self.keep_semtypes) self._log("Started") def _convert_output_to_candidate_links(self, outputs): """ Convert the raw QuickUMLS output into CandidateLink instances. Output is of the format: {matched_string: [CandidateLink, [...]]} :param list outputs: List of outputs from QuickUMLS.match(). def __init__(self, input_string, candidate_term, candidate_source, candidate_id, **attrs): """ links = defaultdict(list) for phrase in outputs: seen_cuis = set() for match in phrase: try: candidate_term = match["preferred_term"] if candidate_term == "": # No preferred_term found. candidate_term = match["term"] except KeyError: candidate_term = match["term"] # QuickUMLS sometimes returns the same CUI more than once. if match["cui"] in seen_cuis: continue else: seen_cuis.add(match["cui"]) candidate = CandidateLink( input_string=match["ngram"], candidate_term=candidate_term, candidate_source="UMLS", candidate_id=match["cui"], linking_score=match["similarity"], # attrs umls_semantic_type=match["semtypes"]) links[match["ngram"]].append(candidate) return links def link(self, queries): """ Link query or list of queries to entities in the corresponding database. Input should be a sequence of (ID, text pairs). Outputs a nested dictionary of the format {input_id: {matched_input: [CandidateLink, [...]]}}. :param list input_strings: List of (ID, string) pairs to link. :returns: Dictionary of input strings to CandidateLink instances. :rtype: dict """ queries = self._prepare_queries(queries, ascii_only=False) all_links = {} for (qid, query) in queries: output = self._linker.match(query) links = self._convert_output_to_candidate_links(output) all_links[qid] = links return all_links def get_best_links(self, candidate_links, keep_top_n): """ Given a set of candidate links for a set of input strings returned by EntityLinker.link(), choose the N "best" linkings for each input string from among the candidate links. {input_id: {matched_input: [CandidateLink, [...]]}} :param dict candidate_links: Dictionary of input strings to candidate linkings. :returns: candidate_links filtered to include only the N "best" links. :rtype: list """ for qid in candidate_links.keys(): for (matched_str, candidates) in candidate_links[qid].items(): for c in candidates: if matched_str.lower() == c.candidate_term.lower(): c.linking_score = 1.0 candidates_sorted = sorted(candidates, key=lambda x: x.linking_score, reverse=True) candidates_top_n = candidates_sorted[:keep_top_n] candidate_links[qid][matched_str] = candidates_top_n return candidate_links
try: from quickumls import QuickUMLS except ImportError: from .quickumls import QuickUMLS print('Creating QuickUMLS object...') quickumls_path = r'C:\quickumls' matcher = QuickUMLS(quickumls_path) print('QuickUMLS object created...') text = "The ulna has dislocated posteriorly from the trochlea of the humerus." print('*************************') print('Text:') print(text) print('*************************') res = matcher.match(text, best_match=True, ignore_syntax=False) print('Matching results:') print(res)
import pandas as pd from collections import defaultdict from quickumls import QuickUMLS to_annot_data = pd.read_csv('toAnnotateWithText_9thSept.csv') matcher = QuickUMLS( '/home/roysoumya/Documents/ClinicalTrials_Coding/QuickUMLS/QuickUMLS_data/' ) brief_title_concepts_list = list() brief_summ_concepts_list = list() for row_id in range(to_annot_data.shape[0]): brief_title = to_annot_data.iloc[row_id, 2] brief_summ = to_annot_data.iloc[row_id, 3] brief_title_umls = matcher.match(brief_title, best_match=True, ignore_syntax=False) brief_title_concepts_list.append(';'.join( [elem[0][u'cui'] for elem in brief_title_umls])) brief_summ_umls = matcher.match(brief_summ, best_match=True, ignore_syntax=False) brief_summ_concepts_list.append(';'.join( [elem[0][u'cui'] for elem in brief_summ_umls])) if row_id % 50 == 0: print(row_id)
import os import re import numpy as np from quickumls import QuickUMLS QUICKUMLS_FP = os.path.expanduser('~/quickumls_data/') matcher = QuickUMLS(QUICKUMLS_FP, threshold=0.7, similarity_name='jaccard', window=5) def parse_chunker(original_text, phrase_matches): order = np.argsort([match[0]['start'] for match in phrase_matches]) offset = 0 chunked_string = original_text prev_end = 0 for num_match, match_idx in enumerate(order): match = phrase_matches[match_idx] ngram = match[0]['ngram'] term = match[0]['term'] start = match[0]['start'] end = match[0]['end'] assert start >= prev_end prev_end = end # Only change multi word phrases if len(ngram.split()) == 1: continue term = '_'.join(term.split())
from quickumls import QuickUMLS import csv, os, sys, time, re if __name__ == "__main__": # start_time = time.time() THRESHOLD = 0.7 matcher = QuickUMLS(quickumls_fp='./QuickUMLS', overlapping_criteria='score', threshold=THRESHOLD, similarity_name='cosine', window=5) # myDict = {} dirchunks = "./data/chunkssmall/" diroutputchunks = "./data/outputchunkssmall/" # list_cui = [] # list_terms = [] for file in os.listdir(dirchunks): filename = dirchunks + file # liste_concepts = [] lineNb = 1 list_cui = [] list_terms = [] with open(filename, 'r') as fd: print("File", filename, "opened! \nNow treating line: ", flush=True) # Preparing outputfile outputFile = diroutputchunks + file + ".output" fw = open(outputFile, 'w')