def list_conditions_with_qumls(path_to_directory_condition,
                               path_to_qumls_files):
    """
    param: path to directory where list of conditions is stored
    returns for each variable what part of the string is recognized as a biomedical concept, to which biomedical concept it is mapped, and if its fully/partly or not recognized,
    """

    term_dict = dict()
    matcher = QuickUMLS(path_to_qumls_files)
    for string in itterating_sentences(path_to_directory_condition):
        x = matcher.match(string, best_match=True, ignore_syntax=False)
        term_string = string
        if len(x) > 0:
            for y in x:
                for z in y:
                    ngram = z["ngram"]
                    term2 = z["term"]
                    term3 = z["similarity"]
                    if term_string.lower() == term2.lower():
                        term_dict[term_string] = [
                            ngram, term2, "full recognition"
                        ]
                    if term_string.lower() != term2.lower():
                        term_dict[term_string] = [
                            ngram, term2, "partial recognition", term3
                        ]

        if len(x) == 0:
            term_dict[term_string] = ["none", "none", "not recognized"]
    return term_dict
Example #2
0
    def extract(self, file_item):
        print 'quickumls_fp: ' + self.quickumls_fp
        print 'overlapping_criteria: ' + self.overlapping_criteria
        print 'threshold: ' + str(self.threshold)
        print 'similarity_name: ' + self.similarity_name
        print 'minMatchedLength: ' + str(self.minMatchedLength)
        print 'window: ' + str(self.window)

        matcher = QuickUMLS(self.quickumls_fp, self.overlapping_criteria,
                            self.threshold, self.window, self.similarity_name,
                            self.minMatchedLength, constants.ACCEPTED_SEMTYPES,
                            True)

        extraction_result = matcher.match(self.text,
                                          best_match=True,
                                          ignore_syntax=False)
        self.buildXML(extraction_result, file_item)
Example #3
0
class QuickUMLSProcessor(MERToolProcessor):
    def __init__(self, config):
        self.__quickumls = QuickUMLS('/home/daniel/QuickUMLS')
        self.__matches = None
        super().__init__(config)

    def process_input(self):
        """Extracts information from input"""
        input_file = self._input_filepath.open(encoding='utf8')
        text = input_file.read()
        print('--- QuickUMLS: Processing input ---')
        start_time = time.time()
        self.__matches = self.__quickumls.match(text,
                                                best_match=True,
                                                ignore_syntax=False)
        end_time = time.time() - start_time
        print('--- {} seconds ---'.format(end_time))

    def format_output(self):
        """Formats the original output to eHealth-KD subtask A output"""
        umls_concepts = map(lambda match_list: match_list[0],
                            self.__matches)  # Only first term (preferred term)
        ordered_concepts = sorted(
            umls_concepts,
            key=lambda umls_concept: umls_concept['start'])  # Order by start
        # Converts an UMLS concept to a eHealth-KD keyphrase
        for concept in ordered_concepts:
            keyphrase = {'label': 'Concept', 'term': concept['ngram']}
            multiword_term = concept['ngram'].split()
            if not multiword_term:
                keyphrase['span'] = '{0} {1}'.format(concept['start'],
                                                     concept['end'])
            else:
                span = []
                for token in multiword_term:
                    if not span:
                        span.append(
                            (concept['start'], concept['start'] + len(token)))
                    else:
                        span.append(
                            (span[-1][1] + 1, span[-1][1] + 1 + len(token)))
                span = map(lambda tup: '{0} {1}'.format(tup[0], tup[1]), span)
                keyphrase['span'] = ';'.join(span)
            self._key_phrases.append(keyphrase)
Example #4
0
class QUMLS(BaseLinker):
    def __init__(self, args):
        from quickumls import QuickUMLS

        assert args.quickumls_path is not None, "Please provide path where QuickUMLS is installed"
        assert args.num_worker == 1, "QuickUMLS doesn't support num_workers > 1"

        self.matcher = QuickUMLS(args.quickumls_path, 'score', threshold=0.6)

    def __call__(self, text):
        qumls_res = self.matcher.match(text)
        men_list = ddict(list)
        for men in qumls_res:
            for cand in men:
                start, end = cand['start'], cand['end']
                umls_cui = cand['cui']
                score = cand['similarity']
                men_list[(start, end)].append([umls_cui, round(score, 3)])

        return self.reformat(men_list, text)
Example #5
0
    def process_data(pid, doc_list):
        data = []
        matcher = QuickUMLS(args.quickumls_path, 'score', threshold=0.6)
        for i, doc in enumerate(doc_list):
            qumls_res = matcher.match(doc['text'])

            res_list = ddict(list)
            for men in qumls_res:
                for cand in men:
                    start, end = cand['start'], cand['end']
                    umls_cui = cand['cui']
                    score = cand['similarity']
                    res_list[(start, end)].append((umls_cui, score))

            doc['result'] = dict(res_list)
            data.append(doc)

            if i % 10 == 0:
                print('Completed [{}] {}, {}'.format(
                    pid, i,
                    time.strftime("%d_%m_%Y") + '_' +
                    time.strftime("%H:%M:%S")))

        return data
Example #6
0
try:
    from quickumls import QuickUMLS
except ImportError:
    from .quickumls import QuickUMLS

print('Creating QuickUMLS object...')

quickumls_path = r'C:\quickumls'

matcher = QuickUMLS(quickumls_path)

print('QuickUMLS object created...')

text = "The ulna has dislocated posteriorly from the trochlea of the humerus."

print('*************************')
print('Text:')
print(text)
print('*************************')

res = matcher.match(text, best_match=True, ignore_syntax=False)

print('Matching results:')
print(res)
Example #7
0
## if running more than once, comment this line out. Will result in error if you try to define 'matcher' more than once.
## path should be your destination_path created during QuickUMLS installation. Change accordingly.
matcher = QuickUMLS(
    '/Users/madisonmyers/Desktop/QuickUMLS-master/destination_path')

location = os.getcwd(
)  ## Will use the directory you are working in. Make sure notes/text files are available in this folder.

for file in os.listdir(location):
    if file.endswith(".txt"):
        ## many of the UCSF clinical notes need utf-8 encoding else it will result in an error
        open_file = open(file, 'r', encoding='utf-8', errors='ignore')
        doclist = [line for line in open_file]
        docstr = ''.join(doclist)
        bn_sents = re.split(r'[.!?]', docstr)
        out = matcher.match(bn_sents, result)
        filename = file.split(".")[0].split("/")[-1]
        ## most common negated terms in clinical text
        f = [
            "not", "no", "denies", "without", "no evidence", "with no",
            "negative for"
        ]
        ## add filename for any text file you don't want to input. Common examples are below.
        #         if filename == "requirements":
        #             continue
        #         if filename == "LICENSE":
        #             continue
        for line in bn_sents:
            if any(i.lower() in line.lower() for i in f):
                continue
        else:
results_list = []
result_count = 0

start_time = time.time()

output_dir = 'output/performance_test'

for i in range(total_iterations):
    if i % 100 == 0:
        print('Progress : [{0}/{1}]'.format(i, total_iterations))

    filename = '{0}.csv'.format(i)
    f = open(os.path.join(output_dir, filename), 'w')

    match_results = matcher.match(text,
                                  best_match=True,
                                  ignore_syntax=ignore_syntax)
    results_list.append(match_results)
    result_count += len(match_results)

    header = 'text,start,end,CUI,term,similarity\n'
    f.write(header)

    # this is a list of lists
    for match_result in match_results:
        # each match may contain multiple ngram entries
        for ngram_match_dict in match_result:
            #print(ngram_match_dict)
            line = '"{0}",{1},{2},{3},"{4}",{5:.2f}\n'.format(
                ngram_match_dict['ngram'], ngram_match_dict['start'],
                ngram_match_dict['end'], ngram_match_dict['cui'],
Example #9
0
from quickumls import QuickUMLS

to_annot_data = pd.read_csv('toAnnotateWithText_9thSept.csv')
matcher = QuickUMLS(
    '/home/roysoumya/Documents/ClinicalTrials_Coding/QuickUMLS/QuickUMLS_data/'
)

brief_title_concepts_list = list()
brief_summ_concepts_list = list()

for row_id in range(to_annot_data.shape[0]):
    brief_title = to_annot_data.iloc[row_id, 2]
    brief_summ = to_annot_data.iloc[row_id, 3]

    brief_title_umls = matcher.match(brief_title,
                                     best_match=True,
                                     ignore_syntax=False)
    brief_title_concepts_list.append(';'.join(
        [elem[0][u'cui'] for elem in brief_title_umls]))

    brief_summ_umls = matcher.match(brief_summ,
                                    best_match=True,
                                    ignore_syntax=False)
    brief_summ_concepts_list.append(';'.join(
        [elem[0][u'cui'] for elem in brief_summ_umls]))

    if row_id % 50 == 0:
        print(row_id)

print('Number of brief title elements: ', len(brief_title_concepts_list))
print('Number of brief summary elements: ', len(brief_summ_concepts_list))
Example #10
0
class QuickUMLSDriver(EntityLinker):
    def __init__(self,
                 name="quickumls",
                 quickumls_install="",
                 criterion="score",
                 min_score=0.7,
                 keep_semtypes=None):
        """
        Interface to QuickUMLS.

        :param str quickumls_install: The path to the QuickUMLS installation.
        :param float min_score: Minimum score to consider, between 0 and 1.0.
        :param list keep_semtypes: List of semantic types to consider.
        """
        super().__init__(name)
        self.quickumls_install = quickumls_install
        self.criterion = criterion
        self.min_score = min_score
        self.keep_semtypes = keep_semtypes
        self._log_parameters()
        self._start()

    def _log_parameters(self):
        self._log(f"Staring annotator '{self.name}'")
        self._log(f"{self.name} parameters:")
        self._log(f"  quickumls_install : {self.quickumls_install}")
        self._log(f"  criterion : {self.criterion}")
        self._log(f"  min_score : {self.min_score}")
        self._log(f"  keep_semtypes : {self.keep_semtypes}")

    def _start(self):
        """
        Instantiate the QuickUMLS matcher.
        """
        self._linker = QuickUMLS(self.quickumls_install,
                                 overlapping_criteria=self.criterion,
                                 threshold=self.min_score,
                                 accepted_semtypes=self.keep_semtypes)
        self._log("Started")

    def _convert_output_to_candidate_links(self, outputs):
        """
        Convert the raw QuickUMLS output into CandidateLink
        instances. Output is of the format:

        {matched_string: [CandidateLink, [...]]}

        :param list outputs: List of outputs from QuickUMLS.match().
    def __init__(self, input_string, candidate_term,
                 candidate_source, candidate_id, **attrs):
        """
        links = defaultdict(list)
        for phrase in outputs:
            seen_cuis = set()
            for match in phrase:
                try:
                    candidate_term = match["preferred_term"]
                    if candidate_term == "":  # No preferred_term found.
                        candidate_term = match["term"]
                except KeyError:
                    candidate_term = match["term"]
                # QuickUMLS sometimes returns the same CUI more than once.
                if match["cui"] in seen_cuis:
                    continue
                else:
                    seen_cuis.add(match["cui"])
                candidate = CandidateLink(
                    input_string=match["ngram"],
                    candidate_term=candidate_term,
                    candidate_source="UMLS",
                    candidate_id=match["cui"],
                    linking_score=match["similarity"],
                    # attrs
                    umls_semantic_type=match["semtypes"])
                links[match["ngram"]].append(candidate)
        return links

    def link(self, queries):
        """
        Link query or list of queries to entities in the
        corresponding database. Input should be a
        sequence of (ID, text pairs). Outputs a nested
        dictionary of the format

            {input_id: {matched_input: [CandidateLink, [...]]}}.

        :param list input_strings: List of (ID, string) pairs to link.
        :returns: Dictionary of input strings to CandidateLink instances.
        :rtype: dict
        """
        queries = self._prepare_queries(queries, ascii_only=False)
        all_links = {}
        for (qid, query) in queries:
            output = self._linker.match(query)
            links = self._convert_output_to_candidate_links(output)
            all_links[qid] = links
        return all_links

    def get_best_links(self, candidate_links, keep_top_n):
        """
        Given a set of candidate links for a set of input
        strings returned by EntityLinker.link(), choose
        the N "best" linkings for each input string from among
        the candidate links.

            {input_id: {matched_input: [CandidateLink, [...]]}}

        :param dict candidate_links: Dictionary of input strings
                                     to candidate linkings.
        :returns: candidate_links filtered to include only the N "best" links.
        :rtype: list
        """
        for qid in candidate_links.keys():
            for (matched_str, candidates) in candidate_links[qid].items():
                for c in candidates:
                    if matched_str.lower() == c.candidate_term.lower():
                        c.linking_score = 1.0
                candidates_sorted = sorted(candidates,
                                           key=lambda x: x.linking_score,
                                           reverse=True)
                candidates_top_n = candidates_sorted[:keep_top_n]
                candidate_links[qid][matched_str] = candidates_top_n
        return candidate_links
Example #11
0
def main(args):
    print('=============')
    if args.granularity not in ['N', 'S', 'W']:
        raise TypeError(
            'Invalid value for the granularity - should be N, S, or W')

    print('Reading MIMIC-III data...')
    if args.skiplims is None:
        notes_df = read_csv(args.noteevents_fp)
    else:
        to_skip = []
        for i in range(0, len(args.skiplims), 2):
            to_skip += [
                j for j in range(args.skiplims[i], args.skiplims[i + 1])
            ]
        notes_df = read_csv(args.noteevents_fp, skiprows=to_skip)

    print('Preprocessing notes ...')
    parsed_list = []
    for note in tqdm(notes_df['TEXT']):
        note = note.lower()
        note = re.sub('[^a-zA-Z.]', ' ', note)
        note = re.sub(r'\s+', ' ', note)

        # For finer granularity than entire notes, they are tokenized so that we
        # can iterate over sentences or words
        if args.granularity != 'N':
            note = nltk.sent_tokenize(note)
            if args.granularity == 'W':
                for i in range(len(note)):
                    note[i] = re.sub('[.]', '', note[i])
                    note = [nltk.word_tokenize for sentence in note]
                    for i in range(len(note)):
                        note[i] = [
                            word for word in note_[i]
                            if word not in stopwords.words('english')
                        ]

        parsed_list.append(note)

    print('Matching with UMLS corpus...')
    # initialise QuickUMLS string matching object
    matcher = QuickUMLS(args.qumls_fp,
                        threshold=args.thresh,
                        similarity_name=args.sim)

    # useful to define these two here so the mapping loop isn't too verbose
    qumls_getter = lambda n: matcher.match(
        n, best_match=False, ignore_syntax=False)
    # this gets the maximum similarity score and its index in the list for that ngram
    simscore_getter = lambda l: max(enumerate([d['similarity'] for d in l]),
                                    key=itemgetter(1))

    ALL = args.attr == 'all'

    if ALL:
        # make a dictionary which will have the columns to be added to the dataframe
        names = ['term', 'cui', 'semtypes']
        attrs = {}
        for name in names:
            attrs[name] = []
    else:
        mapped_corpus = []
    if args.keep_similarity: similarity_scores = []

    for note in tqdm(parsed_list):
        if ALL:
            # note-level mini-version of the dictionary "attrs" to collect the attributes for each note
            sub_attr = {}
            for name in names:
                sub_attr[name] = []
        else:
            single_attr_list = []
        if args.keep_similarity: sim_list = []
        if args.granularity == 'N':
            res = qumls_getter(note)
            for l in res:
                ss = simscore_getter(l)
                if ALL:
                    for name in names:
                        sub_attr[name].append(l[ss[0]][name])
                else:
                    single_attr_list.append(l[ss[0]][args.attr])
                if args.keep_similarity: sim_list.append(ss[1])
        else:
            for s in note:
                if args.granularity != 'W':
                    res = qumls_getter(s)
                    for l in res:
                        ss = simscore_getter(l)
                        if ALL:
                            for name in names:
                                sub_attr[name].append(l[ss[0]][name])
                        else:
                            single_attr_list.append(l[ss[0]][args.attr])
                        if args.keep_similarity: sim_list.append(ss[1])
                else:
                    for w in s:
                        res = qumls_getter(w)[0]
                        ss = simscore_getter(res)
                        if ALL:
                            for name in names:
                                sub_attr[name].append(res[ss[0]][name])
                        else:
                            single_attr_list.append(res[ss[0]][args.attr])
                        if args.keep_similarity: sim_list.append(ss[1])
        if ALL:
            if args.filter_semtypes_file is not None:
                irrelevant_type_ids = [
                    i[:-1] for i in open(args.filter_semtypes_file, 'r')
                ]
                indices_to_remove = []
                for st_set in sub_attr['semtypes']:
                    if all(st in irrelevant_type_ids for st in st_set):
                        indices_to_remove.append(
                            sub_attr['semtypes'].index(st_set))
                for name in names:
                    sub_attr[name] = [
                        st for st in sub_attr[name]
                        if sub_attr[name].index(st) not in indices_to_remove
                    ]
            for name in names:
                mapped_note = ''
                for a in sub_attr[name]:
                    if name == 'semtypes':
                        for a_ in a:
                            mapped_note += a_ + ' '
                    else:
                        mapped_note += a + ' '
                attrs[name].append(mapped_note)
        else:
            mapped_note = ''
            for word in single_attr_list:
                mapped_note += word
                mapped_note += ' '
            mapped_corpus.append(mapped_note)

    print('Matching finished!')

    print('Writing .csv file...')
    if ALL:
        for name, mapped_corpus in attrs.items():
            notes_df[name.upper()] = mapped_corpus
        if args.keep_similarity: notes_df['SIM_SCORE'] = sim_list
    else:
        notes_df[args.attr.upper()] = mapped_corpus

    if args.outfilepath[-4:] != '.csv': args.outfilepath += '.csv'
    notes_df.to_csv(args.outfilepath, index=False)

    print('Done!')
    print('=============')
Example #12
0
if os.environ.get("deployment", False):
    app.config.from_pyfile('/etc/cs4300-volume-cfg/cs4300app.cfg')
else:
    app.config.from_pyfile(os.path.join(
        os.path.join(os.getcwd(), "secrets"), "cs4300app.cfg"))

gunicorn_logger = logging.getLogger('gunicorn.error')
app.logger.handlers = gunicorn_logger.handlers
app.logger.setLevel(gunicorn_logger.level)


os.system("cp -r concept_matching/quickUCSLS concept_matching/quickUCSLS_{}".format(os.getpid()))
app.logger.debug("PID: {}".format(os.getpid()))

concept_matcher = QuickUCSLS("./concept_matching/quickUCSLS_{}".format(os.getpid()), accepted_semtypes={"T{:03d}".format(i) for i in range(1,35)}, threshold=0.5, min_match_length=0)
app.logger.debug("Matcher res: {}".format(concept_matcher.match("cos sim")))
app.logger.debug("Matcher Ready")

def get_preferred_terms():
    preferred_term = dict()
    with codecs.open("./concept_matching/definition_files/MRCONSO.RRF") as f:
        for i, ln in enumerate(f):
            if i < 1:
                continue
            cui, s, _, pref = ln.strip().split("|")
            if pref == "Y":
                preferred_term[cui] = s
    return preferred_term

preferred_term = get_preferred_terms()
 count_quote = line.count('"')
 if count_comma >= 1:
     # New clinical note
     list_cui = []
     list_terms = []
     fw.write(line)
     continue
 print(lineNb, flush=True)
 # if line not in myDict.keys():
 # matches  = matcher.match(line, best_match=True, ignore_syntax=False)
 # print(matches)
 # myDict[line] = matches
 # else:
 # matches = myDict[line]
 matches = matcher.match(line,
                         best_match=True,
                         ignore_syntax=False)
 concepts_output = []
 for phrase_candidate in matches:
     # Find max
     max = 0
     # print("PC :", phrase_candidate)
     for candidate in phrase_candidate:
         if candidate['similarity'] > max:
             max = candidate['similarity']
     # Get preferred terms for that max
     list_to_write = []
     for candidate in phrase_candidate:
         if candidate['similarity'] == max:
             # print("C : ", candidate)
             if candidate['term'] not in list_terms: