Ejemplo n.º 1
0
def metamap(sent):
    """
	Notes
	-----
	- This is too slow because we instantiate mm and get_cui_indices for every datapoint - find a way around this
	"""
    metamap_dir = "/Users/nicolasahar/Projects/repos/public_mm/bin/metamap18"
    mm = MetaMap.get_instance(
        metamap_dir
    )  # import metamap: (should point to the public_mm_lite folder which contains metamaplite.sh)

    cui_indices = get_cui_indices("./data/raw/cui/all_cuis_conso.csv")
    concepts, error = mm.extract_concepts(
        [sent.sent],
        [1])  # can only pass in one sentence at a time to snorkel!

    if concepts:
        if float(concepts[0].score) < 1:  # abstain if score < 1
            return -1

        cui = concepts[0].cui

        if cui in cui_indices["CUI"]:
            cui_index = (list(cui_indices["CUI"].keys())[list(
                cui_indices["CUI"].values()).index(cui)])
            return cui_index + 1  # To avoid assigning index 0 a label of 0

        return -1

    else:
        return 0
Ejemplo n.º 2
0
def GetMetaMapSemanticTypes(metamap_path, annotations):
    """
    Uses MetaMap to return a list of sematic types for each annotation

    @param metamap_path: Path to MetaMap installation
    @param annotations: List of concepts parsed from annotation file
    @return: List of lists of sematic types for each annotation
    """
    #Extract concepts from the list of annotations using MetaMap
    metamap = MetaMap.get_instance(metamap_path)
    indexes = range(len(annotations))
    concepts, error = metamap.extract_concepts(annotations, indexes)

    #List to hold a list of semantic types for each annotation
    anSemTypeList = [[] for x in range(len(annotations))]

    #Iterate over the list of concepts extracted from the list of annotations
    for concept in concepts:
        index = int(concept.index)
        if isinstance(concept, ConceptMMI):
            for semtype in concept.semtypes.strip('[]').split(','):
                if semtype not in anSemTypeList[index]:
                    #Create a list of unique semantic types per annotation
                    anSemTypeList[index].append(semtype)

    return anSemTypeList
Ejemplo n.º 3
0
 def __init__(self):
     # self.mm = MetaMap.get_instance('/home/khyathi/installations/public_mm/bin/metamap')
     # self.start_command = "/home/khyathi/installations/public_mm/bin/skrmedpostctl start"
     # self.stop_command = "/home/khyathi/installations/public_mm/bin/skrmedpostctl stop"
     self.mm = MetaMap.get_instance(config.get('MetaMap', 'instance'))
     self.start_command = config.get('MetaMap', 'start')
     self.stop_command = config.get('MetaMap', 'stop')
    def run_metamap(self):
        mm = MetaMap.get_instance(self.metamap_bin)
        sentence = self.term

        concepts, error = mm.extract_concepts(
            sentences=sentence, compute_all_mappings=False,
            prefer_multiple_concepts=False,
            mm_data_version="USAbase", term_processing=True,
            word_sense_disambiguation=True, silent=True)

        concept_term = pd.DataFrame({"name": None, "cui": None, "score": None, "semtypes": None}, index=range(0))
        for index, concept in enumerate(concepts):
            # print()
            # print(index)
            # print(concept)
            if type(concept).__name__ is "ConceptMMI":
                # print(type(concept).__name__)
                # print(concept.preferred_name)
                concept_series = pd.Series({"name": concept.preferred_name, "cui": concept.cui,
                                            "score": concept.score, "semtypes": concept.semtypes})
                # print(concept_series)
                concept_term = concept_term.append(concept_series, ignore_index=True)
                # print(concept_term)
                # print()

        concept_term.score = [float(item) for item in concept_term.score]   # make score float.
        concept_term.sort_values(by="score", ascending=False, inplace=True)  # sort by score.

        return concept_term
Ejemplo n.º 5
0
 def ConceptExtract(self, sent_text):
     '''
     sent_text: a list of sent text
     '''
     mm = MetaMap.get_instance('./public_mm/bin/metamap16',version = 2016)
     self.concepts,_ = mm.extract_concepts(sent_text,word_sense_disambiguation=True,\
                                  ignore_stop_phrases=True)
     self.scores,_ = mm.extract_concepts(sent_text,mmi_output=False,word_sense_disambiguation=True,\
                                  ignore_stop_phrases=True)
Ejemplo n.º 6
0
 def ConceptExtract(self, sent_text):
     '''
     sent_text: a list of sent text
     mm: Path should be changed to MetaMap location.
     '''
     mm = MetaMap.get_instance('/Users/sileshu/Downloads/public_mm/bin/metamap16',version = 2016)
     self.concepts,_ = mm.extract_concepts(sent_text,word_sense_disambiguation=True,\
                                  ignore_stop_phrases=True)
     self.scores,_ = mm.extract_concepts(sent_text,mmi_output=False,word_sense_disambiguation=True,\
                                  ignore_stop_phrases=True)
    def __init__(self):
        config = configparser.ConfigParser()
        config.read('config.ini')
        # Read path to the MetaMap binary release from configuration
        meta_map_path = config.get('general', 'meta_map_path')
        self.meta_map = MetaMap.get_instance(meta_map_path)

        # Get relevant field names, i.e. score, cui, preferred_name...
        self.relevant_field_names = config.get(
            'general', 'relevant_field_names').split(',')
Ejemplo n.º 8
0
 def ConceptExtract(self, sent_text):
     '''
     sent_text: a list of sent text
     mm: Path should be changed to MetaMap location.
     '''
     ###mm = MetaMap.get_instance('/public_mm/bin/metamap16',version = 2016)
     mm = MetaMap.get_instance('/home/tay/Documents/DSA-NIST/NIST_Utils-master/public_mm/bin/metamap16',version = 2016)
     self.concepts,_ = mm.extract_concepts(sent_text,word_sense_disambiguation=True,\
                                  ignore_stop_phrases=True)
     #self.scores,_ = mm.extract_concepts(sent_text,mmi_output=False,word_sense_disambiguation=True,\
     #                             ignore_stop_phrases=True) #TP#
     self.scores,_ = mm.extract_concepts(sent_text, word_sense_disambiguation=True,\
                                  ignore_stop_phrases=True) #TP#
Ejemplo n.º 9
0
    def __init__(self):
        #config = configparser.ConfigParser()
        #config.read('/home/galiasn/DATA/Jonathan/new/metamap-project-master/config.ini')
        # Read path to the MetaMap binary release from configuration
        #meta_map_path = config.get('general', 'meta_map_path')
        meta_map_path = '/home/galiasn/DATA/MetaMap/public_mm/bin/metamap16'
        self.meta_map = MetaMap.get_instance(meta_map_path)

        # Get relevant field names, i.e. score, cui, preferred_name...
        #self.relevant_field_names = config.get('general', 'relevant_field_names').split(',')
        self.relevant_field_names = [
            'score', 'preferred_name', 'cui', 'semtypes'
        ]
    def findEntity(self):

        # the server installed on your machine
        mm = MetaMap.get_instance('/work/tkakar/public_mm/bin/metamap14')

        #sample_Text = '/work/tkakar/FDAfirstNarrative.txt'
        rawText = self.Text
        #sents= self.Text
        concepts, error = mm.extract_concepts([rawText],
                                              word_sense_disambiguation=True)
        offset_list = []
        drugs_list = []
        drug_offset_pair = ()
        for concept in concepts:
            c = concept.semtypes
            c = c.replace("[", "")
            c = c.replace("]", "")
            semTypes = c.strip().split(",")
            #print semTypes, type(semTypes)
            for semType in semTypes:

                if semType in ['phsu', 'orch']:
                    token = concept.trigger.strip().split("-")[0]
                    token = token.replace("[", "")
                    #print concept.pos_info, "pos_info"
                    offset = self.preprocess.offsetParse(concept.pos_info, ';')
                    #print offset , "offset" , len(offset)
                    for item in offset:
                        #print item ,item[1]
                        item[1] = item[0] + item[1]

                        #print ("offsetMetamap"  ,  item )
                        if item not in offset_list:
                            offset_list.append(item)
                            drugs_list.append(token)
        drugs_list = [drug.replace('"', "") for drug in drugs_list]
        #print len(drugs_list)
        elementList = []
        for drug, offset in zip(drugs_list, offset_list):
            #print drug, type(drug), type(offset), [offset]

            elementList.append(
                DrugnameElement(drug, [offset], "DrugnameMetamapExtractor",
                                "DRUGNAME"))

        #print len(elementList)
        return elementList
Ejemplo n.º 11
0
def metamap_wrapper(text):
    """
    Function-wrapper for metamap binary. Extracts concepts
    found in text.

    !!!! REMEMBER TO START THE METAMAP TAGGER AND
        WordSense DISAMBIGUATION SERVER !!!!
    
    Input:
        - text: str,
        a piece of text or sentence
    Output:
       - a dictionary with key sents and values
       a list of the concepts found
    """

    # Tokenize into sentences
    sents = sent_tokenize(text)
    # Load Metamap Instance
    mm = MetaMap.get_instance(settings['load']['path']['metamap'])
    concepts, errors = mm.extract_concepts(sents, range(len(sents)))
    # Keep the sentence ids
    ids = np.array([int(concept[0]) for concept in concepts])
    sentences = []
    for i in xrange(len(sents)):
        tmp = {'sent_id': i + 1, 'entities': [], 'relations': []}
        # Wanted concepts according to sentence
        wanted = np.where(ids == i)[0].tolist()
        for w_ind in wanted:
            w_conc = concepts[w_ind]
            if hasattr(w_conc, 'cui'):
                tmp_conc = {
                    'label': w_conc.preferred_name,
                    'cui': w_conc.cui,
                    'sem_types': w_conc.semtypes,
                    'score': w_conc.score
                }
                tmp['entities'].append(tmp_conc)
        sentences.append(tmp)
    if errors:
        time_log('Errors with extracting concepts!')
        time_log(errors)
    return {'sents': sentences, 'sent_text': text}
Ejemplo n.º 12
0
def searchMetaMap(search):
    """
    setup MetaMap and search UMLS for user desired topic
    :return: The preferred name for the UMLS concept identified in the text. And the number of different preferred names output
    """
    # mm = MetaMap.get_instance('/Users/nwams/Documents/MetaMap/public_mm/bin/metamap12')
    mm = MetaMap.get_instance("/Users/yiqingluo/IF5400/public_mm/bin/metamap12")

    # searchTopic = raw_input('What topic are you searching for? ')
    searchTopic = [search]

    # [1] only allow user to search for one topic at a time
    concepts, error = mm.extract_concepts(searchTopic, [1])

    conceptArray = []
    for concept in concepts:
        conceptArray.append(concept.preferred_name)
    # print "UMLS terms = ", conceptArray

    return conceptArray, len(conceptArray)
Ejemplo n.º 13
0
class MetaMapWrapper(object):
    mm = MetaMap.get_instance(
        '/Users/hemant/GithubWorkspace/MetaMap/public_mm/bin/metamap18')

    def __init__(self):
        pass

    def annotate(self, text):
        mm_request = [text]
        concepts, error = self.mm.extract_concepts(mm_request, [1, 2])
        extracted_data = {}
        symptoms = []
        diseases = []
        diagnostics = []
        for concept in concepts:
            if hasattr(concept, 'semtypes'):
                # print(concept)
                if concept.semtypes == '[sosy]':
                    # Sign or Symptom
                    # sometimes it returns symptoms as a symptom
                    if concept.preferred_name != 'Symptoms' and concept.preferred_name != 'symptoms':
                        symptoms.append(concept.preferred_name)
                elif concept.semtypes == '[dsyn]':
                    # Disease or Syndrome
                    diseases.append(concept.preferred_name)
                    pass
                elif concept.semtypes == '[diap]':
                    # Diagnostic Procedure
                    diagnostics.append(concept.preferred_name)

        if len(symptoms):
            extracted_data['symptoms'] = symptoms
        if len(diseases):
            extracted_data['diseases'] = diseases
        if len(diagnostics):
            extracted_data['diagnostics'] = diagnostics

        return extracted_data
Ejemplo n.º 14
0
def process_sentences(sentences):
    mm = MetaMap.get_instance(METAMAP_BINARY_PATH)

    sentences_ids = list(range(len(sentences)))
    concepts, error = mm.extract_concepts(sentences, sentences_ids)

    sentences_concepts = [[] for _ in range(len(sentences))]
    for concept in concepts:
        if not isinstance(concept, ConceptMMI):
            continue

        sentence_id = int(concept.index)

        concept_data = {
            'preferred_name': concept.preferred_name,
            'cui': concept.cui,
            'pos_info': extract_positional_information(concept.pos_info),
            'semtypes': extract_semantic_types(concept.semtypes),
            'score': float(concept.score),
        }
        sentences_concepts[sentence_id].append(concept_data)

    return sentences_concepts
Ejemplo n.º 15
0
    def ConceptExtract(self, sent_text):
        '''
        sent_text: a list of sent text
        '''
        mm = MetaMap.get_instance('./public_mm/bin/metamap16',version = 2016)
        self.concepts,_ = mm.extract_concepts(sent_text,word_sense_disambiguation=True,\
                                     ignore_stop_phrases=True)
	
        self.scores,_ = mm.extract_concepts(sent_text,mmi_output=False,word_sense_disambiguation=True,\
                                     ignore_stop_phrases=True)

	with open ("chunk.csv", "a") as f:
		writer = csv.writer(f, delimiter=',')
		writer.writerow(sent_text)

	check1, ob1 = get_bp(self.scores, sent_text)
	if check1:
	    self.concepts.append(ob1)
	    self.scores["C1271104"] = '1000'

	check2, ob2 = get_sp(self.scores, sent_text)
	if check2:
	    self.concepts.append(ob2)
	    self.scores['C0428179'] = '1000'
	
	deletes, pulses = check_pulse(self.concepts, self.scores, sent_text)
	if deletes != -1:
		for i in deletes:
			# print(i)
			self.concepts.pop(i)
			self.scores.pop(CUI_pulse)
		for j in pulses:
			# print(j)
			self.concepts.append(j)
			self.scores[CUI_pulse] = '1000'
			break
Ejemplo n.º 16
0
def readData(NoExpansionFlag, UmlsExpansionFlag):
	#instantiating metamap
	mm = MetaMap.get_instance('/home/khyathi/installations/public_mm/bin/metamap')
	start_command = "/home/khyathi/installations/public_mm/bin/skrmedpostctl start"
	os.system(start_command)
	randomNumber = randint(0, 100)
	csumm=0
	infile = open(sys.argv[1],'r')
	data = json.load(infile)
	#c=1
	for (i, question) in enumerate(data['questions']):
		if question['type'] == 'summary':
			csumm +=1
		if csumm != randomNumber:
			continue
		#if csumm >=3:
		#	break
		quest = unicode(question['body']).encode("ascii","ignore")
		questionBow = quest.split()
		expandedQuestion = [questionBow] + [[]]
		if NoExpansionFlag == True:
			expandedQuestion = [questionBow] + [[]]
		elif UmlsExpansionFlag == True:
			expandedQuestion = [questionBow] + [expandConcepts(quest)]
		#print expandedQuestion
		#raw_input()
		ideal_summaries = question["ideal_answer"]
		ideal_answer_sents = []
		if isinstance(ideal_summaries, types.StringTypes):
			ideal_answer_sents = sent_tokenize(ideal_summaries)
		else:
			ideal_answer_sents = sent_tokenize(ideal_summaries[0])
		"""
		out = open("./ideal_summaries1/bioasq."+str(csumm)+".txt", "w")
		for sentence in ideal_answer_sents:
			out.write(unicode(sentence).encode("ascii","ignore")+"\n")
		out.close()
		"""
		snippets = question['snippets']
		#documents = question['documents']
		sentences = []
		sentenceScoreDict = {}
		snippetsText = []
		for snippet in question['snippets']:
			text = unicode(snippet["text"]).encode("ascii", "ignore")
			snippetsText.append(text)
			if text == "":
				continue
			try:
				sentences += sent_tokenize(text)
			except:
				sentences += text.split(". ")
			#print sentences
			#exit(1)
			#for document in question['documents']:
			#print document
			#abstractText = unicode( retrieve(document) ).encode("ascii","ignore")
			#if abstractText == "":
			#  continue
			#try:
			#  sentences += sent_tokenize(abstractText)
			#except:
			#  sentences += abstractText.split(". ")
		for sentence in sentences:
			sentenceBow = sentence.split()
			expandedSentence = [sentenceBow] + [[]]
			if NoExpansionFlag == True:
				expandedSentence = [sentenceBow] + [[]]
			elif UmlsExpansionFlag == True:
				expandedSentence = [sentenceBow] + [expandConcepts(sentence)]
			similarityScore = similarity(expandedQuestion, expandedSentence)
			sentenceScoreDict[sentence] = similarityScore  
		summaryFinal = cluster(sentenceScoreDict,csumm)
		#print "generated summary " + str(csumm)
		#question = "When does the antipeptic action of bisabolol occur with a pH-value?"
		pickle.dump(cache, open("cached_umls_json.pkl","wb"))
		pickle.dump(cuilist, open("cui.pkl","wb"))
		stop_command = "/home/khyathi/installations/public_mm/bin/skrmedpostctl stop"
		#stop_command = "~/public_mm/bin/skrmedpostctl stop"
		os.system(stop_command)
		#exit(1)
		return (quest, snippetsText, summaryFinal)
Ejemplo n.º 17
0
 def __init__(self):
     self.mm = MetaMap.get_instance(
         '/home/khyathi/installations/public_mm/bin/metamap')
     self.start_command = "/Users/khyathi/installations/public_mm/bin/skrmedpostctl start"
     self.stop_command = "/Users/khyathi/installations/public_mm/bin/skrmedpostctl stop"
Ejemplo n.º 18
0
'''
將 txt 讀入後 呼叫 metamap 做 mapping 
回傳 sematic type, CUI, position, negation
可以使用sldi, sldiID兩種格式檔案
'''
from typing import Tuple
from pymetamap import MetaMap
from mytool import mmi_parser as mmip
mm = MetaMap.get_instance('/home/feng/public_mm/bin/metamap20')
import csv

# txt_file = "clinical_txt/1.txt"


# read a file line by line to a List
def read_line(txt_file):
    sentences = list()
    with open(txt_file, 'r') as f:
        # i = index number, l = 讀出的某一行內容
        for i, l in enumerate(f):
            sentences.append(l)
        lines = i + 1
        return sentences, lines


def extract_sldi(txt_file, output=True):
    '''
    沒有修改部份內容,使用須注意
    負責解析(mapping) txt 檔案中的 term 到 CUI
    輸入檔案為 sldi 模式
    '''
from pymetamap import MetaMap
import sys
from bs4 import BeautifulSoup # process xml with topics
import gzip
import codecs
import re

mm = MetaMap.get_instance("/data/palotti/public_mm/bin/metamap13")

filename = sys.argv[1]
outputfile = sys.argv[2]
soup = BeautifulSoup(codecs.open(filename, "r"))
text_to_use = 1
append = True
onlyOne = True

expand_metamap = True
pweight = 1
tweight = 1
defaultweight = 1
diaweight = 3

def filterConcepts(concepts):
    query = set()
    for c in concepts:
        types = c.semtypes.strip("[]").split(",")
        #print c
        #print "Preferred:", c.preferred_name, "<> Trigger:", c.trigger.strip("[]").split("-tx-1-")[0].strip('"')
        for t in types:
            if t == "clna":
                #print "Remedy: ", c.preferred_name, "<> Trigger:", c.trigger.strip("[]").split("-tx-1-")[0].strip('"')
Ejemplo n.º 20
0
import pandas as pd
import ast
from pymetamap import MetaMap

# https://metamap.nlm.nih.gov/Installation.shtml
PATH_TO_METAMAP = '/UMLS/MetaMap/public_mm/bin/metamap18'

mm = MetaMap.get_instance(PATH_TO_METAMAP)

# https://metamap.nlm.nih.gov/SemanticTypesAndGroups.shtml
semantic_types = pd.read_csv("SemanticTypes_2018AB.csv", header=None, sep='|')
semantic_types.columns = ['Abbreviation', 'UniqueIdentifier', 'SemanticType']


def get_concept(sentence):
    '''
        extracts UMLS concepts for the given input text

        param sentence: user provided text
        type sentence: str

        returns: UMLS concept and UMLS semantic type abbreviation
        rtype: str, list
    '''
    max = 0
    index = 0

    # extract UMLS concept = [index,mm,score,preferred_name,cui,semtypes,trigger,location,pos_info,tree_codes]
    concepts, error = mm.extract_concepts([sentence])

    if concepts:
Ejemplo n.º 21
0
"""
Week 8, in-class task
CS 584: Applied BioNLP
@author Abeed Sarker
email: [email protected]

Metamap example based on pymetamap
Created: 10/1/2020
***DO NOT REDISTRIBUTE***

"""
from pymetamap import MetaMap
import pandas as pd

mm = MetaMap.get_instance(
    '/Users/thiago/Documents/METAMAP/public_mm/binmetamap18')
sents = ['john had a heart attack and he has high blood pressure']
concepts, errors = mm.extract_concepts(sents)
for c in concepts:
    print(c.index, c.score, c.preferred_name, c.cui, c.semtypes)

print('***---***')
f_path = './piboso-train.csv'
df = pd.read_csv(f_path)
texts = df['Text']
ids = df['Document']
sents = df['Sentence']
from collections import defaultdict

concepts_per_sent = defaultdict(list)
for t, s, i in zip(texts, sents, ids):
Ejemplo n.º 22
0
PICO = PICO()
PICO_processor = PICO.get_processor()
PICO_estimator = PICO.get_estimator(PICO_processor)
MED = MED()
MED_processor = MED.get_processor()
MED_estimator = MED.get_estimator(MED_processor)
#from src import Evidence_Proposition_clustering

# load attribute tagger
from general_utils import negex
rfile = open(config.negation_rules)
irules = negex.sortRules(rfile.readlines())
mm = None
if config.use_UMLS:
    from pymetamap import MetaMap
    mm = MetaMap.get_instance(config.metamap_dir)
else:
    mm = None
from src.postprocessing import attribute_processor
attribute_processor = attribute_processor(mm, negex.negTagger, irules)

#bert tokenizer
tokenizer = tokenization.FullTokenizer(vocab_file=config.vocab_file,
                                       do_lower_case=False)


def txt2ntokens(text):
    tokens = []
    for i, word in enumerate(textlist):
        token = tokenizer.tokenize(word)
        tokens.extend(token)
Ejemplo n.º 23
0
import sys
from multiprocessing.pool import Pool
from nltk import word_tokenize
from nltk.corpus import words
from pymetamap import MetaMap
import numpy as np

BAR = 0.03
MIN_LENGTH = 6
MAX_LENGTH = 40

# pylint: disable=invalid-name, len-as-condition, too-many-locals

vocab = set(words.words())
mm_home = '/home/nikhil.pattisapu/tools/metamap2016/public_mm/bin/metamap16'
mm = MetaMap.get_instance(mm_home)
sem_types = [
    'antb', 'bhvr', 'bmod', 'blor', 'bdsu', 'bdsy', 'chem', 'clna', 'cnce',
    'clnd', 'dsyn', 'enty', 'evnt', 'fndg', 'food', 'ftcn', 'hlca', 'hlco',
    'idcn', 'inch', 'ocdi', 'ocac', 'bpoc', 'orch', 'podg', 'phsu', 'phpr',
    'lbpr', 'resa', 'resd', 'sbst', 'sosy', 'tmco'
]
mm_threshold = 2


def remove_over_punctuated(sents):
    """Unused: Remove sentences which contain greater than BAR % of punctuation
    characters"""
    res = []
    for sent in sents:
        if len(sent) == 0:
Ejemplo n.º 24
0
from flask import Flask, request
from flask_restful import Resource, Api
from flask import jsonify
from pymetamap import MetaMap

app = Flask(__name__)
api = Api(app)
mm = MetaMap.get_instance('/Users/subigyanepal/Downloads/public_mm/bin/metamap16')


class MetaMapConcepts(Resource):
    def get(self, text):
        text = [text]
        response = []
        concepts, error = mm.extract_concepts(text)
        for concept in concepts:
            response.append({'score':concept.score, 'preferred_name':concept.preferred_name, 'semtypes':concept.semtypes, 'trigger':concept.trigger, 'pos_info':concept.pos_info, 'negation':concept.trigger[-2]})
        return jsonify(concepts = response)
        
        
api.add_resource(MetaMapConcepts, '/text/<string:text>')

if __name__ == '__main__':
    app.run(threaded=True)
def load_metaMap():
	global global_metamap
	global_metamap =  MetaMap.get_instance('../../../../../opt/public_mm/bin/metamap12')
Ejemplo n.º 26
0
schema_name = 'mimiciii'

parser = argparse.ArgumentParser(description='Process some integers.')
parser.add_argument('-N', type=int, help='number of data points to use')
args = parser.parse_args()

datadir = '/Users/irenechen/Documents/mimic-data'

# Connect to postgres with a copy of the MIMIC-III database
con = psycopg2.connect(dbname=dbname, user=sqluser)

# the below statement is prepended to queries to ensure they select from the right schema
query_schema = 'set search_path to ' + schema_name + ';'

mm = MetaMap.get_instance(
    '/afs/csail.mit.edu/group/clinicalml/shared/mimic_folder/timeline/public_mm/bin/metamap18'
)

df = pd.read_csv('../pset2/discharge_summaries.csv')

N = args.N

csvfile = open('dc_%d.csv' % args.N, 'wb')
fieldnames = [
    'index', 'pos_info', 'mm', 'trigger', 'semtypes', 'patientid',
    'preferred_name', 'score', 'location', 'tree_codes', 'cui'
]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()

def metamap_worker(metamap_path, tweets_path, output_dir, batch_size, batch_start, batch_end=None):
    """
    Runs MetaMap on a subset of tweets in the given dataset.

    metamap_path: Path to the metamap executable. For example,
        '/path/to/MetaMap/public_mm/bin/metamap18'
    tweets_path: Path to a CSV file containing tweets in the standard format.
    output_dir: Path to a directory in which output batch files should be written.
    batch_size: Number of tweets worth of concepts to write out in each batch file.
    batch_start: Index of the tweet in the file to start with. Batches will be
        numbered automatically.
    batch_end: Index of the tweet in the file to end with (exclusive).
    """

    # Setup MetaMap instance
    mm = MetaMap.get_instance(metamap_path)

    # Read and preprocess tweets
    tweets = utils.read_tweet_csv(tweets_path)
    data = tweets.full_text.values.tolist()
    data = [utils.preprocess_for_metamap(tweet) for tweet in data]
    tweet_ids = tweets["id"].values.tolist()
    
    # Remove spurious and duplicated tweets
    data, tweet_ids = filter_tweets(data, tweet_ids)
    assert len(data) == len(tweet_ids)

    all_concepts = []
    batch_idx = batch_start // batch_size
    print("Starting batch index:", batch_idx)

    failed_batches = []

    for batch_start in range(batch_start, batch_end if batch_end else len(data), METAMAP_BATCH_SIZE):
        batch_tweets = data[batch_start:batch_start + METAMAP_BATCH_SIZE]
        batch_ids = list(range(batch_start, batch_start + len(batch_tweets)))

        try:
            try:
                # Extract concepts as a batch
                concepts, error = mm.extract_concepts(batch_tweets, batch_ids)
            except (TypeError, IndexError):
                # Try extracting concepts individually
                for i, tweet in enumerate(batch_tweets):
                    concepts, error = mm.extract_concepts([tweet], [i + batch_start])

            if error is not None:
                print(error)
            else:
                all_concepts.extend([concept_to_dict(concept, tweet_ids) for concept in concepts])

        except Exception as e:
            if isinstance(e, KeyboardInterrupt):
                raise KeyboardInterrupt
            print("Failed batch", batch_start)
            failed_batches.append(batch_start)

        if batch_start % 1000 == 0:
            print("Batch {}, {} concepts extracted".format(batch_start, len(all_concepts)))

        if (batch_start + METAMAP_BATCH_SIZE) % batch_size == 0:
            df = pd.DataFrame(all_concepts)
            df.to_csv(os.path.join(output_dir, "concepts_{}.csv".format(batch_idx)))
            batch_idx += 1
            all_concepts = []

    # Write out the last (partial) batch
    if len(all_concepts) > 0:
        df = pd.DataFrame(all_concepts)
        df.to_csv(os.path.join(output_dir, "concepts_{}.csv".format(batch_idx)))
Ejemplo n.º 28
0
def metamap(doc_list):
    from pymetamap import MetaMap
    mm = MetaMap.get_instance(args.metamap_path)

    def process_data(pid, doc_list):

        data = []
        for i, doc in enumerate(doc_list):

            try:
                text = clean_text(doc['text'])
                concepts, error = mm.extract_concepts([text], [doc['_id']])

                assert len(text) == len(
                    doc['text']
                ), 'Text length does not match after pre-processing'

                res_list = ddict(list)
                for k, concept in enumerate(concepts):
                    if concept[1] != 'MMI': continue

                    pos_info = [
                        list(map(int, x.split('/')))
                        for x in concept.pos_info.replace(',', ';').replace(
                            '[', '').replace(']', '').split(';')
                    ]
                    men_cnt = [
                        len(x.split(',')) for x in concept.pos_info.split(';')
                    ]
                    men_sing = replace(concept.trigger,
                                       '"').split('"')[1::2][1::2]
                    mentions = mergeList([[men] * men_cnt[j]
                                          for j, men in enumerate(men_sing)])

                    for j, (start, offset) in enumerate(pos_info):
                        end = start + offset
                        res_list[(start, end)].append(
                            (concept.cui, concept.score))

                doc['result'] = dict(res_list)
                data.append(doc)
                if i % 10 == 0:
                    print('Completed [{}] {}, {}'.format(
                        pid, i,
                        time.strftime("%d_%m_%Y") + '_' +
                        time.strftime("%H:%M:%S")))
            except Exception as e:
                print('\nException Cause: {}'.format(e.args[0]))
                continue

        print('All work done {}!!'.format(pid))
        return data

    num_procs = args.workers
    chunks = partition(doc_list, num_procs)
    data_list = mergeList(
        Parallel(n_jobs=num_procs)(delayed(process_data)(i, chunk)
                                   for i, chunk in enumerate(chunks)))

    base_dir = './results/{}'.format(args.data)
    make_dir(base_dir)
    dump_pickle(data_list, '{}/{}_{}.pkl'.format(base_dir, args.model,
                                                 args.split))
Ejemplo n.º 29
0
from negbio.pipeline2.dner_mm import MetaMapExtractor
from negbio.pipeline2.pipeline import NegBioPipeline
from pymetamap import MetaMap


def read_cuis(pathname):
    cuis = set()
    with open(pathname) as fp:
        for line in fp:
            line = line.strip()
            if line:
                cuis.add(line)
    return cuis


if __name__ == '__main__':
    argv = parse_args(__doc__)
    mm = MetaMap.get_instance(argv['--metamap'])

    if argv['--cuis'] is None:
        cuis = None
    else:
        cuis = read_cuis(argv['--cuis'])

    extractor = MetaMapExtractor(mm, cuis)
    pipeline = NegBioPipeline(pipeline=[('MetaMapExtractor', extractor)])
    pipeline.scan(source=argv['<file>'],
                  directory=argv['--output'],
                  suffix=argv['--suffix'],
                  overwrite=argv['--overwrite'])
from pymetamap import MetaMap
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
from pyDatalog import pyDatalog

# initialize
mm = MetaMap.get_instance('/Users/sileshu/Downloads/public_mm/bin/metamap16')
pyDatalog.create_terms('RH, PH, C, BR, P, BL, op, Y')

# clauses
(op[RH,PH,C,BR,P,BL] == 'Eliminate_Hazard') <= (True == RH)
(op[RH,PH,C,BR,P,BL] == 'Eliminate_Hazard') <= (False == RH) & (True == PH)
(op[RH,PH,C,BR,P,BL] == 'Secondary_Survey') <= (False == RH) & (False == PH) & \
                                            (True == C) & (False == BL)
(op[RH,PH,C,BR,P,BL] == 'Control_Bleeding') <= (False == RH) & (False == PH) & \
                                            (True == C) & (True == BL)
(op[RH,PH,C,BR,P,BL] == 'Open_the_Airway, Secondary_Survey') <= (False == RH) & \
                                            (False == PH) & (False == C) & (True == BR) & \
                                            (True == P) & (False == BL)
(op[RH,PH,C,BR,P,BL] == 'Open_the_Airway, Control_Bleeding') <= (False == RH) & \
                                            (False == PH) & (False == C) & (True == BR) & \
                                            (True == P) & (True == BL)
(op[RH,PH,C,BR,P,BL] == 'Open_the_Airway, Start_Artificial_Ventilation, Secondary_Survey') <= (False == RH) & \
                                            (False == PH) & (False == C) & (False == BR) & \
                                            (True == P) & (False == BL)
(op[RH,PH,C,BR,P,BL] == 'Open_the_Airway, Start_Artificial_Ventilation, Control_Bleeding') <= (False == RH) & \
                                            (False == PH) & (False == C) & (False == BR) & \
                                            (True == P) & (True == BL)
(op[RH,PH,C,BR,P,BL] == 'Open_the_Airway, Start_External_Cardiac_Compressions, Secondary_Survey') \
Ejemplo n.º 31
0
    def getMetaMapConcepts(self, altText=None):
        """
        Returns the MetaMap concepts found using the 'pymetamap' python wrapper. 
        
        Args:
            altText (str) The text to be tagged, if it is not the same as the whole narrative the preprocessor was created with. This text won't be stored.
        
        Returns:
            the MetaMap concepts, as described in the pymetamap documentation (list)
        """
        if Preprocessor.textList.get("getMetaMapConcepts") is None:
            self.parseXML()
            mm = MetaMap.get_instance('/work/tkakar/public_mm/bin/metamap14')
            rawText = self.rawText()

            concepts, error = mm.extract_concepts([rawText])
            pattern = re.compile(
                '(\[(?:(orch|phsu|sosy|dsyn),?(orch|phsu|sosy|dsyn)?)\])')
            globalIDByConcept = {}
            returnedList = []
            for concept in concepts:
                if not hasattr(concept, 'aa'):
                    #TODO, see if there is any information that we are missing due to some combination not described by the Regex
                    match = pattern.search(concept.semtypes)
                    if match:
                        returnedList.append(concept)
                        posInfo = concept.pos_info
                        triggerInfo = concept.trigger.split('-')
                        conceptName = triggerInfo[3]
                        #need to replace the quotes in the conceptName
                        conceptName = conceptName.replace('"', '')

                        if ';' or '^' in posInfo:
                            posInfoList = self.offsetParse(posInfo, ';')
                        else:
                            posInfoList = self.offsetParse(posInfo)
                            #We need to change the format of the posInfos from (offset,span) to (offsetStartIndex, offsetEndIndex) here:
                        posInfoList = [(offset, span + offset)
                                       for (offset, span) in posInfoList]

                        for listIndex, (startIndex,
                                        endIndex) in enumerate(posInfoList):
                            lfNum = rawText.count('\n', 0, startIndex)
                            lastIdx = rawText.rfind(
                                conceptName, 0, startIndex + len(conceptName))
                            #you're going to forget this tomorrow morning, so this is the number of line feeds between the last instance of the concept name and where metamap thinks the word is.
                            lfNumSpecific = rawText.count(
                                '\n', lastIdx, startIndex)
                            #For some reason, we need to subract one at the end, TODO: Figure out why
                            posInfoList[listIndex] = (startIndex -
                                                      (lfNum + 1) +
                                                      lfNumSpecific - 1,
                                                      endIndex - (lfNum + 1) +
                                                      lfNumSpecific - 1)

                        globalIDList = []
                        #we have the fixed offsets for each mention of the semantic type. we now need to find their location in the xml file.
                        for newStartIdx, newEndIdx in posInfoList:
                            #                        print "newStartIdx: ", newStartIdx
                            #                        print "newEndIdx: ", newEndIdx
                            globalIds = self.placeOffsetInXML(
                                conceptName, word_tokenize(conceptName),
                                newStartIdx, newEndIdx - newStartIdx)
                            globalIDList.append(globalIds)

                        globalIDByConcept[concept] = globalIDList

            for key, value in globalIDByConcept.iteritems():
                for gIDList in value:
                    for gID in gIDList:
                        conceptXMLTag = self.root.find(".//*[@globalID='" +
                                                       str(gID) + "']")
                        tempMetaMapElem = ET.Element("METAMAP")
                        tempMetaMapElem.text = key.semtypes.replace("'", '')
                        conceptXMLTag.append(tempMetaMapElem)

            Preprocessor.textList['getMetaMapConcepts'] = returnedList
            self.writeToXML()

        return Preprocessor.textList.get('getMetaMapConcepts')
Ejemplo n.º 32
0
 def __init__(self, args):
     from pymetamap import MetaMap
     self.model = MetaMap.get_instance(args.metamap_path)
Ejemplo n.º 33
0
def message(payload):
	"""Display the onboarding welcome message after receiving a message
	that contains "start".
	"""
	event = payload.get("event", {})

	channel_id = event.get("channel")
	user_id = event.get("user")
	text = event.get("text")
	if text.find("displayed.")!=-1 : return
	print(text)
	global chatstep
	global concepts
	global cuis
	global morecui
	print(chatstep)
	#---Onboarding
	if chatstep==0 and text.find("This content can't be displayed.")==-1:
		if text and text.find("This content can't be displayed.")==-1:
			chatstep = chatstep+1
			return start_onboarding(user_id, channel_id)

	elif chatstep==1: #getting the symptoms
		if text and text.lower() != "done" and text.find("This content can't be displayed.")==-1:
			symptoms.append(text)
		elif text and text.lower() == "done":
			chatstep+=1
			print(chatstep)
			print(text)
			#IMPROTANT: metamap services should be started before running the following lines
			# "wsdserverctl start" and "skrmedpostctl start"
			mm = MetaMap.get_instance(METAMAP)
			concepts, error = mm.extract_concepts(symptoms)
			text=None
			#keep only semantic type of "sign or symptom"
			sos = []
			for con in concepts:
				if con.semtypes=="[sosy]":
					sos.append(con)
					cuis.append(con.cui)
	
			send_message(user_id, channel_id,"Thanks! Let's check if I understand you correctly!")
			send_message(user_id, channel_id,"Do you have the following symptoms?\n")
			#send_divider(user_id, channel_id)
			#confirm symptoms
			txt=""
			for con in sos:
				txt += "*{}*\n".format(con.preferred_name)
			send_message(user_id, channel_id,txt)
			send_divider(user_id, channel_id)
			send_message(user_id, channel_id,"Please enter yes/no")
			symptoms.clear()
			return
			#open model
		
	elif chatstep==2: # confirmation of symptoms
		if text and text.lower() == "yes"  and text.find("This content can't be displayed.")==-1:
			chatstep = chatstep+1
			morecui = list(findMoreSymptoms(cuis))
			if len(morecui)!=0:
				msg = "I found some related sypmtoms. If you have any of the following symptoms please enter the numbers separated by comma. Otherwise please enter \'no\'"
				send_message(user_id,channel_id,msg)
				send_divider(user_id,channel_id)
				txt=""
				for i in range(len(morecui)):
					con = morecui[i]
					#txt += "{}-{} , ".format(i+1,con2word[con][0])
					try:
						txt += "{}- {} : {} \n".format(i+1,con2word[con][0],nci[con])
					except:
						try:
							txt += "{}- {} : {} \n".format(i+1,con2word[con][0],csp[con])
						except:
							txt += "{}- {} \n".format(i+1,con2word[con][0])
						
				send_message(user_id,channel_id,txt)
				send_divider(user_id,channel_id)
				return
			else:#no more symptoms found
				spec,prob = predictSpecialist(cuis)
				if (spec):
					send_message(user_id,channel_id,"Based on my analysis you should visit the following specialist:")
					txt=""
					for i in len(spec):
						txt += "{} (probability = )\n-------------------------------\n".format(spec[i],prob[i])
					send_message(user_id,channel_id,txt)
				else:
					send_message(user_id,channel_id,"I was not able to find any specialist based on your symptoms so please visit a general practitioner for more guidance.")
				chatstep=4
				return
		elif text and text.lower() == "no":
			send_message(user_id,channel_id,"*Please describe your signs and symptoms. When finished please type \'done\'.*")	
			chatstep = chatstep-1
		text=None	
		return
	
	elif chatstep==3:
		if text.find("This content can't be displayed.")!=-1: return
		chatstep=chatstep+1
		print(text)
		if text and text.lower()!="no" and text.find("displayed")==-1:
			try:
				morenum = text.split(",")
				morenum = [int(x) for x in morenum]
				print(morenum)
				print(morecui)
				for i in morenum:
					cuis.append(morecui[i-1])
			except:
				chatstep = chatstep-1
				send_message(user_id,channel_id,"Sorry! I did not understand! If you have any of the above symptoms please enter the numbers separated by comma. Otherwise, please enter No")
				return
		spec,prob = predictSpecialist(cuis)
		if (len(spec)>0):
			send_message(user_id,channel_id,"Based on my analysis you should visit the following specialist:")
			for i in range(len(spec)):
				txt = "*{} (probability = {:.3f} )*".format(con2word[spec[i]][0],prob[i])
				#txt = "*{} (probability = {:.2f} )*".format(spec[i],prob[i])
				send_message(user_id,channel_id,txt)
				send_divider(user_id,channel_id)
		else:
			send_message(user_id,channel_id,"Sorry! I was not able to find any specialist based on your symptoms so please visit a general practitioner for more guidence.")
			return
		send_message(user_id,channel_id,"Thanks for using Docmatch :wave:")
		return
	#elif chatstep==4 and text.find("displayed")==-1:
		#chatstep+=1
		#if text and text.lower()=="yes" and text.find("This content can't be displayed.")==-1:
			#chatstep=1
			#return start_onboarding(user_id, channel_id)
		#elif text and text.lower().find("no")!=-1 and text.find("This content can't be displayed.")==-1:
			#return send_message(user_id,channel_id," Have a nice time :wave:")
	return
Ejemplo n.º 34
0
from pymetamap import MetaMap

# the server installed on your machine
mm = MetaMap.get_instance('/work/tkakar/public_mm/bin/metamap14')


### if using manual input
#sents = ['she fell down and felt dizzy after taking neproxin.', 'heart attack']
#concepts,error = mm.extract_concepts(sents,[1,2],word_sense_disambiguation=True)


## if using file as input
sample_text = 'textSample.txt'
concepts,error = mm.extract_concepts(filename=sample_text,word_sense_disambiguation=True)

## specify output filename
f = open('Vimig/Results.txt','w')
for concept in concepts:
	## if want to output specific semtypes uncomment below command
	#if concept.semtypes in [ '[sosy]', '[phsu]', '[dsyn]']:
		print >> f, concept
		## to print specific information as output
		#print >> f, "Trigger= "+concept.trigger +"\t", "SemType= " +concept.semtypes+"\t", "Pos= "+concept.pos_info+"\t"
#		#f.write(concept)
f.close()