def update_mimic_doc_dates(doc_dates): es = EntityCentricES.get_instance('./index_settings/es_mimic_setting.json') container = [] utils.multi_thread_tasking(doc_dates, 20, do_doc_update_date, args=[es, container])
def do_semehr_index(settings, patients, doc_to_patient): """ do SemEHR index :param settings: :param patients: :param doc_to_patient: :return: """ es = EntityCentricES(settings.get_attr(['semehr', 'es_host'])) es.index_name = settings.get_attr(['semehr', 'index']) es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type']) es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type']) f_yodie_anns = settings.get_attr(['yodie', 'output_file_path']) ann_files = [ f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f)) ] if settings.get_attr(['job', 'semehr-concept']) == 'yes': print 'working on files : %s' % ann_files # index concepts for ann in ann_files: utils.multi_thread_large_file_tasking(join(f_yodie_anns, ann), 10, do_index_100k_anns, args=[es, doc_to_patient]) if settings.get_attr(['job', 'semehr-patients']) == 'yes': # index patients es_doc_url = settings.get_attr(['semehr', 'es_doc_url']) es_full_text = Elasticsearch([es_doc_url], serializer=JSONSerializerPython2()) ft_index_name = settings.get_attr(['semehr', 'full_text_index']) ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type']) ft_entity_field = settings.get_attr( ['semehr', 'full_text_patient_field']) ft_fulltext_field = settings.get_attr( ['semehr', 'full_text_text_field']) utils.multi_thread_tasking(patients, 10, do_index_100k_patients, args=[ es, es_full_text, ft_index_name, ft_doc_type, ft_entity_field, ft_fulltext_field ])
def index_mimic_notes(): es = EntityCentricES.get_instance('./index_settings/es_mimic_setting.json') ann_files = [ f for f in listdir(_f_yodie_anns) if isfile(join(_f_yodie_anns, f)) ] patients = [] for ann in ann_files: print 'indexing %s ...' % ann utils.multi_thread_large_file_tasking(join(_f_yodie_anns, ann), 20, do_index_mimic, args=[es, patients]) print 'full text and annotations done.' patients = list(set(patients)) index_patients(patients, es)
def do_semehr_index(settings, patients, doc_to_patient): """ do SemEHR index :param settings: :param patients: :param doc_to_patient: :return: """ es = EntityCentricES(settings.get_attr(['semehr', 'es_host'])) es.index_name = settings.get_attr(['semehr', 'index']) es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type']) es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type']) es.doc_level_index = settings.get_attr(['semehr', 'doc_level_index']) f_yodie_anns = settings.get_attr(['yodie', 'output_file_path']) ann_files = [ f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f)) ] if settings.get_attr(['job', 'semehr-concept']) == 'yes': logging.info('[SemEHR-step] starting semehr-concept process') logging.debug('working on files : %s' % ann_files) # index concepts concept_index = settings.get_attr(['semehr', 'concept_index']) for ann in ann_files: utils.multi_thread_large_file_tasking( join(f_yodie_anns, ann), 10, do_index_100k_anns, args=[es, doc_to_patient, concept_index]) logging.info('[SemEHR-step-end]concept/document level indexing done') if settings.get_attr(['job', 'semehr-patients']) == 'yes': logging.info('[SemEHR-step] indexing annotations at patient level') # index patients es_doc_url = settings.get_attr(['semehr', 'es_doc_url']) es_full_text = Elasticsearch([es_doc_url], serializer=JSONSerializerPython2(), verify_certs=False) ft_index_name = settings.get_attr(['semehr', 'full_text_index']) ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type']) ft_entity_field = settings.get_attr( ['semehr', 'full_text_patient_field']) ft_fulltext_field = settings.get_attr( ['semehr', 'full_text_text_field']) utils.multi_thread_tasking(patients, 10, do_index_100k_patients, args=[ es, es_full_text, ft_index_name, ft_doc_type, ft_entity_field, ft_fulltext_field ]) logging.info('[SemEHR-step-end]patient level indexing done')
def process_semehr(config_file): """ a pipeline to process all SemEHR related processes: 0. ES doc copy from one index to another; 1. bio-yodie NLP pipeline annotation on docs; 2. entity centric SemEHR ES indexing :param config_file: :return: """ # read the configuration ps = ProcessSetting(config_file) # setting log configuration log_level = 'INFO' if ps.get_attr( ['logging', 'level']) is None else ps.get_attr(['logging', 'level']) log_format = '%(name)s %(asctime)s %(levelname)s %(message)s' if ps.get_attr(['logging', 'format']) is None \ else ps.get_attr(['logging', 'format']) log_file = None if ps.get_attr( ['logging', 'file']) is None else ps.get_attr(['logging', 'file']) logging.basicConfig(level=log_level, format=log_format) if log_file is not None: formatter = logging.Formatter(log_format) file_handler = logging.FileHandler(log_file) file_handler.setLevel(log_level) file_handler.setFormatter(formatter) logging.getLogger().addHandler(file_handler) logging.info('logging to %s' % log_file) # initialise the jobstatus class instance job_file = join( ps.get_attr(['job', 'job_status_file_path']), 'semehr_job_status_%s.json' % ps.get_attr(['job', 'job_id'])) logging.info('[SemEHR-step] using job status file %s' % job_file) job_status = JobStatus(job_file) job_status.job_start() # preload: load documents to es if ps.get_attr(['job', 'epr_index']) == 'yes': logging.info('[SemEHR-step]load documents to elasticsearch...') load_document_to_es(settings=ps) logging.info('[SemEHR-step-end] epr_index step done') data_rows = [] doc2pid = {} pids = [] if ps.get_attr(['job', 'load_docs']) == 'yes': sql_template = ps.get_attr(['new_docs', 'sql_query']) logging.info( '[SemEHR-step] retrieving docs by using the template [%s]' % sql_template) data_rows = get_docs_for_processing( job_status, sql_template, ps.get_attr(['new_docs', 'dbconn_setting_file'])) logging.info('total docs num is %s' % len(data_rows)) elif ps.get_attr(['job', 'cohort_docs']) == 'yes': logging.info('[SemEHR-step] retrieving docs by cohort [%s]' % ps.get_attr(['cohort_docs', 'es_cohort_file'])) data_rows, doc2pid, pids = es_get_cohort_docs(ps) logging.info('total docs num is %s' % len(data_rows)) try: # if True: # 0. copy docs if ps.get_attr(['job', 'copy_docs']) == 'yes': logging.info('[SemEHR-step] copy docs') docs = [str(r['docid']) for r in data_rows] utils.multi_thread_tasking( docs, ps.get_attr(['doc_copy', 'thread_num']), do_copy_doc, args=[ EntityCentricES(ps.get_attr(['doc_copy', 'es_host'])), ps.get_attr(['doc_copy', 'src_index']), ps.get_attr(['doc_copy', 'src_doc_type']), ps.get_attr(['doc_copy', 'dest_index']), ps.get_attr(['doc_copy', 'dest_doc_type']) ]) logging.info('[SemEHR-step-end]copying docs done') if ps.get_attr(['job', 'yodie']) == 'yes': docid_path = '%s/%s_docids.txt' % (ps.get_attr([ 'yodie', 'input_doc_file_path' ]), ps.get_attr(['job', 'job_id'])) logging.info('[SemEHR-step] doing yodie') # 1. do bio-yodie pipeline # 1.1 prepare the configuration file num_docs = produce_yodie_config(ps, data_rows, docid_path) if num_docs == 0: logging.info( '[SemEHR-step-end] nothing to process, NLP step done') else: logging.info('total number of docs %s' % num_docs) # 1.2 set the env variables set_sys_env(ps) # 1.3 clear ann output folder logging.info('clearing %s ...' % ps.get_attr(['yodie', 'output_file_path'])) clear_folder(ps.get_attr(['yodie', 'output_file_path'])) # 1.3 run bio-yodie os.chdir(ps.get_attr(['yodie', 'gcp_run_path'])) if ps.get_attr(['yodie', 'os']) == 'win': cmd = ' '.join([ 'java', "-Dgate.home=%s" % ps.get_attr(['env', 'gate_home']), "-Dgcp.home=%s" % ps.get_attr(['env', 'gcp_home']), "-Djava.protocol.handler.pkgs=gate.cloud.util.protocols", "-cp .;{SCRIPTDIR}/conf;{SCRIPTDIR}/gcp.jar;{SCRIPTDIR}/lib/*;" "{GATE_HOME}/bin/gate.jar;{GATE_HOME}/lib/*".format( **{ "SCRIPTDIR": ps.get_attr(['env', 'gcp_home']), "GATE_HOME": ps.get_attr(['env', 'gate_home']) }), '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" ' % ps.get_attr(['env', 'yodie_path']), "-Xmx%s" % ps.get_attr(['yodie', 'memory']), "gate.cloud.batch.BatchRunner", "-t %s" % ps.get_attr(['yodie', 'thread_num']), "-b %s" % ps.get_attr(['yodie', 'config_xml_path']) ]) else: cmd = ' '.join([ 'gcp-direct.sh', "-t %s" % ps.get_attr(['yodie', 'thread_num']), "-Xmx%s" % ps.get_attr(['yodie', 'memory']), "-b %s" % ps.get_attr(['yodie', 'config_xml_path']), '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" ' % ps.get_attr(['env', 'yodie_path']), ]) logging.debug( 'executing the following command to start NLP...') logging.info(cmd) p = Popen(cmd, shell=True, stderr=STDOUT) p.wait() if 0 != p.returncode: job_status.set_status(False) job_status.save() logging.error( 'ERROR doing the NLP, stopped with a coide [%s]' % p.returncode) exit(p.returncode) else: logging.info('[SemEHR-step-end] NLP step done') if 'semehr_path' in os.environ: logging.info('changing back to semehr_path: %s' % os.environ['semehr_path']) os.chdir(os.environ['semehr_path']) # 2. do SemEHR concept/entity indexing if ps.get_attr(['job', 'semehr-concept']) == 'yes' or ps.get_attr( ['job', 'semehr-patients']) == 'yes': patients = [] doc_to_patient = {} for r in data_rows: patients.append(str(r['patientid'])) doc_to_patient[str(r['docid'])] = str(r['patientid']) patients = list(set(patients)) do_semehr_index(ps, patients, doc_to_patient) # 3. do SemEHR actionable transparency if ps.get_attr(['job', 'action_trans']) == 'yes': logging.info('[SemEHR-step]doing transparency...') actionable_transparise(settings=ps) # 4. do SemEHR document annotation analysis (post processing) if ps.get_attr(['job', 'doc_analysis']) == 'yes': logging.info('[SemEHR-step]doing SemEHR annotation analysis...') do_semehr_doc_anns_analysis(settings=ps) logging.info('[SemEHR-step-end] doc_analysis step done') # 4.5 do SemEHR patient level index if ps.get_attr(['job', 'patient_index']) == 'yes': logging.info('[SemEHR-step]doing patient level indexing...') patient_level_indexing(settings=ps, pids=pids) logging.info('[SemEHR-step-end] patient level indexing done') # 5. do populate results for a research study if ps.get_attr(['job', 'populate_cohort_result']) == 'yes': logging.info( '[SemEHR-step]doing SemEHR cohort result extraction...') populate_cohort_results(settings=ps) logging.info('[SemEHR-step-end] populate_cohort_result step done') # 6. do collect cohort doc based results for a research study if ps.get_attr(['job', 'cohort_doc_collection']) == 'yes': logging.info( '[SemEHR-step]doing SemEHR cohort doc based collection...') collect_cohort_doc_results(settings=ps, doc2pid=doc2pid) logging.info( '[SemEHR-step-end] collect_cohort_doc_results step done') job_status.set_status(True) job_status.save() logging.info('[SemEHR-process-end] all done') except Exception as e: logging.error('[SemEHR-process-ERROR] Failed to do SemEHR process %s' % str(e)) job_status.set_status(False) job_status.save()
def process_semehr(config_file): """ a pipeline to process all SemEHR related processes: 0. ES doc copy from one index to another; 1. bio-yodie NLP pipeline annotation on docs; 2. entity centric SemEHR ES indexing :param config_file: :return: """ # read the configuration ps = ProcessSetting(config_file) # initialise the jobstatus class instance job_file = join( ps.get_attr(['job', 'job_status_file_path']), 'semehr_job_status_%s.json' % ps.get_attr(['job', 'job_id'])) print 'using job status file %s' % job_file job_status = JobStatus(job_file) job_status.job_start() data_rows = [] if ps.get_attr(['job', 'load_docs']) == 'yes': sql_template = ps.get_attr(['new_docs', 'sql_query']) print 'retrieving docs by using the template [%s]' % sql_template data_rows = get_docs_for_processing( job_status, sql_template, ps.get_attr(['new_docs', 'dbconn_setting_file'])) print 'total docs num is %s' % len(data_rows) # try: if True: # 0. copy docs if ps.get_attr(['job', 'copy_docs']) == 'yes': docs = [str(r['docid']) for r in data_rows] utils.multi_thread_tasking( docs, ps.get_attr(['doc_copy', 'thread_num']), do_copy_doc, args=[ EntityCentricES(ps.get_attr(['doc_copy', 'es_host'])), ps.get_attr(['doc_copy', 'src_index']), ps.get_attr(['doc_copy', 'src_doc_type']), ps.get_attr(['doc_copy', 'dest_index']), ps.get_attr(['doc_copy', 'dest_doc_type']) ]) if ps.get_attr(['job', 'yodie']) == 'yes': docid_path = '%s/%s_docids.txt' % (ps.get_attr([ 'yodie', 'input_doc_file_path' ]), ps.get_attr(['job', 'job_id'])) print 'working on yodie with %s documents, saved to %s...' % (str( len(data_rows)), docid_path) # save doc ids to text file for input to bioyodie print 'saving doc ids to [%s]' % docid_path utils.save_string('\n'.join([str(r['docid']) for r in data_rows]), docid_path) # 1. do bio-yodie pipeline # 1.1 prepare the configuration file produce_yodie_config(ps) # 1.2 set the env variables set_sys_env(ps) # 1.3 clear ann output folder print 'clearing %s ...' % ps.get_attr( ['yodie', 'output_file_path']) clear_folder(ps.get_attr(['yodie', 'output_file_path'])) # 1.3 run bio-yodie os.chdir(ps.get_attr(['yodie', 'gcp_run_path'])) if ps.get_attr(['yodie', 'os']) == 'win': cmd = ' '.join([ 'java', "-Dgate.home=%s" % ps.get_attr(['env', 'gate_home']), "-Dgcp.home=%s" % ps.get_attr(['env', 'gcp_home']), "-Djava.protocol.handler.pkgs=gate.cloud.util.protocols", "-cp .;{SCRIPTDIR}/conf;{SCRIPTDIR}/gcp.jar;{SCRIPTDIR}/lib/*;" "{GATE_HOME}/bin/gate.jar;{GATE_HOME}/lib/*".format( **{ "SCRIPTDIR": ps.get_attr(['env', 'gcp_home']), "GATE_HOME": ps.get_attr(['env', 'gate_home']) }), '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" ' % ps.get_attr(['env', 'yodie_path']), "-Xmx%s" % ps.get_attr(['yodie', 'memory']), "gate.cloud.batch.BatchRunner", "-t %s" % ps.get_attr(['yodie', 'thread_num']), "-b %s" % ps.get_attr(['yodie', 'config_xml_path']) ]) else: cmd = ' '.join([ 'gcp-direct.sh', "-t %s" % ps.get_attr(['yodie', 'thread_num']), "-Xmx%s" % ps.get_attr(['yodie', 'memory']), "-b %s" % ps.get_attr(['yodie', 'config_xml_path']), '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" ' % ps.get_attr(['env', 'yodie_path']), ]) print cmd p = Popen(cmd, shell=True, stderr=STDOUT) p.wait() if 0 != p.returncode: job_status.set_status(False) job_status.save() exit(p.returncode) # 2. do SemEHR concept/entity indexing if ps.get_attr(['job', 'semehr-concept']) == 'yes' or ps.get_attr( ['job', 'semehr-patients']) == 'yes': patients = [] doc_to_patient = {} for r in data_rows: patients.append(str(r['patientid'])) doc_to_patient[str(r['docid'])] = str(r['patientid']) patients = list(set(patients)) do_semehr_index(ps, patients, doc_to_patient) # 3. do SemEHR actionable transparency if ps.get_attr(['job', 'action_trans']) == 'yes': print 'doing transparency...' actionable_transparise(settings=ps) job_status.set_status(True) job_status.save()