def load_document_to_es(settings): """ load document to elastic search :param settings: :return: """ doc_folder = settings.get_attr(['epr_index', 'doc_folder']) d2p_tsv = settings.get_attr(['epr_index', 'doc2patient_tsv']) es = SemEHRES.get_instance_by_setting( settings.get_attr(['epr_index', 'es_host']), settings.get_attr(['epr_index', 'es_index_name']), settings.get_attr(['epr_index', 'doc_type']), '', '') tsv_lines = utils.read_text_file(d2p_tsv) d2p = {} for l in tsv_lines: arr = l.split('\t') if len(arr) > 1: d2p[arr[0]] = arr[1] for f in [f for f in listdir(doc_folder) if isfile(join(doc_folder, f))]: if f in d2p: p = d2p[f] t = utils.read_text_file_as_string(join(doc_folder, f)) es.index_new_doc( index=settings.get_attr(['epr_index', 'es_index_name']), doc_type=settings.get_attr(['epr_index', 'doc_type']), data={ settings.get_attr(['epr_index', 'text_field']): t, settings.get_attr(['epr_index', 'patient_id_field']): p, "id": f }, doc_id=f)
def patient_level_indexing(settings, pids): es = SemEHRES.get_instance_by_setting( settings.get_attr(['patient_index', 'es_host']), settings.get_attr(['patient_index', 'patient_index']), settings.get_attr(['patient_index', 'patient_doct_type']), settings.get_attr(['patient_index', 'es_concept_type']), settings.get_attr(['patient_index', 'es_patient_type'])) doc_level_index = settings.get_attr(['patient_index', 'doc_level_index']) doc_ann_type = settings.get_attr(['patient_index', 'doc_ann_type']) doc_index = settings.get_attr(['patient_index', 'doc_index']) doc_pid_field_name = settings.get_attr( ['patient_index', 'doc_pid_field_name']) doc_text_field_name = settings.get_attr( ['patient_index', 'doc_text_field_name']) patient_index = settings.get_attr(['patient_index', 'patient_index']) patient_doct_type = settings.get_attr( ['patient_index', 'patient_doct_type']) doc_type = settings.get_attr(['patient_index', 'doc_type']) ann_field_name = settings.get_attr(['patient_index', 'ann_field_name']) num_procs = 10 if settings.get_attr(['patient_index', 'num_procs']) is None else \ settings.get_attr(['patient_index', 'num_procs']) ignore_exist = True if settings.get_attr(['patient_index', 'ignore_exist']) is None else \ settings.get_attr(['patient_index', 'ignore_exist']) utils.multi_process_tasking(lst=pids, num_procs=num_procs, process_func=do_patient_indexing, args=[ es, doc_level_index, doc_ann_type, doc_index, doc_type, doc_pid_field_name, doc_text_field_name, patient_index, patient_doct_type, ann_field_name, ignore_exist ])
def es_get_cohort_docs(settings): pids = utils.read_text_file(settings.get_attr(['cohort_docs', 'es_cohort_file'])) es = SemEHRES.get_instance_by_setting(settings.get_attr(['cohort_docs', 'es_host']), settings.get_attr(['cohort_docs', 'es_index']), settings.get_attr(['cohort_docs', 'es_doc_type']), settings.get_attr(['cohort_docs', 'es_concept_type']), settings.get_attr(['cohort_docs', 'es_patient_type'])) patiet_id_field = settings.get_attr(['cohort_docs', 'patiet_id_field']) docs = [] docs2p = {} for pid in pids: container = [] cohort_analysis_helper.query_collect_patient_docs({'_id': pid}, es, '*', patiet_id_field, container) if len(container) > 0: docs += [{'docid': d} for d in container[0]['docs']] for d in container[0]['docs']: docs2p[d] = pid return docs, docs2p, pids
def do_semehr_doc_anns_analysis(settings): anns_folder = settings.get_attr(['doc_ann_analysis', 'ann_docs_path']) text_folder = settings.get_attr(['doc_ann_analysis', 'full_text_folder']) full_text_file_pattern = settings.get_attr( ['doc_ann_analysis', 'full_text_fn_ptn']) rule_config = settings.get_attr(['doc_ann_analysis', 'rule_config_path']) output_folder = settings.get_attr(['doc_ann_analysis', 'output_folder']) study_folder = settings.get_attr(['doc_ann_analysis', 'study_folder']) combined_anns = settings.get_attr(['doc_ann_analysis', 'combined_anns']) es_output_index = settings.get_attr( ['doc_ann_analysis', 'es_output_index']) es_output_doc = settings.get_attr(['doc_ann_analysis', 'es_output_doc']) output_file_pattern = settings.get_attr( ['doc_ann_analysis', 'output_fn_pattern']) thread_num = settings.get_attr(['doc_ann_analysis', 'thread_num']) if thread_num is None: thread_num = 10 process_mode = settings.get_attr(['doc_ann_analysis', 'process_mode']) if process_mode is not None and process_mode != 'sql': if settings.get_attr(['doc_ann_analysis', 'es_host']) is not None: es = SemEHRES.get_instance_by_setting( settings.get_attr(['doc_ann_analysis', 'es_host']), settings.get_attr(['doc_ann_analysis', 'es_index']), settings.get_attr(['doc_ann_analysis', 'es_doc_type']), settings.get_attr(['doc_ann_analysis', 'es_concept_type']), settings.get_attr(['doc_ann_analysis', 'es_patient_type'])) docanalysis.process_doc_anns( anns_folder=anns_folder, full_text_folder=text_folder, rule_config_file=rule_config, output_folder=output_folder, study_folder=study_folder, full_text_fn_ptn=full_text_file_pattern, fn_pattern=output_file_pattern, thread_num=thread_num, es_inst=es, es_text_field=settings.get_attr( ['doc_ann_analysis', 'full_text_field']), patient_id_field=settings.get_attr( ['doc_ann_analysis', 'patielt_id_field']), combined_anns=combined_anns, es_output_index=es_output_index, es_output_doc=es_output_doc) else: docanalysis.process_doc_anns( anns_folder=anns_folder, full_text_folder=text_folder, rule_config_file=rule_config, output_folder=output_folder, study_folder=study_folder, full_text_fn_ptn=full_text_file_pattern, fn_pattern=output_file_pattern, thread_num=thread_num) else: ann_list_sql = settings.get_attr(['doc_ann_analysis', 'ann_list_sql']) primary_keys = settings.get_attr(['doc_ann_analysis', 'primary_keys']) ann_inst_sql = settings.get_attr(['doc_ann_analysis', 'ann_inst_sql']) full_text_sql = settings.get_attr( ['doc_ann_analysis', 'full_text_sql']) update_query_template = settings.get_attr( ['doc_ann_analysis', 'update_query_template']) update_status_template = settings.get_attr( ['doc_ann_analysis', 'update_status_template']) dbconn_file = settings.get_attr(['doc_ann_analysis', 'dbconn_file']) docanalysis.analyse_db_doc_anns( ann_list_sql, ann_inst_sql, primary_keys, update_query_template, full_text_sql, dbconn_file, thread_num=thread_num, study_folder=study_folder, rule_config_file=rule_config, update_status_template=update_status_template)