def load_document_to_es(settings): """ load document to elastic search :param settings: :return: """ doc_folder = settings.get_attr(['epr_index', 'doc_folder']) d2p_tsv = settings.get_attr(['epr_index', 'doc2patient_tsv']) es = SemEHRES.get_instance_by_setting( settings.get_attr(['epr_index', 'es_host']), settings.get_attr(['epr_index', 'es_index_name']), settings.get_attr(['epr_index', 'doc_type']), '', '') tsv_lines = utils.read_text_file(d2p_tsv) d2p = {} for l in tsv_lines: arr = l.split('\t') if len(arr) > 1: d2p[arr[0]] = arr[1] for f in [f for f in listdir(doc_folder) if isfile(join(doc_folder, f))]: if f in d2p: p = d2p[f] t = utils.read_text_file_as_string(join(doc_folder, f)) es.index_new_doc( index=settings.get_attr(['epr_index', 'es_index_name']), doc_type=settings.get_attr(['epr_index', 'doc_type']), data={ settings.get_attr(['epr_index', 'text_field']): t, settings.get_attr(['epr_index', 'patient_id_field']): p, "id": f }, doc_id=f)
def patient_level_indexing(settings, pids): es = SemEHRES.get_instance_by_setting( settings.get_attr(['patient_index', 'es_host']), settings.get_attr(['patient_index', 'patient_index']), settings.get_attr(['patient_index', 'patient_doct_type']), settings.get_attr(['patient_index', 'es_concept_type']), settings.get_attr(['patient_index', 'es_patient_type'])) doc_level_index = settings.get_attr(['patient_index', 'doc_level_index']) doc_ann_type = settings.get_attr(['patient_index', 'doc_ann_type']) doc_index = settings.get_attr(['patient_index', 'doc_index']) doc_pid_field_name = settings.get_attr( ['patient_index', 'doc_pid_field_name']) doc_text_field_name = settings.get_attr( ['patient_index', 'doc_text_field_name']) patient_index = settings.get_attr(['patient_index', 'patient_index']) patient_doct_type = settings.get_attr( ['patient_index', 'patient_doct_type']) doc_type = settings.get_attr(['patient_index', 'doc_type']) ann_field_name = settings.get_attr(['patient_index', 'ann_field_name']) num_procs = 10 if settings.get_attr(['patient_index', 'num_procs']) is None else \ settings.get_attr(['patient_index', 'num_procs']) ignore_exist = True if settings.get_attr(['patient_index', 'ignore_exist']) is None else \ settings.get_attr(['patient_index', 'ignore_exist']) utils.multi_process_tasking(lst=pids, num_procs=num_procs, process_func=do_patient_indexing, args=[ es, doc_level_index, doc_ann_type, doc_index, doc_type, doc_pid_field_name, doc_text_field_name, patient_index, patient_doct_type, ann_field_name, ignore_exist ])
def index_ctx_concept(ann, concept_index, ctx_doc_type, es_inst): data = { "doc": { "cui": ann['cui'], "negation": ann['negation'], "experiencer": ann['experiencer'], "temporality": ann['temporality'], "prefLabel": ann['pref'], "STY": ann['sty'] }, "doc_as_upsert": True } ctx_id = SemEHRES.get_ctx_concept_id(ann) es_inst.update_doc(index=concept_index, doc_type=ctx_doc_type, data=data, doc_id=ctx_id)
def es_get_cohort_docs(settings): pids = utils.read_text_file(settings.get_attr(['cohort_docs', 'es_cohort_file'])) es = SemEHRES.get_instance_by_setting(settings.get_attr(['cohort_docs', 'es_host']), settings.get_attr(['cohort_docs', 'es_index']), settings.get_attr(['cohort_docs', 'es_doc_type']), settings.get_attr(['cohort_docs', 'es_concept_type']), settings.get_attr(['cohort_docs', 'es_patient_type'])) patiet_id_field = settings.get_attr(['cohort_docs', 'patiet_id_field']) docs = [] docs2p = {} for pid in pids: container = [] cohort_analysis_helper.query_collect_patient_docs({'_id': pid}, es, '*', patiet_id_field, container) if len(container) > 0: docs += [{'docid': d} for d in container[0]['docs']] for d in container[0]['docs']: docs2p[d] = pid return docs, docs2p, pids
def es_populate_patient_study_table_post_ruled(study_analyzer, out_file, rule_executor, sample_size, sample_out_file, ruled_ann_out_file, es_conn_file, text_preprocessing=False, retained_patients_filter=None, filter_obj=None): """ populate patient study result with post processing to remove unwanted mentions :param cohort_name: :param study_analyzer: :param out_file: :param rule_executor: :param sample_size: :param sample_out_file: :return: """ es = SemEHRES.get_instance_by_setting_file(es_conn_file) if filter_obj is not None: fes = SemEHRES.get_instance_by_setting_file(filter_obj['doc_es_setting']) patient_id_field = filter_obj['patient_id_field'] filter_obj['es'] = fes if retained_patients_filter is None: pids = es.search_by_scroll("*", es.patient_type) else: pids = retained_patients_filter patients = [{'brcid': p} for p in pids] id2p = {} for p in patients: id2p[p['brcid']] = p print 'total patients is %s' % len(patients) non_empty_concepts = [] study_concepts = study_analyzer.study_concepts term_to_docs = {} ruled_anns = [] for sc in study_concepts: positive_doc_anns = [] sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure)) print 'working on %s' % sc_key if sc.name.startswith('ess_'): non_empty_concepts.append(sc_key) # elasticsearch concepts p2docs = chelper.query_doc_by_search(es, fes, sc.concept_closure, patient_id_field, retained_patients_filter=retained_patients_filter, filter_obj=filter_obj, doc_filter_function=patient_timewindow_filter) for pd in p2docs: id2p[pd['pid']][sc_key] = str(len(pd['docs'])) # continue without to do the rest continue doc_anns = [] if len(sc.concept_closure) > 0: doc_anns = chelper.query_doc_anns(es, sc.concept_closure, study_analyzer.skip_terms, retained_patients_filter=retained_patients_filter, filter_obj=filter_obj, doc_filter_function=patient_timewindow_filter ) if len(doc_anns) > 0: p_to_dfreq = {} counted_docs = set() for d in doc_anns: doc = doc_anns[d] p = doc['pid'] if d in counted_docs: continue for ann in doc['anns']: ruled, rule = rule_executor.execute(doc['text'] if not text_preprocessing else preprocessing_text_befor_rule_execution(doc['text']), int(ann['s']), int(ann['e'])) if not ruled: counted_docs.add(d) p_to_dfreq[p] = 1 if p not in p_to_dfreq else 1 + p_to_dfreq[p] positive_doc_anns.append({'id': d, 'content': doc['text'], 'annotations': [{'start': ann['s'], 'end': ann['e'], 'concept': ann['inst']}]}) else: ruled_anns.append({'p': p, 'd': d, 'ruled': rule}) if len(counted_docs) > 0: non_empty_concepts.append(sc_key) for p in p_to_dfreq: id2p[p][sc_key] = str(p_to_dfreq[p]) # save sample docs if sample_size >= len(positive_doc_anns): term_to_docs[sc_key] = positive_doc_anns else: sampled = [] for i in xrange(sample_size): index = random.randrange(len(positive_doc_anns)) sampled.append(positive_doc_anns[index]) positive_doc_anns.pop(index) term_to_docs[sc_key] = sampled concept_labels = sorted(non_empty_concepts) s = '\t'.join(['brcid'] + concept_labels) + '\n' for p in patients: s += '\t'.join([p['brcid']] + [p[k] if k in p else '0' for k in concept_labels]) + '\n' utils.save_string(s, out_file) utils.save_json_array(convert_encoding(term_to_docs, 'cp1252', 'utf-8'), sample_out_file) utils.save_json_array(convert_encoding(ruled_anns, 'cp1252', 'utf-8'), ruled_ann_out_file) print 'done'
def do_semehr_doc_anns_analysis(settings): anns_folder = settings.get_attr(['doc_ann_analysis', 'ann_docs_path']) text_folder = settings.get_attr(['doc_ann_analysis', 'full_text_folder']) full_text_file_pattern = settings.get_attr( ['doc_ann_analysis', 'full_text_fn_ptn']) rule_config = settings.get_attr(['doc_ann_analysis', 'rule_config_path']) output_folder = settings.get_attr(['doc_ann_analysis', 'output_folder']) study_folder = settings.get_attr(['doc_ann_analysis', 'study_folder']) combined_anns = settings.get_attr(['doc_ann_analysis', 'combined_anns']) es_output_index = settings.get_attr( ['doc_ann_analysis', 'es_output_index']) es_output_doc = settings.get_attr(['doc_ann_analysis', 'es_output_doc']) output_file_pattern = settings.get_attr( ['doc_ann_analysis', 'output_fn_pattern']) thread_num = settings.get_attr(['doc_ann_analysis', 'thread_num']) if thread_num is None: thread_num = 10 process_mode = settings.get_attr(['doc_ann_analysis', 'process_mode']) if process_mode is not None and process_mode != 'sql': if settings.get_attr(['doc_ann_analysis', 'es_host']) is not None: es = SemEHRES.get_instance_by_setting( settings.get_attr(['doc_ann_analysis', 'es_host']), settings.get_attr(['doc_ann_analysis', 'es_index']), settings.get_attr(['doc_ann_analysis', 'es_doc_type']), settings.get_attr(['doc_ann_analysis', 'es_concept_type']), settings.get_attr(['doc_ann_analysis', 'es_patient_type'])) docanalysis.process_doc_anns( anns_folder=anns_folder, full_text_folder=text_folder, rule_config_file=rule_config, output_folder=output_folder, study_folder=study_folder, full_text_fn_ptn=full_text_file_pattern, fn_pattern=output_file_pattern, thread_num=thread_num, es_inst=es, es_text_field=settings.get_attr( ['doc_ann_analysis', 'full_text_field']), patient_id_field=settings.get_attr( ['doc_ann_analysis', 'patielt_id_field']), combined_anns=combined_anns, es_output_index=es_output_index, es_output_doc=es_output_doc) else: docanalysis.process_doc_anns( anns_folder=anns_folder, full_text_folder=text_folder, rule_config_file=rule_config, output_folder=output_folder, study_folder=study_folder, full_text_fn_ptn=full_text_file_pattern, fn_pattern=output_file_pattern, thread_num=thread_num) else: ann_list_sql = settings.get_attr(['doc_ann_analysis', 'ann_list_sql']) primary_keys = settings.get_attr(['doc_ann_analysis', 'primary_keys']) ann_inst_sql = settings.get_attr(['doc_ann_analysis', 'ann_inst_sql']) full_text_sql = settings.get_attr( ['doc_ann_analysis', 'full_text_sql']) update_query_template = settings.get_attr( ['doc_ann_analysis', 'update_query_template']) update_status_template = settings.get_attr( ['doc_ann_analysis', 'update_status_template']) dbconn_file = settings.get_attr(['doc_ann_analysis', 'dbconn_file']) docanalysis.analyse_db_doc_anns( ann_list_sql, ann_inst_sql, primary_keys, update_query_template, full_text_sql, dbconn_file, thread_num=thread_num, study_folder=study_folder, rule_config_file=rule_config, update_status_template=update_status_template)