def load_document_to_es(settings):
    """
    load document to elastic search
    :param settings:
    :return:
    """
    doc_folder = settings.get_attr(['epr_index', 'doc_folder'])
    d2p_tsv = settings.get_attr(['epr_index', 'doc2patient_tsv'])
    es = SemEHRES.get_instance_by_setting(
        settings.get_attr(['epr_index', 'es_host']),
        settings.get_attr(['epr_index', 'es_index_name']),
        settings.get_attr(['epr_index', 'doc_type']), '', '')
    tsv_lines = utils.read_text_file(d2p_tsv)
    d2p = {}
    for l in tsv_lines:
        arr = l.split('\t')
        if len(arr) > 1:
            d2p[arr[0]] = arr[1]
    for f in [f for f in listdir(doc_folder) if isfile(join(doc_folder, f))]:
        if f in d2p:
            p = d2p[f]
            t = utils.read_text_file_as_string(join(doc_folder, f))
            es.index_new_doc(
                index=settings.get_attr(['epr_index', 'es_index_name']),
                doc_type=settings.get_attr(['epr_index', 'doc_type']),
                data={
                    settings.get_attr(['epr_index', 'text_field']): t,
                    settings.get_attr(['epr_index', 'patient_id_field']): p,
                    "id": f
                },
                doc_id=f)
def patient_level_indexing(settings, pids):
    es = SemEHRES.get_instance_by_setting(
        settings.get_attr(['patient_index', 'es_host']),
        settings.get_attr(['patient_index', 'patient_index']),
        settings.get_attr(['patient_index', 'patient_doct_type']),
        settings.get_attr(['patient_index', 'es_concept_type']),
        settings.get_attr(['patient_index', 'es_patient_type']))
    doc_level_index = settings.get_attr(['patient_index', 'doc_level_index'])
    doc_ann_type = settings.get_attr(['patient_index', 'doc_ann_type'])
    doc_index = settings.get_attr(['patient_index', 'doc_index'])
    doc_pid_field_name = settings.get_attr(
        ['patient_index', 'doc_pid_field_name'])
    doc_text_field_name = settings.get_attr(
        ['patient_index', 'doc_text_field_name'])
    patient_index = settings.get_attr(['patient_index', 'patient_index'])
    patient_doct_type = settings.get_attr(
        ['patient_index', 'patient_doct_type'])
    doc_type = settings.get_attr(['patient_index', 'doc_type'])
    ann_field_name = settings.get_attr(['patient_index', 'ann_field_name'])
    num_procs = 10 if settings.get_attr(['patient_index', 'num_procs']) is None else \
        settings.get_attr(['patient_index', 'num_procs'])
    ignore_exist = True if settings.get_attr(['patient_index', 'ignore_exist']) is None else \
        settings.get_attr(['patient_index', 'ignore_exist'])

    utils.multi_process_tasking(lst=pids,
                                num_procs=num_procs,
                                process_func=do_patient_indexing,
                                args=[
                                    es, doc_level_index, doc_ann_type,
                                    doc_index, doc_type, doc_pid_field_name,
                                    doc_text_field_name, patient_index,
                                    patient_doct_type, ann_field_name,
                                    ignore_exist
                                ])
def es_get_cohort_docs(settings):
    pids = utils.read_text_file(settings.get_attr(['cohort_docs', 'es_cohort_file']))
    es = SemEHRES.get_instance_by_setting(settings.get_attr(['cohort_docs', 'es_host']),
                                          settings.get_attr(['cohort_docs', 'es_index']),
                                          settings.get_attr(['cohort_docs', 'es_doc_type']),
                                          settings.get_attr(['cohort_docs', 'es_concept_type']),
                                          settings.get_attr(['cohort_docs', 'es_patient_type']))
    patiet_id_field = settings.get_attr(['cohort_docs', 'patiet_id_field'])

    docs = []
    docs2p = {}
    for pid in pids:
        container = []
        cohort_analysis_helper.query_collect_patient_docs({'_id': pid}, es, '*', patiet_id_field, container)
        if len(container) > 0:
            docs += [{'docid': d} for d in container[0]['docs']]
            for d in container[0]['docs']:
                docs2p[d] = pid
    return docs, docs2p, pids
def do_semehr_doc_anns_analysis(settings):
    anns_folder = settings.get_attr(['doc_ann_analysis', 'ann_docs_path'])
    text_folder = settings.get_attr(['doc_ann_analysis', 'full_text_folder'])
    full_text_file_pattern = settings.get_attr(
        ['doc_ann_analysis', 'full_text_fn_ptn'])
    rule_config = settings.get_attr(['doc_ann_analysis', 'rule_config_path'])
    output_folder = settings.get_attr(['doc_ann_analysis', 'output_folder'])
    study_folder = settings.get_attr(['doc_ann_analysis', 'study_folder'])
    combined_anns = settings.get_attr(['doc_ann_analysis', 'combined_anns'])
    es_output_index = settings.get_attr(
        ['doc_ann_analysis', 'es_output_index'])
    es_output_doc = settings.get_attr(['doc_ann_analysis', 'es_output_doc'])
    output_file_pattern = settings.get_attr(
        ['doc_ann_analysis', 'output_fn_pattern'])
    thread_num = settings.get_attr(['doc_ann_analysis', 'thread_num'])
    if thread_num is None:
        thread_num = 10
    process_mode = settings.get_attr(['doc_ann_analysis', 'process_mode'])
    if process_mode is not None and process_mode != 'sql':
        if settings.get_attr(['doc_ann_analysis', 'es_host']) is not None:
            es = SemEHRES.get_instance_by_setting(
                settings.get_attr(['doc_ann_analysis', 'es_host']),
                settings.get_attr(['doc_ann_analysis', 'es_index']),
                settings.get_attr(['doc_ann_analysis', 'es_doc_type']),
                settings.get_attr(['doc_ann_analysis', 'es_concept_type']),
                settings.get_attr(['doc_ann_analysis', 'es_patient_type']))
            docanalysis.process_doc_anns(
                anns_folder=anns_folder,
                full_text_folder=text_folder,
                rule_config_file=rule_config,
                output_folder=output_folder,
                study_folder=study_folder,
                full_text_fn_ptn=full_text_file_pattern,
                fn_pattern=output_file_pattern,
                thread_num=thread_num,
                es_inst=es,
                es_text_field=settings.get_attr(
                    ['doc_ann_analysis', 'full_text_field']),
                patient_id_field=settings.get_attr(
                    ['doc_ann_analysis', 'patielt_id_field']),
                combined_anns=combined_anns,
                es_output_index=es_output_index,
                es_output_doc=es_output_doc)
        else:
            docanalysis.process_doc_anns(
                anns_folder=anns_folder,
                full_text_folder=text_folder,
                rule_config_file=rule_config,
                output_folder=output_folder,
                study_folder=study_folder,
                full_text_fn_ptn=full_text_file_pattern,
                fn_pattern=output_file_pattern,
                thread_num=thread_num)
    else:
        ann_list_sql = settings.get_attr(['doc_ann_analysis', 'ann_list_sql'])
        primary_keys = settings.get_attr(['doc_ann_analysis', 'primary_keys'])
        ann_inst_sql = settings.get_attr(['doc_ann_analysis', 'ann_inst_sql'])
        full_text_sql = settings.get_attr(
            ['doc_ann_analysis', 'full_text_sql'])
        update_query_template = settings.get_attr(
            ['doc_ann_analysis', 'update_query_template'])
        update_status_template = settings.get_attr(
            ['doc_ann_analysis', 'update_status_template'])
        dbconn_file = settings.get_attr(['doc_ann_analysis', 'dbconn_file'])
        docanalysis.analyse_db_doc_anns(
            ann_list_sql,
            ann_inst_sql,
            primary_keys,
            update_query_template,
            full_text_sql,
            dbconn_file,
            thread_num=thread_num,
            study_folder=study_folder,
            rule_config_file=rule_config,
            update_status_template=update_status_template)