def load_document_to_es(settings):
    """
    load document to elastic search
    :param settings:
    :return:
    """
    doc_folder = settings.get_attr(['epr_index', 'doc_folder'])
    d2p_tsv = settings.get_attr(['epr_index', 'doc2patient_tsv'])
    es = SemEHRES.get_instance_by_setting(
        settings.get_attr(['epr_index', 'es_host']),
        settings.get_attr(['epr_index', 'es_index_name']),
        settings.get_attr(['epr_index', 'doc_type']), '', '')
    tsv_lines = utils.read_text_file(d2p_tsv)
    d2p = {}
    for l in tsv_lines:
        arr = l.split('\t')
        if len(arr) > 1:
            d2p[arr[0]] = arr[1]
    for f in [f for f in listdir(doc_folder) if isfile(join(doc_folder, f))]:
        if f in d2p:
            p = d2p[f]
            t = utils.read_text_file_as_string(join(doc_folder, f))
            es.index_new_doc(
                index=settings.get_attr(['epr_index', 'es_index_name']),
                doc_type=settings.get_attr(['epr_index', 'doc_type']),
                data={
                    settings.get_attr(['epr_index', 'text_field']): t,
                    settings.get_attr(['epr_index', 'patient_id_field']): p,
                    "id": f
                },
                doc_id=f)
def patient_level_indexing(settings, pids):
    es = SemEHRES.get_instance_by_setting(
        settings.get_attr(['patient_index', 'es_host']),
        settings.get_attr(['patient_index', 'patient_index']),
        settings.get_attr(['patient_index', 'patient_doct_type']),
        settings.get_attr(['patient_index', 'es_concept_type']),
        settings.get_attr(['patient_index', 'es_patient_type']))
    doc_level_index = settings.get_attr(['patient_index', 'doc_level_index'])
    doc_ann_type = settings.get_attr(['patient_index', 'doc_ann_type'])
    doc_index = settings.get_attr(['patient_index', 'doc_index'])
    doc_pid_field_name = settings.get_attr(
        ['patient_index', 'doc_pid_field_name'])
    doc_text_field_name = settings.get_attr(
        ['patient_index', 'doc_text_field_name'])
    patient_index = settings.get_attr(['patient_index', 'patient_index'])
    patient_doct_type = settings.get_attr(
        ['patient_index', 'patient_doct_type'])
    doc_type = settings.get_attr(['patient_index', 'doc_type'])
    ann_field_name = settings.get_attr(['patient_index', 'ann_field_name'])
    num_procs = 10 if settings.get_attr(['patient_index', 'num_procs']) is None else \
        settings.get_attr(['patient_index', 'num_procs'])
    ignore_exist = True if settings.get_attr(['patient_index', 'ignore_exist']) is None else \
        settings.get_attr(['patient_index', 'ignore_exist'])

    utils.multi_process_tasking(lst=pids,
                                num_procs=num_procs,
                                process_func=do_patient_indexing,
                                args=[
                                    es, doc_level_index, doc_ann_type,
                                    doc_index, doc_type, doc_pid_field_name,
                                    doc_text_field_name, patient_index,
                                    patient_doct_type, ann_field_name,
                                    ignore_exist
                                ])
Esempio n. 3
0
def index_ctx_concept(ann, concept_index, ctx_doc_type, es_inst):
    data = {
        "doc": {
            "cui": ann['cui'],
            "negation": ann['negation'],
            "experiencer": ann['experiencer'],
            "temporality": ann['temporality'],
            "prefLabel": ann['pref'],
            "STY": ann['sty']
        },
        "doc_as_upsert": True
    }
    ctx_id = SemEHRES.get_ctx_concept_id(ann)
    es_inst.update_doc(index=concept_index, doc_type=ctx_doc_type, data=data, doc_id=ctx_id)
def es_get_cohort_docs(settings):
    pids = utils.read_text_file(settings.get_attr(['cohort_docs', 'es_cohort_file']))
    es = SemEHRES.get_instance_by_setting(settings.get_attr(['cohort_docs', 'es_host']),
                                          settings.get_attr(['cohort_docs', 'es_index']),
                                          settings.get_attr(['cohort_docs', 'es_doc_type']),
                                          settings.get_attr(['cohort_docs', 'es_concept_type']),
                                          settings.get_attr(['cohort_docs', 'es_patient_type']))
    patiet_id_field = settings.get_attr(['cohort_docs', 'patiet_id_field'])

    docs = []
    docs2p = {}
    for pid in pids:
        container = []
        cohort_analysis_helper.query_collect_patient_docs({'_id': pid}, es, '*', patiet_id_field, container)
        if len(container) > 0:
            docs += [{'docid': d} for d in container[0]['docs']]
            for d in container[0]['docs']:
                docs2p[d] = pid
    return docs, docs2p, pids
def es_populate_patient_study_table_post_ruled(study_analyzer, out_file, rule_executor,
                                               sample_size, sample_out_file, ruled_ann_out_file,
                                               es_conn_file, text_preprocessing=False,
                                               retained_patients_filter=None,
                                               filter_obj=None):
    """
    populate patient study result with post processing to remove unwanted mentions
    :param cohort_name:
    :param study_analyzer:
    :param out_file:
    :param rule_executor:
    :param sample_size:
    :param sample_out_file:
    :return:
    """
    es = SemEHRES.get_instance_by_setting_file(es_conn_file)
    if filter_obj is not None:
        fes = SemEHRES.get_instance_by_setting_file(filter_obj['doc_es_setting'])
        patient_id_field = filter_obj['patient_id_field']
        filter_obj['es'] = fes
    if retained_patients_filter is None:
        pids = es.search_by_scroll("*", es.patient_type)
    else:
        pids = retained_patients_filter
    patients = [{'brcid': p} for p in pids]
    id2p = {}
    for p in patients:
        id2p[p['brcid']] = p
    print 'total patients is %s' % len(patients)
    non_empty_concepts = []
    study_concepts = study_analyzer.study_concepts
    term_to_docs = {}
    ruled_anns = []
    for sc in study_concepts:
        positive_doc_anns = []
        sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure))
        print 'working on %s' % sc_key
        if sc.name.startswith('ess_'):
            non_empty_concepts.append(sc_key)
            # elasticsearch concepts
            p2docs = chelper.query_doc_by_search(es, fes, sc.concept_closure, patient_id_field,
                                                 retained_patients_filter=retained_patients_filter,
                                                 filter_obj=filter_obj, doc_filter_function=patient_timewindow_filter)
            for pd in p2docs:
                id2p[pd['pid']][sc_key] = str(len(pd['docs']))
            # continue without to do the rest
            continue

        doc_anns = []
        if len(sc.concept_closure) > 0:
            doc_anns = chelper.query_doc_anns(es, sc.concept_closure, study_analyzer.skip_terms,
                                              retained_patients_filter=retained_patients_filter,
                                              filter_obj=filter_obj, doc_filter_function=patient_timewindow_filter
                                              )

        if len(doc_anns) > 0:
            p_to_dfreq = {}
            counted_docs = set()
            for d in doc_anns:
                doc = doc_anns[d]
                p = doc['pid']
                if d in counted_docs:
                    continue
                for ann in doc['anns']:
                    ruled, rule = rule_executor.execute(doc['text'] if not text_preprocessing else
                                                        preprocessing_text_befor_rule_execution(doc['text']),
                                                        int(ann['s']),
                                                        int(ann['e']))
                    if not ruled:
                        counted_docs.add(d)
                        p_to_dfreq[p] = 1 if p not in p_to_dfreq else 1 + p_to_dfreq[p]
                        positive_doc_anns.append({'id': d,
                                                  'content': doc['text'],
                                                  'annotations': [{'start': ann['s'],
                                                                   'end': ann['e'],
                                                                   'concept': ann['inst']}]})
                    else:
                        ruled_anns.append({'p': p, 'd': d, 'ruled': rule})
            if len(counted_docs) > 0:
                non_empty_concepts.append(sc_key)
                for p in p_to_dfreq:
                    id2p[p][sc_key] = str(p_to_dfreq[p])

                # save sample docs
                if sample_size >= len(positive_doc_anns):
                    term_to_docs[sc_key] = positive_doc_anns
                else:
                    sampled = []
                    for i in xrange(sample_size):
                        index = random.randrange(len(positive_doc_anns))
                        sampled.append(positive_doc_anns[index])
                        positive_doc_anns.pop(index)
                    term_to_docs[sc_key] = sampled

    concept_labels = sorted(non_empty_concepts)
    s = '\t'.join(['brcid'] + concept_labels) + '\n'
    for p in patients:
        s += '\t'.join([p['brcid']] + [p[k] if k in p else '0' for k in concept_labels]) + '\n'
    utils.save_string(s, out_file)
    utils.save_json_array(convert_encoding(term_to_docs, 'cp1252', 'utf-8'), sample_out_file)
    utils.save_json_array(convert_encoding(ruled_anns, 'cp1252', 'utf-8'), ruled_ann_out_file)
    print 'done'
def do_semehr_doc_anns_analysis(settings):
    anns_folder = settings.get_attr(['doc_ann_analysis', 'ann_docs_path'])
    text_folder = settings.get_attr(['doc_ann_analysis', 'full_text_folder'])
    full_text_file_pattern = settings.get_attr(
        ['doc_ann_analysis', 'full_text_fn_ptn'])
    rule_config = settings.get_attr(['doc_ann_analysis', 'rule_config_path'])
    output_folder = settings.get_attr(['doc_ann_analysis', 'output_folder'])
    study_folder = settings.get_attr(['doc_ann_analysis', 'study_folder'])
    combined_anns = settings.get_attr(['doc_ann_analysis', 'combined_anns'])
    es_output_index = settings.get_attr(
        ['doc_ann_analysis', 'es_output_index'])
    es_output_doc = settings.get_attr(['doc_ann_analysis', 'es_output_doc'])
    output_file_pattern = settings.get_attr(
        ['doc_ann_analysis', 'output_fn_pattern'])
    thread_num = settings.get_attr(['doc_ann_analysis', 'thread_num'])
    if thread_num is None:
        thread_num = 10
    process_mode = settings.get_attr(['doc_ann_analysis', 'process_mode'])
    if process_mode is not None and process_mode != 'sql':
        if settings.get_attr(['doc_ann_analysis', 'es_host']) is not None:
            es = SemEHRES.get_instance_by_setting(
                settings.get_attr(['doc_ann_analysis', 'es_host']),
                settings.get_attr(['doc_ann_analysis', 'es_index']),
                settings.get_attr(['doc_ann_analysis', 'es_doc_type']),
                settings.get_attr(['doc_ann_analysis', 'es_concept_type']),
                settings.get_attr(['doc_ann_analysis', 'es_patient_type']))
            docanalysis.process_doc_anns(
                anns_folder=anns_folder,
                full_text_folder=text_folder,
                rule_config_file=rule_config,
                output_folder=output_folder,
                study_folder=study_folder,
                full_text_fn_ptn=full_text_file_pattern,
                fn_pattern=output_file_pattern,
                thread_num=thread_num,
                es_inst=es,
                es_text_field=settings.get_attr(
                    ['doc_ann_analysis', 'full_text_field']),
                patient_id_field=settings.get_attr(
                    ['doc_ann_analysis', 'patielt_id_field']),
                combined_anns=combined_anns,
                es_output_index=es_output_index,
                es_output_doc=es_output_doc)
        else:
            docanalysis.process_doc_anns(
                anns_folder=anns_folder,
                full_text_folder=text_folder,
                rule_config_file=rule_config,
                output_folder=output_folder,
                study_folder=study_folder,
                full_text_fn_ptn=full_text_file_pattern,
                fn_pattern=output_file_pattern,
                thread_num=thread_num)
    else:
        ann_list_sql = settings.get_attr(['doc_ann_analysis', 'ann_list_sql'])
        primary_keys = settings.get_attr(['doc_ann_analysis', 'primary_keys'])
        ann_inst_sql = settings.get_attr(['doc_ann_analysis', 'ann_inst_sql'])
        full_text_sql = settings.get_attr(
            ['doc_ann_analysis', 'full_text_sql'])
        update_query_template = settings.get_attr(
            ['doc_ann_analysis', 'update_query_template'])
        update_status_template = settings.get_attr(
            ['doc_ann_analysis', 'update_status_template'])
        dbconn_file = settings.get_attr(['doc_ann_analysis', 'dbconn_file'])
        docanalysis.analyse_db_doc_anns(
            ann_list_sql,
            ann_inst_sql,
            primary_keys,
            update_query_template,
            full_text_sql,
            dbconn_file,
            thread_num=thread_num,
            study_folder=study_folder,
            rule_config_file=rule_config,
            update_status_template=update_status_template)