def do_semehr_index(settings, patients, doc_to_patient):
    """
    do SemEHR index
    :param settings:
    :param patients:
    :param doc_to_patient:
    :return:
    """
    es = EntityCentricES(settings.get_attr(['semehr', 'es_host']))
    es.index_name = settings.get_attr(['semehr', 'index'])
    es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type'])
    es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type'])
    es.doc_level_index = settings.get_attr(['semehr', 'doc_level_index'])

    f_yodie_anns = settings.get_attr(['yodie', 'output_file_path'])
    ann_files = [
        f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f))
    ]

    if settings.get_attr(['job', 'semehr-concept']) == 'yes':
        logging.info('[SemEHR-step] starting semehr-concept process')
        logging.debug('working on files : %s' % ann_files)
        # index concepts
        concept_index = settings.get_attr(['semehr', 'concept_index'])
        for ann in ann_files:
            utils.multi_thread_large_file_tasking(
                join(f_yodie_anns, ann),
                10,
                do_index_100k_anns,
                args=[es, doc_to_patient, concept_index])
        logging.info('[SemEHR-step-end]concept/document level indexing done')

    if settings.get_attr(['job', 'semehr-patients']) == 'yes':
        logging.info('[SemEHR-step] indexing annotations at patient level')
        # index patients
        es_doc_url = settings.get_attr(['semehr', 'es_doc_url'])
        es_full_text = Elasticsearch([es_doc_url],
                                     serializer=JSONSerializerPython2(),
                                     verify_certs=False)
        ft_index_name = settings.get_attr(['semehr', 'full_text_index'])
        ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type'])
        ft_entity_field = settings.get_attr(
            ['semehr', 'full_text_patient_field'])
        ft_fulltext_field = settings.get_attr(
            ['semehr', 'full_text_text_field'])
        utils.multi_thread_tasking(patients,
                                   10,
                                   do_index_100k_patients,
                                   args=[
                                       es, es_full_text, ft_index_name,
                                       ft_doc_type, ft_entity_field,
                                       ft_fulltext_field
                                   ])
        logging.info('[SemEHR-step-end]patient level indexing done')
Esempio n. 2
0
def do_semehr_index(settings, patients, doc_to_patient):
    """
    do SemEHR index
    :param settings:
    :param patients:
    :param doc_to_patient:
    :return:
    """
    es = EntityCentricES(settings.get_attr(['semehr', 'es_host']))
    es.index_name = settings.get_attr(['semehr', 'index'])
    es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type'])
    es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type'])

    f_yodie_anns = settings.get_attr(['yodie', 'output_file_path'])
    ann_files = [
        f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f))
    ]

    if settings.get_attr(['job', 'semehr-concept']) == 'yes':
        print 'working on files : %s' % ann_files
        # index concepts
        for ann in ann_files:
            utils.multi_thread_large_file_tasking(join(f_yodie_anns, ann),
                                                  10,
                                                  do_index_100k_anns,
                                                  args=[es, doc_to_patient])
    if settings.get_attr(['job', 'semehr-patients']) == 'yes':
        # index patients
        es_doc_url = settings.get_attr(['semehr', 'es_doc_url'])
        es_full_text = Elasticsearch([es_doc_url],
                                     serializer=JSONSerializerPython2())
        ft_index_name = settings.get_attr(['semehr', 'full_text_index'])
        ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type'])
        ft_entity_field = settings.get_attr(
            ['semehr', 'full_text_patient_field'])
        ft_fulltext_field = settings.get_attr(
            ['semehr', 'full_text_text_field'])
        utils.multi_thread_tasking(patients,
                                   10,
                                   do_index_100k_patients,
                                   args=[
                                       es, es_full_text, ft_index_name,
                                       ft_doc_type, ft_entity_field,
                                       ft_fulltext_field
                                   ])