def patient_level_indexing(settings, pids):
    es = SemEHRES.get_instance_by_setting(
        settings.get_attr(['patient_index', 'es_host']),
        settings.get_attr(['patient_index', 'patient_index']),
        settings.get_attr(['patient_index', 'patient_doct_type']),
        settings.get_attr(['patient_index', 'es_concept_type']),
        settings.get_attr(['patient_index', 'es_patient_type']))
    doc_level_index = settings.get_attr(['patient_index', 'doc_level_index'])
    doc_ann_type = settings.get_attr(['patient_index', 'doc_ann_type'])
    doc_index = settings.get_attr(['patient_index', 'doc_index'])
    doc_pid_field_name = settings.get_attr(
        ['patient_index', 'doc_pid_field_name'])
    doc_text_field_name = settings.get_attr(
        ['patient_index', 'doc_text_field_name'])
    patient_index = settings.get_attr(['patient_index', 'patient_index'])
    patient_doct_type = settings.get_attr(
        ['patient_index', 'patient_doct_type'])
    doc_type = settings.get_attr(['patient_index', 'doc_type'])
    ann_field_name = settings.get_attr(['patient_index', 'ann_field_name'])
    num_procs = 10 if settings.get_attr(['patient_index', 'num_procs']) is None else \
        settings.get_attr(['patient_index', 'num_procs'])
    ignore_exist = True if settings.get_attr(['patient_index', 'ignore_exist']) is None else \
        settings.get_attr(['patient_index', 'ignore_exist'])

    utils.multi_process_tasking(lst=pids,
                                num_procs=num_procs,
                                process_func=do_patient_indexing,
                                args=[
                                    es, doc_level_index, doc_ann_type,
                                    doc_index, doc_type, doc_pid_field_name,
                                    doc_text_field_name, patient_index,
                                    patient_doct_type, ann_field_name,
                                    ignore_exist
                                ])
Esempio n. 2
0
def analyse_db_doc_anns(sql, ann_sql, pks, update_template, full_text_sql, dbcnn_file, rule_config_file,
                        study_folder, thread_num=10, study_config='study.json', update_status_template=None):
    """
    do database based annotation post processing
    :param sql: get a list of annotation primary keys
    :param ann_sql: a query template to query ann and its doc full text
    :param pks: an array of primary key columns
    :param update_template: an update query template to update post-processed ann
    :param dbcnn_file: database connection file
    :param thread_num:
    :param study_folder:
    :param rule_config_file:
    :param study_config:
    :return:
    """
    ret = load_study_ruler(study_folder, rule_config_file, study_config)
    sa = ret['sa']
    ruler = ret['ruler']
    rows = []
    db.query_data(sql, rows, db.get_db_connection_by_setting(dbcnn_file))
    reader = DBTextReader(full_text_sql, dbcnn_file)
    cnns = []
    for i in xrange(thread_num):
        cnns.append(db.get_db_connection_by_setting(dbcnn_file))
    utils.multi_process_tasking(rows, db_doc_process, num_procs=thread_num,
                                args=[ann_sql, pks, update_template, dbcnn_file, reader, sa, ruler,
                                      update_status_template],
                                thread_wise_objs=cnns)
    for i in xrange(thread_num):
        db.release_db_connection(cnns[i])
Esempio n. 3
0
 def search_docs(self, query):
     query = '\\b%s\\b' % query
     matched_docs = Manager().list()
     utils.multi_process_tasking(self.get_doc_list(),
                                 DocAnn.do_search_doc,
                                 args=[self, query, matched_docs])
     return list(matched_docs)
Esempio n. 4
0
def db_populate_study_results(cohort_sql,
                              doc_ann_sql_temp,
                              doc_ann_pks,
                              dbcnn_file,
                              study_folder,
                              output_folder,
                              sample_sql_temp,
                              thread_num=10,
                              study_config='study.json',
                              sampling=True,
                              sample_size=20):
    """
    populate results for a research study
    :param cohort_sql: cohort selection query
    :param doc_ann_sql_temp: query template for getting a doc_anns item
    :param doc_ann_pks: primary key columns of doc ann table
    :param dbcnn_file: database connection config file
    :param study_folder: study folder
    :param output_folder: where to save the results
    :param sample_sql_temp: query template for getting a sample item (including full text and doc_anns)
    :param thread_num:
    :param study_config:
    :param sampling: whether sampling is needed
    :param sample_size: how many samples per study concept
    :return:
    """
    ret = load_study_ruler(study_folder, None, study_config)
    sa = ret['sa']
    concept_list = sorted([sc.name for sc in sa.study_concepts])
    cui2concept = {}
    for sc in sa.study_concepts:
        for c in sc.concept_closure:
            cui2concept[c] = sc.name
    results = []
    rows = []
    db.query_data(cohort_sql, rows,
                  db.get_db_connection_by_setting(dbcnn_file))
    logging.info('querying results (cohort size:%s)...' % len(rows))
    utils.multi_process_tasking([r['pid'] for r in rows],
                                db_populate_patient_result,
                                num_procs=thread_num,
                                args=[
                                    doc_ann_sql_temp, doc_ann_pks, dbcnn_file,
                                    concept_list, cui2concept,
                                    positive_patient_filter
                                ],
                                thread_init_func=proc_init_container,
                                thread_end_func=proc_final_collect,
                                thread_end_args=[results])
    # populate result table
    c2pks = {}
    for c in concept_list:
        c2pks[c] = []
    s = '\t'.join(['pid'] + concept_list)
    for r in results:
        pr = [r['p']]
        for c in concept_list:
            if r['c2f'][c]['f'] > 0:
                c2pks[c].append(r['c2f'][c]['docs'][0])
            pr.append(str(r['c2f'][c]['f']))
        s += '\t'.join(pr) + '\n'
    f = join(output_folder, 'result.tsv')
    utils.save_string(s, f)
    logging.info('result table saved to [%s]' % f)
    if sampling:
        logging.info('doing sampling...')
        sampled_result = {}
        for c in c2pks:
            pks = c2pks[c]
            sample_pks = []
            logging.info('doc cache size: %s' % len(pks))
            if len(pks) <= sample_size:
                sample_pks = pks
            else:
                for i in xrange(sample_size):
                    index = random.randrange(len(pks))
                    sample_pks.append(pks[index])
                    del pks[index]
            samples = []
            utils.multi_thread_tasking(
                sample_pks,
                thread_num,
                extract_sample,
                args=[c, cui2concept, sample_sql_temp, dbcnn_file, samples])
            sampled_result[c] = samples
            logging.info('%s sampled (%s) results' % (c, len(samples)))

        f = join(output_folder, 'sampled_docs.js')
        utils.save_string('var sample_docs= %s;' % json.dumps(sampled_result),
                          f)
        logging.info('samples saved to %s' % f)
    logging.info('all results populated')
Esempio n. 5
0
 def search_anns(self, query, map_name=None):
     matched_docs = Manager().list()
     utils.multi_process_tasking(self.get_doc_list(),
                                 DocAnn.do_search_anns,
                                 args=[self, query, map_name, matched_docs])
     return list(matched_docs)