def patient_level_indexing(settings, pids): es = SemEHRES.get_instance_by_setting( settings.get_attr(['patient_index', 'es_host']), settings.get_attr(['patient_index', 'patient_index']), settings.get_attr(['patient_index', 'patient_doct_type']), settings.get_attr(['patient_index', 'es_concept_type']), settings.get_attr(['patient_index', 'es_patient_type'])) doc_level_index = settings.get_attr(['patient_index', 'doc_level_index']) doc_ann_type = settings.get_attr(['patient_index', 'doc_ann_type']) doc_index = settings.get_attr(['patient_index', 'doc_index']) doc_pid_field_name = settings.get_attr( ['patient_index', 'doc_pid_field_name']) doc_text_field_name = settings.get_attr( ['patient_index', 'doc_text_field_name']) patient_index = settings.get_attr(['patient_index', 'patient_index']) patient_doct_type = settings.get_attr( ['patient_index', 'patient_doct_type']) doc_type = settings.get_attr(['patient_index', 'doc_type']) ann_field_name = settings.get_attr(['patient_index', 'ann_field_name']) num_procs = 10 if settings.get_attr(['patient_index', 'num_procs']) is None else \ settings.get_attr(['patient_index', 'num_procs']) ignore_exist = True if settings.get_attr(['patient_index', 'ignore_exist']) is None else \ settings.get_attr(['patient_index', 'ignore_exist']) utils.multi_process_tasking(lst=pids, num_procs=num_procs, process_func=do_patient_indexing, args=[ es, doc_level_index, doc_ann_type, doc_index, doc_type, doc_pid_field_name, doc_text_field_name, patient_index, patient_doct_type, ann_field_name, ignore_exist ])
def analyse_db_doc_anns(sql, ann_sql, pks, update_template, full_text_sql, dbcnn_file, rule_config_file, study_folder, thread_num=10, study_config='study.json', update_status_template=None): """ do database based annotation post processing :param sql: get a list of annotation primary keys :param ann_sql: a query template to query ann and its doc full text :param pks: an array of primary key columns :param update_template: an update query template to update post-processed ann :param dbcnn_file: database connection file :param thread_num: :param study_folder: :param rule_config_file: :param study_config: :return: """ ret = load_study_ruler(study_folder, rule_config_file, study_config) sa = ret['sa'] ruler = ret['ruler'] rows = [] db.query_data(sql, rows, db.get_db_connection_by_setting(dbcnn_file)) reader = DBTextReader(full_text_sql, dbcnn_file) cnns = [] for i in xrange(thread_num): cnns.append(db.get_db_connection_by_setting(dbcnn_file)) utils.multi_process_tasking(rows, db_doc_process, num_procs=thread_num, args=[ann_sql, pks, update_template, dbcnn_file, reader, sa, ruler, update_status_template], thread_wise_objs=cnns) for i in xrange(thread_num): db.release_db_connection(cnns[i])
def search_docs(self, query): query = '\\b%s\\b' % query matched_docs = Manager().list() utils.multi_process_tasking(self.get_doc_list(), DocAnn.do_search_doc, args=[self, query, matched_docs]) return list(matched_docs)
def db_populate_study_results(cohort_sql, doc_ann_sql_temp, doc_ann_pks, dbcnn_file, study_folder, output_folder, sample_sql_temp, thread_num=10, study_config='study.json', sampling=True, sample_size=20): """ populate results for a research study :param cohort_sql: cohort selection query :param doc_ann_sql_temp: query template for getting a doc_anns item :param doc_ann_pks: primary key columns of doc ann table :param dbcnn_file: database connection config file :param study_folder: study folder :param output_folder: where to save the results :param sample_sql_temp: query template for getting a sample item (including full text and doc_anns) :param thread_num: :param study_config: :param sampling: whether sampling is needed :param sample_size: how many samples per study concept :return: """ ret = load_study_ruler(study_folder, None, study_config) sa = ret['sa'] concept_list = sorted([sc.name for sc in sa.study_concepts]) cui2concept = {} for sc in sa.study_concepts: for c in sc.concept_closure: cui2concept[c] = sc.name results = [] rows = [] db.query_data(cohort_sql, rows, db.get_db_connection_by_setting(dbcnn_file)) logging.info('querying results (cohort size:%s)...' % len(rows)) utils.multi_process_tasking([r['pid'] for r in rows], db_populate_patient_result, num_procs=thread_num, args=[ doc_ann_sql_temp, doc_ann_pks, dbcnn_file, concept_list, cui2concept, positive_patient_filter ], thread_init_func=proc_init_container, thread_end_func=proc_final_collect, thread_end_args=[results]) # populate result table c2pks = {} for c in concept_list: c2pks[c] = [] s = '\t'.join(['pid'] + concept_list) for r in results: pr = [r['p']] for c in concept_list: if r['c2f'][c]['f'] > 0: c2pks[c].append(r['c2f'][c]['docs'][0]) pr.append(str(r['c2f'][c]['f'])) s += '\t'.join(pr) + '\n' f = join(output_folder, 'result.tsv') utils.save_string(s, f) logging.info('result table saved to [%s]' % f) if sampling: logging.info('doing sampling...') sampled_result = {} for c in c2pks: pks = c2pks[c] sample_pks = [] logging.info('doc cache size: %s' % len(pks)) if len(pks) <= sample_size: sample_pks = pks else: for i in xrange(sample_size): index = random.randrange(len(pks)) sample_pks.append(pks[index]) del pks[index] samples = [] utils.multi_thread_tasking( sample_pks, thread_num, extract_sample, args=[c, cui2concept, sample_sql_temp, dbcnn_file, samples]) sampled_result[c] = samples logging.info('%s sampled (%s) results' % (c, len(samples))) f = join(output_folder, 'sampled_docs.js') utils.save_string('var sample_docs= %s;' % json.dumps(sampled_result), f) logging.info('samples saved to %s' % f) logging.info('all results populated')
def search_anns(self, query, map_name=None): matched_docs = Manager().list() utils.multi_process_tasking(self.get_doc_list(), DocAnn.do_search_anns, args=[self, query, map_name, matched_docs]) return list(matched_docs)