def copy_docs(index_setting_file, src_index, src_doc_type, entity_id_field_name, dest_index, dest_doc_type, patient_list_file, thread_num=30): """ copy a list of docs (doc ids read from doc_list_file) from one index to another :param index_setting_file: :param src_index: :param src_doc_type: :param entity_id_field_name: :param dest_index: :param dest_doc_type: :param patient_list_file: :param thread_num: :return: """ es = EntityCentricES.get_instance(index_setting_file) patients = utils.read_text_file(patient_list_file) utils.multi_thread_tasking(patients, thread_num, do_copy_doc, args=[ es, src_index, src_doc_type, entity_id_field_name, dest_index, dest_doc_type ]) print 'all done'
def run_parallel_prediction(settings): cm_obj = Concept2Mapping(settings['concept_mapping_file']) mp_inst = mp.MentionPattern(settings['pattern_folder'], cm_obj.cui2label, in_action=True) mp_inst = None db_pool = du.get_mysql_pooling(settings['dbconf'], num=30) doc_ids = [] model_factory = ModelFactory(settings['phenotypes'], settings['model_dir']) du.query_data(settings['sql_docs4process'], pool=db_pool, container=doc_ids) # for d in doc_ids: # do_one_doc(d, model_factory, cm_obj, mp_inst, db_pool, # settings['sql_text_ptn'], # settings['sql_ann_ptn'], # settings['save_result_sql_ptn'], # settings['update_doc_sql_ptn']) utils.multi_thread_tasking(doc_ids, num_threads=settings['num_threads'], process_func=do_one_doc, args=[ model_factory, cm_obj, mp_inst, db_pool, settings['sql_text_ptn'], settings['sql_ann_ptn'], settings['save_result_sql_ptn'], settings['update_doc_sql_ptn'] ]) logging.info('#docs: %s all done' % len(doc_ids))
def update_mimic_doc_dates(doc_dates): es = EntityCentricES.get_instance('./index_settings/es_mimic_setting.json') container = [] utils.multi_thread_tasking(doc_dates, 20, do_doc_update_date, args=[es, container])
def collect_result(self, output_file, graph_file_path): files = [ f for f in listdir(self._doc_pth) if isfile(join(self._doc_pth, f)) ] f_did = [] for f in files: sr = re.search(self._did_pattern, f, re.IGNORECASE) if sr: f_did.append((f, sr.group(1))) results = [] logging.info('collecting results ...') utils.multi_thread_tasking( lst=f_did, num_threads=10, process_func=DocCohort.collect_doc_anns_by_types, args=[self._doc_pth, self.collect_semantic_types, results]) logging.info('total anns collected %s' % len(results)) ret = {'concepts': {}, 'p2c': {}} for r in results: if r['d'] in self._d2p: p = self._d2p[r['d']] if p not in ret['p2c']: ret['p2c'][p] = {} pd = ret['p2c'][p] if r['cui'] not in ret['concepts']: ret['concepts'][r['cui']] = r['pref'] if r['cui'] not in pd: pd[r['cui']] = 1 else: pd[r['cui']] += 1 else: logging.error('doc %s not in cohort map' % r['d']) utils.save_json_array(ret, output_file) utils.save_json_array(DocCohort.result_to_graph(ret), graph_file_path) logging.info('result collected')
def query_doc_anns(es, concepts, skip_terms, retained_patients_filter=None, filter_obj=None, doc_filter_function=None): patients = es.search_by_scroll(" ".join(concepts), es.patient_type, collection_func=lambda d, c: c.append(d)) print '%s patients matched' % len(patients) if retained_patients_filter is not None: retained = [] for po in patients: if po['_id'] in retained_patients_filter: retained.append(po) patients = retained print 'patients filtered to size %s' % len(patients) doc_anns = {} container = [] utils.multi_thread_tasking(patients, 40, collect_patient_docs, args=[ es, concepts, skip_terms, container, filter_obj, doc_filter_function ]) print 'data collected, merging...' for d in container: doc_anns.update(d) print 'merged dic size %s' % len(doc_anns) return doc_anns
def predict_exp(corpus_trans_file, ann_file, cache_file, output_file): # initialise pattern instances from documents if not isfile(cache_file): # load labelled data ann_lines = utils.read_text_file(ann_file) prev_doc = None anns = [] doc_anns = [] ptn_insts = [] doc_to_pt = {} for ls in ann_lines: l = ls.split('\t') doc_id = l[1] doc_to_pt[doc_id] = l[0] if prev_doc != doc_id: if prev_doc is not None: if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)): doc_anns.append((prev_doc, anns)) anns = [] prev_doc = doc_id anns.append({ 's': int(l[2]), 'e': int(l[3]), 'signed_label': l[4], 'gt_label': l[5] }) if prev_doc is not None: if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)): doc_anns.append((prev_doc, anns)) # mutithreading do processing labelled docs print 'processing docs...' utils.multi_thread_tasking(doc_anns, 30, do_process_labelled_doc, args=[ptn_insts]) jl.dump({'insts': ptn_insts, 'doc_to_pt': doc_to_pt}, cache_file) else: cached = jl.load(cache_file) ptn_insts = cached['insts'] doc_to_pt = cached['doc_to_pt'] cp = sp.CorpusPredictor.load_corpus_model(corpus_trans_file) ret = [] for inst in ptn_insts: print 'predicting [%s]...' % inst.sentence acc = cp.predcit(inst) print 'accuracy: %s' % acc ann = inst.annotations[0] ret.append( (doc_to_pt[inst.doc_id], inst.doc_id, str(ann['s']), str(ann['e']), ann['signed_label'], ann['gt_label'], str(acc))) s = [] for r in ret: s.append(u'\t'.join(r)) print u'\n'.join(s) utils.save_json_array(ret, output_file) return ret
def update_doc_dates(ser_file): data = jl.load(ser_file) step = 100 batch = [] for i in xrange(0, len(data), step): batch.append(data[i:min(len(data), i + step)]) utils.multi_thread_tasking(batch, 20, update_doc_date, thread_init_func=get_mysql_conn, thread_end_func=dutil.release_db_connection)
def compute_all_subconcepts(concepts, file_path): c_to_subs = {} umls = UMLSAPI(_umls_api_key) container = [] utils.multi_thread_tasking(concepts, 10, do_compute_subconcept, args=[umls, container]) for p in container: c_to_subs[p[0]] = p[1] utils.save_json_array(c_to_subs, file_path)
def do_semehr_index(settings, patients, doc_to_patient): """ do SemEHR index :param settings: :param patients: :param doc_to_patient: :return: """ es = EntityCentricES(settings.get_attr(['semehr', 'es_host'])) es.index_name = settings.get_attr(['semehr', 'index']) es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type']) es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type']) es.doc_level_index = settings.get_attr(['semehr', 'doc_level_index']) f_yodie_anns = settings.get_attr(['yodie', 'output_file_path']) ann_files = [ f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f)) ] if settings.get_attr(['job', 'semehr-concept']) == 'yes': logging.info('[SemEHR-step] starting semehr-concept process') logging.debug('working on files : %s' % ann_files) # index concepts concept_index = settings.get_attr(['semehr', 'concept_index']) for ann in ann_files: utils.multi_thread_large_file_tasking( join(f_yodie_anns, ann), 10, do_index_100k_anns, args=[es, doc_to_patient, concept_index]) logging.info('[SemEHR-step-end]concept/document level indexing done') if settings.get_attr(['job', 'semehr-patients']) == 'yes': logging.info('[SemEHR-step] indexing annotations at patient level') # index patients es_doc_url = settings.get_attr(['semehr', 'es_doc_url']) es_full_text = Elasticsearch([es_doc_url], serializer=JSONSerializerPython2(), verify_certs=False) ft_index_name = settings.get_attr(['semehr', 'full_text_index']) ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type']) ft_entity_field = settings.get_attr( ['semehr', 'full_text_patient_field']) ft_fulltext_field = settings.get_attr( ['semehr', 'full_text_text_field']) utils.multi_thread_tasking(patients, 10, do_index_100k_patients, args=[ es, es_full_text, ft_index_name, ft_doc_type, ft_entity_field, ft_fulltext_field ]) logging.info('[SemEHR-step-end]patient level indexing done')
def index_cris_patients(): f_patient_doc = './hepc_pos_doc_brcid.txt' lines = utils.read_text_file(f_patient_doc, encoding='utf-8-sig') patients = [] for l in lines: arr = l.split('\t') if arr[0] not in patients: patients.append(arr[0]) print 'total patients %s %s' % (len(patients), patients[0]) es = EntityCentricES.get_instance('./index_settings/es_cris_setting.json') utils.multi_thread_tasking(patients, 10, do_index_patient, args=[es]) print 'done'
def compute_all_concept_closure(all_concepts, umls_instance): concept_to_closure = {} print 'all concepts number %s' % len(all_concepts) computed = [] results = [] utils.multi_thread_tasking(all_concepts, 40, StudyConcept.do_compute_concept_closure, args=[umls_instance, computed, results]) for r in results: concept_to_closure[r['concept']] = r['closure'] return concept_to_closure
def working_on_docs(index_setting_file, job_file, src_index, src_doc_type, dest_index, dest_doc_type, num_threads=20): job_status = JobStatus(job_file) docs = get_docs_for_processing(job_status) print 'copy docs: [%s]' % docs es = ees.EntityCentricES.get_instance(index_setting_file) try: utils.multi_thread_tasking(docs, num_threads, do_copy_doc, args=[es, src_index, src_doc_type, dest_index, dest_doc_type]) job_status.set_status(True) except: job_status.set_status(JobStatus.STATUS_FAILURE) job_status.save()
def process_labelled_docs(labelled_file, corpus_model_file, mini_comp_file): corpus_analyzer = None if not isfile(corpus_model_file): # load labelled data ann_lines = utils.read_text_file(labelled_file) prev_doc = None anns = [] doc_anns = [] ptn_insts = [] for ls in ann_lines: l = ls.split('\t') doc_id = l[0] if prev_doc != doc_id: if prev_doc is not None: if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)): doc_anns.append((prev_doc, anns)) anns = [] prev_doc = doc_id anns.append({ 's': int(l[1]), 'e': int(l[2]), 'signed_label': l[3], 'gt_label': l[4] }) if prev_doc is not None: if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)): doc_anns.append((prev_doc, anns)) # mutithreading do processing labelled docs print 'processing docs...' utils.multi_thread_tasking(doc_anns, 30, do_process_labelled_doc, args=[ptn_insts]) print 'merging patterns..' corpus_analyzer = sp.CorpusAnalyzer() for pi in ptn_insts: corpus_analyzer.add_pattern(pi) corpus_analyzer.serialise(corpus_model_file) else: corpus_analyzer = sp.CorpusAnalyzer.load_seralisation( corpus_model_file) # corpus_analyzer.show() # pt_insts = corpus_analyzer.pattern_to_insts if isfile(mini_comp_file): corpus_analyzer.load_mini_comp_dict(mini_comp_file) else: corpus_analyzer.produce_save_comp_dict(mini_comp_file) corpus_analyzer.show_mini_comp_patterns() # generate_corpus_model(corpus_analyzer) return corpus_analyzer
def index_pubmed(): es = EntityCentricES.get_instance('./index_settings/es_setting.json') doc_details = utils.load_json_data('./index_settings/pmc_docs.json') pmcid_to_journal = {} for d in doc_details: if 'pmcid' in d and 'journalTitle' in d: pmcid_to_journal[d['pmcid']] = d['journalTitle'] # load anns # utils.multi_thread_large_file_tasking('./index_settings/test_anns.json', 10, do_index_pubmed, # args=[es, pmcid_to_journal, './index_settings/fulltext']) utils.multi_thread_tasking(doc_details, 10, do_index_pubmed_docs, args=[es, './index_settings/fulltext']) print 'done'
def get_concepts_names(umls, concepts): batch_size = 200 batches = [] for k in range(0, int(math.ceil(len(concepts)*1.0/batch_size))): batches.append(concepts[batch_size * k : batch_size * (k+1)]) print 'batch %s, len %s' % (k, len(batches[-1])) container = [] utils.multi_thread_tasking(batches, 20, do_get_concepts_names, args=[umls, container]) print len(container) c2label = {} for r in container: c2label[r[0]] = r[1] return c2label
def do_semehr_index(settings, patients, doc_to_patient): """ do SemEHR index :param settings: :param patients: :param doc_to_patient: :return: """ es = EntityCentricES(settings.get_attr(['semehr', 'es_host'])) es.index_name = settings.get_attr(['semehr', 'index']) es.concept_doc_type = settings.get_attr(['semehr', 'concept_doc_type']) es.entity_doc_type = settings.get_attr(['semehr', 'entity_doc_type']) f_yodie_anns = settings.get_attr(['yodie', 'output_file_path']) ann_files = [ f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f)) ] if settings.get_attr(['job', 'semehr-concept']) == 'yes': print 'working on files : %s' % ann_files # index concepts for ann in ann_files: utils.multi_thread_large_file_tasking(join(f_yodie_anns, ann), 10, do_index_100k_anns, args=[es, doc_to_patient]) if settings.get_attr(['job', 'semehr-patients']) == 'yes': # index patients es_doc_url = settings.get_attr(['semehr', 'es_doc_url']) es_full_text = Elasticsearch([es_doc_url], serializer=JSONSerializerPython2()) ft_index_name = settings.get_attr(['semehr', 'full_text_index']) ft_doc_type = settings.get_attr(['semehr', 'full_text_doc_type']) ft_entity_field = settings.get_attr( ['semehr', 'full_text_patient_field']) ft_fulltext_field = settings.get_attr( ['semehr', 'full_text_text_field']) utils.multi_thread_tasking(patients, 10, do_index_100k_patients, args=[ es, es_full_text, ft_index_name, ft_doc_type, ft_entity_field, ft_fulltext_field ])
def complete_samples(sample_file, complete_sql, db_conn_file, out_file): ann_prefix = 'var sample_docs=' anns_str = utils.read_text_file_as_string(sample_file) if anns_str.startswith(ann_prefix): anns_str = anns_str[len(ann_prefix):] anns = json.loads(anns_str) # anns = utils.load_json_data(sample_file) key_anns = [] for k in anns: key_anns.append((k, anns[k])) container = [] utils.multi_thread_tasking(key_anns, 40, complete_sample_ann_data, args=[complete_sql, db_conn_file, container]) results = {} for r in container: results[r[0]] = r[1] utils.save_string(ann_prefix + json.dumps(results), out_file) print 'done'
def index_100k(index_setting_file, patient_index_only=None): es = EntityCentricES.get_instance(index_setting_file) f_patient_doc = es.customise_settings['patient_doc_mapping_file'] f_yodie_anns = es.customise_settings['yodie_output_folder'] es_epr_full_text = es.customise_settings['es_ft'] ft_index_name = es.customise_settings['ft_index_name'] ft_doc_type = es.customise_settings['ft_doc_type'] ft_entity_field = es.customise_settings['ft_entity_field'] ft_fulltext_field = es.customise_settings['ft_fulltext_field'] lines = utils.read_text_file(f_patient_doc) doc_to_patient = {} patients = set() for l in lines: arr = l.split('\t') doc_to_patient[arr[1]] = arr[0] patients.add(arr[0]) patients = list(patients) # epr full text index api es_full_text = Elasticsearch([es_epr_full_text], serializer=JSONSerializerPython2()) # es_full_text.get() if patient_index_only is None: ann_files = [ f for f in listdir(f_yodie_anns) if isfile(join(f_yodie_anns, f)) ] for ann in ann_files: utils.multi_thread_large_file_tasking(join(f_yodie_anns, ann), 10, do_index_100k_anns, args=[es, doc_to_patient]) print 'anns done, indexing patients...' else: print 'skipping concept indexing' utils.multi_thread_tasking(patients, 10, do_index_100k_patients, args=[ es, es_full_text, ft_index_name, ft_doc_type, ft_entity_field, ft_fulltext_field ]) print 'all done'
def query_doc_by_search(es, doc_es, es_search, patiet_id_field, retained_patients_filter=None, filter_obj=None, doc_filter_function=None): """ get number of mentions by elasticsearch queries instead of NLP results :param es: :param doc_es: :param es_search: :param patiet_id_field: :param retained_patients_filter: :param filter_obj: :param doc_filter_function: :return: """ patients = es.search_by_scroll(" ".join(es_search), es.patient_type, collection_func=lambda d, c: c.append(d)) print '%s patients matched' % len(patients) if retained_patients_filter is not None: retained = [] for po in patients: if po['_id'] in retained_patients_filter: retained.append(po) patients = retained print 'patients filtered to size %s' % len(patients) container = [] utils.multi_thread_tasking(patients, 40, query_collect_patient_docs, args=[ doc_es, es_search, patiet_id_field, container, filter_obj, doc_filter_function ]) return container
def mimic_anonymisation(single_file, rule_file): doc = utils.read_text_file_as_string(single_file) arr = re.split(r'START\_OF\_RECORD=\d+\|\|\|\|\d+\|\|\|\|\r{0,1}\n', doc) i = 0 texts = [] for t in arr: texts.append(t.replace('||||END_OF_RECORD\n', '')) anonymis_inst = AnonymiseRule(rule_file) failed_docs = [] sent_data = [] utils.multi_thread_tasking(texts, 1, wrap_anonymise_doc, args=[failed_docs, anonymis_inst, sent_data]) t2sent = {} for s in sent_data: if s['type'] not in t2sent: t2sent[s['type']] = [] t2sent[s['type']].append(s['sent']) for t in t2sent: t2sent[t] = list(set(t2sent[t])) print('%s\n======\n%s\n\n' % (t, '\n'.join(t2sent[t])))
def generate_result_in_one_iteration(cohort_name, study_analyzer, out_file, sample_size, sample_out_file, doc_to_brc_sql, brc_sql, anns_iter_sql, skip_term_sql, doc_content_sql, db_conn_file): """ generate result in one iteration over all annotations. this is supposed to be much faster when working on large study concepts. But post-processing using rules not supported now :param cohort_name: :param study_analyzer: :param out_file: :param sample_size: :param sample_out_file: :param doc_to_brc_sql: :param brc_sql: :param anns_iter_sql: :param skip_term_sql: :param doc_content_sql: :param db_conn_file: :return: """ # populate concept to anns maps sc2anns = {} for sc in study_analyzer.study_concepts: sc2anns[sc.name] = [] # populate patient list print 'populating patient list...' patients = {} rows_container = [] dutil.query_data(brc_sql.format(cohort_name), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) for r in rows_container: patients[r['brcid']] = {'brcid': r['brcid']} # populate document id to patient id dictionary print 'populating doc to patient map...' rows_container = [] dutil.query_data(doc_to_brc_sql.format(cohort_name), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) doc2brc = {} for dp in rows_container: doc2brc[dp['doc_id']] = dp['brcid'] # query annotations print 'iterating annotations...' rows_container = [] dutil.query_data(anns_iter_sql.format(**{'cohort_id': cohort_name, 'extra_constrains': ' \n '.join( [generate_skip_term_constrain(study_analyzer, skip_term_sql)] + [] if (study_analyzer.study_options is None or study_analyzer.study_options['extra_constrains'] is None) else study_analyzer.study_options['extra_constrains'])}), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) for r in rows_container: concept_id = r['inst_uri'] brcid = doc2brc[r['doc_id']] if r['doc_id'] in doc2brc else None if brcid is None: print 'doc %s not matched to a patient!!!' % r['doc_id'] continue patient = patients[brcid] if brcid in patients else None if patient is None: print 'brc id %s not matched a patient!!!' % brcid continue # get matched study concepts for sc in study_analyzer.study_concepts: if concept_id in sc.concept_closure: patient[sc.name] = (patient[sc.name] + 1) if sc.name in patient else 1 sc2anns[sc.name].append({'ann_id': r['ann_id'], 'doc_id': r['doc_id'], 'concept_id': concept_id, 'start': r['start_offset'], 'end': r['end_offset']}) # generate result table print 'generate result table...' concept_labels = sorted([k for k in sc2anns]) s = '\t'.join(['brcid'] + concept_labels) + '\n' lines = [] utils.multi_thread_tasking([patients[pid] for pid in patients], 40, do_put_line, args=[concept_labels, lines]) s += '\n'.join(lines) utils.save_string(s, out_file) # generate sample annotations term_to_docs = {} for concept in sc2anns: ann_ids = sc2anns[concept] sample_ids = [] if len(ann_ids) <= sample_size: sample_ids = ann_ids else: for i in xrange(sample_size): index = random.randrange(len(ann_ids)) sample_ids.append(ann_ids[index]) del ann_ids[index] term_to_docs[concept] = sample_ids # query doc contents print 'populating term to sampled anns...' term_to_sampled = {} for term in term_to_docs: sample_ids = term_to_docs[term] if len(sample_ids) <=0 : continue sample_doc_ids = ['\'' + s['doc_id'] + '\'' for s in sample_ids] rows_container = [] dutil.query_data(doc_content_sql.format(','.join(sample_doc_ids)), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) doc_to_content = {} for r in rows_container: doc_to_content[r['doc_id']] = r['TextContent'] term_sampled = [] for s in sample_ids: term_sampled.append({'id': s['doc_id'], 'content': doc_to_content[s['doc_id']], 'annotations': [{'start': s['start'], 'end': s['end'], 'concept': s['concept_id']}]}) term_to_sampled[term] = term_sampled utils.save_json_array(convert_encoding(term_to_sampled, 'cp1252', 'utf-8'), sample_out_file)
def dump_doc_as_files(folder): utils.multi_thread_tasking(load_all_docs(), 10, do_save_file, args=[folder])
def process_semehr(config_file): """ a pipeline to process all SemEHR related processes: 0. ES doc copy from one index to another; 1. bio-yodie NLP pipeline annotation on docs; 2. entity centric SemEHR ES indexing :param config_file: :return: """ # read the configuration ps = ProcessSetting(config_file) # setting log configuration log_level = 'INFO' if ps.get_attr( ['logging', 'level']) is None else ps.get_attr(['logging', 'level']) log_format = '%(name)s %(asctime)s %(levelname)s %(message)s' if ps.get_attr(['logging', 'format']) is None \ else ps.get_attr(['logging', 'format']) log_file = None if ps.get_attr( ['logging', 'file']) is None else ps.get_attr(['logging', 'file']) logging.basicConfig(level=log_level, format=log_format) if log_file is not None: formatter = logging.Formatter(log_format) file_handler = logging.FileHandler(log_file) file_handler.setLevel(log_level) file_handler.setFormatter(formatter) logging.getLogger().addHandler(file_handler) logging.info('logging to %s' % log_file) # initialise the jobstatus class instance job_file = join( ps.get_attr(['job', 'job_status_file_path']), 'semehr_job_status_%s.json' % ps.get_attr(['job', 'job_id'])) logging.info('[SemEHR-step] using job status file %s' % job_file) job_status = JobStatus(job_file) job_status.job_start() # preload: load documents to es if ps.get_attr(['job', 'epr_index']) == 'yes': logging.info('[SemEHR-step]load documents to elasticsearch...') load_document_to_es(settings=ps) logging.info('[SemEHR-step-end] epr_index step done') data_rows = [] doc2pid = {} pids = [] if ps.get_attr(['job', 'load_docs']) == 'yes': sql_template = ps.get_attr(['new_docs', 'sql_query']) logging.info( '[SemEHR-step] retrieving docs by using the template [%s]' % sql_template) data_rows = get_docs_for_processing( job_status, sql_template, ps.get_attr(['new_docs', 'dbconn_setting_file'])) logging.info('total docs num is %s' % len(data_rows)) elif ps.get_attr(['job', 'cohort_docs']) == 'yes': logging.info('[SemEHR-step] retrieving docs by cohort [%s]' % ps.get_attr(['cohort_docs', 'es_cohort_file'])) data_rows, doc2pid, pids = es_get_cohort_docs(ps) logging.info('total docs num is %s' % len(data_rows)) try: # if True: # 0. copy docs if ps.get_attr(['job', 'copy_docs']) == 'yes': logging.info('[SemEHR-step] copy docs') docs = [str(r['docid']) for r in data_rows] utils.multi_thread_tasking( docs, ps.get_attr(['doc_copy', 'thread_num']), do_copy_doc, args=[ EntityCentricES(ps.get_attr(['doc_copy', 'es_host'])), ps.get_attr(['doc_copy', 'src_index']), ps.get_attr(['doc_copy', 'src_doc_type']), ps.get_attr(['doc_copy', 'dest_index']), ps.get_attr(['doc_copy', 'dest_doc_type']) ]) logging.info('[SemEHR-step-end]copying docs done') if ps.get_attr(['job', 'yodie']) == 'yes': docid_path = '%s/%s_docids.txt' % (ps.get_attr([ 'yodie', 'input_doc_file_path' ]), ps.get_attr(['job', 'job_id'])) logging.info('[SemEHR-step] doing yodie') # 1. do bio-yodie pipeline # 1.1 prepare the configuration file num_docs = produce_yodie_config(ps, data_rows, docid_path) if num_docs == 0: logging.info( '[SemEHR-step-end] nothing to process, NLP step done') else: logging.info('total number of docs %s' % num_docs) # 1.2 set the env variables set_sys_env(ps) # 1.3 clear ann output folder logging.info('clearing %s ...' % ps.get_attr(['yodie', 'output_file_path'])) clear_folder(ps.get_attr(['yodie', 'output_file_path'])) # 1.3 run bio-yodie os.chdir(ps.get_attr(['yodie', 'gcp_run_path'])) if ps.get_attr(['yodie', 'os']) == 'win': cmd = ' '.join([ 'java', "-Dgate.home=%s" % ps.get_attr(['env', 'gate_home']), "-Dgcp.home=%s" % ps.get_attr(['env', 'gcp_home']), "-Djava.protocol.handler.pkgs=gate.cloud.util.protocols", "-cp .;{SCRIPTDIR}/conf;{SCRIPTDIR}/gcp.jar;{SCRIPTDIR}/lib/*;" "{GATE_HOME}/bin/gate.jar;{GATE_HOME}/lib/*".format( **{ "SCRIPTDIR": ps.get_attr(['env', 'gcp_home']), "GATE_HOME": ps.get_attr(['env', 'gate_home']) }), '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" ' % ps.get_attr(['env', 'yodie_path']), "-Xmx%s" % ps.get_attr(['yodie', 'memory']), "gate.cloud.batch.BatchRunner", "-t %s" % ps.get_attr(['yodie', 'thread_num']), "-b %s" % ps.get_attr(['yodie', 'config_xml_path']) ]) else: cmd = ' '.join([ 'gcp-direct.sh', "-t %s" % ps.get_attr(['yodie', 'thread_num']), "-Xmx%s" % ps.get_attr(['yodie', 'memory']), "-b %s" % ps.get_attr(['yodie', 'config_xml_path']), '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" ' % ps.get_attr(['env', 'yodie_path']), ]) logging.debug( 'executing the following command to start NLP...') logging.info(cmd) p = Popen(cmd, shell=True, stderr=STDOUT) p.wait() if 0 != p.returncode: job_status.set_status(False) job_status.save() logging.error( 'ERROR doing the NLP, stopped with a coide [%s]' % p.returncode) exit(p.returncode) else: logging.info('[SemEHR-step-end] NLP step done') if 'semehr_path' in os.environ: logging.info('changing back to semehr_path: %s' % os.environ['semehr_path']) os.chdir(os.environ['semehr_path']) # 2. do SemEHR concept/entity indexing if ps.get_attr(['job', 'semehr-concept']) == 'yes' or ps.get_attr( ['job', 'semehr-patients']) == 'yes': patients = [] doc_to_patient = {} for r in data_rows: patients.append(str(r['patientid'])) doc_to_patient[str(r['docid'])] = str(r['patientid']) patients = list(set(patients)) do_semehr_index(ps, patients, doc_to_patient) # 3. do SemEHR actionable transparency if ps.get_attr(['job', 'action_trans']) == 'yes': logging.info('[SemEHR-step]doing transparency...') actionable_transparise(settings=ps) # 4. do SemEHR document annotation analysis (post processing) if ps.get_attr(['job', 'doc_analysis']) == 'yes': logging.info('[SemEHR-step]doing SemEHR annotation analysis...') do_semehr_doc_anns_analysis(settings=ps) logging.info('[SemEHR-step-end] doc_analysis step done') # 4.5 do SemEHR patient level index if ps.get_attr(['job', 'patient_index']) == 'yes': logging.info('[SemEHR-step]doing patient level indexing...') patient_level_indexing(settings=ps, pids=pids) logging.info('[SemEHR-step-end] patient level indexing done') # 5. do populate results for a research study if ps.get_attr(['job', 'populate_cohort_result']) == 'yes': logging.info( '[SemEHR-step]doing SemEHR cohort result extraction...') populate_cohort_results(settings=ps) logging.info('[SemEHR-step-end] populate_cohort_result step done') # 6. do collect cohort doc based results for a research study if ps.get_attr(['job', 'cohort_doc_collection']) == 'yes': logging.info( '[SemEHR-step]doing SemEHR cohort doc based collection...') collect_cohort_doc_results(settings=ps, doc2pid=doc2pid) logging.info( '[SemEHR-step-end] collect_cohort_doc_results step done') job_status.set_status(True) job_status.save() logging.info('[SemEHR-process-end] all done') except Exception as e: logging.error('[SemEHR-process-ERROR] Failed to do SemEHR process %s' % str(e)) job_status.set_status(False) job_status.save()
def save_docs_to_db(self, docs): utils.multi_thread_tasking(docs, process_func=CohortHelper.do_save_doc_to_db, num_threads=10 if 'threads' not in self._conf else self._conf['threads'], args=[self._conf['sql_template'], self._conf['db_conf_file']])
def index_patients(patients, es): print 'indexing %s patients...' % len(patients) utils.multi_thread_tasking(patients, 10, do_index_patient, args=[es]) print 'patient indexing done'
def process_semehr(config_file): """ a pipeline to process all SemEHR related processes: 0. ES doc copy from one index to another; 1. bio-yodie NLP pipeline annotation on docs; 2. entity centric SemEHR ES indexing :param config_file: :return: """ # read the configuration ps = ProcessSetting(config_file) # initialise the jobstatus class instance job_file = join( ps.get_attr(['job', 'job_status_file_path']), 'semehr_job_status_%s.json' % ps.get_attr(['job', 'job_id'])) print 'using job status file %s' % job_file job_status = JobStatus(job_file) job_status.job_start() data_rows = [] if ps.get_attr(['job', 'load_docs']) == 'yes': sql_template = ps.get_attr(['new_docs', 'sql_query']) print 'retrieving docs by using the template [%s]' % sql_template data_rows = get_docs_for_processing( job_status, sql_template, ps.get_attr(['new_docs', 'dbconn_setting_file'])) print 'total docs num is %s' % len(data_rows) # try: if True: # 0. copy docs if ps.get_attr(['job', 'copy_docs']) == 'yes': docs = [str(r['docid']) for r in data_rows] utils.multi_thread_tasking( docs, ps.get_attr(['doc_copy', 'thread_num']), do_copy_doc, args=[ EntityCentricES(ps.get_attr(['doc_copy', 'es_host'])), ps.get_attr(['doc_copy', 'src_index']), ps.get_attr(['doc_copy', 'src_doc_type']), ps.get_attr(['doc_copy', 'dest_index']), ps.get_attr(['doc_copy', 'dest_doc_type']) ]) if ps.get_attr(['job', 'yodie']) == 'yes': docid_path = '%s/%s_docids.txt' % (ps.get_attr([ 'yodie', 'input_doc_file_path' ]), ps.get_attr(['job', 'job_id'])) print 'working on yodie with %s documents, saved to %s...' % (str( len(data_rows)), docid_path) # save doc ids to text file for input to bioyodie print 'saving doc ids to [%s]' % docid_path utils.save_string('\n'.join([str(r['docid']) for r in data_rows]), docid_path) # 1. do bio-yodie pipeline # 1.1 prepare the configuration file produce_yodie_config(ps) # 1.2 set the env variables set_sys_env(ps) # 1.3 clear ann output folder print 'clearing %s ...' % ps.get_attr( ['yodie', 'output_file_path']) clear_folder(ps.get_attr(['yodie', 'output_file_path'])) # 1.3 run bio-yodie os.chdir(ps.get_attr(['yodie', 'gcp_run_path'])) if ps.get_attr(['yodie', 'os']) == 'win': cmd = ' '.join([ 'java', "-Dgate.home=%s" % ps.get_attr(['env', 'gate_home']), "-Dgcp.home=%s" % ps.get_attr(['env', 'gcp_home']), "-Djava.protocol.handler.pkgs=gate.cloud.util.protocols", "-cp .;{SCRIPTDIR}/conf;{SCRIPTDIR}/gcp.jar;{SCRIPTDIR}/lib/*;" "{GATE_HOME}/bin/gate.jar;{GATE_HOME}/lib/*".format( **{ "SCRIPTDIR": ps.get_attr(['env', 'gcp_home']), "GATE_HOME": ps.get_attr(['env', 'gate_home']) }), '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" ' % ps.get_attr(['env', 'yodie_path']), "-Xmx%s" % ps.get_attr(['yodie', 'memory']), "gate.cloud.batch.BatchRunner", "-t %s" % ps.get_attr(['yodie', 'thread_num']), "-b %s" % ps.get_attr(['yodie', 'config_xml_path']) ]) else: cmd = ' '.join([ 'gcp-direct.sh', "-t %s" % ps.get_attr(['yodie', 'thread_num']), "-Xmx%s" % ps.get_attr(['yodie', 'memory']), "-b %s" % ps.get_attr(['yodie', 'config_xml_path']), '-Dat.ofai.gate.modularpipelines.configFile="%s/bio-yodie-1-2-1/main-bio/main-bio.config.yaml" ' % ps.get_attr(['env', 'yodie_path']), ]) print cmd p = Popen(cmd, shell=True, stderr=STDOUT) p.wait() if 0 != p.returncode: job_status.set_status(False) job_status.save() exit(p.returncode) # 2. do SemEHR concept/entity indexing if ps.get_attr(['job', 'semehr-concept']) == 'yes' or ps.get_attr( ['job', 'semehr-patients']) == 'yes': patients = [] doc_to_patient = {} for r in data_rows: patients.append(str(r['patientid'])) doc_to_patient[str(r['docid'])] = str(r['patientid']) patients = list(set(patients)) do_semehr_index(ps, patients, doc_to_patient) # 3. do SemEHR actionable transparency if ps.get_attr(['job', 'action_trans']) == 'yes': print 'doing transparency...' actionable_transparise(settings=ps) job_status.set_status(True) job_status.save()
def index_mimic_af_cohort_smp(): med_profile_type = 'medprofile' pids = utils.read_text_file('../resources/af_pids.txt') print pids utils.multi_thread_tasking(pids, 5, smp_index, args=[es, med_profile_type])
def dump_pmc_data(term, page_size, data_path): docs = search_pmc(term, page_size) utils.save_json_array(docs, join(data_path, 'pmc_docs.json')) utils.multi_thread_tasking([d['pmcid'] for d in docs if 'pmcid' in d], 10, do_download_pmc_full_text, args=[join(data_path, 'fulltext')]) print 'done'
def db_populate_study_results(cohort_sql, doc_ann_sql_temp, doc_ann_pks, dbcnn_file, study_folder, output_folder, sample_sql_temp, thread_num=10, study_config='study.json', sampling=True, sample_size=20): """ populate results for a research study :param cohort_sql: cohort selection query :param doc_ann_sql_temp: query template for getting a doc_anns item :param doc_ann_pks: primary key columns of doc ann table :param dbcnn_file: database connection config file :param study_folder: study folder :param output_folder: where to save the results :param sample_sql_temp: query template for getting a sample item (including full text and doc_anns) :param thread_num: :param study_config: :param sampling: whether sampling is needed :param sample_size: how many samples per study concept :return: """ ret = load_study_ruler(study_folder, None, study_config) sa = ret['sa'] concept_list = sorted([sc.name for sc in sa.study_concepts]) cui2concept = {} for sc in sa.study_concepts: for c in sc.concept_closure: cui2concept[c] = sc.name results = [] rows = [] db.query_data(cohort_sql, rows, db.get_db_connection_by_setting(dbcnn_file)) logging.info('querying results (cohort size:%s)...' % len(rows)) utils.multi_process_tasking([r['pid'] for r in rows], db_populate_patient_result, num_procs=thread_num, args=[ doc_ann_sql_temp, doc_ann_pks, dbcnn_file, concept_list, cui2concept, positive_patient_filter ], thread_init_func=proc_init_container, thread_end_func=proc_final_collect, thread_end_args=[results]) # populate result table c2pks = {} for c in concept_list: c2pks[c] = [] s = '\t'.join(['pid'] + concept_list) for r in results: pr = [r['p']] for c in concept_list: if r['c2f'][c]['f'] > 0: c2pks[c].append(r['c2f'][c]['docs'][0]) pr.append(str(r['c2f'][c]['f'])) s += '\t'.join(pr) + '\n' f = join(output_folder, 'result.tsv') utils.save_string(s, f) logging.info('result table saved to [%s]' % f) if sampling: logging.info('doing sampling...') sampled_result = {} for c in c2pks: pks = c2pks[c] sample_pks = [] logging.info('doc cache size: %s' % len(pks)) if len(pks) <= sample_size: sample_pks = pks else: for i in xrange(sample_size): index = random.randrange(len(pks)) sample_pks.append(pks[index]) del pks[index] samples = [] utils.multi_thread_tasking( sample_pks, thread_num, extract_sample, args=[c, cui2concept, sample_sql_temp, dbcnn_file, samples]) sampled_result[c] = samples logging.info('%s sampled (%s) results' % (c, len(samples))) f = join(output_folder, 'sampled_docs.js') utils.save_string('var sample_docs= %s;' % json.dumps(sampled_result), f) logging.info('samples saved to %s' % f) logging.info('all results populated')