def concept_analyse(concept_id, condition_label_sql, wrong_label_sql, db_cnf): # get condition mention labels concept_result = {'concept': concept_id, 'labels': {}} mc = MConcept(concept_id) results_condition_labels = [] dutil.query_data(condition_label_sql.format(**{'concept': concept_id}), results_condition_labels, dbconn=dutil.get_db_connection_by_setting(db_cnf)) for r in results_condition_labels: if r['label'] not in mc.name2labels: mc.add_label(ConceptLabel(r['label'])) mc.name2labels[r['label']].condition_mention = r['num'] results_wrong_labels = [] dutil.query_data(wrong_label_sql.format(**{'concept': concept_id}), results_wrong_labels, dbconn=dutil.get_db_connection_by_setting(db_cnf)) for r in results_wrong_labels: if r['label'] not in mc.name2labels: mc.add_label(ConceptLabel(r['label'])) mc.name2labels[r['label']].wrong_mention = r['num'] output = mc.output() print output return concept_result, output
def analyse_db_doc_anns(sql, ann_sql, pks, update_template, full_text_sql, dbcnn_file, rule_config_file, study_folder, thread_num=10, study_config='study.json', update_status_template=None): """ do database based annotation post processing :param sql: get a list of annotation primary keys :param ann_sql: a query template to query ann and its doc full text :param pks: an array of primary key columns :param update_template: an update query template to update post-processed ann :param dbcnn_file: database connection file :param thread_num: :param study_folder: :param rule_config_file: :param study_config: :return: """ ret = load_study_ruler(study_folder, rule_config_file, study_config) sa = ret['sa'] ruler = ret['ruler'] rows = [] db.query_data(sql, rows, db.get_db_connection_by_setting(dbcnn_file)) reader = DBTextReader(full_text_sql, dbcnn_file) cnns = [] for i in xrange(thread_num): cnns.append(db.get_db_connection_by_setting(dbcnn_file)) utils.multi_process_tasking(rows, db_doc_process, num_procs=thread_num, args=[ann_sql, pks, update_template, dbcnn_file, reader, sa, ruler, update_status_template], thread_wise_objs=cnns) for i in xrange(thread_num): db.release_db_connection(cnns[i])
def load_phenotype_def_into_db(): db_cnf = '../../studies/COMOB_SD/dbcnn_input.json' p_def_file = './data/phenotype_defs.json' pd = utils.load_json_data(p_def_file) w_sql = """ insert into tp_phenotype_concepts (phenotype_id, concept_id) values ('{pid}', '{cid}'); """ r_sql = """ select * from tp_phenotypes """ p_data = [] dutil.query_data(r_sql, p_data, dutil.get_db_connection_by_setting(db_cnf)) p2pid = {} for r in p_data: p2pid[r['phenotype_name']] = r['id'] for p in pd: if p not in p2pid: print '%s not found in definition table' % p continue for c in pd[p]['concepts']: sql = w_sql.format(**{'pid': p2pid[p], 'cid': c}) print 'executing [%s]' % sql dutil.query_data(sql, None, dbconn=dutil.get_db_connection_by_setting(db_cnf)) print 'done'
def db_doc_process(row, sql_template, pks, update_template, dbcnn_file, text_reader, sa, ruler, update_status_template): sql = sql_template.format(*[row[k] for k in pks]) rets = [] db.query_data(sql, rets, db.get_db_connection_by_setting(dbcnn_file)) if len(rets) > 0: anns = json.loads(fix_escaped_issue(rets[0]['anns'])) ann_doc = SemEHRAnnDoc() ann_doc.load(anns) no_concepts = False if len(ann_doc.annotations) > 0: num_concepts = process_doc_rule(ann_doc, ruler, text_reader, [row[k] for k in pks], sa) if num_concepts > 0: update_query = update_template.format(*( [db.escape_string(json.dumps(ann_doc.serialise_json()))] + [row[k] for k in pks])) # logging.debug('update ann: %s' % update_query) db.query_data(update_query, None, db.get_db_connection_by_setting(dbcnn_file)) logging.info('ann %s updated' % row) else: no_concepts = True else: no_concepts = True if no_concepts and update_status_template is not None: q = update_status_template.format(*[row[k] for k in pks]) db.query_data(q, None, db.get_db_connection_by_setting(dbcnn_file)) logging.debug('no concepts found/update %s' % q)
def do_action_trans_docs(docs, nlp, doc_ann_sql_template, doc_content_sql_template, action_trans_update_sql_template, db_conn_file, corpus_predictor): """ do actionable transparency prediction on a batch of docs. this function is to supposed to be called in a single thread :param docs: :param nlp: :param doc_ann_sql_template: :param doc_content_sql_template: :param action_trans_update_sql_template: :param db_conn_file: :param corpus_predictor: :return: """ # self_nlp = tstg.load_mode('en') for doc_id in docs: doc_anns = [] dutil.query_data( doc_ann_sql_template.format(doc_id['docid']), doc_anns, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) doc_anns = [{ 's': int(ann['s']), 'e': int(ann['e']), 'AnnId': str(ann['AnnId']), 'signed_label': '', 'gt_label': '', 'action_trans': ann['action_trans'] } for ann in doc_anns] if len(doc_anns) == 0: continue if doc_anns[0]['action_trans'] is not None: print 'found trans %s of first ann, skipping doc' % doc_anns[0][ 'action_trans'] continue doc_container = [] dutil.query_data( doc_content_sql_template.format(doc_id['docid']), doc_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) ptns = tstg.doc_processing(nlp, unicode(doc_container[0]['content']), doc_anns, doc_id['docid']) # print 'doc %s read/model created, predicting...' for inst in ptns: acc = corpus_predictor.predcit(inst) anns = inst.annotations sql = action_trans_update_sql_template.format( **{ 'acc': acc, 'AnnId': anns[0]['AnnId'] }) # print 'executing %s' % sql dutil.query_data( sql, container=None, dbconn=dutil.get_db_connection_by_setting(db_conn_file))
def extract_sample(pk_vals, concept, cui2concept, sample_sql_temp, dbcnn_file, container, ontext_filter_fun=positive_patient_filter): """ extract an sample :param pk_vals: :param concept: :param sample_sql_temp: :param dbcnn_file: :param container: :return: """ r = {} if type(sample_sql_temp) is dict: # two separate sqls to avoid join rows = [] db.query_data(sample_sql_temp['text_sql'].format(*[v for v in pk_vals]), rows, db.get_db_connection_by_setting(dbcnn_file)) if len(rows) > 0: r['text'] = rows[0]['text'] else: r = None if r is not None: rows = [] db.query_data(sample_sql_temp['ann_sql'].format(*[v for v in pk_vals]), rows, db.get_db_connection_by_setting(dbcnn_file)) if len(rows) > 0: r['src_table'] = rows[0]['src_table'] r['src_col'] = rows[0]['src_col'] r['anns'] = rows[0]['anns'] else: r = None else: rows = [] db.query_data(sample_sql_temp.format(*[v for v in pk_vals]), rows, db.get_db_connection_by_setting(dbcnn_file)) if len(rows) > 0: r = rows[0] else: r = None if r is not None: anns = json.loads(r['anns']) ann_doc = SemEHRAnnDoc() ann_doc.load(anns) for a in ann_doc.annotations: if a.cui in cui2concept and concept == cui2concept[a.cui]: correct = len(a.ruled_by) == 0 if correct and ontext_filter_fun is not None: correct = ontext_filter_fun(a) if correct: container.append({'content': r['text'], 'doc_table': r['src_table'], 'doc_col': r['src_col'], 'id': '_'.join(pk_vals), 'annotations': [{'start': a.start, 'end': a.end, 'concept': a.cui, 'string_orig': a.str}]}) break
def query_data(self, query_template, q_obj): rows_container = [] dutil.query_data(query_template.format(**q_obj), rows_container, dbconn=dutil.get_db_connection_by_setting( self.db_conn_file)) return rows_container
def extract_cohort_docs(self, use_combo_fn_name=True): db_conf_file = self._cohort_conf db_conf = None if 'linux_dsn_setting' in self._conf and self._conf['linux_dsn_setting']: # need dsn settings db_conf = self.populate_linux_odbc_setting() db_conf_file = None logging.info('using dsn %s' % db_conf['dsn']) query_size = self._conf['query_size'] if 'query_size' in self._conf else 50 file_pattern = self._conf['file_pattern'] if 'file_pattern' in self._conf else '%s.txt' out_put_folder = self._conf['out_put_folder'] if len(self._patient_ids) == 0: logging.info('cohort is empty, has it been loaded?') return q_temp = self._conf['doc_query_temp'] logging.info('working on extraction, cohort size:%s' % len(self._patient_ids)) for idx in range(0, len(self._patient_ids), query_size): q = q_temp.format(**{'patient_ids': ",".join(["'%s'" % p for p in self._patient_ids[idx:idx+query_size]])}) logging.info('querying batch %s' % (idx + 1)) logging.debug(q) docs = [] db.query_data(q, docs, db.get_db_connection_by_setting(db_conf_file, db_conf)) if self._dest == 'sql': # save docs to database self.save_docs_to_db(docs) else: # save docs to files for d in docs: if d['doc_content'] is None: continue fn = ('%s_%s' % (d['doc_id'], d['patient_id'])) if use_combo_fn_name else ('%s' % d['doc_id']) utils.save_string(d['doc_content'], join(out_put_folder, file_pattern % fn)) logging.info('%s docs saved to destination [%s]' % (len(docs), self._dest)) logging.info('query finished, docs saved to %s' % out_put_folder)
def db_populate_patient_result(container, pid, doc_ann_sql_temp, doc_ann_pks, dbcnn_file, concept_list, cui2concept, ontext_filter_fun=None): """ populate a row (per patient) in the result table :param pid: :param doc_ann_sql_temp: :param doc_ann_pks: :param dbcnn_file: :param concept_list: :param cui2concept: :param container: :return: """ rows = [] db.query_data(doc_ann_sql_temp.format(pid), rows, db.get_db_connection_by_setting(dbcnn_file)) c2f = {} for c in concept_list: c2f[c] = {'f': 0, 'rf': 0, 'docs': []} logging.debug('pid: %s has %s docs' % (pid, len(rows))) i = 0 g2_c2f = {} grp = False for r in rows: try: i += 1 if 'grp' in r: grp = True if r['grp'] in g2_c2f: c2f = g2_c2f[r['grp']] else: c2f = {} for c in concept_list: c2f[c] = {'f': 0, 'rf': 0, 'docs': []} g2_c2f[r['grp']] = c2f anns = json.loads(fix_escaped_issue(r['anns'])) ann_doc = SemEHRAnnDoc() ann_doc.load(anns) for a in ann_doc.annotations: # for c in a.study_concepts: if a.cui in cui2concept: c = cui2concept[a.cui] logging.debug('%s found in %s, ruled_by=%s, concepts:%s' % (c, '-'.join([r[k] for k in doc_ann_pks]), a.ruled_by, a.study_concepts)) if c in c2f: correct = len(a.ruled_by) == 0 if correct and ontext_filter_fun is not None: correct = ontext_filter_fun(a) if not correct: c2f[c]['rf'] += 1 else: c2f[c]['f'] += 1 c2f[c]['docs'].append([r[k] for k in doc_ann_pks]) except Exception as e: logging.error('parsing anns %s because of %s' % (fix_escaped_issue(r['anns']), str(e))) logging.info('pid %s done' % pid) if not grp: g2_c2f = c2f container.append({'p': pid, 'c2f': g2_c2f, 'grp': grp}) logging.debug('pid %s with %s, %s' % (pid, len(c2f), len(container)))
def convert_text_ann_from_db(sql_temp, pks, db_conn, full_text_folder, ann_folder, full_text_file_pattern='%s.txt', ann_file_pattern='%s.txt.knowtator.xml'): sql = sql_temp.format(**pks) results = [] logging.info('doing [%s]...' % sql) file_key = '_'.join([pks[k] for k in pks]) dbutils.query_data(sql, results, dbutils.get_db_connection_by_setting(db_conn)) if len(results) > 0: text = results[0]['text'].replace('\r', '\n') anns = json.loads(results[0]['anns']) xml = AnnConverter.to_eHOST(AnnConverter.load_ann(anns, file_key), full_text=text) utils.save_string(xml, join(ann_folder, ann_file_pattern % file_key)) utils.save_string( text, join(full_text_folder, full_text_file_pattern % file_key)) logging.info('doc [%s] done' % file_key) else: logging.info('doc/anns [%s] not found' % file_key)
def label_analyse(sql_template_file, db_cnf, output_file=None): sql_temps = utils.load_json_data(sql_template_file) concepts = [] dutil.query_data(sql_temps['get_validated_concepts'], concepts, dbconn=dutil.get_db_connection_by_setting(db_cnf)) s = '' for c in concepts: data, output = concept_analyse(c['concept_id'], sql_temps['condition_label_sql'], sql_temps['wrong_label_sql'], db_cnf) s += output if output_file is not None: print 'saving output to %s...' % output_file utils.save_string(s, output_file)
def get_docs_for_processing(job_status, job_sql_template, cnn_conf_file): """ retrieve docs to process from a database table/view :param job_status: :return: """ job_data = job_status.job_start() print 'working on %s' % job_data container = [] sqldbutils.query_data(job_sql_template.format(**job_data), container, dbconn=sqldbutils.get_db_connection_by_setting(cnn_conf_file)) return container
def action_transparentise(cohort_name, db_conn_file, cohort_doc_sql_template, doc_ann_sql_template, doc_content_sql_template, action_trans_update_sql_template, corpus_trans_file): """ use actionable transparency model to create confidence value for each annotations; this method split all cohort documents into batches that are to processed in multiple threads :param cohort_name: :param db_conn_file: :param cohort_doc_sql_template: :param doc_ann_sql_template: :param doc_content_sql_template: :param action_trans_update_sql_template: :param corpus_trans_file: :return: """ docs = [] dutil.query_data(cohort_doc_sql_template.format(cohort_name), docs, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) batch_size = 500 batches = [] for i in range(0, len(docs), batch_size): batches.append(docs[i:i+batch_size]) nlp = tstg.load_mode('en') corpus_predictor = tssp.CorpusPredictor.load_corpus_model(corpus_trans_file) for batch in batches: print 'working on %s/%s batch' % (i, len(batches)) try: do_action_trans_docs(batch, nlp, doc_ann_sql_template, doc_content_sql_template, action_trans_update_sql_template, db_conn_file, corpus_predictor) except Exception as e: print 'error processing [%s]' % e i += 1 #utils.multi_thread_tasking(batches, 1, do_action_trans_docs, # args=[nlp, # doc_ann_sql_template, # doc_content_sql_template, # action_trans_update_sql_template, # db_conn_file, # corpus_predictor # ]) print 'all anns transparentised'
def complete_sample_ann_data(key_anns, complete_sql, db_conn_file, container): k = key_anns[0] anns = key_anns[1] for ann in anns: rows_container = [] dutil.query_data(complete_sql.format(**{'doc_id': ann['id'], 'start': ann['annotations'][0]['start'], 'end': ann['annotations'][0]['end'], 'concept': ann['annotations'][0]['concept']}), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) if len(rows_container) > 0: ann['annotations'][0]['string_orig'] = rows_container[0]['string_orig'] if 'action_trans' in rows_container[0]: ann['annotations'][0]['confidence'] = rows_container[0]['action_trans'] container.append([k, anns])
def get_db_docs_for_converting(settings): sql = settings['sql'] db_conn = settings['db_conn'] doc_ann_sql_temp = settings['sql_temp'] full_text_folder = settings['full_text_folder'] ann_folder = settings['ann_folder'] results = [] dbutils.query_data(sql, results, dbutils.get_db_connection_by_setting(db_conn)) ds = [] for r in results: ds.append(r) logging.info('total docs %s' % len(ds)) for d in ds: AnnConverter.convert_text_ann_from_db( sql_temp=doc_ann_sql_temp, pks=d, db_conn=db_conn, full_text_folder=full_text_folder, ann_folder=ann_folder)
def download_docs(doc_ids, query, db_conn_setting, out_put_folder): """ download clinical notes from EHR :param doc_ids: :param query: :param db_conn_setting: :return: """ db_cnn = dutil.get_db_connection_by_setting(db_conn_setting) results = [] q = query.format(**{'ids': ','.join(['\'%s\'' % did for did in doc_ids])}) print 'querying [%s]' % q print q dutil.query_data(q, results, db_cnn) for r in results: if r['textcontent'] is not None: utils.save_string(r['textcontent'].decode('cp1252').replace( chr(13), ' '), join(out_put_folder, r['cn_doc_id'] + '.txt'), encoding='utf-8')
def extract_cohort_docs(self): db_conf_file = self._cohort_conf db_conf = None if 'linux_dsn_setting' in self._conf and self._conf[ 'linux_dsn_setting']: db_conf = self.populate_linux_odbc_setting() db_conf_file = None logging.info('using dsn %s' % db_conf['dsn']) query_size = self._conf[ 'query_size'] if 'query_size' in self._conf else 50 file_pattern = self._conf[ 'file_pattern'] if 'file_pattern' in self._conf else '%s.txt' out_put_folder = self._conf['out_put_folder'] if len(self._patient_ids) == 0: logging.info('cohort is empty, has it been loaded?') return q_temp = self._conf['doc_query_temp'] logging.info('working on extraction, cohort size:%s' % len(self._patient_ids)) for idx in range(0, len(self._patient_ids), query_size): q = q_temp.format( **{ 'patient_ids': ",".join([ "'%s'" % p for p in self._patient_ids[idx:idx + query_size] ]) }) logging.info('querying batch %s' % (idx + 1)) logging.debug(q) docs = [] db.query_data( q, docs, db.get_db_connection_by_setting(db_conf_file, db_conf)) for d in docs: utils.save_string( d['doc_content'], join(out_put_folder, file_pattern % d['doc_id'])) logging.info('query finished, docs saved to %s' % out_put_folder)
def smp_export(patient_id, es, corpus_mapping, sql_temp, db_cnn): """ structured medical profile extraction :param es: elasticsearch index :param patient_id: :param sql_temp: :param db_cnn :return: """ print 'indexing %s' % patient_id ds_ids = mimicdao.get_summary_doc_by_patient(patient_id) for r in ds_ids: doc = es.get_doc_detail(r['row_id']) profile = parse_discharge_summary(doc['fulltext'], doc['anns'], corpus_mapping) mp = {} for sec in profile: t = sec['section'] if sec['section'] != '' else 'basic' t = t.replace(' ', '_') mp[t] = sec file_name = '%s_%s.json' % (patient_id, r['row_id']) db.query_data(sql_temp.format(**{'patient_id': patient_id, 'doc_id': r['row_id'], 'smp': db.escape_string(json.dumps(mp))}), None, dbconn=db.get_db_connection_by_setting(db_cnn)) print '%s indexed' % file_name
def generate_result_in_one_iteration(cohort_name, study_analyzer, out_file, sample_size, sample_out_file, doc_to_brc_sql, brc_sql, anns_iter_sql, skip_term_sql, doc_content_sql, db_conn_file): """ generate result in one iteration over all annotations. this is supposed to be much faster when working on large study concepts. But post-processing using rules not supported now :param cohort_name: :param study_analyzer: :param out_file: :param sample_size: :param sample_out_file: :param doc_to_brc_sql: :param brc_sql: :param anns_iter_sql: :param skip_term_sql: :param doc_content_sql: :param db_conn_file: :return: """ # populate concept to anns maps sc2anns = {} for sc in study_analyzer.study_concepts: sc2anns[sc.name] = [] # populate patient list print 'populating patient list...' patients = {} rows_container = [] dutil.query_data(brc_sql.format(cohort_name), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) for r in rows_container: patients[r['brcid']] = {'brcid': r['brcid']} # populate document id to patient id dictionary print 'populating doc to patient map...' rows_container = [] dutil.query_data(doc_to_brc_sql.format(cohort_name), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) doc2brc = {} for dp in rows_container: doc2brc[dp['doc_id']] = dp['brcid'] # query annotations print 'iterating annotations...' rows_container = [] dutil.query_data(anns_iter_sql.format(**{'cohort_id': cohort_name, 'extra_constrains': ' \n '.join( [generate_skip_term_constrain(study_analyzer, skip_term_sql)] + [] if (study_analyzer.study_options is None or study_analyzer.study_options['extra_constrains'] is None) else study_analyzer.study_options['extra_constrains'])}), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) for r in rows_container: concept_id = r['inst_uri'] brcid = doc2brc[r['doc_id']] if r['doc_id'] in doc2brc else None if brcid is None: print 'doc %s not matched to a patient!!!' % r['doc_id'] continue patient = patients[brcid] if brcid in patients else None if patient is None: print 'brc id %s not matched a patient!!!' % brcid continue # get matched study concepts for sc in study_analyzer.study_concepts: if concept_id in sc.concept_closure: patient[sc.name] = (patient[sc.name] + 1) if sc.name in patient else 1 sc2anns[sc.name].append({'ann_id': r['ann_id'], 'doc_id': r['doc_id'], 'concept_id': concept_id, 'start': r['start_offset'], 'end': r['end_offset']}) # generate result table print 'generate result table...' concept_labels = sorted([k for k in sc2anns]) s = '\t'.join(['brcid'] + concept_labels) + '\n' lines = [] utils.multi_thread_tasking([patients[pid] for pid in patients], 40, do_put_line, args=[concept_labels, lines]) s += '\n'.join(lines) utils.save_string(s, out_file) # generate sample annotations term_to_docs = {} for concept in sc2anns: ann_ids = sc2anns[concept] sample_ids = [] if len(ann_ids) <= sample_size: sample_ids = ann_ids else: for i in xrange(sample_size): index = random.randrange(len(ann_ids)) sample_ids.append(ann_ids[index]) del ann_ids[index] term_to_docs[concept] = sample_ids # query doc contents print 'populating term to sampled anns...' term_to_sampled = {} for term in term_to_docs: sample_ids = term_to_docs[term] if len(sample_ids) <=0 : continue sample_doc_ids = ['\'' + s['doc_id'] + '\'' for s in sample_ids] rows_container = [] dutil.query_data(doc_content_sql.format(','.join(sample_doc_ids)), rows_container, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) doc_to_content = {} for r in rows_container: doc_to_content[r['doc_id']] = r['TextContent'] term_sampled = [] for s in sample_ids: term_sampled.append({'id': s['doc_id'], 'content': doc_to_content[s['doc_id']], 'annotations': [{'start': s['start'], 'end': s['end'], 'concept': s['concept_id']}]}) term_to_sampled[term] = term_sampled utils.save_json_array(convert_encoding(term_to_sampled, 'cp1252', 'utf-8'), sample_out_file)
def populate_patient_study_table_post_ruled(cohort_name, study_analyzer, out_file, rule_executor, sample_size, sample_out_file, ruled_ann_out_file, patients_sql, term_doc_anns_sql, skip_term_sql, db_conn_file, text_preprocessing=False): """ populate patient study result with post processing to remove unwanted mentions :param cohort_name: :param study_analyzer: :param out_file: :param rule_executor: :param sample_size: :param sample_out_file: :return: """ patients = [] dutil.query_data(patients_sql.format(cohort_name), patients, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) id2p = {} for p in patients: id2p[p['brcid']] = p non_empty_concepts = [] study_concepts = study_analyzer.study_concepts term_to_docs = {} ruled_anns = [] positive_dumps = [] skip_terms_list = [t.lower() for t in rule_executor.skip_terms] for sc in study_concepts: positive_doc_anns = [] sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure)) concept_list = ', '.join(['\'%s\'' % c for c in sc.concept_closure]) doc_anns = [] if len(sc.concept_closure) > 0: sql_temp = term_doc_anns_sql data_sql = sql_temp.format(**{'concepts': concept_list, 'cohort_id': cohort_name, 'extra_constrains': ' \n '.join( [generate_skip_term_constrain(study_analyzer, skip_term_sql)] + [] if (study_analyzer.study_options is None or study_analyzer.study_options['extra_constrains'] is None) else study_analyzer.study_options['extra_constrains'])}) print data_sql dutil.query_data(data_sql, doc_anns, dbconn=dutil.get_db_connection_by_setting(db_conn_file)) if len(doc_anns) > 0: p_to_dfreq = {} counted_docs = set() for ann in doc_anns: p = ann['brcid'] d = ann['CN_Doc_ID'] if d in counted_docs: continue ruled = False case_instance = '' if not ruled: # skip term rules if 'string_orig' in ann and ann['string_orig'].lower() in skip_terms_list: ruled = True rule = 'skip-term' case_instance = ann['string_orig'] if not ruled: # string orign rules - not used now ruled, case_instance = rule_executor.execute_original_string_rules( ann['string_orig'] if 'string_orig' in ann else ann['TextContent'][int(ann['start_offset']):int(ann['end_offset'])]) rule = 'original-string-rule' if not ruled: # post processing rules ruled, case_instance, rule = \ rule_executor.execute(ann['TextContent'] if not text_preprocessing else preprocessing_text_befor_rule_execution(ann['TextContent']), int(ann['start_offset']), int(ann['end_offset']), string_orig=ann['string_orig'] if 'string_orig' in ann else None) rule = 'semehr ' + rule if not ruled: # bio-yodie labels if 'experiencer' in ann: if ann['experiencer'].lower() != 'patient' or \ ann['temporality'].lower() != 'recent' or \ ann['negation'].lower() != 'affirmed': ruled = True case_instance = '\t'.join([ann['experiencer'], ann['temporality'], ann['negation']]) rule = 'yodie' if ruled: ruled_anns.append({'p': p, 'd': d, 'ruled': rule, 's': ann['start_offset'], 'e': ann['end_offset'], 'c': ann['inst_uri'], 'case-instance': case_instance, 'string_orig': ann['string_orig'] }) else: counted_docs.add(d) p_to_dfreq[p] = 1 if p not in p_to_dfreq else 1 + p_to_dfreq[p] positive_doc_anns.append({'id': ann['CN_Doc_ID'], 'content': ann['TextContent'], 'annotations': [{'start': ann['start_offset'], 'end': ann['end_offset'], 'concept': ann['inst_uri'], 'string_orig': ann[ 'string_orig'] if 'string_orig' in ann else ''}], 'doc_table': ann['src_table'], 'doc_col': ann['src_col']}) positive_dumps.append({'p': p, 'd': d, 's': ann['start_offset'], 'e': ann['end_offset'], 'c': ann['inst_uri'], 'string_orig': ann['string_orig']}) if len(counted_docs) > 0: non_empty_concepts.append(sc_key) for p in p_to_dfreq: id2p[p][sc_key] = str(p_to_dfreq[p]) # save sample docs if sample_size >= len(positive_doc_anns): term_to_docs[sc_key] = positive_doc_anns else: sampled = [] for i in xrange(sample_size): index = random.randrange(len(positive_doc_anns)) sampled.append(positive_doc_anns[index]) positive_doc_anns.pop(index) term_to_docs[sc_key] = sampled concept_labels = sorted(non_empty_concepts) s = '\t'.join(['brcid'] + concept_labels) + '\n' for p in patients: s += '\t'.join([p['brcid']] + [p[k] if k in p else '0' for k in concept_labels]) + '\n' utils.save_string(s, out_file) utils.save_string('var sample_docs=' + json.dumps(convert_encoding(term_to_docs, 'cp1252', 'utf-8')), sample_out_file) utils.save_json_array(convert_encoding(ruled_anns, 'cp1252', 'utf-8'), ruled_ann_out_file) utils.save_json_array(positive_dumps, out_file + "_json") print 'done'
def do_save_doc_to_db(d, sql_temp, db_conf_file): if d['doc_content'] is None: return d['doc_content'] = db.escape_string(d['doc_content']) sql = sql_temp.format(**d) db.query_data(sql, None, dbconn=db.get_db_connection_by_setting(db_conf_file))
def read_full_text(self, text_key): sql = self._qt.format(*[k for k in text_key]) rets = [] db.query_data(sql, rets, db.get_db_connection_by_setting(self._cnn_file)) return rets[0]['text']
def db_populate_study_results(cohort_sql, doc_ann_sql_temp, doc_ann_pks, dbcnn_file, study_folder, output_folder, sample_sql_temp, thread_num=10, study_config='study.json', sampling=True, sample_size=20): """ populate results for a research study :param cohort_sql: cohort selection query :param doc_ann_sql_temp: query template for getting a doc_anns item :param doc_ann_pks: primary key columns of doc ann table :param dbcnn_file: database connection config file :param study_folder: study folder :param output_folder: where to save the results :param sample_sql_temp: query template for getting a sample item (including full text and doc_anns) :param thread_num: :param study_config: :param sampling: whether sampling is needed :param sample_size: how many samples per study concept :return: """ ret = load_study_ruler(study_folder, None, study_config) sa = ret['sa'] concept_list = sorted([sc.name for sc in sa.study_concepts]) cui2concept = {} for sc in sa.study_concepts: for c in sc.concept_closure: cui2concept[c] = sc.name results = [] rows = [] db.query_data(cohort_sql, rows, db.get_db_connection_by_setting(dbcnn_file)) logging.info('querying results (cohort size:%s)...' % len(rows)) utils.multi_process_tasking([r['pid'] for r in rows], db_populate_patient_result, num_procs=thread_num, args=[ doc_ann_sql_temp, doc_ann_pks, dbcnn_file, concept_list, cui2concept, positive_patient_filter ], thread_init_func=proc_init_container, thread_end_func=proc_final_collect, thread_end_args=[results]) # populate result table c2pks = {} for c in concept_list: c2pks[c] = [] s = '\t'.join(['pid'] + concept_list) for r in results: pr = [r['p']] for c in concept_list: if r['c2f'][c]['f'] > 0: c2pks[c].append(r['c2f'][c]['docs'][0]) pr.append(str(r['c2f'][c]['f'])) s += '\t'.join(pr) + '\n' f = join(output_folder, 'result.tsv') utils.save_string(s, f) logging.info('result table saved to [%s]' % f) if sampling: logging.info('doing sampling...') sampled_result = {} for c in c2pks: pks = c2pks[c] sample_pks = [] logging.info('doc cache size: %s' % len(pks)) if len(pks) <= sample_size: sample_pks = pks else: for i in xrange(sample_size): index = random.randrange(len(pks)) sample_pks.append(pks[index]) del pks[index] samples = [] utils.multi_thread_tasking( sample_pks, thread_num, extract_sample, args=[c, cui2concept, sample_sql_temp, dbcnn_file, samples]) sampled_result[c] = samples logging.info('%s sampled (%s) results' % (c, len(samples))) f = join(output_folder, 'sampled_docs.js') utils.save_string('var sample_docs= %s;' % json.dumps(sampled_result), f) logging.info('samples saved to %s' % f) logging.info('all results populated')