def concept_analyse(concept_id, condition_label_sql, wrong_label_sql, db_cnf):
    # get condition mention labels
    concept_result = {'concept': concept_id, 'labels': {}}
    mc = MConcept(concept_id)
    results_condition_labels = []
    dutil.query_data(condition_label_sql.format(**{'concept': concept_id}),
                     results_condition_labels,
                     dbconn=dutil.get_db_connection_by_setting(db_cnf))
    for r in results_condition_labels:
        if r['label'] not in mc.name2labels:
            mc.add_label(ConceptLabel(r['label']))
        mc.name2labels[r['label']].condition_mention = r['num']

    results_wrong_labels = []
    dutil.query_data(wrong_label_sql.format(**{'concept': concept_id}),
                     results_wrong_labels,
                     dbconn=dutil.get_db_connection_by_setting(db_cnf))
    for r in results_wrong_labels:
        if r['label'] not in mc.name2labels:
            mc.add_label(ConceptLabel(r['label']))
        mc.name2labels[r['label']].wrong_mention = r['num']

    output = mc.output()
    print output
    return concept_result, output
def analyse_db_doc_anns(sql, ann_sql, pks, update_template, full_text_sql, dbcnn_file, rule_config_file,
                        study_folder, thread_num=10, study_config='study.json', update_status_template=None):
    """
    do database based annotation post processing
    :param sql: get a list of annotation primary keys
    :param ann_sql: a query template to query ann and its doc full text
    :param pks: an array of primary key columns
    :param update_template: an update query template to update post-processed ann
    :param dbcnn_file: database connection file
    :param thread_num:
    :param study_folder:
    :param rule_config_file:
    :param study_config:
    :return:
    """
    ret = load_study_ruler(study_folder, rule_config_file, study_config)
    sa = ret['sa']
    ruler = ret['ruler']
    rows = []
    db.query_data(sql, rows, db.get_db_connection_by_setting(dbcnn_file))
    reader = DBTextReader(full_text_sql, dbcnn_file)
    cnns = []
    for i in xrange(thread_num):
        cnns.append(db.get_db_connection_by_setting(dbcnn_file))
    utils.multi_process_tasking(rows, db_doc_process, num_procs=thread_num,
                                args=[ann_sql, pks, update_template, dbcnn_file, reader, sa, ruler,
                                      update_status_template],
                                thread_wise_objs=cnns)
    for i in xrange(thread_num):
        db.release_db_connection(cnns[i])
Beispiel #3
0
def load_phenotype_def_into_db():
    db_cnf = '../../studies/COMOB_SD/dbcnn_input.json'
    p_def_file = './data/phenotype_defs.json'
    pd = utils.load_json_data(p_def_file)
    w_sql = """
    insert into tp_phenotype_concepts (phenotype_id, concept_id) values 
    ('{pid}', '{cid}');
    """
    r_sql = """
    select * from tp_phenotypes
    """
    p_data = []
    dutil.query_data(r_sql, p_data, dutil.get_db_connection_by_setting(db_cnf))
    p2pid = {}
    for r in p_data:
        p2pid[r['phenotype_name']] = r['id']
    for p in pd:
        if p not in p2pid:
            print '%s not found in definition table' % p
            continue
        for c in pd[p]['concepts']:
            sql = w_sql.format(**{'pid': p2pid[p], 'cid': c})
            print 'executing [%s]' % sql
            dutil.query_data(sql, None, dbconn=dutil.get_db_connection_by_setting(db_cnf))
    print 'done'
def db_doc_process(row, sql_template, pks, update_template, dbcnn_file,
                   text_reader, sa, ruler, update_status_template):
    sql = sql_template.format(*[row[k] for k in pks])
    rets = []
    db.query_data(sql, rets, db.get_db_connection_by_setting(dbcnn_file))
    if len(rets) > 0:
        anns = json.loads(fix_escaped_issue(rets[0]['anns']))
        ann_doc = SemEHRAnnDoc()
        ann_doc.load(anns)
        no_concepts = False
        if len(ann_doc.annotations) > 0:
            num_concepts = process_doc_rule(ann_doc, ruler, text_reader,
                                            [row[k] for k in pks], sa)
            if num_concepts > 0:
                update_query = update_template.format(*(
                    [db.escape_string(json.dumps(ann_doc.serialise_json()))] +
                    [row[k] for k in pks]))
                # logging.debug('update ann: %s' % update_query)
                db.query_data(update_query, None,
                              db.get_db_connection_by_setting(dbcnn_file))
                logging.info('ann %s updated' % row)
            else:
                no_concepts = True
        else:
            no_concepts = True
        if no_concepts and update_status_template is not None:
            q = update_status_template.format(*[row[k] for k in pks])
            db.query_data(q, None, db.get_db_connection_by_setting(dbcnn_file))
            logging.debug('no concepts found/update %s' % q)
Beispiel #5
0
def do_action_trans_docs(docs, nlp, doc_ann_sql_template,
                         doc_content_sql_template,
                         action_trans_update_sql_template, db_conn_file,
                         corpus_predictor):
    """
    do actionable transparency prediction on a batch of docs.
    this function is to supposed to be called in a single thread
    :param docs:
    :param nlp:
    :param doc_ann_sql_template:
    :param doc_content_sql_template:
    :param action_trans_update_sql_template:
    :param db_conn_file:
    :param corpus_predictor:
    :return:
    """
    # self_nlp = tstg.load_mode('en')
    for doc_id in docs:
        doc_anns = []
        dutil.query_data(
            doc_ann_sql_template.format(doc_id['docid']),
            doc_anns,
            dbconn=dutil.get_db_connection_by_setting(db_conn_file))
        doc_anns = [{
            's': int(ann['s']),
            'e': int(ann['e']),
            'AnnId': str(ann['AnnId']),
            'signed_label': '',
            'gt_label': '',
            'action_trans': ann['action_trans']
        } for ann in doc_anns]
        if len(doc_anns) == 0:
            continue
        if doc_anns[0]['action_trans'] is not None:
            print 'found trans %s of first ann, skipping doc' % doc_anns[0][
                'action_trans']
            continue
        doc_container = []
        dutil.query_data(
            doc_content_sql_template.format(doc_id['docid']),
            doc_container,
            dbconn=dutil.get_db_connection_by_setting(db_conn_file))
        ptns = tstg.doc_processing(nlp, unicode(doc_container[0]['content']),
                                   doc_anns, doc_id['docid'])
        # print 'doc %s read/model created, predicting...'
        for inst in ptns:
            acc = corpus_predictor.predcit(inst)
            anns = inst.annotations
            sql = action_trans_update_sql_template.format(
                **{
                    'acc': acc,
                    'AnnId': anns[0]['AnnId']
                })
            # print 'executing %s' % sql
            dutil.query_data(
                sql,
                container=None,
                dbconn=dutil.get_db_connection_by_setting(db_conn_file))
def extract_sample(pk_vals, concept, cui2concept, sample_sql_temp, dbcnn_file, container, ontext_filter_fun=positive_patient_filter):
    """
    extract an sample
    :param pk_vals:
    :param concept:
    :param sample_sql_temp:
    :param dbcnn_file:
    :param container:
    :return:
    """
    r = {}
    if type(sample_sql_temp) is dict:
        # two separate sqls to avoid join
        rows = []
        db.query_data(sample_sql_temp['text_sql'].format(*[v for v in pk_vals]), rows,
                      db.get_db_connection_by_setting(dbcnn_file))
        if len(rows) > 0:
            r['text'] = rows[0]['text']
        else:
            r = None
        if r is not None:
            rows = []
            db.query_data(sample_sql_temp['ann_sql'].format(*[v for v in pk_vals]), rows,
                          db.get_db_connection_by_setting(dbcnn_file))
            if len(rows) > 0:
                r['src_table'] = rows[0]['src_table']
                r['src_col'] = rows[0]['src_col']
                r['anns'] = rows[0]['anns']
            else:
                r = None
    else:
        rows = []
        db.query_data(sample_sql_temp.format(*[v for v in pk_vals]), rows,
                      db.get_db_connection_by_setting(dbcnn_file))
        if len(rows) > 0:
            r = rows[0]
        else:
            r = None

    if r is not None:
        anns = json.loads(r['anns'])
        ann_doc = SemEHRAnnDoc()
        ann_doc.load(anns)
        for a in ann_doc.annotations:
            if a.cui in cui2concept and concept == cui2concept[a.cui]:
                correct = len(a.ruled_by) == 0
                if correct and ontext_filter_fun is not None:
                    correct = ontext_filter_fun(a)
                if correct:
                    container.append({'content': r['text'], 'doc_table': r['src_table'], 'doc_col': r['src_col'],
                                      'id': '_'.join(pk_vals),
                                      'annotations': [{'start': a.start,
                                                       'end': a.end,
                                                       'concept': a.cui,
                                                       'string_orig': a.str}]})
                    break
 def query_data(self, query_template, q_obj):
     rows_container = []
     dutil.query_data(query_template.format(**q_obj),
                      rows_container,
                      dbconn=dutil.get_db_connection_by_setting(
                          self.db_conn_file))
     return rows_container
Beispiel #8
0
 def extract_cohort_docs(self, use_combo_fn_name=True):
     db_conf_file = self._cohort_conf
     db_conf = None
     if 'linux_dsn_setting' in self._conf and self._conf['linux_dsn_setting']:
         # need dsn settings
         db_conf = self.populate_linux_odbc_setting()
         db_conf_file = None
         logging.info('using dsn %s' % db_conf['dsn'])
     query_size = self._conf['query_size'] if 'query_size' in self._conf else 50
     file_pattern = self._conf['file_pattern'] if 'file_pattern' in self._conf else '%s.txt'
     out_put_folder = self._conf['out_put_folder']
     if len(self._patient_ids) == 0:
         logging.info('cohort is empty, has it been loaded?')
         return
     q_temp = self._conf['doc_query_temp']
     logging.info('working on extraction, cohort size:%s' % len(self._patient_ids))
     for idx in range(0, len(self._patient_ids), query_size):
         q = q_temp.format(**{'patient_ids': ",".join(["'%s'" % p for p in self._patient_ids[idx:idx+query_size]])})
         logging.info('querying batch %s' % (idx + 1))
         logging.debug(q)
         docs = []
         db.query_data(q, docs, db.get_db_connection_by_setting(db_conf_file, db_conf))
         if self._dest == 'sql':
             # save docs to database
             self.save_docs_to_db(docs)
         else:
             # save docs to files
             for d in docs:
                 if d['doc_content'] is None:
                     continue
                 fn = ('%s_%s' % (d['doc_id'], d['patient_id'])) if use_combo_fn_name else ('%s' % d['doc_id'])
                 utils.save_string(d['doc_content'], join(out_put_folder, file_pattern % fn))
         logging.info('%s docs saved to destination [%s]' % (len(docs), self._dest))
     logging.info('query finished, docs saved to %s' % out_put_folder)
def db_populate_patient_result(container, pid, doc_ann_sql_temp, doc_ann_pks, dbcnn_file, concept_list,
                               cui2concept,
                               ontext_filter_fun=None):
    """
    populate a row (per patient) in the result table
    :param pid:
    :param doc_ann_sql_temp:
    :param doc_ann_pks:
    :param dbcnn_file:
    :param concept_list:
    :param cui2concept:
    :param container:
    :return:
    """
    rows = []
    db.query_data(doc_ann_sql_temp.format(pid), rows, db.get_db_connection_by_setting(dbcnn_file))
    c2f = {}
    for c in concept_list:
        c2f[c] = {'f': 0, 'rf': 0, 'docs': []}
    logging.debug('pid: %s has %s docs' % (pid, len(rows)))
    i = 0
    g2_c2f = {}
    grp = False
    for r in rows:
        try:
            i += 1
            if 'grp' in r:
                grp = True
                if r['grp'] in g2_c2f:
                    c2f = g2_c2f[r['grp']]
                else:
                    c2f = {}
                    for c in concept_list:
                        c2f[c] = {'f': 0, 'rf': 0, 'docs': []}
                    g2_c2f[r['grp']] = c2f
            anns = json.loads(fix_escaped_issue(r['anns']))
            ann_doc = SemEHRAnnDoc()
            ann_doc.load(anns)
            for a in ann_doc.annotations:
                # for c in a.study_concepts:
                if a.cui in cui2concept:
                    c = cui2concept[a.cui]
                    logging.debug('%s found in %s, ruled_by=%s, concepts:%s' % (c, '-'.join([r[k] for k in doc_ann_pks]),
                                                                   a.ruled_by, a.study_concepts))
                    if c in c2f:
                        correct = len(a.ruled_by) == 0
                        if correct and ontext_filter_fun is not None:
                            correct = ontext_filter_fun(a)
                        if not correct:
                            c2f[c]['rf'] += 1
                        else:
                            c2f[c]['f'] += 1
                            c2f[c]['docs'].append([r[k] for k in doc_ann_pks])
        except Exception as e:
            logging.error('parsing anns %s because of %s' % (fix_escaped_issue(r['anns']), str(e)))
    logging.info('pid %s done' % pid)
    if not grp:
        g2_c2f = c2f
    container.append({'p': pid, 'c2f': g2_c2f, 'grp': grp})
    logging.debug('pid %s with %s, %s' % (pid, len(c2f), len(container)))
 def convert_text_ann_from_db(sql_temp,
                              pks,
                              db_conn,
                              full_text_folder,
                              ann_folder,
                              full_text_file_pattern='%s.txt',
                              ann_file_pattern='%s.txt.knowtator.xml'):
     sql = sql_temp.format(**pks)
     results = []
     logging.info('doing [%s]...' % sql)
     file_key = '_'.join([pks[k] for k in pks])
     dbutils.query_data(sql, results,
                        dbutils.get_db_connection_by_setting(db_conn))
     if len(results) > 0:
         text = results[0]['text'].replace('\r', '\n')
         anns = json.loads(results[0]['anns'])
         xml = AnnConverter.to_eHOST(AnnConverter.load_ann(anns, file_key),
                                     full_text=text)
         utils.save_string(xml, join(ann_folder,
                                     ann_file_pattern % file_key))
         utils.save_string(
             text, join(full_text_folder,
                        full_text_file_pattern % file_key))
         logging.info('doc [%s] done' % file_key)
     else:
         logging.info('doc/anns [%s] not found' % file_key)
Beispiel #11
0
def label_analyse(sql_template_file, db_cnf, output_file=None):
    sql_temps = utils.load_json_data(sql_template_file)
    concepts = []
    dutil.query_data(sql_temps['get_validated_concepts'], concepts,
                     dbconn=dutil.get_db_connection_by_setting(db_cnf))
    s = ''
    for c in concepts:
        data, output = concept_analyse(c['concept_id'], sql_temps['condition_label_sql'], sql_temps['wrong_label_sql'], db_cnf)
        s += output
    if output_file is not None:
        print 'saving output to %s...' % output_file
        utils.save_string(s, output_file)
def get_docs_for_processing(job_status, job_sql_template, cnn_conf_file):
    """
    retrieve docs to process from a database table/view
    :param job_status:
    :return:
    """
    job_data = job_status.job_start()
    print 'working on %s' % job_data
    container = []
    sqldbutils.query_data(job_sql_template.format(**job_data), container,
                          dbconn=sqldbutils.get_db_connection_by_setting(cnn_conf_file))
    return container
def action_transparentise(cohort_name, db_conn_file,
                          cohort_doc_sql_template,
                          doc_ann_sql_template,
                          doc_content_sql_template,
                          action_trans_update_sql_template,
                          corpus_trans_file):
    """
    use actionable transparency model to create confidence value for each annotations;
    this method split all cohort documents into batches that are to processed in multiple threads
    :param cohort_name:
    :param db_conn_file:
    :param cohort_doc_sql_template:
    :param doc_ann_sql_template:
    :param doc_content_sql_template:
    :param action_trans_update_sql_template:
    :param corpus_trans_file:
    :return:
    """
    docs = []
    dutil.query_data(cohort_doc_sql_template.format(cohort_name), docs,
                     dbconn=dutil.get_db_connection_by_setting(db_conn_file))
    batch_size = 500
    batches = []
    for i in range(0, len(docs), batch_size):
        batches.append(docs[i:i+batch_size])
    nlp = tstg.load_mode('en')
    corpus_predictor = tssp.CorpusPredictor.load_corpus_model(corpus_trans_file)
    for batch in batches:
        print 'working on %s/%s batch' % (i, len(batches))
        try:
            do_action_trans_docs(batch, 
                                 nlp,
                                 doc_ann_sql_template,
                                 doc_content_sql_template,
                                 action_trans_update_sql_template,
                                 db_conn_file,
                                 corpus_predictor)
        except Exception as e:
            print 'error processing [%s]' % e
        i += 1
    #utils.multi_thread_tasking(batches, 1, do_action_trans_docs,
    #                           args=[nlp,
    #                                 doc_ann_sql_template,
    #                                 doc_content_sql_template,
    #                                 action_trans_update_sql_template,
    #                                 db_conn_file,
    #                                 corpus_predictor
    #                                 ])
    print 'all anns transparentised'
def complete_sample_ann_data(key_anns, complete_sql, db_conn_file, container):
    k = key_anns[0]
    anns = key_anns[1]
    for ann in anns:
        rows_container = []
        dutil.query_data(complete_sql.format(**{'doc_id': ann['id'],
                                                'start': ann['annotations'][0]['start'],
                                                'end': ann['annotations'][0]['end'],
                                                'concept': ann['annotations'][0]['concept']}),
                         rows_container,
                         dbconn=dutil.get_db_connection_by_setting(db_conn_file))
        if len(rows_container) > 0:
            ann['annotations'][0]['string_orig'] = rows_container[0]['string_orig']
            if 'action_trans' in rows_container[0]:
                ann['annotations'][0]['confidence'] = rows_container[0]['action_trans']
    container.append([k, anns])
 def get_db_docs_for_converting(settings):
     sql = settings['sql']
     db_conn = settings['db_conn']
     doc_ann_sql_temp = settings['sql_temp']
     full_text_folder = settings['full_text_folder']
     ann_folder = settings['ann_folder']
     results = []
     dbutils.query_data(sql, results,
                        dbutils.get_db_connection_by_setting(db_conn))
     ds = []
     for r in results:
         ds.append(r)
     logging.info('total docs %s' % len(ds))
     for d in ds:
         AnnConverter.convert_text_ann_from_db(
             sql_temp=doc_ann_sql_temp,
             pks=d,
             db_conn=db_conn,
             full_text_folder=full_text_folder,
             ann_folder=ann_folder)
def download_docs(doc_ids, query, db_conn_setting, out_put_folder):
    """
    download clinical notes from EHR
    :param doc_ids:
    :param query:
    :param db_conn_setting:
    :return:
    """
    db_cnn = dutil.get_db_connection_by_setting(db_conn_setting)
    results = []
    q = query.format(**{'ids': ','.join(['\'%s\'' % did for did in doc_ids])})
    print 'querying [%s]' % q
    print q
    dutil.query_data(q, results, db_cnn)
    for r in results:
        if r['textcontent'] is not None:
            utils.save_string(r['textcontent'].decode('cp1252').replace(
                chr(13), ' '),
                              join(out_put_folder, r['cn_doc_id'] + '.txt'),
                              encoding='utf-8')
Beispiel #17
0
 def extract_cohort_docs(self):
     db_conf_file = self._cohort_conf
     db_conf = None
     if 'linux_dsn_setting' in self._conf and self._conf[
             'linux_dsn_setting']:
         db_conf = self.populate_linux_odbc_setting()
         db_conf_file = None
         logging.info('using dsn %s' % db_conf['dsn'])
     query_size = self._conf[
         'query_size'] if 'query_size' in self._conf else 50
     file_pattern = self._conf[
         'file_pattern'] if 'file_pattern' in self._conf else '%s.txt'
     out_put_folder = self._conf['out_put_folder']
     if len(self._patient_ids) == 0:
         logging.info('cohort is empty, has it been loaded?')
         return
     q_temp = self._conf['doc_query_temp']
     logging.info('working on extraction, cohort size:%s' %
                  len(self._patient_ids))
     for idx in range(0, len(self._patient_ids), query_size):
         q = q_temp.format(
             **{
                 'patient_ids':
                 ",".join([
                     "'%s'" % p
                     for p in self._patient_ids[idx:idx + query_size]
                 ])
             })
         logging.info('querying batch %s' % (idx + 1))
         logging.debug(q)
         docs = []
         db.query_data(
             q, docs,
             db.get_db_connection_by_setting(db_conf_file, db_conf))
         for d in docs:
             utils.save_string(
                 d['doc_content'],
                 join(out_put_folder, file_pattern % d['doc_id']))
     logging.info('query finished, docs saved to %s' % out_put_folder)
def smp_export(patient_id, es, corpus_mapping, sql_temp, db_cnn):
    """
    structured medical profile extraction
    :param es: elasticsearch index
    :param patient_id:
    :param sql_temp:
    :param db_cnn
    :return:
    """
    print 'indexing %s' % patient_id
    ds_ids = mimicdao.get_summary_doc_by_patient(patient_id)
    for r in ds_ids:
        doc = es.get_doc_detail(r['row_id'])
        profile = parse_discharge_summary(doc['fulltext'], doc['anns'], corpus_mapping)
        mp = {}
        for sec in profile:
            t = sec['section'] if sec['section'] != '' else 'basic'
            t = t.replace(' ', '_')
            mp[t] = sec
        file_name = '%s_%s.json' % (patient_id, r['row_id'])
        db.query_data(sql_temp.format(**{'patient_id': patient_id, 'doc_id': r['row_id'],
                                         'smp': db.escape_string(json.dumps(mp))}),
                      None, dbconn=db.get_db_connection_by_setting(db_cnn))
        print '%s indexed' % file_name
def generate_result_in_one_iteration(cohort_name, study_analyzer, out_file,
                                     sample_size, sample_out_file,
                                     doc_to_brc_sql, brc_sql, anns_iter_sql, skip_term_sql, doc_content_sql,
                                     db_conn_file):
    """
    generate result in one iteration over all annotations. this is supposed to be much faster when working on
    large study concepts. But post-processing using rules not supported now
    :param cohort_name:
    :param study_analyzer:
    :param out_file:
    :param sample_size:
    :param sample_out_file:
    :param doc_to_brc_sql:
    :param brc_sql:
    :param anns_iter_sql:
    :param skip_term_sql:
    :param doc_content_sql:
    :param db_conn_file:
    :return:
    """
    # populate concept to anns maps
    sc2anns = {}
    for sc in study_analyzer.study_concepts:
        sc2anns[sc.name] = []

    # populate patient list
    print 'populating patient list...'
    patients = {}
    rows_container = []
    dutil.query_data(brc_sql.format(cohort_name), rows_container,
                     dbconn=dutil.get_db_connection_by_setting(db_conn_file))
    for r in rows_container:
        patients[r['brcid']] = {'brcid': r['brcid']}

    # populate document id to patient id dictionary
    print 'populating doc to patient map...'
    rows_container = []
    dutil.query_data(doc_to_brc_sql.format(cohort_name), rows_container,
                     dbconn=dutil.get_db_connection_by_setting(db_conn_file))
    doc2brc = {}
    for dp in rows_container:
        doc2brc[dp['doc_id']] = dp['brcid']

    # query annotations
    print 'iterating annotations...'
    rows_container = []
    dutil.query_data(anns_iter_sql.format(**{'cohort_id': cohort_name,
                                             'extra_constrains':
                                                 ' \n '.join(
                                                  [generate_skip_term_constrain(study_analyzer, skip_term_sql)]
                                                  + [] if (study_analyzer.study_options is None or
                                                           study_analyzer.study_options['extra_constrains'] is None)
                                                  else study_analyzer.study_options['extra_constrains'])}),
                     rows_container,
                     dbconn=dutil.get_db_connection_by_setting(db_conn_file))
    for r in rows_container:
        concept_id = r['inst_uri']
        brcid = doc2brc[r['doc_id']] if r['doc_id'] in doc2brc else None
        if brcid is None:
            print 'doc %s not matched to a patient!!!' % r['doc_id']
            continue
        patient = patients[brcid] if brcid in patients else None
        if patient is None:
            print 'brc id %s not matched a patient!!!' % brcid
            continue
        # get matched study concepts
        for sc in study_analyzer.study_concepts:
            if concept_id in sc.concept_closure:
                patient[sc.name] = (patient[sc.name] + 1) if sc.name in patient else 1
                sc2anns[sc.name].append({'ann_id': r['ann_id'], 'doc_id': r['doc_id'], 'concept_id': concept_id,
                                         'start': r['start_offset'], 'end': r['end_offset']})

    # generate result table
    print 'generate result table...'
    concept_labels = sorted([k for k in sc2anns])
    s = '\t'.join(['brcid'] + concept_labels) + '\n'
    lines = []
    utils.multi_thread_tasking([patients[pid] for pid in patients], 40, do_put_line,
                               args=[concept_labels, lines])
    s += '\n'.join(lines)
    utils.save_string(s, out_file)

    # generate sample annotations
    term_to_docs = {}
    for concept in sc2anns:
        ann_ids = sc2anns[concept]
        sample_ids = []
        if len(ann_ids) <= sample_size:
            sample_ids = ann_ids
        else:
            for i in xrange(sample_size):
                index = random.randrange(len(ann_ids))
                sample_ids.append(ann_ids[index])
                del ann_ids[index]
        term_to_docs[concept] = sample_ids

    # query doc contents
    print 'populating term to sampled anns...'
    term_to_sampled = {}
    for term in term_to_docs:
        sample_ids = term_to_docs[term]
        if len(sample_ids) <=0 :
            continue
        sample_doc_ids = ['\'' + s['doc_id'] + '\'' for s in sample_ids]
        rows_container = []
        dutil.query_data(doc_content_sql.format(','.join(sample_doc_ids)), rows_container,
                         dbconn=dutil.get_db_connection_by_setting(db_conn_file))
        doc_to_content = {}
        for r in rows_container:
            doc_to_content[r['doc_id']] = r['TextContent']
        term_sampled = []
        for s in sample_ids:
            term_sampled.append({'id': s['doc_id'],
                                 'content': doc_to_content[s['doc_id']],
                                 'annotations': [{'start': s['start'],
                                                  'end': s['end'],
                                                  'concept': s['concept_id']}]})
        term_to_sampled[term] = term_sampled
    utils.save_json_array(convert_encoding(term_to_sampled, 'cp1252', 'utf-8'), sample_out_file)
def populate_patient_study_table_post_ruled(cohort_name, study_analyzer, out_file, rule_executor,
                                            sample_size, sample_out_file, ruled_ann_out_file,
                                            patients_sql, term_doc_anns_sql, skip_term_sql,
                                            db_conn_file, text_preprocessing=False):
    """
    populate patient study result with post processing to remove unwanted mentions
    :param cohort_name:
    :param study_analyzer:
    :param out_file:
    :param rule_executor:
    :param sample_size:
    :param sample_out_file:
    :return:
    """
    patients = []
    dutil.query_data(patients_sql.format(cohort_name), patients, dbconn=dutil.get_db_connection_by_setting(db_conn_file))
    id2p = {}
    for p in patients:
        id2p[p['brcid']] = p

    non_empty_concepts = []
    study_concepts = study_analyzer.study_concepts
    term_to_docs = {}
    ruled_anns = []
    positive_dumps = []
    skip_terms_list = [t.lower() for t in rule_executor.skip_terms]
    for sc in study_concepts:
        positive_doc_anns = []
        sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure))
        concept_list = ', '.join(['\'%s\'' % c for c in sc.concept_closure])
        doc_anns = []
        if len(sc.concept_closure) > 0:
            sql_temp = term_doc_anns_sql
            data_sql = sql_temp.format(**{'concepts': concept_list,
                                          'cohort_id': cohort_name,
                                          'extra_constrains':
                                              ' \n '.join(
                                                  [generate_skip_term_constrain(study_analyzer, skip_term_sql)]
                                                  + [] if (study_analyzer.study_options is None or
                                                           study_analyzer.study_options['extra_constrains'] is None)
                                                  else study_analyzer.study_options['extra_constrains'])})
            print data_sql
            dutil.query_data(data_sql, doc_anns,
                             dbconn=dutil.get_db_connection_by_setting(db_conn_file))
        if len(doc_anns) > 0:
            p_to_dfreq = {}
            counted_docs = set()
            for ann in doc_anns:
                p = ann['brcid']
                d = ann['CN_Doc_ID']
                if d in counted_docs:
                    continue
                ruled = False
                case_instance = ''
                if not ruled:
                    # skip term rules
                    if 'string_orig' in ann and ann['string_orig'].lower() in skip_terms_list:
                        ruled = True
                        rule = 'skip-term'
                        case_instance = ann['string_orig']
                if not ruled:
                    # string orign rules - not used now
                    ruled, case_instance = rule_executor.execute_original_string_rules(
                        ann['string_orig'] if 'string_orig' in ann
                        else ann['TextContent'][int(ann['start_offset']):int(ann['end_offset'])])
                    rule = 'original-string-rule'
                if not ruled:
                    # post processing rules
                    ruled, case_instance, rule = \
                        rule_executor.execute(ann['TextContent'] if not text_preprocessing else
                                              preprocessing_text_befor_rule_execution(ann['TextContent']),
                                              int(ann['start_offset']),
                                              int(ann['end_offset']),
                                              string_orig=ann['string_orig'] if 'string_orig' in ann else None)
                    rule = 'semehr ' + rule
                if not ruled:
                    # bio-yodie labels
                    if 'experiencer' in ann:
                        if ann['experiencer'].lower() != 'patient' or \
                                ann['temporality'].lower() != 'recent' or \
                                ann['negation'].lower() != 'affirmed':
                            ruled = True
                            case_instance = '\t'.join([ann['experiencer'], ann['temporality'], ann['negation']])
                            rule = 'yodie'
                if ruled:
                    ruled_anns.append({'p': p, 'd': d, 'ruled': rule, 's': ann['start_offset'],
                                       'e': ann['end_offset'],
                                       'c': ann['inst_uri'],
                                       'case-instance': case_instance,
                                       'string_orig': ann['string_orig']
                                       })
                else:
                    counted_docs.add(d)
                    p_to_dfreq[p] = 1 if p not in p_to_dfreq else 1 + p_to_dfreq[p]
                    positive_doc_anns.append({'id': ann['CN_Doc_ID'],
                                              'content': ann['TextContent'],
                                              'annotations': [{'start': ann['start_offset'],
                                                               'end': ann['end_offset'],
                                                               'concept': ann['inst_uri'],
                                                               'string_orig': ann[
                                                                   'string_orig'] if 'string_orig' in ann else ''}],
                                              'doc_table': ann['src_table'],
                                              'doc_col': ann['src_col']})
                    positive_dumps.append({'p': p, 'd': d, 's': ann['start_offset'],
                                           'e': ann['end_offset'],
                                           'c': ann['inst_uri'],
                                           'string_orig': ann['string_orig']})
            if len(counted_docs) > 0:
                non_empty_concepts.append(sc_key)
                for p in p_to_dfreq:
                    id2p[p][sc_key] = str(p_to_dfreq[p])

                # save sample docs
                if sample_size >= len(positive_doc_anns):
                    term_to_docs[sc_key] = positive_doc_anns
                else:
                    sampled = []
                    for i in xrange(sample_size):
                        index = random.randrange(len(positive_doc_anns))
                        sampled.append(positive_doc_anns[index])
                        positive_doc_anns.pop(index)
                    term_to_docs[sc_key] = sampled

    concept_labels = sorted(non_empty_concepts)
    s = '\t'.join(['brcid'] + concept_labels) + '\n'
    for p in patients:
        s += '\t'.join([p['brcid']] + [p[k] if k in p else '0' for k in concept_labels]) + '\n'
    utils.save_string(s, out_file)
    utils.save_string('var sample_docs=' + json.dumps(convert_encoding(term_to_docs, 'cp1252', 'utf-8')), sample_out_file)
    utils.save_json_array(convert_encoding(ruled_anns, 'cp1252', 'utf-8'), ruled_ann_out_file)
    utils.save_json_array(positive_dumps, out_file + "_json")
    print 'done'
Beispiel #21
0
 def do_save_doc_to_db(d, sql_temp, db_conf_file):
     if d['doc_content'] is None:
         return
     d['doc_content'] = db.escape_string(d['doc_content'])
     sql = sql_temp.format(**d)
     db.query_data(sql, None, dbconn=db.get_db_connection_by_setting(db_conf_file))
Beispiel #22
0
 def read_full_text(self, text_key):
     sql = self._qt.format(*[k for k in text_key])
     rets = []
     db.query_data(sql, rets,
                   db.get_db_connection_by_setting(self._cnn_file))
     return rets[0]['text']
Beispiel #23
0
def db_populate_study_results(cohort_sql,
                              doc_ann_sql_temp,
                              doc_ann_pks,
                              dbcnn_file,
                              study_folder,
                              output_folder,
                              sample_sql_temp,
                              thread_num=10,
                              study_config='study.json',
                              sampling=True,
                              sample_size=20):
    """
    populate results for a research study
    :param cohort_sql: cohort selection query
    :param doc_ann_sql_temp: query template for getting a doc_anns item
    :param doc_ann_pks: primary key columns of doc ann table
    :param dbcnn_file: database connection config file
    :param study_folder: study folder
    :param output_folder: where to save the results
    :param sample_sql_temp: query template for getting a sample item (including full text and doc_anns)
    :param thread_num:
    :param study_config:
    :param sampling: whether sampling is needed
    :param sample_size: how many samples per study concept
    :return:
    """
    ret = load_study_ruler(study_folder, None, study_config)
    sa = ret['sa']
    concept_list = sorted([sc.name for sc in sa.study_concepts])
    cui2concept = {}
    for sc in sa.study_concepts:
        for c in sc.concept_closure:
            cui2concept[c] = sc.name
    results = []
    rows = []
    db.query_data(cohort_sql, rows,
                  db.get_db_connection_by_setting(dbcnn_file))
    logging.info('querying results (cohort size:%s)...' % len(rows))
    utils.multi_process_tasking([r['pid'] for r in rows],
                                db_populate_patient_result,
                                num_procs=thread_num,
                                args=[
                                    doc_ann_sql_temp, doc_ann_pks, dbcnn_file,
                                    concept_list, cui2concept,
                                    positive_patient_filter
                                ],
                                thread_init_func=proc_init_container,
                                thread_end_func=proc_final_collect,
                                thread_end_args=[results])
    # populate result table
    c2pks = {}
    for c in concept_list:
        c2pks[c] = []
    s = '\t'.join(['pid'] + concept_list)
    for r in results:
        pr = [r['p']]
        for c in concept_list:
            if r['c2f'][c]['f'] > 0:
                c2pks[c].append(r['c2f'][c]['docs'][0])
            pr.append(str(r['c2f'][c]['f']))
        s += '\t'.join(pr) + '\n'
    f = join(output_folder, 'result.tsv')
    utils.save_string(s, f)
    logging.info('result table saved to [%s]' % f)
    if sampling:
        logging.info('doing sampling...')
        sampled_result = {}
        for c in c2pks:
            pks = c2pks[c]
            sample_pks = []
            logging.info('doc cache size: %s' % len(pks))
            if len(pks) <= sample_size:
                sample_pks = pks
            else:
                for i in xrange(sample_size):
                    index = random.randrange(len(pks))
                    sample_pks.append(pks[index])
                    del pks[index]
            samples = []
            utils.multi_thread_tasking(
                sample_pks,
                thread_num,
                extract_sample,
                args=[c, cui2concept, sample_sql_temp, dbcnn_file, samples])
            sampled_result[c] = samples
            logging.info('%s sampled (%s) results' % (c, len(samples)))

        f = join(output_folder, 'sampled_docs.js')
        utils.save_string('var sample_docs= %s;' % json.dumps(sampled_result),
                          f)
        logging.info('samples saved to %s' % f)
    logging.info('all results populated')