def concept_analyse(concept_id, condition_label_sql, wrong_label_sql, db_cnf):
    # get condition mention labels
    concept_result = {'concept': concept_id, 'labels': {}}
    mc = MConcept(concept_id)
    results_condition_labels = []
    dutil.query_data(condition_label_sql.format(**{'concept': concept_id}),
                     results_condition_labels,
                     dbconn=dutil.get_db_connection_by_setting(db_cnf))
    for r in results_condition_labels:
        if r['label'] not in mc.name2labels:
            mc.add_label(ConceptLabel(r['label']))
        mc.name2labels[r['label']].condition_mention = r['num']

    results_wrong_labels = []
    dutil.query_data(wrong_label_sql.format(**{'concept': concept_id}),
                     results_wrong_labels,
                     dbconn=dutil.get_db_connection_by_setting(db_cnf))
    for r in results_wrong_labels:
        if r['label'] not in mc.name2labels:
            mc.add_label(ConceptLabel(r['label']))
        mc.name2labels[r['label']].wrong_mention = r['num']

    output = mc.output()
    print output
    return concept_result, output
Ejemplo n.º 2
0
def run_parallel_prediction(settings):
    cm_obj = Concept2Mapping(settings['concept_mapping_file'])
    mp_inst = mp.MentionPattern(settings['pattern_folder'],
                                cm_obj.cui2label,
                                in_action=True)
    mp_inst = None
    db_pool = du.get_mysql_pooling(settings['dbconf'], num=30)
    doc_ids = []
    model_factory = ModelFactory(settings['phenotypes'], settings['model_dir'])
    du.query_data(settings['sql_docs4process'],
                  pool=db_pool,
                  container=doc_ids)
    # for d in doc_ids:
    #     do_one_doc(d, model_factory, cm_obj, mp_inst, db_pool,
    #                                  settings['sql_text_ptn'],
    #                                  settings['sql_ann_ptn'],
    #                                  settings['save_result_sql_ptn'],
    #                                  settings['update_doc_sql_ptn'])
    utils.multi_thread_tasking(doc_ids,
                               num_threads=settings['num_threads'],
                               process_func=do_one_doc,
                               args=[
                                   model_factory, cm_obj, mp_inst, db_pool,
                                   settings['sql_text_ptn'],
                                   settings['sql_ann_ptn'],
                                   settings['save_result_sql_ptn'],
                                   settings['update_doc_sql_ptn']
                               ])
    logging.info('#docs: %s all done' % len(doc_ids))
Ejemplo n.º 3
0
 def query_data(self, query_template, q_obj):
     rows_container = []
     dutil.query_data(query_template.format(**q_obj),
                      rows_container,
                      dbconn=dutil.get_db_connection_by_setting(
                          self.db_conn_file))
     return rows_container
Ejemplo n.º 4
0
def db_populate_patient_result(container, pid, doc_ann_sql_temp, doc_ann_pks, dbcnn_file, concept_list,
                               cui2concept,
                               ontext_filter_fun=None):
    """
    populate a row (per patient) in the result table
    :param pid:
    :param doc_ann_sql_temp:
    :param doc_ann_pks:
    :param dbcnn_file:
    :param concept_list:
    :param cui2concept:
    :param container:
    :return:
    """
    rows = []
    db.query_data(doc_ann_sql_temp.format(pid), rows, db.get_db_connection_by_setting(dbcnn_file))
    c2f = {}
    for c in concept_list:
        c2f[c] = {'f': 0, 'rf': 0, 'docs': []}
    logging.debug('pid: %s has %s docs' % (pid, len(rows)))
    i = 0
    g2_c2f = {}
    grp = False
    for r in rows:
        try:
            i += 1
            if 'grp' in r:
                grp = True
                if r['grp'] in g2_c2f:
                    c2f = g2_c2f[r['grp']]
                else:
                    c2f = {}
                    for c in concept_list:
                        c2f[c] = {'f': 0, 'rf': 0, 'docs': []}
                    g2_c2f[r['grp']] = c2f
            anns = json.loads(fix_escaped_issue(r['anns']))
            ann_doc = SemEHRAnnDoc()
            ann_doc.load(anns)
            for a in ann_doc.annotations:
                # for c in a.study_concepts:
                if a.cui in cui2concept:
                    c = cui2concept[a.cui]
                    logging.debug('%s found in %s, ruled_by=%s, concepts:%s' % (c, '-'.join([r[k] for k in doc_ann_pks]),
                                                                   a.ruled_by, a.study_concepts))
                    if c in c2f:
                        correct = len(a.ruled_by) == 0
                        if correct and ontext_filter_fun is not None:
                            correct = ontext_filter_fun(a)
                        if not correct:
                            c2f[c]['rf'] += 1
                        else:
                            c2f[c]['f'] += 1
                            c2f[c]['docs'].append([r[k] for k in doc_ann_pks])
        except Exception as e:
            logging.error('parsing anns %s because of %s' % (fix_escaped_issue(r['anns']), str(e)))
    logging.info('pid %s done' % pid)
    if not grp:
        g2_c2f = c2f
    container.append({'p': pid, 'c2f': g2_c2f, 'grp': grp})
    logging.debug('pid %s with %s, %s' % (pid, len(c2f), len(container)))
Ejemplo n.º 5
0
 def extract_cohort_docs(self, use_combo_fn_name=True):
     db_conf_file = self._cohort_conf
     db_conf = None
     if 'linux_dsn_setting' in self._conf and self._conf['linux_dsn_setting']:
         # need dsn settings
         db_conf = self.populate_linux_odbc_setting()
         db_conf_file = None
         logging.info('using dsn %s' % db_conf['dsn'])
     query_size = self._conf['query_size'] if 'query_size' in self._conf else 50
     file_pattern = self._conf['file_pattern'] if 'file_pattern' in self._conf else '%s.txt'
     out_put_folder = self._conf['out_put_folder']
     if len(self._patient_ids) == 0:
         logging.info('cohort is empty, has it been loaded?')
         return
     q_temp = self._conf['doc_query_temp']
     logging.info('working on extraction, cohort size:%s' % len(self._patient_ids))
     for idx in range(0, len(self._patient_ids), query_size):
         q = q_temp.format(**{'patient_ids': ",".join(["'%s'" % p for p in self._patient_ids[idx:idx+query_size]])})
         logging.info('querying batch %s' % (idx + 1))
         logging.debug(q)
         docs = []
         db.query_data(q, docs, db.get_db_connection_by_setting(db_conf_file, db_conf))
         if self._dest == 'sql':
             # save docs to database
             self.save_docs_to_db(docs)
         else:
             # save docs to files
             for d in docs:
                 if d['doc_content'] is None:
                     continue
                 fn = ('%s_%s' % (d['doc_id'], d['patient_id'])) if use_combo_fn_name else ('%s' % d['doc_id'])
                 utils.save_string(d['doc_content'], join(out_put_folder, file_pattern % fn))
         logging.info('%s docs saved to destination [%s]' % (len(docs), self._dest))
     logging.info('query finished, docs saved to %s' % out_put_folder)
Ejemplo n.º 6
0
def analyse_db_doc_anns(sql, ann_sql, pks, update_template, full_text_sql, dbcnn_file, rule_config_file,
                        study_folder, thread_num=10, study_config='study.json', update_status_template=None):
    """
    do database based annotation post processing
    :param sql: get a list of annotation primary keys
    :param ann_sql: a query template to query ann and its doc full text
    :param pks: an array of primary key columns
    :param update_template: an update query template to update post-processed ann
    :param dbcnn_file: database connection file
    :param thread_num:
    :param study_folder:
    :param rule_config_file:
    :param study_config:
    :return:
    """
    ret = load_study_ruler(study_folder, rule_config_file, study_config)
    sa = ret['sa']
    ruler = ret['ruler']
    rows = []
    db.query_data(sql, rows, db.get_db_connection_by_setting(dbcnn_file))
    reader = DBTextReader(full_text_sql, dbcnn_file)
    cnns = []
    for i in xrange(thread_num):
        cnns.append(db.get_db_connection_by_setting(dbcnn_file))
    utils.multi_process_tasking(rows, db_doc_process, num_procs=thread_num,
                                args=[ann_sql, pks, update_template, dbcnn_file, reader, sa, ruler,
                                      update_status_template],
                                thread_wise_objs=cnns)
    for i in xrange(thread_num):
        db.release_db_connection(cnns[i])
Ejemplo n.º 7
0
def load_phenotype_def_into_db():
    db_cnf = '../../studies/COMOB_SD/dbcnn_input.json'
    p_def_file = './data/phenotype_defs.json'
    pd = utils.load_json_data(p_def_file)
    w_sql = """
    insert into tp_phenotype_concepts (phenotype_id, concept_id) values 
    ('{pid}', '{cid}');
    """
    r_sql = """
    select * from tp_phenotypes
    """
    p_data = []
    dutil.query_data(r_sql, p_data, dutil.get_db_connection_by_setting(db_cnf))
    p2pid = {}
    for r in p_data:
        p2pid[r['phenotype_name']] = r['id']
    for p in pd:
        if p not in p2pid:
            print '%s not found in definition table' % p
            continue
        for c in pd[p]['concepts']:
            sql = w_sql.format(**{'pid': p2pid[p], 'cid': c})
            print 'executing [%s]' % sql
            dutil.query_data(sql, None, dbconn=dutil.get_db_connection_by_setting(db_cnf))
    print 'done'
def populate_patient_concept_table(cohort_name, concepts, out_file,
                                   patients_sql, concept_doc_freq_sql):
    patients = []
    dutil.query_data(patients_sql.format(cohort_name), patients)
    id2p = {}
    for p in patients:
        id2p[p['brcid']] = p

    non_empty_concepts = []
    for c in concepts:
        patient_concept_freq = []
        print 'querying %s...' % c
        dutil.query_data(concept_doc_freq_sql.format(c, cohort_name), patient_concept_freq)
        if len(patient_concept_freq) > 0:
            non_empty_concepts.append(c)
            for pc in patient_concept_freq:
                id2p[pc['brcid']][c] = str(pc['num'])

    label2cid = {}
    concept_labels = []
    for c in non_empty_concepts:
        label = oi.get_concept_label(c)
        label2cid[label] = c
        concept_labels.append(label)
    concept_labels = sorted(concept_labels)
    s = '\t'.join(['brcid'] + concept_labels) + '\n'
    for p in patients:
        s += '\t'.join([p['brcid']] + [p[label2cid[k]] if label2cid[k] in p else '0' for k in concept_labels]) + '\n'
    utils.save_string(s, out_file)
    print 'done'
Ejemplo n.º 9
0
 def convert_text_ann_from_db(sql_temp,
                              pks,
                              db_conn,
                              full_text_folder,
                              ann_folder,
                              full_text_file_pattern='%s.txt',
                              ann_file_pattern='%s.txt.knowtator.xml'):
     sql = sql_temp.format(**pks)
     results = []
     logging.info('doing [%s]...' % sql)
     file_key = '_'.join([pks[k] for k in pks])
     dbutils.query_data(sql, results,
                        dbutils.get_db_connection_by_setting(db_conn))
     if len(results) > 0:
         text = results[0]['text'].replace('\r', '\n')
         anns = json.loads(results[0]['anns'])
         xml = AnnConverter.to_eHOST(AnnConverter.load_ann(anns, file_key),
                                     full_text=text)
         utils.save_string(xml, join(ann_folder,
                                     ann_file_pattern % file_key))
         utils.save_string(
             text, join(full_text_folder,
                        full_text_file_pattern % file_key))
         logging.info('doc [%s] done' % file_key)
     else:
         logging.info('doc/anns [%s] not found' % file_key)
Ejemplo n.º 10
0
def label_analyse(sql_template_file, db_cnf, output_file=None):
    sql_temps = utils.load_json_data(sql_template_file)
    concepts = []
    dutil.query_data(sql_temps['get_validated_concepts'], concepts,
                     dbconn=dutil.get_db_connection_by_setting(db_cnf))
    s = ''
    for c in concepts:
        data, output = concept_analyse(c['concept_id'], sql_temps['condition_label_sql'], sql_temps['wrong_label_sql'], db_cnf)
        s += output
    if output_file is not None:
        print 'saving output to %s...' % output_file
        utils.save_string(s, output_file)
def get_docs_for_processing(job_status, job_sql_template, cnn_conf_file):
    """
    retrieve docs to process from a database table/view
    :param job_status:
    :return:
    """
    job_data = job_status.job_start()
    print 'working on %s' % job_data
    container = []
    sqldbutils.query_data(job_sql_template.format(**job_data), container,
                          dbconn=sqldbutils.get_db_connection_by_setting(cnn_conf_file))
    return container
Ejemplo n.º 12
0
def populate_patient_study_table(cohort_name, study_analyzer, out_file,
                                 patients_sql, term_doc_freq_sql):
    """
    populate result table for a given study analyzer instance
    :param cohort_name:
    :param study_analyzer:
    :param out_file:
    :return:
    """
    patients = []
    dutil.query_data(patients_sql.format(cohort_name), patients)
    id2p = {}
    for p in patients:
        id2p[p['brcid']] = p

    non_empty_concepts = []
    study_concepts = study_analyzer.study_concepts
    for sc in study_concepts:
        sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure))
        concept_list = ', '.join(['\'%s\'' % c for c in sc.concept_closure])
        patient_term_freq = []
        if len(sc.concept_closure) > 0:
            data_sql = term_doc_freq_sql.format(
                **{
                    'concepts':
                    concept_list,
                    'cohort_id':
                    cohort_name,
                    'extra_constrains':
                    ' \n '.join(
                        [generate_skip_term_constrain(study_analyzer)] + [] if
                        (study_analyzer.study_options is None or study_analyzer
                         .study_options['extra_constrains'] is None) else
                        study_analyzer.study_options['extra_constrains'])
                })
            print data_sql
            dutil.query_data(data_sql, patient_term_freq)
        if len(patient_term_freq) > 0:
            non_empty_concepts.append(sc_key)
            for pc in patient_term_freq:
                id2p[pc['brcid']][sc_key] = str(pc['num'])

    concept_labels = sorted(non_empty_concepts)
    s = '\t'.join(['brcid'] + concept_labels) + '\n'
    for p in patients:
        s += '\t'.join([p['brcid']] +
                       [p[k] if k in p else '0'
                        for k in concept_labels]) + '\n'
    utils.save_string(s, out_file)
    print 'done'
def action_transparentise(cohort_name, db_conn_file,
                          cohort_doc_sql_template,
                          doc_ann_sql_template,
                          doc_content_sql_template,
                          action_trans_update_sql_template,
                          corpus_trans_file):
    """
    use actionable transparency model to create confidence value for each annotations;
    this method split all cohort documents into batches that are to processed in multiple threads
    :param cohort_name:
    :param db_conn_file:
    :param cohort_doc_sql_template:
    :param doc_ann_sql_template:
    :param doc_content_sql_template:
    :param action_trans_update_sql_template:
    :param corpus_trans_file:
    :return:
    """
    docs = []
    dutil.query_data(cohort_doc_sql_template.format(cohort_name), docs,
                     dbconn=dutil.get_db_connection_by_setting(db_conn_file))
    batch_size = 500
    batches = []
    for i in range(0, len(docs), batch_size):
        batches.append(docs[i:i+batch_size])
    nlp = tstg.load_mode('en')
    corpus_predictor = tssp.CorpusPredictor.load_corpus_model(corpus_trans_file)
    for batch in batches:
        print 'working on %s/%s batch' % (i, len(batches))
        try:
            do_action_trans_docs(batch, 
                                 nlp,
                                 doc_ann_sql_template,
                                 doc_content_sql_template,
                                 action_trans_update_sql_template,
                                 db_conn_file,
                                 corpus_predictor)
        except Exception as e:
            print 'error processing [%s]' % e
        i += 1
    #utils.multi_thread_tasking(batches, 1, do_action_trans_docs,
    #                           args=[nlp,
    #                                 doc_ann_sql_template,
    #                                 doc_content_sql_template,
    #                                 action_trans_update_sql_template,
    #                                 db_conn_file,
    #                                 corpus_predictor
    #                                 ])
    print 'all anns transparentised'
def complete_sample_ann_data(key_anns, complete_sql, db_conn_file, container):
    k = key_anns[0]
    anns = key_anns[1]
    for ann in anns:
        rows_container = []
        dutil.query_data(complete_sql.format(**{'doc_id': ann['id'],
                                                'start': ann['annotations'][0]['start'],
                                                'end': ann['annotations'][0]['end'],
                                                'concept': ann['annotations'][0]['concept']}),
                         rows_container,
                         dbconn=dutil.get_db_connection_by_setting(db_conn_file))
        if len(rows_container) > 0:
            ann['annotations'][0]['string_orig'] = rows_container[0]['string_orig']
            if 'action_trans' in rows_container[0]:
                ann['annotations'][0]['confidence'] = rows_container[0]['action_trans']
    container.append([k, anns])
Ejemplo n.º 15
0
def extract_sample(pk_vals,
                   concept,
                   cui2concept,
                   sample_sql_temp,
                   dbcnn_file,
                   container,
                   ontext_filter_fun=positive_patient_filter):
    """
    extract an sample
    :param pk_vals:
    :param concept:
    :param sample_sql_temp:
    :param dbcnn_file:
    :param container:
    :return:
    """
    rows = []
    db.query_data(sample_sql_temp.format(*[v for v in pk_vals]), rows,
                  db.get_db_connection_by_setting(dbcnn_file))
    if len(rows) > 0:
        r = rows[0]
        anns = json.loads(r['anns'])
        ann_doc = SemEHRAnnDoc()
        ann_doc.load(anns)
        for a in ann_doc.annotations:
            if a.cui in cui2concept and concept == cui2concept[a.cui]:
                correct = len(a.ruled_by) == 0
                if correct and ontext_filter_fun is not None:
                    correct = ontext_filter_fun(a)
                if correct:
                    container.append({
                        'content':
                        r['text'],
                        'doc_table':
                        r['src_table'],
                        'doc_col':
                        r['src_col'],
                        'id':
                        '_'.join(pk_vals),
                        'annotations': [{
                            'start': a.start,
                            'end': a.end,
                            'concept': a.cui,
                            'string_orig': a.str
                        }]
                    })
                    break
Ejemplo n.º 16
0
def do_one_doc(doc_id, model_factory, concept_mapping, mention_pattern,
               db_pool, sql_text_ptn, sql_ann_ptn, save_result_sql_ptn,
               update_doc_sql_ptn):
    container = []
    du.query_data(sql_ann_ptn.format(**doc_id),
                  pool=db_pool,
                  container=container)
    if len(container) == 0:
        logging.info('%s anns not found' % doc_id)
        return
    doc_anns = json.loads(container[0]['anns'])
    patient_id = container[0]['patient_id']

    container = []
    du.query_data(sql_text_ptn.format(**doc_id),
                  pool=db_pool,
                  container=container)
    if len(container) == 0:
        logging.info('%s text not found' % doc_id)
        return
    text = container[0]['doc_content']
    container = []

    p2count = predict_doc_phenotypes(str(doc_id),
                                     doc_anns,
                                     text,
                                     model_factory,
                                     concept_mapping,
                                     mention_pattern=mention_pattern)

    if p2count is not None:
        save_dict = doc_id.copy()
        save_dict['result'] = json.dumps(p2count)
        save_dict['patient_id'] = patient_id
        du.query_data(save_result_sql_ptn.format(**save_dict),
                      container=None,
                      pool=db_pool)
        du.query_data(update_doc_sql_ptn.format(**doc_id),
                      container=None,
                      pool=db_pool)
    else:
        du.query_data(update_doc_sql_ptn.format(**doc_id),
                      container=None,
                      pool=db_pool)
        logging.info('%s empty phenotypes' % doc_id)
    logging.info('%s done' % doc_id)
Ejemplo n.º 17
0
def populate_episode_study_table(study_analyzer, episode_data, out_path, cohort):
    study_concepts = study_analyzer.study_concepts
    for sc in study_concepts:
        sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure))
        print 'working on %s' % sc_key
        concept_list = ', '.join(['\'%s\'' % c for c in sc.concept_closure])
        patient_date_tuples = []
        if len(sc.concept_closure) > 0:
            data_sql = doc_concept_sql_cohort.format(**{'concepts': concept_list,
                                                     'cohort': cohort,
                                                     'extra_constrains': ''})
            print data_sql
            dutil.query_data(data_sql, patient_date_tuples,
                             dbconn=dutil.get_mysqldb_connection(my_host, my_user, my_pwd, my_db, my_sock)
                             if db_connection == 'mysql' else None)
            # filter patient_date tuples using episode constraints
            for eps in episode_data:
                for row in patient_date_tuples:
                    if eps['brcid'] == str(row['brcid']):
                        eps[sc.name] = {'win1': 0, 'win2': 0, 'win3': 0} if sc.name not in eps else eps[sc.name]
                        count_eps_win(eps, sc.name, row, 'win1')
                        count_eps_win(eps, sc.name, row, 'win2')
                        count_eps_win(eps, sc.name, row, 'win3')
    rows = []
    headers = ['brcid'] + [sc.name for sc in study_concepts]
    for eps in episode_data:
        r = {'win1':[eps['brcid']], 'win2':[eps['brcid']], 'win3':[eps['brcid']]}
        for sc in study_concepts:
            if sc.name in eps:
                r['win1'].append(str(eps[sc.name]['win1']))
                r['win2'].append(str(eps[sc.name]['win2']))
                r['win3'].append(str(eps[sc.name]['win3']))
            else:
                r['win1'].append('0')
                r['win2'].append('0')
                r['win3'].append('0')
        rows.append(r)

    for w in ['win1', 'win2', 'win3']:
        s = '\t'.join(headers) + '\n'
        for r in rows:
            s += '\t'.join(r[w]) + '\n'
        utils.save_string(s, out_path + '/weeks_eps' + w + '_control.tsv')
Ejemplo n.º 18
0
def download_docs(doc_ids, query, db_conn_setting, out_put_folder):
    """
    download clinical notes from EHR
    :param doc_ids:
    :param query:
    :param db_conn_setting:
    :return:
    """
    db_cnn = dutil.get_db_connection_by_setting(db_conn_setting)
    results = []
    q = query.format(**{'ids': ','.join(['\'%s\'' % did for did in doc_ids])})
    print 'querying [%s]' % q
    print q
    dutil.query_data(q, results, db_cnn)
    for r in results:
        if r['textcontent'] is not None:
            utils.save_string(r['textcontent'].decode('cp1252').replace(
                chr(13), ' '),
                              join(out_put_folder, r['cn_doc_id'] + '.txt'),
                              encoding='utf-8')
Ejemplo n.º 19
0
 def get_db_docs_for_converting(settings):
     sql = settings['sql']
     db_conn = settings['db_conn']
     doc_ann_sql_temp = settings['sql_temp']
     full_text_folder = settings['full_text_folder']
     ann_folder = settings['ann_folder']
     results = []
     dbutils.query_data(sql, results,
                        dbutils.get_db_connection_by_setting(db_conn))
     ds = []
     for r in results:
         ds.append(r)
     logging.info('total docs %s' % len(ds))
     for d in ds:
         AnnConverter.convert_text_ann_from_db(
             sql_temp=doc_ann_sql_temp,
             pks=d,
             db_conn=db_conn,
             full_text_folder=full_text_folder,
             ann_folder=ann_folder)
Ejemplo n.º 20
0
def db_doc_process(cnn, row, sql_template, pks, update_template, dbcnn_file,
                   text_reader, sa, ruler, update_status_template):
    sql = sql_template.format(*[row[k] for k in pks])
    rets = []
    db.query_data(sql, rets, db.get_db_connection_by_setting(dbcnn_file))
    if len(rets) > 0:
        anns = json.loads(fix_escaped_issue(rets[0]['anns']))
        ann_doc = SemEHRAnnDoc()
        ann_doc.load(anns)
        no_concepts = False
        if len(ann_doc.annotations) > 0:
            num_concepts = process_doc_rule(ann_doc, ruler, text_reader,
                                            [row[k] for k in pks], sa)
            if num_concepts > 0:
                update_query = update_template.format(*(
                    [db.escape_string(json.dumps(ann_doc.serialise_json()))] +
                    [row[k] for k in pks]))
                # logging.debug('update ann: %s' % update_query)
                db.query_data(update_query, None, dbconn=cnn)
                logging.info('ann %s updated' % row)
            else:
                no_concepts = True
        else:
            no_concepts = True
        if no_concepts and update_status_template is not None:
            q = update_status_template.format(*[row[k] for k in pks])
            db.query_data(q, None, dbconn=cnn)
            logging.debug('no concepts found/update %s' % q)
Ejemplo n.º 21
0
 def extract_cohort_docs(self):
     db_conf_file = self._cohort_conf
     db_conf = None
     if 'linux_dsn_setting' in self._conf and self._conf[
             'linux_dsn_setting']:
         db_conf = self.populate_linux_odbc_setting()
         db_conf_file = None
         logging.info('using dsn %s' % db_conf['dsn'])
     query_size = self._conf[
         'query_size'] if 'query_size' in self._conf else 50
     file_pattern = self._conf[
         'file_pattern'] if 'file_pattern' in self._conf else '%s.txt'
     out_put_folder = self._conf['out_put_folder']
     if len(self._patient_ids) == 0:
         logging.info('cohort is empty, has it been loaded?')
         return
     q_temp = self._conf['doc_query_temp']
     logging.info('working on extraction, cohort size:%s' %
                  len(self._patient_ids))
     for idx in range(0, len(self._patient_ids), query_size):
         q = q_temp.format(
             **{
                 'patient_ids':
                 ",".join([
                     "'%s'" % p
                     for p in self._patient_ids[idx:idx + query_size]
                 ])
             })
         logging.info('querying batch %s' % (idx + 1))
         logging.debug(q)
         docs = []
         db.query_data(
             q, docs,
             db.get_db_connection_by_setting(db_conf_file, db_conf))
         for d in docs:
             utils.save_string(
                 d['doc_content'],
                 join(out_put_folder, file_pattern % d['doc_id']))
     logging.info('query finished, docs saved to %s' % out_put_folder)
Ejemplo n.º 22
0
def collect_patient_morbidity_result(phenotypes, sql_result, db_conf,
                                     output_folder):
    columns = ['patient_id'] + phenotypes
    # 1. query all results, assuming the result size is not too BIG (<1m) and ordered by patient_id
    db_pool = du.get_mysql_pooling(db_conf, num=10)
    container = []
    du.query_data(sql_result, pool=db_pool, container=container)
    data = {}
    phenotype2subtypes = {}
    phenotype2data = {}
    for phe in phenotypes:
        phenotype2data[phe] = []

    cur_p = None
    cur_row = initial_morbidity_row(phenotypes)
    cur_phenotype_detail = None
    for d in container:
        if d['patient_id'] != cur_p:
            add_data_row(data, cur_p, cur_row, phenotypes, phenotype2data,
                         cur_phenotype_detail)
            cur_p = d['patient_id']
            cur_row = initial_morbidity_row(phenotypes)
            cur_phenotype_detail = initial_phenotype_details(phenotypes)
            logging.info('working on [%s]...' % cur_p)
        doc_result = json.loads(d['result'])
        for p in doc_result:
            cur_row[p] += doc_result[p]['freq']
            add_phenotype_detail(cur_phenotype_detail, p,
                                 doc_result[p]['cui2freq'], phenotype2subtypes)
    add_data_row(data, cur_p, cur_row, phenotypes, phenotype2data,
                 cur_phenotype_detail)
    df = pd.DataFrame(data)
    output_file = join(output_folder, 'icd_chapter_results.csv')
    df.to_csv(output_file, index=False)
    logging.info('data saved to %s' % output_file)

    for p in phenotypes:
        populate_subtype_output(phenotype2subtypes[p], phenotype2data[p],
                                join(output_folder, '%s.csv' % p))
Ejemplo n.º 23
0
def do_action_trans_docs(docs, nlp, doc_ann_sql_template,
                         doc_content_sql_template,
                         action_trans_update_sql_template, db_conn_file,
                         corpus_predictor):
    """
    do actionable transparency prediction on a batch of docs.
    this function is to supposed to be called in a single thread
    :param docs:
    :param nlp:
    :param doc_ann_sql_template:
    :param doc_content_sql_template:
    :param action_trans_update_sql_template:
    :param db_conn_file:
    :param corpus_predictor:
    :return:
    """
    # self_nlp = tstg.load_mode('en')
    for doc_id in docs:
        doc_anns = []
        dutil.query_data(
            doc_ann_sql_template.format(doc_id['docid']),
            doc_anns,
            dbconn=dutil.get_db_connection_by_setting(db_conn_file))
        doc_anns = [{
            's': int(ann['s']),
            'e': int(ann['e']),
            'AnnId': str(ann['AnnId']),
            'signed_label': '',
            'gt_label': '',
            'action_trans': ann['action_trans']
        } for ann in doc_anns]
        if len(doc_anns) == 0:
            continue
        if doc_anns[0]['action_trans'] is not None:
            print 'found trans %s of first ann, skipping doc' % doc_anns[0][
                'action_trans']
            continue
        doc_container = []
        dutil.query_data(
            doc_content_sql_template.format(doc_id['docid']),
            doc_container,
            dbconn=dutil.get_db_connection_by_setting(db_conn_file))
        ptns = tstg.doc_processing(nlp, unicode(doc_container[0]['content']),
                                   doc_anns, doc_id['docid'])
        # print 'doc %s read/model created, predicting...'
        for inst in ptns:
            acc = corpus_predictor.predcit(inst)
            anns = inst.annotations
            sql = action_trans_update_sql_template.format(
                **{
                    'acc': acc,
                    'AnnId': anns[0]['AnnId']
                })
            # print 'executing %s' % sql
            dutil.query_data(
                sql,
                container=None,
                dbconn=dutil.get_db_connection_by_setting(db_conn_file))
def smp_export(patient_id, es, corpus_mapping, sql_temp, db_cnn):
    """
    structured medical profile extraction
    :param es: elasticsearch index
    :param patient_id:
    :param sql_temp:
    :param db_cnn
    :return:
    """
    print 'indexing %s' % patient_id
    ds_ids = mimicdao.get_summary_doc_by_patient(patient_id)
    for r in ds_ids:
        doc = es.get_doc_detail(r['row_id'])
        profile = parse_discharge_summary(doc['fulltext'], doc['anns'], corpus_mapping)
        mp = {}
        for sec in profile:
            t = sec['section'] if sec['section'] != '' else 'basic'
            t = t.replace(' ', '_')
            mp[t] = sec
        file_name = '%s_%s.json' % (patient_id, r['row_id'])
        db.query_data(sql_temp.format(**{'patient_id': patient_id, 'doc_id': r['row_id'],
                                         'smp': db.escape_string(json.dumps(mp))}),
                      None, dbconn=db.get_db_connection_by_setting(db_cnn))
        print '%s indexed' % file_name
Ejemplo n.º 25
0
def get_concepts(output_file):
    curated_mappings = utils.load_json_data(
        './resources/curated_mappings.json')
    autoimmune_concepts = []
    patients = []
    dutil.query_data(autoimmune_concepts_sql, autoimmune_concepts)
    print '{} concepts read'.format(len(autoimmune_concepts))
    dutil.query_data(patients_sql, patients)
    print patients[0]
    # patient dic
    patient_dic = {}
    for p in patients:
        patient_dic[p['brcid']] = p

    non_empty_curated_concepts = []
    non_empty_not_curated_concepts = []
    empty_concepts = []
    for co in autoimmune_concepts:
        c = co['concept_name']
        sympton_freq_result = []
        print autoimmune_sympton_freq_sql.format(c)
        dutil.query_data(
            autoimmune_sympton_freq_sql.format(c.replace("'", "''")),
            sympton_freq_result)
        if len(sympton_freq_result) > 0:
            if c in curated_mappings and curated_mappings[
                    c] is not None and curated_mappings[c] == 'correct':
                non_empty_curated_concepts.append(c)
            else:
                non_empty_not_curated_concepts.append(c)
        else:
            empty_concepts.append(c)
        for sf in sympton_freq_result:
            patient_dic[sf['brcid']][c] = sf['num']
            patient_dic[sf['brcid']]['any'] = sf['num'] + \
                                              (patient_dic[sf['brcid']]['any']
                                               if 'any' in patient_dic[sf['brcid']] else 0)
    p_attrs = [
        'brcid', 'primary_diag', 'diagnosis_date', 'dob', 'gender_id',
        'ethnicitycleaned'
    ]
    d_attrs = sorted(
        non_empty_curated_concepts
    )  # sorted([co['concept_name'] for co in autoimmune_concepts])
    d_attrs = ['any'] + d_attrs + ['=sep='
                                   ] + sorted(non_empty_not_curated_concepts)
    s = '\t'.join(p_attrs) + '\t' + '\t'.join(d_attrs) + '\n'
    for p in patients:
        s += '\t'.join([str(p[k]) for k in p_attrs]) + '\t' + '\t'.join(
            ['0' if c not in p else str(p[c]) for c in d_attrs]) + '\n'
    utils.save_string(s, output_file)
    print json.dumps(empty_concepts)
def generate_result_in_one_iteration(cohort_name, study_analyzer, out_file,
                                     sample_size, sample_out_file,
                                     doc_to_brc_sql, brc_sql, anns_iter_sql, skip_term_sql, doc_content_sql,
                                     db_conn_file):
    """
    generate result in one iteration over all annotations. this is supposed to be much faster when working on
    large study concepts. But post-processing using rules not supported now
    :param cohort_name:
    :param study_analyzer:
    :param out_file:
    :param sample_size:
    :param sample_out_file:
    :param doc_to_brc_sql:
    :param brc_sql:
    :param anns_iter_sql:
    :param skip_term_sql:
    :param doc_content_sql:
    :param db_conn_file:
    :return:
    """
    # populate concept to anns maps
    sc2anns = {}
    for sc in study_analyzer.study_concepts:
        sc2anns[sc.name] = []

    # populate patient list
    print 'populating patient list...'
    patients = {}
    rows_container = []
    dutil.query_data(brc_sql.format(cohort_name), rows_container,
                     dbconn=dutil.get_db_connection_by_setting(db_conn_file))
    for r in rows_container:
        patients[r['brcid']] = {'brcid': r['brcid']}

    # populate document id to patient id dictionary
    print 'populating doc to patient map...'
    rows_container = []
    dutil.query_data(doc_to_brc_sql.format(cohort_name), rows_container,
                     dbconn=dutil.get_db_connection_by_setting(db_conn_file))
    doc2brc = {}
    for dp in rows_container:
        doc2brc[dp['doc_id']] = dp['brcid']

    # query annotations
    print 'iterating annotations...'
    rows_container = []
    dutil.query_data(anns_iter_sql.format(**{'cohort_id': cohort_name,
                                             'extra_constrains':
                                                 ' \n '.join(
                                                  [generate_skip_term_constrain(study_analyzer, skip_term_sql)]
                                                  + [] if (study_analyzer.study_options is None or
                                                           study_analyzer.study_options['extra_constrains'] is None)
                                                  else study_analyzer.study_options['extra_constrains'])}),
                     rows_container,
                     dbconn=dutil.get_db_connection_by_setting(db_conn_file))
    for r in rows_container:
        concept_id = r['inst_uri']
        brcid = doc2brc[r['doc_id']] if r['doc_id'] in doc2brc else None
        if brcid is None:
            print 'doc %s not matched to a patient!!!' % r['doc_id']
            continue
        patient = patients[brcid] if brcid in patients else None
        if patient is None:
            print 'brc id %s not matched a patient!!!' % brcid
            continue
        # get matched study concepts
        for sc in study_analyzer.study_concepts:
            if concept_id in sc.concept_closure:
                patient[sc.name] = (patient[sc.name] + 1) if sc.name in patient else 1
                sc2anns[sc.name].append({'ann_id': r['ann_id'], 'doc_id': r['doc_id'], 'concept_id': concept_id,
                                         'start': r['start_offset'], 'end': r['end_offset']})

    # generate result table
    print 'generate result table...'
    concept_labels = sorted([k for k in sc2anns])
    s = '\t'.join(['brcid'] + concept_labels) + '\n'
    lines = []
    utils.multi_thread_tasking([patients[pid] for pid in patients], 40, do_put_line,
                               args=[concept_labels, lines])
    s += '\n'.join(lines)
    utils.save_string(s, out_file)

    # generate sample annotations
    term_to_docs = {}
    for concept in sc2anns:
        ann_ids = sc2anns[concept]
        sample_ids = []
        if len(ann_ids) <= sample_size:
            sample_ids = ann_ids
        else:
            for i in xrange(sample_size):
                index = random.randrange(len(ann_ids))
                sample_ids.append(ann_ids[index])
                del ann_ids[index]
        term_to_docs[concept] = sample_ids

    # query doc contents
    print 'populating term to sampled anns...'
    term_to_sampled = {}
    for term in term_to_docs:
        sample_ids = term_to_docs[term]
        if len(sample_ids) <=0 :
            continue
        sample_doc_ids = ['\'' + s['doc_id'] + '\'' for s in sample_ids]
        rows_container = []
        dutil.query_data(doc_content_sql.format(','.join(sample_doc_ids)), rows_container,
                         dbconn=dutil.get_db_connection_by_setting(db_conn_file))
        doc_to_content = {}
        for r in rows_container:
            doc_to_content[r['doc_id']] = r['TextContent']
        term_sampled = []
        for s in sample_ids:
            term_sampled.append({'id': s['doc_id'],
                                 'content': doc_to_content[s['doc_id']],
                                 'annotations': [{'start': s['start'],
                                                  'end': s['end'],
                                                  'concept': s['concept_id']}]})
        term_to_sampled[term] = term_sampled
    utils.save_json_array(convert_encoding(term_to_sampled, 'cp1252', 'utf-8'), sample_out_file)
def random_extract_annotated_docs(cohort_name, study_analyzer, out_file,
                                  docs_by_term_sql, docs_by_cohort_sql, docs_by_ids_sql,
                                  sample_size=5):

    term_to_docs = {}
    study_concepts = study_analyzer.study_concepts
    is_NOT_cohort_based = study_analyzer.study_options is None \
                          or study_analyzer.study_options['sample_non_hits'] is None \
                          or (not study_analyzer.study_options['sample_non_hits'])
    for sc in study_concepts:
        sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure))
        concept_list = ', '.join(['\'%s\'' % c for c in sc.concept_closure])
        doc_ids = []
        if len(sc.concept_closure) > 0:
            if is_NOT_cohort_based:
                dutil.query_data(
                    docs_by_term_sql.format(
                        **{'concepts': concept_list,
                           'cohort_id': cohort_name,
                           'extra_constrains':
                               ' \n '.join(
                                   [generate_skip_term_constrain(study_analyzer)]
                                   + [] if (study_analyzer.study_options is None or
                                            study_analyzer.study_options['extra_constrains'] is None)
                                   else study_analyzer.study_options['extra_constrains'])}),
                    doc_ids)
            else:
                doc_sql = docs_by_cohort_sql.format(
                    **{'cohort_id': cohort_name,
                       'extra_constrains':
                           ' and '.join(
                               [generate_skip_term_constrain(study_analyzer)]
                               + [] if (study_analyzer.study_options is None or
                                        study_analyzer.study_options['extra_constrains'] is None)
                               else study_analyzer.study_options['extra_constrains'])})
                print doc_sql
                dutil.query_data(doc_sql, doc_ids)
        if len(doc_ids) > 0:
            sample_ids = []
            if len(doc_ids) <= sample_size:
                sample_ids = [r['CN_Doc_ID'] for r in doc_ids]
            else:
                for i in xrange(sample_size):
                    index = random.randrange(len(doc_ids))
                    sample_ids.append(doc_ids[index]['CN_Doc_ID'])
                    del doc_ids[index]
            doc_list = ', '.join(['\'%s\'' % d for d in sample_ids])
            docs = []
            doc_sample_sql = docs_by_ids_sql.format(**{'docs': doc_list,
                                                       'concepts': concept_list,
                                                       'extra_constrains': generate_skip_term_constrain(study_analyzer)})
            print doc_sample_sql
            dutil.query_data(doc_sample_sql,
                             docs)
            doc_objs = []
            prev_doc_id = ''
            doc_obj = None
            for d in docs:
                if prev_doc_id != d['CN_Doc_ID']:
                    doc_obj = {'id': d['CN_Doc_ID'], 'content': d['TextContent'], 'annotations': [],
                               'doc_table': d['src_table'], 'doc_col': d['src_col']}
                    doc_objs.append(doc_obj)
                    prev_doc_id = d['CN_Doc_ID']
                doc_obj['annotations'].append({'start': d['start_offset'],
                                               'end': d['end_offset'],
                                               'concept': d['inst_uri']})
            term_to_docs[sc.name] = doc_objs
        if not is_NOT_cohort_based:
            break
    utils.save_json_array(term_to_docs, out_file)
    print 'done'
def load_all_docs():
    sql = "select TextContent, Date, src_table, src_col, BrcId, CN_Doc_ID from sqlcris_user.KConnect.vw_hepcpos_docs"
    docs = []
    dutil.query_data(sql, docs)
    return docs
def get_doc_detail_by_id(doc_id, fulltext_date_by_doc_id):
    sql = fulltext_date_by_doc_id.format(**{'doc_id': doc_id})
    docs = []
    dutil.query_data(sql, docs)
    return docs
def populate_patient_study_table_post_ruled(cohort_name, study_analyzer, out_file, rule_executor,
                                            sample_size, sample_out_file, ruled_ann_out_file,
                                            patients_sql, term_doc_anns_sql, skip_term_sql,
                                            db_conn_file, text_preprocessing=False):
    """
    populate patient study result with post processing to remove unwanted mentions
    :param cohort_name:
    :param study_analyzer:
    :param out_file:
    :param rule_executor:
    :param sample_size:
    :param sample_out_file:
    :return:
    """
    patients = []
    dutil.query_data(patients_sql.format(cohort_name), patients, dbconn=dutil.get_db_connection_by_setting(db_conn_file))
    id2p = {}
    for p in patients:
        id2p[p['brcid']] = p

    non_empty_concepts = []
    study_concepts = study_analyzer.study_concepts
    term_to_docs = {}
    ruled_anns = []
    positive_dumps = []
    skip_terms_list = [t.lower() for t in rule_executor.skip_terms]
    for sc in study_concepts:
        positive_doc_anns = []
        sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure))
        concept_list = ', '.join(['\'%s\'' % c for c in sc.concept_closure])
        doc_anns = []
        if len(sc.concept_closure) > 0:
            sql_temp = term_doc_anns_sql
            data_sql = sql_temp.format(**{'concepts': concept_list,
                                          'cohort_id': cohort_name,
                                          'extra_constrains':
                                              ' \n '.join(
                                                  [generate_skip_term_constrain(study_analyzer, skip_term_sql)]
                                                  + [] if (study_analyzer.study_options is None or
                                                           study_analyzer.study_options['extra_constrains'] is None)
                                                  else study_analyzer.study_options['extra_constrains'])})
            print data_sql
            dutil.query_data(data_sql, doc_anns,
                             dbconn=dutil.get_db_connection_by_setting(db_conn_file))
        if len(doc_anns) > 0:
            p_to_dfreq = {}
            counted_docs = set()
            for ann in doc_anns:
                p = ann['brcid']
                d = ann['CN_Doc_ID']
                if d in counted_docs:
                    continue
                ruled = False
                case_instance = ''
                if not ruled:
                    # skip term rules
                    if 'string_orig' in ann and ann['string_orig'].lower() in skip_terms_list:
                        ruled = True
                        rule = 'skip-term'
                        case_instance = ann['string_orig']
                if not ruled:
                    # string orign rules - not used now
                    ruled, case_instance = rule_executor.execute_original_string_rules(
                        ann['string_orig'] if 'string_orig' in ann
                        else ann['TextContent'][int(ann['start_offset']):int(ann['end_offset'])])
                    rule = 'original-string-rule'
                if not ruled:
                    # post processing rules
                    ruled, case_instance, rule = \
                        rule_executor.execute(ann['TextContent'] if not text_preprocessing else
                                              preprocessing_text_befor_rule_execution(ann['TextContent']),
                                              int(ann['start_offset']),
                                              int(ann['end_offset']),
                                              string_orig=ann['string_orig'] if 'string_orig' in ann else None)
                    rule = 'semehr ' + rule
                if not ruled:
                    # bio-yodie labels
                    if 'experiencer' in ann:
                        if ann['experiencer'].lower() != 'patient' or \
                                ann['temporality'].lower() != 'recent' or \
                                ann['negation'].lower() != 'affirmed':
                            ruled = True
                            case_instance = '\t'.join([ann['experiencer'], ann['temporality'], ann['negation']])
                            rule = 'yodie'
                if ruled:
                    ruled_anns.append({'p': p, 'd': d, 'ruled': rule, 's': ann['start_offset'],
                                       'e': ann['end_offset'],
                                       'c': ann['inst_uri'],
                                       'case-instance': case_instance,
                                       'string_orig': ann['string_orig']
                                       })
                else:
                    counted_docs.add(d)
                    p_to_dfreq[p] = 1 if p not in p_to_dfreq else 1 + p_to_dfreq[p]
                    positive_doc_anns.append({'id': ann['CN_Doc_ID'],
                                              'content': ann['TextContent'],
                                              'annotations': [{'start': ann['start_offset'],
                                                               'end': ann['end_offset'],
                                                               'concept': ann['inst_uri'],
                                                               'string_orig': ann[
                                                                   'string_orig'] if 'string_orig' in ann else ''}],
                                              'doc_table': ann['src_table'],
                                              'doc_col': ann['src_col']})
                    positive_dumps.append({'p': p, 'd': d, 's': ann['start_offset'],
                                           'e': ann['end_offset'],
                                           'c': ann['inst_uri'],
                                           'string_orig': ann['string_orig']})
            if len(counted_docs) > 0:
                non_empty_concepts.append(sc_key)
                for p in p_to_dfreq:
                    id2p[p][sc_key] = str(p_to_dfreq[p])

                # save sample docs
                if sample_size >= len(positive_doc_anns):
                    term_to_docs[sc_key] = positive_doc_anns
                else:
                    sampled = []
                    for i in xrange(sample_size):
                        index = random.randrange(len(positive_doc_anns))
                        sampled.append(positive_doc_anns[index])
                        positive_doc_anns.pop(index)
                    term_to_docs[sc_key] = sampled

    concept_labels = sorted(non_empty_concepts)
    s = '\t'.join(['brcid'] + concept_labels) + '\n'
    for p in patients:
        s += '\t'.join([p['brcid']] + [p[k] if k in p else '0' for k in concept_labels]) + '\n'
    utils.save_string(s, out_file)
    utils.save_string('var sample_docs=' + json.dumps(convert_encoding(term_to_docs, 'cp1252', 'utf-8')), sample_out_file)
    utils.save_json_array(convert_encoding(ruled_anns, 'cp1252', 'utf-8'), ruled_ann_out_file)
    utils.save_json_array(positive_dumps, out_file + "_json")
    print 'done'