Ejemplo n.º 1
0
 def test_Data(self, sess, logits, x, y, test_x, test_y, fold_idx):
     n_correct = 0
     n_total = 0
     n_pos_correct = 0
     n_pos_total = 0
     output = ''
     for idx in xrange(len(test_x)):
         p, c = sess.run([tf.nn.softmax(logits), y], {x: test_x[idx:idx+1], y: test_y[idx:idx+1]})
         predicted = np.argmax(p)
         correct_label = np.argmax(c)
         if predicted == correct_label:
             n_correct += 1
         line = '%s\t%s\t%s\t%s\t%s' % (n_total, predicted, correct_label,
                                       '%.2f' % p.tolist()[0][predicted],
                                       '\t'.join(['%.2f' % p for p in p.tolist()[0]]))
         print(line)
         output += line + '\n'
         n_total += 1
         if correct_label == 0:
             n_pos_total += 1
             if correct_label == predicted:
                 n_pos_correct += 1
     utils.save_string(output, './output/weighted_fold_%s.txt' % fold_idx)
     print 'accuracy: %s' % (1.0 * n_correct / n_total)
     print 'pos recall: %s' % (1.0 * n_pos_correct / n_pos_total)
Ejemplo n.º 2
0
def learn_concept_mappings(output_lst_folder):
    type2insts = {}
    type2insts_2 = {}
    label_dir = _gold_dir
    ann_dir = _ann_dir
    file_keys = [f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))]
    t2missed = {}
    for fk in file_keys:
        get_doc_level_inference(label_dir,
                                ann_dir,
                                fk,
                                type2insts,
                                type2insts_2,
                                t2missed)
    for t in type2insts:
        type2insts[t] = list(type2insts[t])
    logging.info(json.dumps(type2insts))

    s = '\n' * 2
    for t in type2insts_2:
        type2insts_2[t] = list(type2insts_2[t])
    s += json.dumps(type2insts_2)

    s += '\n' * 2
    labels = []
    defs = []
    for t in t2missed:
        t2missed[t] = list(set(t2missed[t]))
        utils.save_string('\n'.join(t2missed[t]) + '\n', join(output_lst_folder, t + '.lst'))
        labels += [l.lower() for l in t2missed[t]]
        defs.append(t + '.lst' + ':StrokeStudy:' + t)
    s += '\n' * 2
    s += '\n'.join(defs)
    s += json.dumps(t2missed)
    logging.info(s)
Ejemplo n.º 3
0
def doc_infer(settings):
    rules = PhenotypeRule.load_rules(settings['rule_file'])
    d2predicted = utils.load_json_data(settings['doc_nlp_results'])
    doc_labels_output = settings['doc_inference_output']
    s = ''
    doc_type2id = {}
    pids = []
    for d in d2predicted:
        m = re.match(r'Stroke\_id\_(\d+)(\.\d+){0,1}', d)
        pid = d
        if m is not None:
            pid = m.group(1)
            pids.append(pid)
        label_provs = PhenotypeRuleExecutor.apply_rules(d2predicted[d], rules)
        print(pid, d, label_provs)
        for lp in label_provs:
            if lp['label'] != '':
                s += '%s\t%s\n' % (pid, lp['label'])
                if lp['label'] not in doc_type2id:
                    doc_type2id[lp['label']] = []
                doc_type2id[lp['label']].append(pid)

    pids = list(set(pids))
    print(json.dumps(pids))
    utils.save_string(s, doc_labels_output)
    if 'patient_level_truth_tsv' in settings:
        doc_infer_with_ground_truth(settings['patient_level_truth_tsv'], pids, doc_type2id)
Ejemplo n.º 4
0
def merge_mappings_dictionary(map_files, dict_dirs, new_map_file,
                              new_dict_folder):
    maps = [utils.load_json_data(mf) for mf in map_files]
    new_m = {}
    for m in maps:
        new_m.update(m)
    t2list = {}
    for dd in dict_dirs:
        lst_files = [
            f for f in listdir(dd)
            if isfile(join(dd, f)) and f.endswith('.lst')
        ]
        for f in lst_files:
            t = f[:f.index('.')]
            labels = utils.read_text_file(join(dd, f))
            if t not in t2list:
                t2list[t] = set()
            for l in labels:
                if len(l) > 0:
                    t2list[t].add(l)
    utils.save_json_array(new_m, new_map_file)
    logging.info('mapping saved to %s' % new_map_file)
    for t in t2list:
        utils.save_string('\n'.join(list(t2list[t])) + '\n',
                          join(new_dict_folder, t + '.lst'))
        logging.info('%s.lst saved' % t)
    logging.info('all done')
Ejemplo n.º 5
0
 def convert_text_ann_from_db(sql_temp,
                              pks,
                              db_conn,
                              full_text_folder,
                              ann_folder,
                              full_text_file_pattern='%s.txt',
                              ann_file_pattern='%s.txt.knowtator.xml'):
     sql = sql_temp.format(**pks)
     results = []
     logging.info('doing [%s]...' % sql)
     file_key = '_'.join([pks[k] for k in pks])
     dbutils.query_data(sql, results,
                        dbutils.get_db_connection_by_setting(db_conn))
     if len(results) > 0:
         text = results[0]['text'].replace('\r', '\n')
         anns = json.loads(results[0]['anns'])
         xml = AnnConverter.to_eHOST(AnnConverter.load_ann(anns, file_key),
                                     full_text=text)
         utils.save_string(xml, join(ann_folder,
                                     ann_file_pattern % file_key))
         utils.save_string(
             text, join(full_text_folder,
                        full_text_file_pattern % file_key))
         logging.info('doc [%s] done' % file_key)
     else:
         logging.info('doc/anns [%s] not found' % file_key)
Ejemplo n.º 6
0
 def convert_text_ann_from_files(full_text_folder,
                                 ann_folder,
                                 output_folder,
                                 full_text_file_pattern='(%s).txt',
                                 ann_file_pattern='se_ann_%s.json',
                                 output_file_pattern='%s.txt.knowtator.xml',
                                 ann_to_convert=None):
     text_files = [
         f for f in listdir(full_text_folder)
         if isfile(join(full_text_folder, f))
     ]
     p = re.compile(full_text_file_pattern)
     for f in text_files:
         logging.info('working on [%s]' % f)
         m = p.match(f)
         if m is not None:
             fk = m.group(1)
             text = utils.read_text_file_as_string(join(
                 full_text_folder, f))
             anns = utils.load_json_data(
                 join(ann_folder, ann_file_pattern % fk))
             xml = AnnConverter.to_eHOST(AnnConverter.load_ann(anns, fk),
                                         full_text=text,
                                         ann_to_convert=ann_to_convert)
             utils.save_string(
                 xml, join(output_folder, output_file_pattern % fk))
             utils.save_string(text.replace('\r', ' '),
                               join(full_text_folder, f))
             logging.info('doc [%s] done' % fk)
Ejemplo n.º 7
0
def run_kfold_learning(k, corpus_folder, semehr_folder, gold_folder, working_folder):
    kf = KFold(n_splits=k)
    files = [f for f in listdir(corpus_folder) if isfile(join(corpus_folder, f))]
    k = 0
    for train_idx, test_idx in kf.split(files):
        reset_folder(working_folder)
        # copy files
        train_ann_dir = join(working_folder, 'ann')
        train_gold_dir = join(working_folder, 'gold')    
        train_text_dir = join(working_folder, 'train_corpus')
        test_ann_dir = join(working_folder, 'test_ann')
        test_gold_dir = join(working_folder, 'test_gold')
        test_text_dir = join(working_folder, 'test_corpus')
        for idx in train_idx:
            shutil.copy(join(corpus_folder, files[idx]), join(train_text_dir, files[idx]))
            ann_file = 'se_ann_%s.json' % files[idx].replace('.txt', '')            
            gold_file = '%s.knowtator.xml' % files[idx]
            shutil.copy(join(semehr_folder, ann_file), join(train_ann_dir, ann_file))
            shutil.copy(join(gold_folder, gold_file), join(train_gold_dir, gold_file))
        for idx in test_idx:
            shutil.copy(join(corpus_folder, files[idx]), join(test_text_dir, files[idx]))
            ann_file = 'se_ann_%s.json' % files[idx].replace('.txt', '')
            gold_file = '%s.knowtator.xml' % files[idx]
            shutil.copy(join(semehr_folder, ann_file), join(test_ann_dir, ann_file))
            shutil.copy(join(gold_folder, gold_file), join(test_gold_dir, gold_file))
        performance = run_learning()
        utils.save_string(performance, join(working_folder, 'folder_%s_perf.tsv' % k))
        k += 1
        logging.info('round %s done' % k)
Ejemplo n.º 8
0
 def populate_inter_annotator_results(ann_folder_1,
                                      ann_folder_2,
                                      output_file,
                                      missing_file,
                                      correct_labels=["VERIFIED_CORRECT"]):
     ann_files = [
         f for f in listdir(ann_folder_1) if isfile(join(ann_folder_1, f))
     ]
     all_mentions = 0
     missed = []
     mismatched = []
     for f in ann_files:
         ann1 = AnnConverter.load_ann_file(join(ann_folder_1, f))
         ann2 = AnnConverter.load_ann_file(join(ann_folder_2, f))
         all_mentions += len(ann1)
         for ann in ann1:
             if ann not in ann2:
                 missed.append('%s\t%s\t%s' %
                               (ann, ann1[ann]['text'], ann1[ann]['class']))
             elif ann2[ann]['class'] != ann1[ann]['class'] and ann1[ann][
                     'class'] not in correct_labels:
                 mismatched.append('%s\t%s\t%s\t%s\t%s' %
                                   (f, ann, ann1[ann]['text'],
                                    ann1[ann]['class'], ann2[ann]['class']))
     print('\n'.join(mismatched))
     print(len(missed), all_mentions)
     utils.save_string('\n'.join(mismatched), output_file)
     utils.save_string('\n'.join(missed), missing_file)
def merge_and_output(dir_path, cohort, default_results='hepc_results.json'):
    headers = ['all', 'positive', 'Negated', 'hypothetical', 'historical', 'Other', 'first_pos_time']

    results = {}
    for pid in cohort:
        results[pid] = {}
    c_results = utils.load_json_data(join(dir_path, default_results))
    for p in c_results:
        results[p['id']] = p
    for f in [f for f in listdir(dir_path) if isfile(join(dir_path, f))]:
        if f != default_results:
            c_results = utils.load_json_data(join(dir_path, f))
            d = f.replace('_results.json', '')
            print f
            if d not in headers:
                headers.append(d)
            for p in c_results:
                results[p['id']][d] = p['all']

    s = '\t'.join(['id'] + headers) + '\n'
    for pid in results:
        p = results[pid]
        row = [pid] + ['-' if h not in p else str(p[h]) for h in headers]
        s += '\t'.join(row) + '\n'
    utils.save_string(s, './valid_doc_files/merged_output_liverdiseases.tsv')
    print 'output generated'
Ejemplo n.º 10
0
 def extract_cohort_docs(self, use_combo_fn_name=True):
     db_conf_file = self._cohort_conf
     db_conf = None
     if 'linux_dsn_setting' in self._conf and self._conf['linux_dsn_setting']:
         # need dsn settings
         db_conf = self.populate_linux_odbc_setting()
         db_conf_file = None
         logging.info('using dsn %s' % db_conf['dsn'])
     query_size = self._conf['query_size'] if 'query_size' in self._conf else 50
     file_pattern = self._conf['file_pattern'] if 'file_pattern' in self._conf else '%s.txt'
     out_put_folder = self._conf['out_put_folder']
     if len(self._patient_ids) == 0:
         logging.info('cohort is empty, has it been loaded?')
         return
     q_temp = self._conf['doc_query_temp']
     logging.info('working on extraction, cohort size:%s' % len(self._patient_ids))
     for idx in range(0, len(self._patient_ids), query_size):
         q = q_temp.format(**{'patient_ids': ",".join(["'%s'" % p for p in self._patient_ids[idx:idx+query_size]])})
         logging.info('querying batch %s' % (idx + 1))
         logging.debug(q)
         docs = []
         db.query_data(q, docs, db.get_db_connection_by_setting(db_conf_file, db_conf))
         if self._dest == 'sql':
             # save docs to database
             self.save_docs_to_db(docs)
         else:
             # save docs to files
             for d in docs:
                 if d['doc_content'] is None:
                     continue
                 fn = ('%s_%s' % (d['doc_id'], d['patient_id'])) if use_combo_fn_name else ('%s' % d['doc_id'])
                 utils.save_string(d['doc_content'], join(out_put_folder, file_pattern % fn))
         logging.info('%s docs saved to destination [%s]' % (len(docs), self._dest))
     logging.info('query finished, docs saved to %s' % out_put_folder)
Ejemplo n.º 11
0
def phenotype_counting(phenotype_def, concept_level_results, output_file):
    pd = utils.load_json_data(phenotype_def)
    npd = {}
    cd = utils.read_text_file(concept_level_results)
    c_headers = cd[0].split('\t')
    headers = [h for h in c_headers[2:len(c_headers) - 1]]
    for r in cd[1:]:
        arr = r.split('\t')
        c = arr[0]
        num_mention = arr[12]
        for p in pd:
            if c in pd[p]['concepts']:
                po = npd[p] if p in npd else {'freq':0, 'p': p,
                                              'num_concepts': len(pd[p]['concepts'])}
                npd[p] = po
                po['freq'] += int(num_mention)
                for idx in xrange(2, len(arr) - 1):
                    h = headers[idx-2]
                    po[h] = int(arr[idx]) if h not in po else (int(arr[idx]) + int(po[h]))

    rows = ['\t'.join(['phenotype', 'num concepts'] + headers + ['prevalence'])]
    for p in npd:
        po = npd[p]
        rows.append('\t'.join([p, str(po['num_concepts'])] + [str(po[h]) for h in headers] + [str(po['freq'])]))
    utils.save_string('\n'.join(rows), output_file)
def populate_patient_concept_table(cohort_name, concepts, out_file,
                                   patients_sql, concept_doc_freq_sql):
    patients = []
    dutil.query_data(patients_sql.format(cohort_name), patients)
    id2p = {}
    for p in patients:
        id2p[p['brcid']] = p

    non_empty_concepts = []
    for c in concepts:
        patient_concept_freq = []
        print 'querying %s...' % c
        dutil.query_data(concept_doc_freq_sql.format(c, cohort_name), patient_concept_freq)
        if len(patient_concept_freq) > 0:
            non_empty_concepts.append(c)
            for pc in patient_concept_freq:
                id2p[pc['brcid']][c] = str(pc['num'])

    label2cid = {}
    concept_labels = []
    for c in non_empty_concepts:
        label = oi.get_concept_label(c)
        label2cid[label] = c
        concept_labels.append(label)
    concept_labels = sorted(concept_labels)
    s = '\t'.join(['brcid'] + concept_labels) + '\n'
    for p in patients:
        s += '\t'.join([p['brcid']] + [p[label2cid[k]] if label2cid[k] in p else '0' for k in concept_labels]) + '\n'
    utils.save_string(s, out_file)
    print 'done'
Ejemplo n.º 13
0
def generate_gold_stand_from_validation(generated_ann_folder,
                                        validated_ann_folder,
                                        gold_standard_folder):

    files = [
        f for f in listdir(generated_ann_folder)
        if isfile(join(generated_ann_folder, f))
    ]
    for f in files:
        logging.debug('processing: %s / %s' % (generated_ann_folder, f))
        # ignore added annotations for now
        gd_anns = []
        gen_doc = eHostGenedDoc(join(generated_ann_folder, f))
        logging.debug('ann number: %s' % len(gen_doc.get_ess_entities()))
        val_doc = eHostAnnDoc(join(validated_ann_folder, f))
        for g in gen_doc.get_ess_entities():
            logging.debug('validation label: %s' % g.type)
            for v in val_doc.get_ess_entities():
                if g.start == v.start and g.end == v.end:
                    logging.debug('validation label: %s' % v.type)
                    if v.type == 'CORRECT':
                        gd_anns.append(g)

        elem_annotations = ET.Element("annotations")
        elem_annotations.set('textSource', f)
        idx = 0
        for ann in gd_anns:
            if ann.str.lower() == 'haematoma':
                continue
            idx += 1
            mention_id = '%s-%s' % (f, idx)
            elem_ann = ET.SubElement(elem_annotations, "annotation")
            elem_mention = ET.SubElement(elem_ann, "mention")
            elem_mention.set('id', mention_id)
            elem_annotator = ET.SubElement(elem_ann, "annotator")
            elem_annotator.set('id', 'semehr')
            elem_annotator.text = 'semehr'
            elem_span = ET.SubElement(elem_ann, "span")
            elem_span.set('start', '%s' % ann.start)
            elem_span.set('end', '%s' % ann.end)
            elem_spanText = ET.SubElement(elem_ann, "spannedText")
            elem_spanText.text = ann.str
            elem_date = ET.SubElement(elem_ann, "creationDate")
            elem_date.text = datetime.datetime.now().strftime(
                "%a %B %d %X %Z %Y")
            #
            elem_class = ET.SubElement(elem_annotations, "classMention")
            elem_class.set('id', mention_id)
            elem_mention_class = ET.SubElement(elem_class, "mentionClass")
            if ann.str.lower(
            ) == 'haemorrhage' or ann.str.lower() == 'blood' or ann.str.lower(
            ) == 'bleed' or ann.str.lower().startswith('collection'):
                ann.type = 'bleeding'
            elem_mention_class.set('id', ann.type)
            elem_mention_class.text = ann.str
        tree = ET.ElementTree(elem_annotations)
        logging.info('gd file saved to %s - %s' % (gold_standard_folder, f))
        utils.save_string(
            ET.tostring(elem_annotations, encoding='utf8', method='xml'),
            join(gold_standard_folder, f))
Ejemplo n.º 14
0
def as_text(action_i, out_path):
    def line_helper(frame):
        line = [str(cord_i) for cord_i in list(frame)]
        return ",".join(line)

    lines = [line_helper(frame_i) for frame_i in action_i.img_seq]
    text = "\n".join(lines)
    utils.save_string(out_path, text)
def phenotype_prevalence(phenotype_with_prev, output_file):
    pd = utils.load_json_data(phenotype_with_prev)
    utils.save_string(
        '\n'.join([
            '\t'.join(
                [p, str(pd[p]['prevalence']),
                 str(len(pd[p]['concepts']))]) for p in pd
        ]), output_file)
Ejemplo n.º 16
0
def mapping_headings(heading_stats_file, output_file, freq_threshold=1000):
    heading_freq = utils.load_json_data(heading_stats_file)
    sorted_top_k_headings = sorted([(h, heading_freq[h])
                                    for h in heading_freq],
                                   key=lambda x: -x[1])[:freq_threshold]
    s = ''
    for r in sorted_top_k_headings[:500:]:
        s += '%s\t%s\n' % (r[0], r[1])
    utils.save_string(s, './top500heading_discharge_summary.txt')
    utils.save_json_array(sorted_top_k_headings, output_file)
def dump_mention_detail(studies_folder, include_study_pattern, dump_tsv_file,
                        dump_concept_file):
    reg_p = re.compile(include_study_pattern)
    rows = ['\t'.join(['concept', 'pt', 'doc', 's', 'e', 'label', 'ruled'])]
    c_group = {}
    for f in listdir(studies_folder):
        m = reg_p.match(f)
        if m is not None:
            ruled_file = join(studies_folder, f, 'ruled_anns.json')
            if isfile(ruled_file):
                # {"p": "pid", "s": 3356, "e": 3365, "d": "did", "case-instance": [xxx"],
                # "c": "C0000833", "string_orig": "abscesses",
                # "ruled": "semehr hypothetical_filters.json"}
                ruleds = utils.load_json_data(ruled_file)
                for r in ruleds:
                    rows.append('\t'.join([
                        r['c'], r['p'], r['d'],
                        str(r['s']),
                        str(r['e']), r['string_orig'], r['ruled']
                    ]))
                    increase_freq_on_dict(
                        c_group, r['c'], r['ruled'],
                        '-'.join([r['d'], str(r['s']),
                                  str(r['e'])]))
            pos_file = join(studies_folder, f, 'result.csv_json')
            if isfile(pos_file):
                # {"c": "C0000833", "e": 467, "d": "52773120", "string_orig": "abscess", "p": "10110421", "s": 460}
                poses = utils.load_json_data(pos_file)
                for r in poses:
                    rows.append('\t'.join([
                        r['c'], r['p'], r['d'],
                        str(r['s']),
                        str(r['e']), r['string_orig'], ''
                    ]))
                    increase_freq_on_dict(
                        c_group, r['c'], 'pos',
                        '-'.join([r['d'], str(r['s']),
                                  str(r['e'])]))

    rule_headers = [
        'semehr negation_filters.json', 'semehr hypothetical_filters.json',
        'semehr not_mention_filters.json',
        'semehr other_experiencer_filters.json',
        'semehr cris_document_filters.json', 'skip-term', 'semehr s_skin.json',
        'semehr s_karen.json', 'yodie', 'pos'
    ]
    c_rows = ['\t'.join(['concept'] + rule_headers)]
    for c in c_group:
        co = c_group[c]
        c_rows.append(
            '\t'.join([c] +
                      [str(co[h]) if h in co else '0' for h in rule_headers]))
    utils.save_string('\n'.join(rows), dump_tsv_file)
    utils.save_string('\n'.join(c_rows), dump_concept_file)
    print 'dumped to  %s' % dump_tsv_file
Ejemplo n.º 18
0
def output_single_phenotype_detail(pprevalence_file, phenotype, output_file):
    pp = utils.load_json_data(pprevalence_file)
    p = pp[phenotype]
    rows = []
    rows.append('\t'.join(['total', str(p['prevalence'])]))
    for sp in p['subtypes']:
        rows.append('\t'.join([sp['phenotype'], str(p['concepts'][sp['concept']])]))
    for c in p['concepts']:
        rows.append('\t'.join([c, str(p['concepts'][c])]))
    utils.save_string('\n'.join(rows), output_file)
    print '% result saved to %s' % (phenotype, output_file)
Ejemplo n.º 19
0
def produce_weka_output(predict_output_file,
                        orig_features_file,
                        merged_output_file,
                        arrf_file,
                        threshold=.70,
                        mode='threshold'):
    orig_data_lines = utils.read_text_file(orig_features_file)
    ret = utils.load_json_data(predict_output_file)
    ptn2anns = {}
    for r in ret:
        ptn = r[0]
        if ptn not in ptn2anns:
            ptn2anns[ptn] = {'posM': 0, 'negM': 0, 'hisM': 0, 'otherM': 0}
        if mode == 'threshold':
            if float(r[6]) >= threshold:
                ptn2anns[ptn][r[4]] += 1
        elif mode == 'weighted_sum':
            ptn2anns[ptn][r[4]] += float(r[6])

    rows = []
    arrf_header = """@RELATION	hepc

@ATTRIBUTE	Total_Mentions	NUMERIC
@ATTRIBUTE	Positive_Mentions	NUMERIC
@ATTRIBUTE	History_hypothetical_Mentions	NUMERIC
@ATTRIBUTE	Negative_Mentions	NUMERIC
@ATTRIBUTE	Other_Experiencers	NUMERIC
@ATTRIBUTE	AT_Total_Mentions	NUMERIC
@ATTRIBUTE	AT_Positive_Mentions	NUMERIC
@ATTRIBUTE	AT_History_hypothetical_Mentions	NUMERIC
@ATTRIBUTE	AT_Negative_Mentions	NUMERIC
@ATTRIBUTE	AT_Other_Experiencers	NUMERIC
@ATTRIBUTE	class	{positive,negative,unknown}


@DATA
"""
    arrf_rows = []
    for l in orig_data_lines:
        arr = l.split('\t')
        ptn = arr[0]
        new_line = arr[:6] + \
                   ([str(ptn2anns[ptn]['posM'] + ptn2anns[ptn]['negM'] + ptn2anns[ptn]['hisM'] + ptn2anns[ptn]['otherM']),
                                str(ptn2anns[ptn]['posM']),
                                str(ptn2anns[ptn]['hisM']),
                                str(ptn2anns[ptn]['negM']),
                                str(ptn2anns[ptn]['otherM'])] if ptn in ptn2anns else ['0','0','0','0','0']) + \
                   [arr[6]]
        rows.append(new_line)
        arrf_rows.append(','.join(new_line[1:]))

    utils.save_string(arrf_header + '\n'.join(arrf_rows), arrf_file)
    utils.save_string('\n'.join(['\t'.join(r) for r in rows]),
                      merged_output_file)
Ejemplo n.º 20
0
def make_sequences(in_path,out_path,dim=0):
    action_t_series=utils.read_dir_objects(in_path)
    sequences=[]
    for action in action_t_series:
        arr=action.to_array()[dim]
        cat_series=category_series(arr)
        seq=cats_to_seq(cat_series)
        seq+="#"+str(action.cat)
        seq+="#"+action.name +"\n"
        sequences.append(seq)
    str_seq=utils.array_to_txt(sequences)
    utils.save_string(out_path,str_seq)
Ejemplo n.º 21
0
def extract_doc_level_ann(ann_dump, output_folder):
    """

    extract doc level annotations and save to separate files
    :param ann_dump:
    :param output_folder:
    :return:
    """
    lines = utils.read_text_file(ann_dump)
    for l in lines:
        doc_ann = json.loads(l)
        utils.save_string(l, join(output_folder, doc_ann['docId'].split('.')[0] + '.json'))
Ejemplo n.º 22
0
def do_phenotype_analysis(phenotype_result_file, c_map_file, output_folder):
    c_map = utils.load_json_data(c_map_file)
    p_map = utils.load_json_data(phenotype_result_file)
    # extract performances of phenotypes
    headers = ["posM", "hisM", "negM", "otherM", "wrongM"]
    rows = ['\t'.join(["phenotype"] + headers)]
    for p in p_map:
        v = p_map[p]['validation']
        if v is None or len(v) == 0:
            continue
        rows.append('\t'.join([p] + [str(v[h]) if h in v else '0' for h in headers]))
    utils.save_string('\n'.join(rows), join(output_folder, 'phenotype_performance.tsv'))
Ejemplo n.º 23
0
def label_analyse(sql_template_file, db_cnf, output_file=None):
    sql_temps = utils.load_json_data(sql_template_file)
    concepts = []
    dutil.query_data(sql_temps['get_validated_concepts'], concepts,
                     dbconn=dutil.get_db_connection_by_setting(db_cnf))
    s = ''
    for c in concepts:
        data, output = concept_analyse(c['concept_id'], sql_temps['condition_label_sql'], sql_temps['wrong_label_sql'], db_cnf)
        s += output
    if output_file is not None:
        print 'saving output to %s...' % output_file
        utils.save_string(s, output_file)
Ejemplo n.º 24
0
def get_concepts(output_file):
    curated_mappings = utils.load_json_data(
        './resources/curated_mappings.json')
    autoimmune_concepts = []
    patients = []
    dutil.query_data(autoimmune_concepts_sql, autoimmune_concepts)
    print '{} concepts read'.format(len(autoimmune_concepts))
    dutil.query_data(patients_sql, patients)
    print patients[0]
    # patient dic
    patient_dic = {}
    for p in patients:
        patient_dic[p['brcid']] = p

    non_empty_curated_concepts = []
    non_empty_not_curated_concepts = []
    empty_concepts = []
    for co in autoimmune_concepts:
        c = co['concept_name']
        sympton_freq_result = []
        print autoimmune_sympton_freq_sql.format(c)
        dutil.query_data(
            autoimmune_sympton_freq_sql.format(c.replace("'", "''")),
            sympton_freq_result)
        if len(sympton_freq_result) > 0:
            if c in curated_mappings and curated_mappings[
                    c] is not None and curated_mappings[c] == 'correct':
                non_empty_curated_concepts.append(c)
            else:
                non_empty_not_curated_concepts.append(c)
        else:
            empty_concepts.append(c)
        for sf in sympton_freq_result:
            patient_dic[sf['brcid']][c] = sf['num']
            patient_dic[sf['brcid']]['any'] = sf['num'] + \
                                              (patient_dic[sf['brcid']]['any']
                                               if 'any' in patient_dic[sf['brcid']] else 0)
    p_attrs = [
        'brcid', 'primary_diag', 'diagnosis_date', 'dob', 'gender_id',
        'ethnicitycleaned'
    ]
    d_attrs = sorted(
        non_empty_curated_concepts
    )  # sorted([co['concept_name'] for co in autoimmune_concepts])
    d_attrs = ['any'] + d_attrs + ['=sep='
                                   ] + sorted(non_empty_not_curated_concepts)
    s = '\t'.join(p_attrs) + '\t' + '\t'.join(d_attrs) + '\n'
    for p in patients:
        s += '\t'.join([str(p[k]) for k in p_attrs]) + '\t' + '\t'.join(
            ['0' if c not in p else str(p[c]) for c in d_attrs]) + '\n'
    utils.save_string(s, output_file)
    print json.dumps(empty_concepts)
Ejemplo n.º 25
0
def populate_patient_study_table(cohort_name, study_analyzer, out_file,
                                 patients_sql, term_doc_freq_sql):
    """
    populate result table for a given study analyzer instance
    :param cohort_name:
    :param study_analyzer:
    :param out_file:
    :return:
    """
    patients = []
    dutil.query_data(patients_sql.format(cohort_name), patients)
    id2p = {}
    for p in patients:
        id2p[p['brcid']] = p

    non_empty_concepts = []
    study_concepts = study_analyzer.study_concepts
    for sc in study_concepts:
        sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure))
        concept_list = ', '.join(['\'%s\'' % c for c in sc.concept_closure])
        patient_term_freq = []
        if len(sc.concept_closure) > 0:
            data_sql = term_doc_freq_sql.format(
                **{
                    'concepts':
                    concept_list,
                    'cohort_id':
                    cohort_name,
                    'extra_constrains':
                    ' \n '.join(
                        [generate_skip_term_constrain(study_analyzer)] + [] if
                        (study_analyzer.study_options is None or study_analyzer
                         .study_options['extra_constrains'] is None) else
                        study_analyzer.study_options['extra_constrains'])
                })
            print data_sql
            dutil.query_data(data_sql, patient_term_freq)
        if len(patient_term_freq) > 0:
            non_empty_concepts.append(sc_key)
            for pc in patient_term_freq:
                id2p[pc['brcid']][sc_key] = str(pc['num'])

    concept_labels = sorted(non_empty_concepts)
    s = '\t'.join(['brcid'] + concept_labels) + '\n'
    for p in patients:
        s += '\t'.join([p['brcid']] +
                       [p[k] if k in p else '0'
                        for k in concept_labels]) + '\n'
    utils.save_string(s, out_file)
    print 'done'
Ejemplo n.º 26
0
def save_full_text(xml_file, output_dir):
    """
    recover full text from Informatics' xml format
    :param xml_file:
    :param output_dir:
    :return:
    """
    if not isfile(xml_file):
        return
    ed = EDIRDoc(xml_file)
    fn = basename(xml_file)
    name = fn.replace(r'-ann.xml', '.txt')
    logging.info('%s processed to be %s' % (fn, name))
    utils.save_string(ed.get_full_text, join(output_dir, name))
Ejemplo n.º 27
0
def get_all_instances(save_file):
    concepts = utils.load_json_data('./resources/exact_concpts_mappings.json')
    concpet2subconcepts_csv = ''
    for c in concepts:
        if concepts[c] == '':
            continue
        insts = query_instances(concepts[c])
        insts = [concepts[c]] + insts
        print u'{}\t{}\t{}\t{}'.format(c, concepts[c], len(insts),
                                       json.dumps(insts))
        for cid in insts:
            concpet2subconcepts_csv += u'{}, {}\n'.format(
                c, cid[cid.rfind('/') + 1:])
    if save_file is not None:
        utils.save_string(concpet2subconcepts_csv, save_file)
def select_section_headers(sec_freq_file):
    """
    do simple syntactic merging of section titles and sort them by
    frequencies
    :param sec_freq_file:
    :return:
    """
    sec_freq = utils.load_json_data(sec_freq_file)
    merged_sec_freq = {}
    for s in sec_freq:
        k = normalise_sec_title(s)
        merged_sec_freq[k] = sec_freq[s] if k not in merged_sec_freq else sec_freq[s] + merged_sec_freq[k]
    sec_freq = merged_sec_freq
    sf = [(s, sec_freq[s]) for s in sec_freq]
    sf = sorted(sf, key=lambda sec: -sec[1])
    utils.save_string('\n'.join('%s\t%s' % t for t in sf), '../resources/wrappers/mimic_section_freqs.txt')
Ejemplo n.º 29
0
def download_docs(doc_ids, query, db_conn_setting, out_put_folder):
    """
    download clinical notes from EHR
    :param doc_ids:
    :param query:
    :param db_conn_setting:
    :return:
    """
    db_cnn = dutil.get_db_connection_by_setting(db_conn_setting)
    results = []
    q = query.format(**{'ids': ','.join(['\'%s\'' % did for did in doc_ids])} )
    print 'querying [%s]' % q
    print q
    dutil.query_data(q, results, db_cnn)
    for r in results:
        if r['textcontent'] is not None:
            utils.save_string(r['textcontent'].decode('cp1252').replace(chr(13), ' '), join(out_put_folder, r['cn_doc_id'] + '.txt'), encoding='utf-8')
def complete_samples(sample_file, complete_sql, db_conn_file, out_file):
    ann_prefix = 'var sample_docs='
    anns_str = utils.read_text_file_as_string(sample_file)
    if anns_str.startswith(ann_prefix):
        anns_str = anns_str[len(ann_prefix):]
    anns = json.loads(anns_str)
    # anns = utils.load_json_data(sample_file)
    key_anns = []
    for k in anns:
        key_anns.append((k, anns[k]))
    container = []
    utils.multi_thread_tasking(key_anns, 40, complete_sample_ann_data,
                               args=[complete_sql, db_conn_file, container])
    results = {}
    for r in container:
        results[r[0]] = r[1]
    utils.save_string(ann_prefix + json.dumps(results), out_file)
    print 'done'
Ejemplo n.º 31
0
def process_batched_docs(folder_path, out_folder):
    if isdir(folder_path):
        for f in listdir(folder_path):
            if isfile(join(folder_path, f)):
                t = utils.read_text_file_as_string(join(folder_path, f))
                print 'processing %s' % join(folder_path, f)
                print t
                mit = re.finditer(r'^(\d+)\,\"', t, re.MULTILINE)
                prev_pos = 0
                prev_id = None
                for m in mit:
                    if prev_pos > 0:
                        utils.save_string(t[prev_pos:m.start()-2], join(out_folder, prev_id))
                    prev_pos = m.end()
                    prev_id = m.string[m.start(1):m.end(1)]
                if prev_id is not None:
                    utils.save_string(t[prev_pos:len(t) - 1], join(out_folder, prev_id))
                else:
                    print 'ERROR!! pattern not found in %s' % join(folder_path, f)
Ejemplo n.º 32
0
def create_seq_dataset(action_dir,conf):
    actions=get_actions(action_dir,conf.nn,conf.cls)
    seq_data=utils.array_to_txt(actions)
    utils.save_string(conf.seq,seq_data)