def test_Data(self, sess, logits, x, y, test_x, test_y, fold_idx): n_correct = 0 n_total = 0 n_pos_correct = 0 n_pos_total = 0 output = '' for idx in xrange(len(test_x)): p, c = sess.run([tf.nn.softmax(logits), y], {x: test_x[idx:idx+1], y: test_y[idx:idx+1]}) predicted = np.argmax(p) correct_label = np.argmax(c) if predicted == correct_label: n_correct += 1 line = '%s\t%s\t%s\t%s\t%s' % (n_total, predicted, correct_label, '%.2f' % p.tolist()[0][predicted], '\t'.join(['%.2f' % p for p in p.tolist()[0]])) print(line) output += line + '\n' n_total += 1 if correct_label == 0: n_pos_total += 1 if correct_label == predicted: n_pos_correct += 1 utils.save_string(output, './output/weighted_fold_%s.txt' % fold_idx) print 'accuracy: %s' % (1.0 * n_correct / n_total) print 'pos recall: %s' % (1.0 * n_pos_correct / n_pos_total)
def learn_concept_mappings(output_lst_folder): type2insts = {} type2insts_2 = {} label_dir = _gold_dir ann_dir = _ann_dir file_keys = [f.split('.')[0] for f in listdir(ann_dir) if isfile(join(ann_dir, f))] t2missed = {} for fk in file_keys: get_doc_level_inference(label_dir, ann_dir, fk, type2insts, type2insts_2, t2missed) for t in type2insts: type2insts[t] = list(type2insts[t]) logging.info(json.dumps(type2insts)) s = '\n' * 2 for t in type2insts_2: type2insts_2[t] = list(type2insts_2[t]) s += json.dumps(type2insts_2) s += '\n' * 2 labels = [] defs = [] for t in t2missed: t2missed[t] = list(set(t2missed[t])) utils.save_string('\n'.join(t2missed[t]) + '\n', join(output_lst_folder, t + '.lst')) labels += [l.lower() for l in t2missed[t]] defs.append(t + '.lst' + ':StrokeStudy:' + t) s += '\n' * 2 s += '\n'.join(defs) s += json.dumps(t2missed) logging.info(s)
def doc_infer(settings): rules = PhenotypeRule.load_rules(settings['rule_file']) d2predicted = utils.load_json_data(settings['doc_nlp_results']) doc_labels_output = settings['doc_inference_output'] s = '' doc_type2id = {} pids = [] for d in d2predicted: m = re.match(r'Stroke\_id\_(\d+)(\.\d+){0,1}', d) pid = d if m is not None: pid = m.group(1) pids.append(pid) label_provs = PhenotypeRuleExecutor.apply_rules(d2predicted[d], rules) print(pid, d, label_provs) for lp in label_provs: if lp['label'] != '': s += '%s\t%s\n' % (pid, lp['label']) if lp['label'] not in doc_type2id: doc_type2id[lp['label']] = [] doc_type2id[lp['label']].append(pid) pids = list(set(pids)) print(json.dumps(pids)) utils.save_string(s, doc_labels_output) if 'patient_level_truth_tsv' in settings: doc_infer_with_ground_truth(settings['patient_level_truth_tsv'], pids, doc_type2id)
def merge_mappings_dictionary(map_files, dict_dirs, new_map_file, new_dict_folder): maps = [utils.load_json_data(mf) for mf in map_files] new_m = {} for m in maps: new_m.update(m) t2list = {} for dd in dict_dirs: lst_files = [ f for f in listdir(dd) if isfile(join(dd, f)) and f.endswith('.lst') ] for f in lst_files: t = f[:f.index('.')] labels = utils.read_text_file(join(dd, f)) if t not in t2list: t2list[t] = set() for l in labels: if len(l) > 0: t2list[t].add(l) utils.save_json_array(new_m, new_map_file) logging.info('mapping saved to %s' % new_map_file) for t in t2list: utils.save_string('\n'.join(list(t2list[t])) + '\n', join(new_dict_folder, t + '.lst')) logging.info('%s.lst saved' % t) logging.info('all done')
def convert_text_ann_from_db(sql_temp, pks, db_conn, full_text_folder, ann_folder, full_text_file_pattern='%s.txt', ann_file_pattern='%s.txt.knowtator.xml'): sql = sql_temp.format(**pks) results = [] logging.info('doing [%s]...' % sql) file_key = '_'.join([pks[k] for k in pks]) dbutils.query_data(sql, results, dbutils.get_db_connection_by_setting(db_conn)) if len(results) > 0: text = results[0]['text'].replace('\r', '\n') anns = json.loads(results[0]['anns']) xml = AnnConverter.to_eHOST(AnnConverter.load_ann(anns, file_key), full_text=text) utils.save_string(xml, join(ann_folder, ann_file_pattern % file_key)) utils.save_string( text, join(full_text_folder, full_text_file_pattern % file_key)) logging.info('doc [%s] done' % file_key) else: logging.info('doc/anns [%s] not found' % file_key)
def convert_text_ann_from_files(full_text_folder, ann_folder, output_folder, full_text_file_pattern='(%s).txt', ann_file_pattern='se_ann_%s.json', output_file_pattern='%s.txt.knowtator.xml', ann_to_convert=None): text_files = [ f for f in listdir(full_text_folder) if isfile(join(full_text_folder, f)) ] p = re.compile(full_text_file_pattern) for f in text_files: logging.info('working on [%s]' % f) m = p.match(f) if m is not None: fk = m.group(1) text = utils.read_text_file_as_string(join( full_text_folder, f)) anns = utils.load_json_data( join(ann_folder, ann_file_pattern % fk)) xml = AnnConverter.to_eHOST(AnnConverter.load_ann(anns, fk), full_text=text, ann_to_convert=ann_to_convert) utils.save_string( xml, join(output_folder, output_file_pattern % fk)) utils.save_string(text.replace('\r', ' '), join(full_text_folder, f)) logging.info('doc [%s] done' % fk)
def run_kfold_learning(k, corpus_folder, semehr_folder, gold_folder, working_folder): kf = KFold(n_splits=k) files = [f for f in listdir(corpus_folder) if isfile(join(corpus_folder, f))] k = 0 for train_idx, test_idx in kf.split(files): reset_folder(working_folder) # copy files train_ann_dir = join(working_folder, 'ann') train_gold_dir = join(working_folder, 'gold') train_text_dir = join(working_folder, 'train_corpus') test_ann_dir = join(working_folder, 'test_ann') test_gold_dir = join(working_folder, 'test_gold') test_text_dir = join(working_folder, 'test_corpus') for idx in train_idx: shutil.copy(join(corpus_folder, files[idx]), join(train_text_dir, files[idx])) ann_file = 'se_ann_%s.json' % files[idx].replace('.txt', '') gold_file = '%s.knowtator.xml' % files[idx] shutil.copy(join(semehr_folder, ann_file), join(train_ann_dir, ann_file)) shutil.copy(join(gold_folder, gold_file), join(train_gold_dir, gold_file)) for idx in test_idx: shutil.copy(join(corpus_folder, files[idx]), join(test_text_dir, files[idx])) ann_file = 'se_ann_%s.json' % files[idx].replace('.txt', '') gold_file = '%s.knowtator.xml' % files[idx] shutil.copy(join(semehr_folder, ann_file), join(test_ann_dir, ann_file)) shutil.copy(join(gold_folder, gold_file), join(test_gold_dir, gold_file)) performance = run_learning() utils.save_string(performance, join(working_folder, 'folder_%s_perf.tsv' % k)) k += 1 logging.info('round %s done' % k)
def populate_inter_annotator_results(ann_folder_1, ann_folder_2, output_file, missing_file, correct_labels=["VERIFIED_CORRECT"]): ann_files = [ f for f in listdir(ann_folder_1) if isfile(join(ann_folder_1, f)) ] all_mentions = 0 missed = [] mismatched = [] for f in ann_files: ann1 = AnnConverter.load_ann_file(join(ann_folder_1, f)) ann2 = AnnConverter.load_ann_file(join(ann_folder_2, f)) all_mentions += len(ann1) for ann in ann1: if ann not in ann2: missed.append('%s\t%s\t%s' % (ann, ann1[ann]['text'], ann1[ann]['class'])) elif ann2[ann]['class'] != ann1[ann]['class'] and ann1[ann][ 'class'] not in correct_labels: mismatched.append('%s\t%s\t%s\t%s\t%s' % (f, ann, ann1[ann]['text'], ann1[ann]['class'], ann2[ann]['class'])) print('\n'.join(mismatched)) print(len(missed), all_mentions) utils.save_string('\n'.join(mismatched), output_file) utils.save_string('\n'.join(missed), missing_file)
def merge_and_output(dir_path, cohort, default_results='hepc_results.json'): headers = ['all', 'positive', 'Negated', 'hypothetical', 'historical', 'Other', 'first_pos_time'] results = {} for pid in cohort: results[pid] = {} c_results = utils.load_json_data(join(dir_path, default_results)) for p in c_results: results[p['id']] = p for f in [f for f in listdir(dir_path) if isfile(join(dir_path, f))]: if f != default_results: c_results = utils.load_json_data(join(dir_path, f)) d = f.replace('_results.json', '') print f if d not in headers: headers.append(d) for p in c_results: results[p['id']][d] = p['all'] s = '\t'.join(['id'] + headers) + '\n' for pid in results: p = results[pid] row = [pid] + ['-' if h not in p else str(p[h]) for h in headers] s += '\t'.join(row) + '\n' utils.save_string(s, './valid_doc_files/merged_output_liverdiseases.tsv') print 'output generated'
def extract_cohort_docs(self, use_combo_fn_name=True): db_conf_file = self._cohort_conf db_conf = None if 'linux_dsn_setting' in self._conf and self._conf['linux_dsn_setting']: # need dsn settings db_conf = self.populate_linux_odbc_setting() db_conf_file = None logging.info('using dsn %s' % db_conf['dsn']) query_size = self._conf['query_size'] if 'query_size' in self._conf else 50 file_pattern = self._conf['file_pattern'] if 'file_pattern' in self._conf else '%s.txt' out_put_folder = self._conf['out_put_folder'] if len(self._patient_ids) == 0: logging.info('cohort is empty, has it been loaded?') return q_temp = self._conf['doc_query_temp'] logging.info('working on extraction, cohort size:%s' % len(self._patient_ids)) for idx in range(0, len(self._patient_ids), query_size): q = q_temp.format(**{'patient_ids': ",".join(["'%s'" % p for p in self._patient_ids[idx:idx+query_size]])}) logging.info('querying batch %s' % (idx + 1)) logging.debug(q) docs = [] db.query_data(q, docs, db.get_db_connection_by_setting(db_conf_file, db_conf)) if self._dest == 'sql': # save docs to database self.save_docs_to_db(docs) else: # save docs to files for d in docs: if d['doc_content'] is None: continue fn = ('%s_%s' % (d['doc_id'], d['patient_id'])) if use_combo_fn_name else ('%s' % d['doc_id']) utils.save_string(d['doc_content'], join(out_put_folder, file_pattern % fn)) logging.info('%s docs saved to destination [%s]' % (len(docs), self._dest)) logging.info('query finished, docs saved to %s' % out_put_folder)
def phenotype_counting(phenotype_def, concept_level_results, output_file): pd = utils.load_json_data(phenotype_def) npd = {} cd = utils.read_text_file(concept_level_results) c_headers = cd[0].split('\t') headers = [h for h in c_headers[2:len(c_headers) - 1]] for r in cd[1:]: arr = r.split('\t') c = arr[0] num_mention = arr[12] for p in pd: if c in pd[p]['concepts']: po = npd[p] if p in npd else {'freq':0, 'p': p, 'num_concepts': len(pd[p]['concepts'])} npd[p] = po po['freq'] += int(num_mention) for idx in xrange(2, len(arr) - 1): h = headers[idx-2] po[h] = int(arr[idx]) if h not in po else (int(arr[idx]) + int(po[h])) rows = ['\t'.join(['phenotype', 'num concepts'] + headers + ['prevalence'])] for p in npd: po = npd[p] rows.append('\t'.join([p, str(po['num_concepts'])] + [str(po[h]) for h in headers] + [str(po['freq'])])) utils.save_string('\n'.join(rows), output_file)
def populate_patient_concept_table(cohort_name, concepts, out_file, patients_sql, concept_doc_freq_sql): patients = [] dutil.query_data(patients_sql.format(cohort_name), patients) id2p = {} for p in patients: id2p[p['brcid']] = p non_empty_concepts = [] for c in concepts: patient_concept_freq = [] print 'querying %s...' % c dutil.query_data(concept_doc_freq_sql.format(c, cohort_name), patient_concept_freq) if len(patient_concept_freq) > 0: non_empty_concepts.append(c) for pc in patient_concept_freq: id2p[pc['brcid']][c] = str(pc['num']) label2cid = {} concept_labels = [] for c in non_empty_concepts: label = oi.get_concept_label(c) label2cid[label] = c concept_labels.append(label) concept_labels = sorted(concept_labels) s = '\t'.join(['brcid'] + concept_labels) + '\n' for p in patients: s += '\t'.join([p['brcid']] + [p[label2cid[k]] if label2cid[k] in p else '0' for k in concept_labels]) + '\n' utils.save_string(s, out_file) print 'done'
def generate_gold_stand_from_validation(generated_ann_folder, validated_ann_folder, gold_standard_folder): files = [ f for f in listdir(generated_ann_folder) if isfile(join(generated_ann_folder, f)) ] for f in files: logging.debug('processing: %s / %s' % (generated_ann_folder, f)) # ignore added annotations for now gd_anns = [] gen_doc = eHostGenedDoc(join(generated_ann_folder, f)) logging.debug('ann number: %s' % len(gen_doc.get_ess_entities())) val_doc = eHostAnnDoc(join(validated_ann_folder, f)) for g in gen_doc.get_ess_entities(): logging.debug('validation label: %s' % g.type) for v in val_doc.get_ess_entities(): if g.start == v.start and g.end == v.end: logging.debug('validation label: %s' % v.type) if v.type == 'CORRECT': gd_anns.append(g) elem_annotations = ET.Element("annotations") elem_annotations.set('textSource', f) idx = 0 for ann in gd_anns: if ann.str.lower() == 'haematoma': continue idx += 1 mention_id = '%s-%s' % (f, idx) elem_ann = ET.SubElement(elem_annotations, "annotation") elem_mention = ET.SubElement(elem_ann, "mention") elem_mention.set('id', mention_id) elem_annotator = ET.SubElement(elem_ann, "annotator") elem_annotator.set('id', 'semehr') elem_annotator.text = 'semehr' elem_span = ET.SubElement(elem_ann, "span") elem_span.set('start', '%s' % ann.start) elem_span.set('end', '%s' % ann.end) elem_spanText = ET.SubElement(elem_ann, "spannedText") elem_spanText.text = ann.str elem_date = ET.SubElement(elem_ann, "creationDate") elem_date.text = datetime.datetime.now().strftime( "%a %B %d %X %Z %Y") # elem_class = ET.SubElement(elem_annotations, "classMention") elem_class.set('id', mention_id) elem_mention_class = ET.SubElement(elem_class, "mentionClass") if ann.str.lower( ) == 'haemorrhage' or ann.str.lower() == 'blood' or ann.str.lower( ) == 'bleed' or ann.str.lower().startswith('collection'): ann.type = 'bleeding' elem_mention_class.set('id', ann.type) elem_mention_class.text = ann.str tree = ET.ElementTree(elem_annotations) logging.info('gd file saved to %s - %s' % (gold_standard_folder, f)) utils.save_string( ET.tostring(elem_annotations, encoding='utf8', method='xml'), join(gold_standard_folder, f))
def as_text(action_i, out_path): def line_helper(frame): line = [str(cord_i) for cord_i in list(frame)] return ",".join(line) lines = [line_helper(frame_i) for frame_i in action_i.img_seq] text = "\n".join(lines) utils.save_string(out_path, text)
def phenotype_prevalence(phenotype_with_prev, output_file): pd = utils.load_json_data(phenotype_with_prev) utils.save_string( '\n'.join([ '\t'.join( [p, str(pd[p]['prevalence']), str(len(pd[p]['concepts']))]) for p in pd ]), output_file)
def mapping_headings(heading_stats_file, output_file, freq_threshold=1000): heading_freq = utils.load_json_data(heading_stats_file) sorted_top_k_headings = sorted([(h, heading_freq[h]) for h in heading_freq], key=lambda x: -x[1])[:freq_threshold] s = '' for r in sorted_top_k_headings[:500:]: s += '%s\t%s\n' % (r[0], r[1]) utils.save_string(s, './top500heading_discharge_summary.txt') utils.save_json_array(sorted_top_k_headings, output_file)
def dump_mention_detail(studies_folder, include_study_pattern, dump_tsv_file, dump_concept_file): reg_p = re.compile(include_study_pattern) rows = ['\t'.join(['concept', 'pt', 'doc', 's', 'e', 'label', 'ruled'])] c_group = {} for f in listdir(studies_folder): m = reg_p.match(f) if m is not None: ruled_file = join(studies_folder, f, 'ruled_anns.json') if isfile(ruled_file): # {"p": "pid", "s": 3356, "e": 3365, "d": "did", "case-instance": [xxx"], # "c": "C0000833", "string_orig": "abscesses", # "ruled": "semehr hypothetical_filters.json"} ruleds = utils.load_json_data(ruled_file) for r in ruleds: rows.append('\t'.join([ r['c'], r['p'], r['d'], str(r['s']), str(r['e']), r['string_orig'], r['ruled'] ])) increase_freq_on_dict( c_group, r['c'], r['ruled'], '-'.join([r['d'], str(r['s']), str(r['e'])])) pos_file = join(studies_folder, f, 'result.csv_json') if isfile(pos_file): # {"c": "C0000833", "e": 467, "d": "52773120", "string_orig": "abscess", "p": "10110421", "s": 460} poses = utils.load_json_data(pos_file) for r in poses: rows.append('\t'.join([ r['c'], r['p'], r['d'], str(r['s']), str(r['e']), r['string_orig'], '' ])) increase_freq_on_dict( c_group, r['c'], 'pos', '-'.join([r['d'], str(r['s']), str(r['e'])])) rule_headers = [ 'semehr negation_filters.json', 'semehr hypothetical_filters.json', 'semehr not_mention_filters.json', 'semehr other_experiencer_filters.json', 'semehr cris_document_filters.json', 'skip-term', 'semehr s_skin.json', 'semehr s_karen.json', 'yodie', 'pos' ] c_rows = ['\t'.join(['concept'] + rule_headers)] for c in c_group: co = c_group[c] c_rows.append( '\t'.join([c] + [str(co[h]) if h in co else '0' for h in rule_headers])) utils.save_string('\n'.join(rows), dump_tsv_file) utils.save_string('\n'.join(c_rows), dump_concept_file) print 'dumped to %s' % dump_tsv_file
def output_single_phenotype_detail(pprevalence_file, phenotype, output_file): pp = utils.load_json_data(pprevalence_file) p = pp[phenotype] rows = [] rows.append('\t'.join(['total', str(p['prevalence'])])) for sp in p['subtypes']: rows.append('\t'.join([sp['phenotype'], str(p['concepts'][sp['concept']])])) for c in p['concepts']: rows.append('\t'.join([c, str(p['concepts'][c])])) utils.save_string('\n'.join(rows), output_file) print '% result saved to %s' % (phenotype, output_file)
def produce_weka_output(predict_output_file, orig_features_file, merged_output_file, arrf_file, threshold=.70, mode='threshold'): orig_data_lines = utils.read_text_file(orig_features_file) ret = utils.load_json_data(predict_output_file) ptn2anns = {} for r in ret: ptn = r[0] if ptn not in ptn2anns: ptn2anns[ptn] = {'posM': 0, 'negM': 0, 'hisM': 0, 'otherM': 0} if mode == 'threshold': if float(r[6]) >= threshold: ptn2anns[ptn][r[4]] += 1 elif mode == 'weighted_sum': ptn2anns[ptn][r[4]] += float(r[6]) rows = [] arrf_header = """@RELATION hepc @ATTRIBUTE Total_Mentions NUMERIC @ATTRIBUTE Positive_Mentions NUMERIC @ATTRIBUTE History_hypothetical_Mentions NUMERIC @ATTRIBUTE Negative_Mentions NUMERIC @ATTRIBUTE Other_Experiencers NUMERIC @ATTRIBUTE AT_Total_Mentions NUMERIC @ATTRIBUTE AT_Positive_Mentions NUMERIC @ATTRIBUTE AT_History_hypothetical_Mentions NUMERIC @ATTRIBUTE AT_Negative_Mentions NUMERIC @ATTRIBUTE AT_Other_Experiencers NUMERIC @ATTRIBUTE class {positive,negative,unknown} @DATA """ arrf_rows = [] for l in orig_data_lines: arr = l.split('\t') ptn = arr[0] new_line = arr[:6] + \ ([str(ptn2anns[ptn]['posM'] + ptn2anns[ptn]['negM'] + ptn2anns[ptn]['hisM'] + ptn2anns[ptn]['otherM']), str(ptn2anns[ptn]['posM']), str(ptn2anns[ptn]['hisM']), str(ptn2anns[ptn]['negM']), str(ptn2anns[ptn]['otherM'])] if ptn in ptn2anns else ['0','0','0','0','0']) + \ [arr[6]] rows.append(new_line) arrf_rows.append(','.join(new_line[1:])) utils.save_string(arrf_header + '\n'.join(arrf_rows), arrf_file) utils.save_string('\n'.join(['\t'.join(r) for r in rows]), merged_output_file)
def make_sequences(in_path,out_path,dim=0): action_t_series=utils.read_dir_objects(in_path) sequences=[] for action in action_t_series: arr=action.to_array()[dim] cat_series=category_series(arr) seq=cats_to_seq(cat_series) seq+="#"+str(action.cat) seq+="#"+action.name +"\n" sequences.append(seq) str_seq=utils.array_to_txt(sequences) utils.save_string(out_path,str_seq)
def extract_doc_level_ann(ann_dump, output_folder): """ extract doc level annotations and save to separate files :param ann_dump: :param output_folder: :return: """ lines = utils.read_text_file(ann_dump) for l in lines: doc_ann = json.loads(l) utils.save_string(l, join(output_folder, doc_ann['docId'].split('.')[0] + '.json'))
def do_phenotype_analysis(phenotype_result_file, c_map_file, output_folder): c_map = utils.load_json_data(c_map_file) p_map = utils.load_json_data(phenotype_result_file) # extract performances of phenotypes headers = ["posM", "hisM", "negM", "otherM", "wrongM"] rows = ['\t'.join(["phenotype"] + headers)] for p in p_map: v = p_map[p]['validation'] if v is None or len(v) == 0: continue rows.append('\t'.join([p] + [str(v[h]) if h in v else '0' for h in headers])) utils.save_string('\n'.join(rows), join(output_folder, 'phenotype_performance.tsv'))
def label_analyse(sql_template_file, db_cnf, output_file=None): sql_temps = utils.load_json_data(sql_template_file) concepts = [] dutil.query_data(sql_temps['get_validated_concepts'], concepts, dbconn=dutil.get_db_connection_by_setting(db_cnf)) s = '' for c in concepts: data, output = concept_analyse(c['concept_id'], sql_temps['condition_label_sql'], sql_temps['wrong_label_sql'], db_cnf) s += output if output_file is not None: print 'saving output to %s...' % output_file utils.save_string(s, output_file)
def get_concepts(output_file): curated_mappings = utils.load_json_data( './resources/curated_mappings.json') autoimmune_concepts = [] patients = [] dutil.query_data(autoimmune_concepts_sql, autoimmune_concepts) print '{} concepts read'.format(len(autoimmune_concepts)) dutil.query_data(patients_sql, patients) print patients[0] # patient dic patient_dic = {} for p in patients: patient_dic[p['brcid']] = p non_empty_curated_concepts = [] non_empty_not_curated_concepts = [] empty_concepts = [] for co in autoimmune_concepts: c = co['concept_name'] sympton_freq_result = [] print autoimmune_sympton_freq_sql.format(c) dutil.query_data( autoimmune_sympton_freq_sql.format(c.replace("'", "''")), sympton_freq_result) if len(sympton_freq_result) > 0: if c in curated_mappings and curated_mappings[ c] is not None and curated_mappings[c] == 'correct': non_empty_curated_concepts.append(c) else: non_empty_not_curated_concepts.append(c) else: empty_concepts.append(c) for sf in sympton_freq_result: patient_dic[sf['brcid']][c] = sf['num'] patient_dic[sf['brcid']]['any'] = sf['num'] + \ (patient_dic[sf['brcid']]['any'] if 'any' in patient_dic[sf['brcid']] else 0) p_attrs = [ 'brcid', 'primary_diag', 'diagnosis_date', 'dob', 'gender_id', 'ethnicitycleaned' ] d_attrs = sorted( non_empty_curated_concepts ) # sorted([co['concept_name'] for co in autoimmune_concepts]) d_attrs = ['any'] + d_attrs + ['=sep=' ] + sorted(non_empty_not_curated_concepts) s = '\t'.join(p_attrs) + '\t' + '\t'.join(d_attrs) + '\n' for p in patients: s += '\t'.join([str(p[k]) for k in p_attrs]) + '\t' + '\t'.join( ['0' if c not in p else str(p[c]) for c in d_attrs]) + '\n' utils.save_string(s, output_file) print json.dumps(empty_concepts)
def populate_patient_study_table(cohort_name, study_analyzer, out_file, patients_sql, term_doc_freq_sql): """ populate result table for a given study analyzer instance :param cohort_name: :param study_analyzer: :param out_file: :return: """ patients = [] dutil.query_data(patients_sql.format(cohort_name), patients) id2p = {} for p in patients: id2p[p['brcid']] = p non_empty_concepts = [] study_concepts = study_analyzer.study_concepts for sc in study_concepts: sc_key = '%s(%s)' % (sc.name, len(sc.concept_closure)) concept_list = ', '.join(['\'%s\'' % c for c in sc.concept_closure]) patient_term_freq = [] if len(sc.concept_closure) > 0: data_sql = term_doc_freq_sql.format( **{ 'concepts': concept_list, 'cohort_id': cohort_name, 'extra_constrains': ' \n '.join( [generate_skip_term_constrain(study_analyzer)] + [] if (study_analyzer.study_options is None or study_analyzer .study_options['extra_constrains'] is None) else study_analyzer.study_options['extra_constrains']) }) print data_sql dutil.query_data(data_sql, patient_term_freq) if len(patient_term_freq) > 0: non_empty_concepts.append(sc_key) for pc in patient_term_freq: id2p[pc['brcid']][sc_key] = str(pc['num']) concept_labels = sorted(non_empty_concepts) s = '\t'.join(['brcid'] + concept_labels) + '\n' for p in patients: s += '\t'.join([p['brcid']] + [p[k] if k in p else '0' for k in concept_labels]) + '\n' utils.save_string(s, out_file) print 'done'
def save_full_text(xml_file, output_dir): """ recover full text from Informatics' xml format :param xml_file: :param output_dir: :return: """ if not isfile(xml_file): return ed = EDIRDoc(xml_file) fn = basename(xml_file) name = fn.replace(r'-ann.xml', '.txt') logging.info('%s processed to be %s' % (fn, name)) utils.save_string(ed.get_full_text, join(output_dir, name))
def get_all_instances(save_file): concepts = utils.load_json_data('./resources/exact_concpts_mappings.json') concpet2subconcepts_csv = '' for c in concepts: if concepts[c] == '': continue insts = query_instances(concepts[c]) insts = [concepts[c]] + insts print u'{}\t{}\t{}\t{}'.format(c, concepts[c], len(insts), json.dumps(insts)) for cid in insts: concpet2subconcepts_csv += u'{}, {}\n'.format( c, cid[cid.rfind('/') + 1:]) if save_file is not None: utils.save_string(concpet2subconcepts_csv, save_file)
def select_section_headers(sec_freq_file): """ do simple syntactic merging of section titles and sort them by frequencies :param sec_freq_file: :return: """ sec_freq = utils.load_json_data(sec_freq_file) merged_sec_freq = {} for s in sec_freq: k = normalise_sec_title(s) merged_sec_freq[k] = sec_freq[s] if k not in merged_sec_freq else sec_freq[s] + merged_sec_freq[k] sec_freq = merged_sec_freq sf = [(s, sec_freq[s]) for s in sec_freq] sf = sorted(sf, key=lambda sec: -sec[1]) utils.save_string('\n'.join('%s\t%s' % t for t in sf), '../resources/wrappers/mimic_section_freqs.txt')
def download_docs(doc_ids, query, db_conn_setting, out_put_folder): """ download clinical notes from EHR :param doc_ids: :param query: :param db_conn_setting: :return: """ db_cnn = dutil.get_db_connection_by_setting(db_conn_setting) results = [] q = query.format(**{'ids': ','.join(['\'%s\'' % did for did in doc_ids])} ) print 'querying [%s]' % q print q dutil.query_data(q, results, db_cnn) for r in results: if r['textcontent'] is not None: utils.save_string(r['textcontent'].decode('cp1252').replace(chr(13), ' '), join(out_put_folder, r['cn_doc_id'] + '.txt'), encoding='utf-8')
def complete_samples(sample_file, complete_sql, db_conn_file, out_file): ann_prefix = 'var sample_docs=' anns_str = utils.read_text_file_as_string(sample_file) if anns_str.startswith(ann_prefix): anns_str = anns_str[len(ann_prefix):] anns = json.loads(anns_str) # anns = utils.load_json_data(sample_file) key_anns = [] for k in anns: key_anns.append((k, anns[k])) container = [] utils.multi_thread_tasking(key_anns, 40, complete_sample_ann_data, args=[complete_sql, db_conn_file, container]) results = {} for r in container: results[r[0]] = r[1] utils.save_string(ann_prefix + json.dumps(results), out_file) print 'done'
def process_batched_docs(folder_path, out_folder): if isdir(folder_path): for f in listdir(folder_path): if isfile(join(folder_path, f)): t = utils.read_text_file_as_string(join(folder_path, f)) print 'processing %s' % join(folder_path, f) print t mit = re.finditer(r'^(\d+)\,\"', t, re.MULTILINE) prev_pos = 0 prev_id = None for m in mit: if prev_pos > 0: utils.save_string(t[prev_pos:m.start()-2], join(out_folder, prev_id)) prev_pos = m.end() prev_id = m.string[m.start(1):m.end(1)] if prev_id is not None: utils.save_string(t[prev_pos:len(t) - 1], join(out_folder, prev_id)) else: print 'ERROR!! pattern not found in %s' % join(folder_path, f)
def create_seq_dataset(action_dir,conf): actions=get_actions(action_dir,conf.nn,conf.cls) seq_data=utils.array_to_txt(actions) utils.save_string(conf.seq,seq_data)