def analyse_discharge_summaries(es, q, doc_type='eprdoc', full_text_field='fulltext', reg_exp=r'^([^\n\:]+)\:$', output_file='../resources/wrappers/section_freqs.json'): """ iterate all discharge summaries and create the section dictionary for the corpus (EHR system) :param es: :param q: :param doc_type: :param full_text_field :param reg_exp :param output_file :return: """ scroll_obj = es.scroll(q, doc_type, include_fields=[full_text_field], size=500) container = [] utils.multi_thread_tasking_it(scroll_obj, 10, do_query_analysis, args=[container, full_text_field, reg_exp]) print 'search finished. merging sections...' sec_freq = {} for ss in container: for s in ss: sec_freq[s] = 1 if s not in sec_freq else 1 + sec_freq[s] utils.save_json_array(sec_freq, output_file) print json.dumps(sec_freq) print 'done'
def generate_all_queries(): concepts = utils.load_json_data('./resources/autoimmune-concepts.json') concept2queries = {} for c in concepts: concept2queries[c] = generate_prospector_query(concepts[c]) print '%s done' % c utils.save_json_array(concept2queries, './resources/mimir_queries.json')
def analyse_doc_anns(json_doc, file_key, rule_executor, text_reader, output_folder, fn_pattern='se_ann_%s.json', es_inst=None, es_output_index=None, es_output_doc='doc', study_analyzer=None, contextualised_concept_index='semehr_ctx_concepts', ctx_doc_type='ctx_concept'): ann_doc = SemEHRAnnDoc() ann_doc.load(json_doc, file_key=file_key) read_obj = text_reader.read_full_text(ann_doc.file_key) patient_id = None if isinstance(read_obj, dict): text = read_obj['text'] patient_id = read_obj['pid'] else: text = read_obj if text is None: logging.error('file [%s] full text not found' % ann_doc.file_key) return reader = WrapperTextReader(text) process_doc_rule(ann_doc, rule_executor, reader, None, study_analyzer) if es_inst is None: utils.save_json_array(ann_doc.serialise_json(), join(output_folder, fn_pattern % ann_doc.file_key)) else: data = ann_doc.serialise_json() data['doc_id'] = file_key data['patient_id'] = patient_id es_inst.index_new_doc(index=es_output_index, doc_type=es_output_doc, data=data, doc_id=file_key) # index conceptualised concepts if contextualised_concept_index is not None: for ann in data['annotations']: index_ctx_concept(ann, contextualised_concept_index, ctx_doc_type, es_inst) return ann_doc.serialise_json()
def patient_level_analysis(complete_anns_file, output_file): lines = utils.read_text_file(complete_anns_file) pos_condition2patients = {} patient2conditions = {} positive_labels = ['posM', 'hisM'] indexable_labels = ['posM', 'hisM', 'negM'] for l in lines: arr = l.split('\t') label = arr[2] condition = arr[3] pid = arr[8] if label in positive_labels: pos_condition2patients[condition] = [pid] if condition not in pos_condition2patients else \ pos_condition2patients[condition] + [pid] if label in indexable_labels: pd = patient2conditions[pid] if pid in patient2conditions else {} patient2conditions[pid] = pd if label in pd: pd[label].append(condition) pd[label] = list(set(pd[label])) else: pd[label] = [condition] utils.save_json_array( { 'p2c': patient2conditions, 'c2p': pos_condition2patients }, output_file)
def collect_result(self, output_file, graph_file_path): files = [ f for f in listdir(self._doc_pth) if isfile(join(self._doc_pth, f)) ] f_did = [] for f in files: sr = re.search(self._did_pattern, f, re.IGNORECASE) if sr: f_did.append((f, sr.group(1))) results = [] logging.info('collecting results ...') utils.multi_thread_tasking( lst=f_did, num_threads=10, process_func=DocCohort.collect_doc_anns_by_types, args=[self._doc_pth, self.collect_semantic_types, results]) logging.info('total anns collected %s' % len(results)) ret = {'concepts': {}, 'p2c': {}} for r in results: if r['d'] in self._d2p: p = self._d2p[r['d']] if p not in ret['p2c']: ret['p2c'][p] = {} pd = ret['p2c'][p] if r['cui'] not in ret['concepts']: ret['concepts'][r['cui']] = r['pref'] if r['cui'] not in pd: pd[r['cui']] = 1 else: pd[r['cui']] += 1 else: logging.error('doc %s not in cohort map' % r['d']) utils.save_json_array(ret, output_file) utils.save_json_array(DocCohort.result_to_graph(ret), graph_file_path) logging.info('result collected')
def parse_es_docs( es, q, writing_es_host, writing_index_name, writing_doc_type, doc_type='eprdoc', full_text_field='fulltext', output_file='../resources/wrappers/sen_data_extracted.json', failed_docs_file='../resources/wrappers/sen_failed_docs.json', ): writing_es = Elasticsearch([writing_es_host], verify_certs=False) # scroll_obj = es.scroll(q, doc_type, include_fields=[full_text_field], size=500) ret_count, docs = es.search(doc_type, q, offset=0, size=30) container = [] failed_docs = [] print 'anonymising... %s, %s' % (len(docs), ','.join( [d['_id'] for d in docs])) utils.multi_thread_tasking_it(docs, 1, do_doc_anonymisation, args=[ writing_es, writing_index_name, writing_doc_type, full_text_field, container, failed_docs ]) print 'search finished. merging sections...' utils.save_json_array(container, output_file) utils.save_json_array(failed_docs_file, failed_docs_file) print 'done'
def merge_mappings_dictionary(map_files, dict_dirs, new_map_file, new_dict_folder): maps = [utils.load_json_data(mf) for mf in map_files] new_m = {} for m in maps: new_m.update(m) t2list = {} for dd in dict_dirs: lst_files = [ f for f in listdir(dd) if isfile(join(dd, f)) and f.endswith('.lst') ] for f in lst_files: t = f[:f.index('.')] labels = utils.read_text_file(join(dd, f)) if t not in t2list: t2list[t] = set() for l in labels: if len(l) > 0: t2list[t].add(l) utils.save_json_array(new_m, new_map_file) logging.info('mapping saved to %s' % new_map_file) for t in t2list: utils.save_string('\n'.join(list(t2list[t])) + '\n', join(new_dict_folder, t + '.lst')) logging.info('%s.lst saved' % t) logging.info('all done')
def predict_exp(corpus_trans_file, ann_file, cache_file, output_file): # initialise pattern instances from documents if not isfile(cache_file): # load labelled data ann_lines = utils.read_text_file(ann_file) prev_doc = None anns = [] doc_anns = [] ptn_insts = [] doc_to_pt = {} for ls in ann_lines: l = ls.split('\t') doc_id = l[1] doc_to_pt[doc_id] = l[0] if prev_doc != doc_id: if prev_doc is not None: if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)): doc_anns.append((prev_doc, anns)) anns = [] prev_doc = doc_id anns.append({ 's': int(l[2]), 'e': int(l[3]), 'signed_label': l[4], 'gt_label': l[5] }) if prev_doc is not None: if exists(join(working_folder, 'docs', '%s.txt' % prev_doc)): doc_anns.append((prev_doc, anns)) # mutithreading do processing labelled docs print 'processing docs...' utils.multi_thread_tasking(doc_anns, 30, do_process_labelled_doc, args=[ptn_insts]) jl.dump({'insts': ptn_insts, 'doc_to_pt': doc_to_pt}, cache_file) else: cached = jl.load(cache_file) ptn_insts = cached['insts'] doc_to_pt = cached['doc_to_pt'] cp = sp.CorpusPredictor.load_corpus_model(corpus_trans_file) ret = [] for inst in ptn_insts: print 'predicting [%s]...' % inst.sentence acc = cp.predcit(inst) print 'accuracy: %s' % acc ann = inst.annotations[0] ret.append( (doc_to_pt[inst.doc_id], inst.doc_id, str(ann['s']), str(ann['e']), ann['signed_label'], ann['gt_label'], str(acc))) s = [] for r in ret: s.append(u'\t'.join(r)) print u'\n'.join(s) utils.save_json_array(ret, output_file) return ret
def query_liver_diseases(concepts, prefix, patient_filter, d2time): es = SemEHRES.get_instance() results, docs = es.summary_patients_by_concepts(concepts, filter_func=None, args=[d2time], patient_filters=patient_filter, data_collection_func=first_time_collector) utils.save_json_array(results, './addiction_res/%s_results.json' % prefix) utils.save_json_array(docs, './valid_doc_files/%s_valid_docs.json' % prefix)
def mapping_headings(heading_stats_file, output_file, freq_threshold=1000): heading_freq = utils.load_json_data(heading_stats_file) sorted_top_k_headings = sorted([(h, heading_freq[h]) for h in heading_freq], key=lambda x: -x[1])[:freq_threshold] s = '' for r in sorted_top_k_headings[:500:]: s += '%s\t%s\n' % (r[0], r[1]) utils.save_string(s, './top500heading_discharge_summary.txt') utils.save_json_array(sorted_top_k_headings, output_file)
def compute_all_subconcepts(concepts, file_path): c_to_subs = {} umls = UMLSAPI(_umls_api_key) container = [] utils.multi_thread_tasking(concepts, 10, do_compute_subconcept, args=[umls, container]) for p in container: c_to_subs[p[0]] = p[1] utils.save_json_array(c_to_subs, file_path)
def test_models_and_ensemble(model_files, x, weights=None, outcome='death', threshold=0.5, result_csv=None, severity_conf=None, generate_figs=False, auc_fig_file=None, calibration_fig_file=None, event_rate=None, nri_json=None): """ do tests on individual models and also ensemble methods :param event_rate: :param model_files: :param x: :param weights: :param outcome: :param threshold: :param result_csv: :param severity_conf: severity configuration for setting weights on the alignments between model outcomes and what to predict :param generate_figs: generate figs or not :param auc_fig_file: roc curve figure output file :param calibration_fig_file: calibration figure output file :return: """ data = {} ve = me.BasicEnsembler() y_list = [] predicted_list = [] models = [] for idx in range(len(model_files)): mf = model_files[idx] m = load_model(mf) models.append(m) y, pred = test_single_model(m, x, outcome=outcome, threshold=threshold) y_list.append(y) predicted_list.append(pred) ve.add_model(m, 1 if weights is None else weights[idx]) # results['{0}\n({1})'.format(m.id, m.model_type)] = result ve.mode = me.VoteMode.competence_fusion y, pred = test_ensemble(ve, x, threshold=threshold, outcome=outcome, severity_conf=severity_conf, generate_figs=generate_figs) y_list.append(y) predicted_list.append(pred) results, nri_result = eval.evaluate_pipeline(y_list, predicted_list, model_names=[m.id for m in models] + ['ensemble model'], threshold=threshold, figs=generate_figs, outcome=outcome, auc_fig_file=auc_fig_file, calibration_fig_file=calibration_fig_file, event_rate=event_rate) model_labels = ['{0}\n({1})'.format(m.id, m.model_type) for m in models] + ['ensemble model'] for idx in range(len(model_labels)): data[model_labels[idx]] = {} for k in results: data[model_labels[idx]][k] = results[k][idx] result_df = eval.format_result(data) if result_csv is not None: result_df.to_csv(result_csv, sep='\t', index=False) if nri_json is not None: utils.save_json_array(nri_result, nri_json)
def extend_manual_mappings(mapping_file, new_mapping_file): umls = get_umls_client_inst('./resources/HW_UMLS_KEY.txt') m = utils.load_json_data(mapping_file) for k in m: logging.info('working on %s' % k) new_concepts = [] + m[k]['concepts'] for c in m[k]['concepts']: new_concepts += umls.transitive_narrower(c) m[k]['concepts'] = list(set(new_concepts)) logging.info('saving new results to %s' % new_mapping_file) utils.save_json_array(m, new_mapping_file)
def encode_doc_anns(d2anns, ann_ctx_file=None): ann_context_list = [] for d in d2anns: print('getting %s' % d) doc = get_es_instance().get(d2anns[d][0]['index'], d, doc_type=_es_doc_type) ann_context_list += extract_text(doc['_source']['fulltext'], d2anns[d]) if ann_ctx_file is not None: utils.save_json_array(ann_context_list, ann_ctx_file) print('annotation context results saved to %s' % ann_ctx_file) return ann_context_list
def extract_study_phenotypes(study_folder, output_file, exclude_filter=None): reg_p = re.compile(exclude_filter) if exclude_filter is not None else None all_phenotype_concepts = {} for f in listdir(study_folder): if reg_p is not None: m = reg_p.match(f) if m is not None: print '%s matched [%s], skipped' % (f, m) continue folder = join(study_folder, f) if isdir(folder): print 'inspecting %s ...' % folder if isfile(join(folder, 'study_analyzer.pickle')): sa = StudyAnalyzer.deserialise( join(folder, 'study_analyzer.pickle')) for c in sa.study_concepts: if c.name in all_phenotype_concepts: all_phenotype_concepts[ c.name]['freq'] = all_phenotype_concepts[ c.name]['freq'] + 1 else: all_phenotype_concepts[c.name] = { "phenotype": c.name, "concepts": list(c.concept_closure), "subtypes": [{ "phenotype": t, "concept": c.term_to_concept[t]['mapped'] } for t in c.term_to_concept], "freq": 1 } # for t in c.term_to_concept: # if t in all_phenotype_concepts: # all_phenotype_concepts[t]['freq'] = all_phenotype_concepts[t]['freq'] + 1 # else: # all_phenotype_concepts[t] = {"phenotype": t, # "concepts": [c.term_to_concept[t]['mapped']] # if c.term_to_concept[t]['closure'] == 0 else # list(set(list(c.concept_closure) + # [c.term_to_concept[t]['mapped']])), # "freq": 1} print 'total phenotypes %s' % len(all_phenotype_concepts) if len(all_phenotype_concepts) > 0: utils.save_json_array(all_phenotype_concepts, output_file) print 'saved to %s' % output_file else: print 'no data found'
def icd10_mapping_convert(json_file, output_json): c2concepts = utils.load_json_data(json_file) result = {} for c in c2concepts: r = { "tc": { "closure": len(c2concepts[c]), "mapped": c2concepts[c][0] }, "concepts": c2concepts[c] } result[c] = r utils.save_json_array(result, output_json) logging.info('all done')
def break_down_study_concepts(scs, umls, new_mapping_file): mmc = {} for sc in scs: cui = sc.term_to_concept[sc.terms[0]]['mapped'] m = {"tc": {"closure": 1, "mapped": cui}, "concepts": [cui]} mmc[sc.name] = m c2n = get_concepts_names(umls, list(sc.concept_closure)) for c in sc.concept_closure: if c != cui: # for each single concept create a studyconcept mc = {"tc": {"closure": 1, "mapped": c}, "concepts": [c]} mmc[c2n[c]] = mc utils.save_json_array(mmc, new_mapping_file)
def export_pickled_study_concept_2_flat_json(pickle_file, output_file): if isfile(pickle_file): obj = {} sa = StudyAnalyzer.deserialise(pickle_file) for sc in sa.study_concepts: for t in sc.term_to_concepts: for c in sc.term_to_concepts[t]['closure']: obj[c] = { "tc": { "closure": 1, "mapped": c }, "concepts": [c] } utils.save_json_array(obj, output_file) print 'flat json saved to %s' % output_file
def predict_to_eHOST_results(predict_setting): ss = StrokeSettings(predict_setting) if 'predict_mode' in ss.settings and ss.settings['predict_mode'] == 'direct_nlp': logging.info('predicting with direct nlp...') predicted_results = direct_nlp_prediction(ss.settings) elif 'predict_mode' in ss.settings and ss.settings['predict_mode'] == 'hybrid': predicted_results = hybrid_prediciton(ss.settings) else: logging.info('predicting...') predicted_results = predict(ss.settings) output_eHOST_format(predicted_results, ss.settings['output_folder']) logging.info('results saved to %s' % ss.settings['output_folder']) if 'output_file' in ss.settings: d2ann = {} for d in predicted_results: d2ann[d] = [{'label': t['label'], 'ann': t['ann'].to_dict()} for t in predicted_results[d]] utils.save_json_array(d2ann, ss.settings['output_file'])
def output_phenotypes(phenotype_file, phenotype_performance, c_map_file, output_file): p = utils.load_json_data(phenotype_file) c_map = utils.load_json_data(c_map_file) new_p = {} p_lines = utils.read_text_file(phenotype_performance) for l in p_lines[1:]: arr = l.split('\t') new_p[arr[0]] = p[arr[0]] pt = new_p[arr[0]] concepts = pt['concepts'] pt['concepts'] = {} pt['prevalence'] = 0 for c in concepts: pt['concepts'][c] = 0 if c not in c_map else c_map[c]['freq'] pt['prevalence'] += pt['concepts'][c] utils.save_json_array(new_p, output_file) print 'new data saved to %s' % output_file
def add_concept_level_freqs(data_folder, c_map_file): reg_p = re.compile(".*annotations\\.csv") c_map = utils.load_json_data(c_map_file) for f in listdir(data_folder): if reg_p is not None: m = reg_p.match(f) if m is not None: print '%s matched, reading...' % f lines = utils.read_text_file(join(data_folder, f)) for l in lines: arr = l.split('\t') if arr[0] not in c_map: continue if 'freq' not in c_map[arr[0]]: c_map[arr[0]]['freq'] = 0 c_map[arr[0]]['freq'] += int(arr[1]) utils.save_json_array(c_map, c_map_file)
def populate_phenotype_validation_results(phenotype_def_file, complete_validation_file, c_map_file, output_file): c_map = populate_concept_level_performance(complete_validation_file, c_map_file) phenotypes = utils.load_json_data(phenotype_def_file) for p_name in phenotypes: p = phenotypes[p_name] p['validation'] = {} for c in p['concepts']: if c not in c_map: continue for label in c_map[c]: if label in p['validation']: p['validation'][label] += c_map[c][label] else: p['validation'][label] = c_map[c][label] utils.save_json_array(phenotypes, output_file) print 'done'
def get_what_is_changing(ann_folder, text_folder, output_file, eHostAnnFile=True): """ get what is getting better/worse :param ann_folder: :param text_folder: :param output_file: :return: """ nlp = rr.get_nlp_instance() files = [f for f in listdir(ann_folder) if isfile(join(ann_folder, f))] type2abstractions = {} for f in files: anns = [] text_file = join(text_folder, f[0:-14]) if eHostAnnFile: d = eHostAnnDoc(join(ann_folder, f)) anns = d.get_ess_entities(no_context=True) else: d = eHostGenedDoc(join(ann_folder, f)) anns = d.get_ess_entities() if len(anns) == 0: logging.info('anns is empty for [{:s}]'.format(f)) text = utils.read_text_file_as_string(join(text_folder, f[0:-14]), encoding='cp1252') sents = rr.get_sentences_as_anns(nlp, text) for ann in anns: for s in sents: if ann.overlap(s): abss = rr.AbstractedSentence(1) abss.text = s.str result = abss.get_abstaction_by_pos( abss.locate_pos(ann.str), nlp) if result is None: logging.info('%s not found in %s' % (ann.str, f)) continue type = ann.label if type not in type2abstractions: type2abstractions[type] = [] type2abstractions[type].append(result.to_dict()) logging.debug(type2abstractions) utils.save_json_array(type2abstractions, output_file)
def encode_text(ann_ctxs, word_to_index_file=None): # Tokenize the sentences into words tokenized_sentences = [ nltk.word_tokenize(' '.join(ctx['prev'] + ctx['next']).lower()) for ctx in ann_ctxs ] # Count the word frequencies word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print "Found %d unique words tokens." % len(word_freq.items()) # Get the most common words and build index_to_word and word_to_index vectors vocab = word_freq.most_common(vocabulary_size - len(ann_type_tokens)) index_to_word = [x[0] for x in vocab] index_to_word += ann_type_tokens word_to_index = dict([(w, i) for i, w in enumerate(index_to_word)]) if word_to_index_file is not None: utils.save_json_array(word_to_index, word_to_index_file) return word_to_index
def analyse_doc_anns(ann_doc_path, rule_executor, text_reader, output_folder, fn_pattern='se_ann_%s.json', study_analyzer=None): p, fn = split(ann_doc_path) file_key = fn[:fn.index('.')] json_doc = utils.load_json_data(ann_doc_path) ann_doc = SemEHRAnnDoc() ann_doc.load(json_doc, file_key=file_key) text = text_reader.read_full_text(ann_doc.file_key) if text is None: logging.error('file [%s] full text not found' % ann_doc.file_key) return reader = WrapperTextReader(text) process_doc_rule(ann_doc, rule_executor, reader, None, study_analyzer) utils.save_json_array(ann_doc.serialise_json(), join(output_folder, fn_pattern % ann_doc.file_key)) return ann_doc.serialise_json()
def parse_disease_phenotypes(disease_phenotype_csv, disease_model_json): lines = utils.read_text_file(disease_phenotype_csv) dis_to_data = {} for l in lines[1:]: arr = l.split(',') lv4_id = arr[4] lv4_disease = arr[5] hpo_label = arr[7] hpo_id = arr[8] test = arr[9] test_id = arr[10] dis_data = [] if lv4_disease in dis_to_data: dis_data = dis_to_data[lv4_disease] else: dis_to_data[lv4_disease] = dis_data if len(test.strip()) > 0: dis_data.append({'test': test, 'test_id': test_id}) else: dis_data.append({'hpo_label': hpo_label, 'hpo_id': hpo_id}) utils.save_json_array(dis_to_data, disease_model_json)
def populate_concept_level_performance(complete_validation_file, c_map_file): if isfile(c_map_file): return utils.load_json_data(c_map_file) lines = utils.read_text_file(complete_validation_file) concept2label = {} for l in lines[1:]: arr = l.split('\t') label = arr[2] concept = arr[8] c_map = None if concept not in concept2label: c_map = {} concept2label[concept] = c_map else: c_map = concept2label[concept] if label not in c_map: c_map[label] = 1 else: c_map[label] += 1 utils.save_json_array(concept2label, c_map_file) return concept2label
def convert_csv_annotations(csv_file, text_folder, ann_folder, mapping_file, annotated_anns_file, id_pattern='%s-%s', ann_file_pattern='%s.txt.knowtator.xml'): with open(csv_file, newline='') as cf: reader = csv.DictReader(cf) label2concepts = {} d2annotated_anns = {} for r in reader: d2annotated_anns[r['doc_id'] + ".txt"] = [{ 's': r['start'], 'e': r['end'] }] if r['Skip Document'] != 'Yes': utils.save_string(r['text'], join(text_folder, r['doc_id'] + ".txt")) elem_annotations = ET.Element("annotations") elem_annotations.set('textSource', r['doc_id']) mention_id = id_pattern % (r['doc_id'], 0) if r['Correct'] == 'Yes' and r['Negation'] == 'NOT Negated': AnnConverter.create_elem_ann(elem_annotations, mention_id, r['start'], r['end'], r['string_orig'], r['icd10-ch']) xml = ET.tostring(elem_annotations, encoding='unicode', method='xml') utils.save_string( xml, join(ann_folder, ann_file_pattern % r['doc_id'])) if r['icd10-ch'] not in label2concepts: label2concepts[r['icd10-ch']] = [] if r['cui'] not in label2concepts[r['icd10-ch']]: label2concepts[r['icd10-ch']].append(r['cui']) utils.save_json_array(label2concepts, mapping_file) utils.save_json_array(d2annotated_anns, annotated_anns_file)
def query_all_concepts(): total = 0 docs = [] concept2query = utils.load_json_data('./resources/mimir_queries.json') for c in concept2query: print 'querying %s' % c r = query_mimir('postQuery', {'queryString': concept2query[c]}) qid = get_xml_data(r, 'm:data/m:queryId', mimir_ns) print 'query id: %s' % qid r = query_mimir('documentsCount', {'queryId': qid}) document_count = get_xml_data(r, 'm:data/m:value', mimir_ns) print 'documentCount: %s' % document_count if document_count != '': document_count = int(document_count) if document_count > 0: total += document_count docs.append( random_pick_results(c, qid, document_count, min(5, document_count))) print 'random picked %s' % c utils.save_json_array(docs, './samples/samples.json') print 'total docs: %s' % total
def encode_ann_ctx(dic_file, ann_ctx_file, output_file=None): word_to_index = utils.load_json_data(dic_file) ann_ctxs = utils.load_json_data(ann_ctx_file) encoded = [] for ann in ann_ctxs: encoded.append({ 'prev': [ word_to_index[w.lower()] for w in nltk.word_tokenize(' '.join(ann['prev']).lower()) ], 'next': [ word_to_index[w.lower()] for w in nltk.word_tokenize(' '.join(ann['next']).lower()) ], 'label': ann['label'], 'annId': ann['annId'], 'label_encoded': word_to_index[ann['label']] }) if output_file is not None: utils.save_json_array(encoded, output_file) return encoded