def visualise_lp_ranged_stats(stat_file, cat, title, skips=None, score_output_file=None): r2stats = utils.load_json_data(stat_file) data = [] data2save = {} for r in r2stats: stats = r2stats[r] total_normal = stats['s_nm'] total_highlights = stats['s_ht'] keys, scores = score_language_patterns(stats['nm'][cat], stats['ht'][cat], total_normal, total_highlights) if score_output_file is None: trace1 = go.Bar(x=keys, y=scores, name=r) data.append(trace1) else: data2save[r] = {} for i in range(len(keys)): data2save[r][keys[i]] = scores[i] if score_output_file is None: layout = go.Layout(barmode='group', title=title) fig = go.Figure(data=data, layout=layout) py.plot(fig, filename='language pattern ranged stats - ' + cat) else: utils.save_json_array(data2save, score_output_file)
def finish_multithread_summ(src, sids, score_dict, scores_list, score_file, sum_file): threshold = 1 summary = {} for s in scores_list: scores = s['scores'] sent = s['sent'] cat = s['cat'] if scores['total'] < threshold: continue summary[cat] = [ (sent, scores) ] if cat not in summary else summary[cat] + [(sent, scores)] if 'goal' in summary and 'method' in summary and 'general' in summary: summary.pop('general', None) num_sents_per_cat = 2 for cat in summary: summary[cat] = HighLighter.pick_top_k(summary[cat], 100) if cat == 'findings' \ else HighLighter.pick_top_k(summary[cat], num_sents_per_cat) print json.dumps(summary) utils.save_json_array([so['scores'] for so in scores_list], score_file) utils.save_json_array(summary, sum_file) return summary, scores_list
def test_merge_highlight(): ht_file = './30-test-papers/11274654_ht.json' ann_file = './30-test-papers/11274654_annotated_ann.json' ann = util.load_json_data(ann_file) for a in ann: if 'marked' in a: a['marked'] = [] ht = read_highlights_json(ht_file) merge_highlights(ann, ht) util.save_json_array(ann, ann_file)
def append_abstract_label(xml_file): p, f = os.path.split(xml_file) ann_file = os.path.join(p, f[:f.rfind('.')] + '_ann.json') tree = ET.parse(xml_file) root = tree.getroot() abstracts = root.findall(".//abstract") if len(abstracts) > 0: ab_sents = abstracts[0].findall("s") max_ab_sid = int(ab_sents[len(ab_sents) - 1].attrib['sid']) if max_ab_sid >= 0: anns = util.load_json_data(ann_file) for ann in anns: if int(ann['sid']) <= max_ab_sid: ann['abstract-title'] = True util.save_json_array(anns, ann_file)
def summ(highlighter, ann_file, out_path): anns = utils.load_json_data(ann_file) p, fn = split(ann_file) score_file = join(out_path, fn[:fn.rfind('.')] + '_scores.json') sid_to_score = {} if isfile(score_file): stored_scores = utils.load_json_data(score_file) i = 1 for score in stored_scores: sid_to_score[score['sid']] = score i += 1 summary, scores = highlighter.summarise([s['text'] for s in anns], src=ann_file, sids=[s['sid'] for s in anns], score_dict=sid_to_score) # if not isfile(score_file): utils.save_json_array(scores, score_file) utils.save_json_array(summary, join(out_path, fn[:fn.rfind('.')] + '.sum'))
def process_pmc_paper(pmcid, job_path, job_id): ann_file = join(job_path, pmcid + '_ann.json') if exists(ann_file): print '%s exists, skipping download' % ann_file update_paper_fulltext(pmcid, utils.load_json_data(ann_file)) return t = get_pmc_paper_fulltext(pmcid) if t is None or len(t) == 0: utils.append_text_file(pmcid, join(job_path, 'not_available.txt')) else: sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') sents = sent_detector.tokenize(t.strip()) if job_path is not None: fulltext = [{ 'text': sents[i], 'sid': str(i + 1) } for i in range(len(sents))] utils.save_json_array(fulltext, ann_file) update_paper_fulltext(pmcid, fulltext)
def get_general_highlights(): geos = utils.load_json_data('./training/geo_features.json') sents = [] for g in geos: f_ann = g['id'] sids = [] for sid in g['sid_cat']: if g['sid_cat'][sid] == 'general': sids.append(sid) if len(sids) > 0: anns = utils.load_json_data(f_ann) for ann in anns: if ann['sid'] in sids: sents.append({ 'text': ann['text'], 'marked': ann['marked'] if 'marked' in ann else '' }) utils.save_json_array(sents, './training/general_highlights.json')
def pp_pattern_stats(container, out_file, hter): range2stats = {} for stats in container: for k in stats: range2stats[k] = [ stats[k] ] if k not in range2stats else [stats[k]] + range2stats[k] range2merged = {} for r in range2stats: merged = { 'ht': { 'sp': {}, 'ne': {}, 'cd': {}, 'sp_breakdown': {} }, 'nm': { 'sp': {}, 'ne': {}, 'cd': {}, 'sp_breakdown': {} }, 's_ht': 0, 's_nm': 0 } for stats in range2stats[r]: merge_key_freq(merged, stats, 'ht', 'sp') merge_key_freq(merged, stats, 'ht', 'ne') merge_key_freq(merged, stats, 'ht', 'cd') merge_key_freq(merged, stats, 'ht', 'sp_breakdown') merge_key_freq(merged, stats, 'nm', 'sp') merge_key_freq(merged, stats, 'nm', 'ne') merge_key_freq(merged, stats, 'nm', 'cd') merge_key_freq(merged, stats, 'nm', 'sp_breakdown') merged['s_ht'] += stats['s_ht'] merged['s_nm'] += stats['s_nm'] range2merged[r] = merged utils.save_json_array(range2merged, out_file)
def visualise_lp_stats(stat_file, cat, title, skips=None, score_output_file=None): stats = utils.load_json_data(stat_file) total_normal = stats['s_nm'] total_highlights = stats['s_ht'] keys, scores = score_language_patterns(stats['nm'][cat], stats['ht'][cat], total_normal, total_highlights) if score_output_file is None: trace1 = go.Bar(x=keys, y=scores, name='Highlighted Sentences / Other Sentences') data = [trace1] layout = go.Layout(barmode='group', title=title) fig = go.Figure(data=data, layout=layout) py.plot(fig, filename='language pattern stats - ' + cat) else: data = {} for i in range(len(keys)): data[keys[i]] = scores[i] utils.save_json_array(data, score_output_file)
def do_highlight(test_file): thread_nums = 5 hters = [] for i in range(thread_nums): print('initialising highlighter instance...') hters.append(HighLighter.get_instance()) print('highlighter instance initialised') data = None with codecs.open(test_file, encoding='utf-8') as rf: data = json.load(rf) scores = [] out_file = test_file[:test_file.rfind('.')] + "_scores.json" print('multithreading...') utils.multi_thread_tasking( data, thread_nums, score_sentence, args=[scores, out_file], thread_wise_objs=hters, callback_func=lambda hl, s, of: utils.save_json_array(s, of)) print('multithreading started')
def pp_highlight(container, out_file, hter, threshold, manual_ann): utils.save_json_array(container, out_file)
def post_process_geometric_analysis(container, output_file, hter): print json.dumps(container) utils.save_json_array(container, output_file) print 'geometric features of all annotations extracted and saved'
def remove_ann_sentences(ann_file): anns = utils.load_json_data(ann_file) for ann in anns: ann['text'] = '' utils.save_json_array(anns, ann_file)