コード例 #1
0
def visualise_lp_ranged_stats(stat_file,
                              cat,
                              title,
                              skips=None,
                              score_output_file=None):
    r2stats = utils.load_json_data(stat_file)
    data = []
    data2save = {}
    for r in r2stats:
        stats = r2stats[r]
        total_normal = stats['s_nm']
        total_highlights = stats['s_ht']
        keys, scores = score_language_patterns(stats['nm'][cat],
                                               stats['ht'][cat], total_normal,
                                               total_highlights)
        if score_output_file is None:
            trace1 = go.Bar(x=keys, y=scores, name=r)
            data.append(trace1)
        else:
            data2save[r] = {}
            for i in range(len(keys)):
                data2save[r][keys[i]] = scores[i]
    if score_output_file is None:
        layout = go.Layout(barmode='group', title=title)

        fig = go.Figure(data=data, layout=layout)
        py.plot(fig, filename='language pattern ranged stats - ' + cat)
    else:
        utils.save_json_array(data2save, score_output_file)
コード例 #2
0
    def finish_multithread_summ(src, sids, score_dict, scores_list, score_file,
                                sum_file):
        threshold = 1
        summary = {}
        for s in scores_list:
            scores = s['scores']
            sent = s['sent']
            cat = s['cat']
            if scores['total'] < threshold:
                continue
            summary[cat] = [
                (sent, scores)
            ] if cat not in summary else summary[cat] + [(sent, scores)]
        if 'goal' in summary and 'method' in summary and 'general' in summary:
            summary.pop('general', None)

        num_sents_per_cat = 2
        for cat in summary:
            summary[cat] = HighLighter.pick_top_k(summary[cat], 100) if cat == 'findings' \
                else HighLighter.pick_top_k(summary[cat], num_sents_per_cat)
        print json.dumps(summary)

        utils.save_json_array([so['scores'] for so in scores_list], score_file)
        utils.save_json_array(summary, sum_file)
        return summary, scores_list
コード例 #3
0
def test_merge_highlight():
    ht_file = './30-test-papers/11274654_ht.json'
    ann_file = './30-test-papers/11274654_annotated_ann.json'
    ann = util.load_json_data(ann_file)
    for a in ann:
        if 'marked' in a:
            a['marked'] = []
    ht = read_highlights_json(ht_file)
    merge_highlights(ann, ht)
    util.save_json_array(ann, ann_file)
コード例 #4
0
def append_abstract_label(xml_file):
    p, f = os.path.split(xml_file)
    ann_file = os.path.join(p, f[:f.rfind('.')] + '_ann.json')
    tree = ET.parse(xml_file)
    root = tree.getroot()
    abstracts = root.findall(".//abstract")
    if len(abstracts) > 0:
        ab_sents = abstracts[0].findall("s")
        max_ab_sid = int(ab_sents[len(ab_sents) - 1].attrib['sid'])
        if max_ab_sid >= 0:
            anns = util.load_json_data(ann_file)
            for ann in anns:
                if int(ann['sid']) <= max_ab_sid:
                    ann['abstract-title'] = True
            util.save_json_array(anns, ann_file)
コード例 #5
0
def summ(highlighter, ann_file, out_path):
    anns = utils.load_json_data(ann_file)
    p, fn = split(ann_file)
    score_file = join(out_path, fn[:fn.rfind('.')] + '_scores.json')
    sid_to_score = {}
    if isfile(score_file):
        stored_scores = utils.load_json_data(score_file)
        i = 1
        for score in stored_scores:
            sid_to_score[score['sid']] = score
            i += 1

    summary, scores = highlighter.summarise([s['text'] for s in anns],
                                            src=ann_file,
                                            sids=[s['sid'] for s in anns],
                                            score_dict=sid_to_score)
    # if not isfile(score_file):
    utils.save_json_array(scores, score_file)
    utils.save_json_array(summary, join(out_path, fn[:fn.rfind('.')] + '.sum'))
コード例 #6
0
def process_pmc_paper(pmcid, job_path, job_id):
    ann_file = join(job_path, pmcid + '_ann.json')
    if exists(ann_file):
        print '%s exists, skipping download' % ann_file
        update_paper_fulltext(pmcid, utils.load_json_data(ann_file))
        return
    t = get_pmc_paper_fulltext(pmcid)
    if t is None or len(t) == 0:
        utils.append_text_file(pmcid, join(job_path, 'not_available.txt'))
    else:
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_detector.tokenize(t.strip())
        if job_path is not None:
            fulltext = [{
                'text': sents[i],
                'sid': str(i + 1)
            } for i in range(len(sents))]
            utils.save_json_array(fulltext, ann_file)
            update_paper_fulltext(pmcid, fulltext)
コード例 #7
0
def get_general_highlights():
    geos = utils.load_json_data('./training/geo_features.json')
    sents = []
    for g in geos:
        f_ann = g['id']
        sids = []
        for sid in g['sid_cat']:
            if g['sid_cat'][sid] == 'general':
                sids.append(sid)
        if len(sids) > 0:
            anns = utils.load_json_data(f_ann)
            for ann in anns:
                if ann['sid'] in sids:
                    sents.append({
                        'text':
                        ann['text'],
                        'marked':
                        ann['marked'] if 'marked' in ann else ''
                    })
    utils.save_json_array(sents, './training/general_highlights.json')
コード例 #8
0
def pp_pattern_stats(container, out_file, hter):
    range2stats = {}
    for stats in container:
        for k in stats:
            range2stats[k] = [
                stats[k]
            ] if k not in range2stats else [stats[k]] + range2stats[k]

    range2merged = {}
    for r in range2stats:
        merged = {
            'ht': {
                'sp': {},
                'ne': {},
                'cd': {},
                'sp_breakdown': {}
            },
            'nm': {
                'sp': {},
                'ne': {},
                'cd': {},
                'sp_breakdown': {}
            },
            's_ht': 0,
            's_nm': 0
        }
        for stats in range2stats[r]:
            merge_key_freq(merged, stats, 'ht', 'sp')
            merge_key_freq(merged, stats, 'ht', 'ne')
            merge_key_freq(merged, stats, 'ht', 'cd')
            merge_key_freq(merged, stats, 'ht', 'sp_breakdown')
            merge_key_freq(merged, stats, 'nm', 'sp')
            merge_key_freq(merged, stats, 'nm', 'ne')
            merge_key_freq(merged, stats, 'nm', 'cd')
            merge_key_freq(merged, stats, 'nm', 'sp_breakdown')
            merged['s_ht'] += stats['s_ht']
            merged['s_nm'] += stats['s_nm']
        range2merged[r] = merged
    utils.save_json_array(range2merged, out_file)
コード例 #9
0
def visualise_lp_stats(stat_file,
                       cat,
                       title,
                       skips=None,
                       score_output_file=None):
    stats = utils.load_json_data(stat_file)
    total_normal = stats['s_nm']
    total_highlights = stats['s_ht']
    keys, scores = score_language_patterns(stats['nm'][cat], stats['ht'][cat],
                                           total_normal, total_highlights)
    if score_output_file is None:
        trace1 = go.Bar(x=keys,
                        y=scores,
                        name='Highlighted Sentences / Other Sentences')
        data = [trace1]
        layout = go.Layout(barmode='group', title=title)

        fig = go.Figure(data=data, layout=layout)
        py.plot(fig, filename='language pattern stats - ' + cat)
    else:
        data = {}
        for i in range(len(keys)):
            data[keys[i]] = scores[i]
        utils.save_json_array(data, score_output_file)
コード例 #10
0
def do_highlight(test_file):
    thread_nums = 5
    hters = []
    for i in range(thread_nums):
        print('initialising highlighter instance...')
        hters.append(HighLighter.get_instance())
        print('highlighter instance initialised')
    data = None
    with codecs.open(test_file, encoding='utf-8') as rf:
        data = json.load(rf)
    scores = []
    out_file = test_file[:test_file.rfind('.')] + "_scores.json"
    print('multithreading...')
    utils.multi_thread_tasking(
        data,
        thread_nums,
        score_sentence,
        args=[scores, out_file],
        thread_wise_objs=hters,
        callback_func=lambda hl, s, of: utils.save_json_array(s, of))
    print('multithreading started')
コード例 #11
0
def pp_highlight(container, out_file, hter, threshold, manual_ann):
    utils.save_json_array(container, out_file)
コード例 #12
0
def post_process_geometric_analysis(container, output_file, hter):
    print json.dumps(container)
    utils.save_json_array(container, output_file)
    print 'geometric features of all annotations extracted and saved'
コード例 #13
0
def remove_ann_sentences(ann_file):
    anns = utils.load_json_data(ann_file)
    for ann in anns:
        ann['text'] = ''
    utils.save_json_array(anns, ann_file)