コード例 #1
0
def geometric_analysis(ann_file, container, out_file, highlighter):
    p, fn = os.path.split(ann_file)

    score_file = os.path.join('./summaries/',
                              fn[0:fn.rfind('.')] + '_scores.json')
    scores = utils.load_json_data(score_file)
    sent_scores = {}
    for s in scores:
        sent_scores[s['sid']] = s
    anns = utils.load_json_data(ann_file)
    ht_obj = {
        'total': len(anns),
        'ht_sids': [],
        'sect_dict': {},
        'sects': {},
        'page_dict': {},
        'total_page': 0,
        'id': ann_file,
        'sid_cat': {}
    }
    sect = ''
    last_sid = ''
    for ann in anns:
        if 'marked' in ann and len(ann['marked']) > 0:
            ht_obj['ht_sids'].append(ann['sid'])
            if 'struct' in ann:
                ht_obj['sect_dict'][ann['struct']] = [ann['sid']] if ann['struct'] not in ht_obj['sect_dict'] else \
                    ht_obj['sect_dict'][ann['struct']] + [ann['sid']]
            if 'page' in ann:
                ht_obj['page_dict'][ann['page']] = [ann['sid']] if ann['page'] not in ht_obj['page_dict'] else \
                    ht_obj['page_dict'][ann['page']] + [ann['sid']]
            ht_obj['sid_cat'][ann['sid']] = highlighter.get_sentence_cat_bd(
                sent_scores[ann['sid']])
        if 'page' in ann:
            ht_obj['total_page'] = ann['page']
        if ann['struct'] != sect:
            if sect.strip() != '':
                ht_obj['sects'][sect]['end'] = last_sid
            sect = ann['struct']
            ht_obj['sects'][ann['struct']] = {'star': ann['sid']}
        last_sid = ann['sid']
        if int(ann['sid']) > ht_obj['total']:
            ht_obj['total'] = int(ann['sid'])

    ht_obj['sects'][sect]['end'] = last_sid
    sum_file = os.path.join('./summaries/', fn[0:fn.rfind('.')] + '.sum')
    sum = utils.load_json_data(sum_file)
    if 'journal' in sum:
        ht_obj['journal'] = sum['journal']
    else:
        ht_obj['journal'] = 'J.'
    container.append(ht_obj)
コード例 #2
0
def get_ncbo_stats(ann_file, container):
    anns = utils.load_json_data(ann_file)
    onto2freq = {'ht': {}, 'nm': {}}
    total_nm = 0
    total_ht = 0
    for ann in anns:
        if 'marked' in ann:
            total_ht += 1
        else:
            total_nm += 1
        if 'ncbo' in ann:
            matched_ontos = []
            for ncbo in ann['ncbo']:
                for name in pann.onto_name:
                    if name not in matched_ontos and ncbo['uri'].startswith(
                            pann.onto_name[name]):
                        matched_ontos.append(name)
                    if name in matched_ontos:
                        break
            # for name in matched_ontos:
            #     ctn = onto2freq['ht'] if 'marked' in ann else onto2freq['nm']
            #     ctn[name] = 1 if name not in ctn else 1 + ctn[name]
            if len(matched_ontos) > 0:
                comb = '-'.join(sorted(matched_ontos))
                ctn = onto2freq['ht'] if 'marked' in ann else onto2freq['nm']
                ctn[comb] = 1 if comb not in ctn else 1 + ctn[comb]
    container.append({
        'total_nm': total_nm,
        'total_ht': total_ht,
        'freqs': onto2freq
    })
コード例 #3
0
def visualise_lp_ranged_stats(stat_file,
                              cat,
                              title,
                              skips=None,
                              score_output_file=None):
    r2stats = utils.load_json_data(stat_file)
    data = []
    data2save = {}
    for r in r2stats:
        stats = r2stats[r]
        total_normal = stats['s_nm']
        total_highlights = stats['s_ht']
        keys, scores = score_language_patterns(stats['nm'][cat],
                                               stats['ht'][cat], total_normal,
                                               total_highlights)
        if score_output_file is None:
            trace1 = go.Bar(x=keys, y=scores, name=r)
            data.append(trace1)
        else:
            data2save[r] = {}
            for i in range(len(keys)):
                data2save[r][keys[i]] = scores[i]
    if score_output_file is None:
        layout = go.Layout(barmode='group', title=title)

        fig = go.Figure(data=data, layout=layout)
        py.plot(fig, filename='language pattern ranged stats - ' + cat)
    else:
        utils.save_json_array(data2save, score_output_file)
コード例 #4
0
def visualise_categorised_geometric(geo_feature_file, fn):
    gms = utils.load_json_data(geo_feature_file)
    journal2cat = {}
    journal2papers = {}
    # cat_trace = {}

    for paper in gms:
        # j = paper['journal']
        j = 'all'
        journal2cat[j] = {} if j not in journal2cat else journal2cat[j]
        cat_trace = journal2cat[j]
        journal2papers[j] = [j, 1] if j not in journal2papers else [
            j, 1 + journal2papers[j][1]
        ]
        sects = paper['sect_dict']
        sid_cat = paper['sid_cat']
        for y in sects:
            for x in sects[y]:
                cat = sid_cat[x]
                if cat in ['cardinal nouns', 'named entities', 'general']:
                    continue
                if cat not in cat_trace:
                    cat_trace[cat] = {'x': [], 'y': []}
                trace = cat_trace[cat]
                trace['x'].append(1.0 * int(x) / int(paper['total']))
                label_y = y.replace('deo:', '').replace('DoCO:', '').replace(
                    'BodyMatter', 'Others').replace('FrontMatter', 'Others')
                trace['y'].append(label_y)

    sorted_journals = sorted([journal2papers[j] for j in journal2papers],
                             cmp=lambda jp1, jp2: jp2[1] - jp1[1])
    print sorted_journals
    print len(sorted_journals)

    # selected_j = sorted_journals[1][0]
    selected_j = 'all'
    cat_trace = journal2cat[selected_j]  # skip the no-journal paper group
    traces = []
    for cat in cat_trace:
        traces.append(
            go.Scatter(x=cat_trace[cat]['x'],
                       y=cat_trace[cat]['y'],
                       mode='markers',
                       name=cat))
    # print traces
    layout = go.Layout(
        title=
        'highlights over spatial dimensions',  # selected_j + ' - language pattern breakdown',
        yaxis=dict(categoryorder='array',
                   categoryarray=[
                       'Introduction', 'Methods', 'Results', 'Discussion',
                       'Others'
                   ]))
    fig = go.Figure(data=traces, layout=layout)
    # py.plot(fig, filename=fn) # + ' - ' + selected_j)
    py.image.save_as({
        'data': traces,
        'layout': layout
    }, './results/spatial.pdf')
コード例 #5
0
def paper_stat(ann_file, container):
    path, fn = utils.split(ann_file)
    sums = utils.load_json_data(
        utils.join('./20-test-papers/summaries/', fn[:fn.rfind('.')] + '.sum'))
    anns = utils.load_json_data(ann_file)
    total_ht = 0
    for ann in anns:
        if 'marked' in ann:
            total_ht += 1
    container.append({
        'f': ann_file,
        'ht': total_ht,
        'nm': len(anns) - total_ht,
        'total': len(anns),
        'PMID': sums['PMID'] if 'PMID' in sums else '',
        'Journal': sums['journal'] if 'journal' in sums else ''
    })
コード例 #6
0
def get_sp_ne_associations(score_file, container):
    scores = utils.load_json_data(score_file)
    sp2ne = {}
    for s in scores:
        p = s['pattern']
        if 'sp_index' in p and p['sp_index'] > -1 and s['ne'] > 0:
            sp2ne[p['sp_index']] = 1 if p[
                'sp_index'] not in sp2ne else 1 + sp2ne[p['sp_index']]
    container.append(sp2ne)
コード例 #7
0
def test_merge_highlight():
    ht_file = './30-test-papers/11274654_ht.json'
    ann_file = './30-test-papers/11274654_annotated_ann.json'
    ann = util.load_json_data(ann_file)
    for a in ann:
        if 'marked' in a:
            a['marked'] = []
    ht = read_highlights_json(ht_file)
    merge_highlights(ann, ht)
    util.save_json_array(ann, ann_file)
コード例 #8
0
def summ(highlighter, ann_file, out_path):
    anns = utils.load_json_data(ann_file)
    p, fn = split(ann_file)
    score_file = join(out_path, fn[:fn.rfind('.')] + '_scores.json')
    sid_to_score = {}
    if isfile(score_file):
        stored_scores = utils.load_json_data(score_file)
        i = 1
        for score in stored_scores:
            sid_to_score[score['sid']] = score
            i += 1

    summary, scores = highlighter.summarise([s['text'] for s in anns],
                                            src=ann_file,
                                            sids=[s['sid'] for s in anns],
                                            score_dict=sid_to_score)
    # if not isfile(score_file):
    utils.save_json_array(scores, score_file)
    utils.save_json_array(summary, join(out_path, fn[:fn.rfind('.')] + '.sum'))
コード例 #9
0
def get_general_highlights():
    geos = utils.load_json_data('./training/geo_features.json')
    sents = []
    for g in geos:
        f_ann = g['id']
        sids = []
        for sid in g['sid_cat']:
            if g['sid_cat'][sid] == 'general':
                sids.append(sid)
        if len(sids) > 0:
            anns = utils.load_json_data(f_ann)
            for ann in anns:
                if ann['sid'] in sids:
                    sents.append({
                        'text':
                        ann['text'],
                        'marked':
                        ann['marked'] if 'marked' in ann else ''
                    })
    utils.save_json_array(sents, './training/general_highlights.json')
コード例 #10
0
def update_score_path_summ(score_path):
    # regenerate sum because of new score file after semantic fixing
    hter = ah.HighLighter.get_instance()
    sum_files = utils.filter_path_file(score_path, 'sum')
    for s in sum_files:
        pmcid = s[:s.rfind('_')]
        print join(score_path[:score_path.rfind('/summ')], pmcid + '_ann.json')
        ah.summ(
            hter,
            join(score_path[:score_path.rfind('/summ')], pmcid + '_ann.json'),
            score_path)
        update_paper_summ(pmcid, utils.load_json_data(join(score_path, s)))
        print 'paper %s summary uploaded' % pmcid
コード例 #11
0
def summ_mt(ann_file, out_path):
    # test run nltk
    aa.extract_cd_nouns_nes('good 12 cn', {}, {})

    anns = utils.load_json_data(ann_file)
    p, fn = split(ann_file)
    score_file = join(out_path, fn[:fn.rfind('.')] + '_scores.json')
    sid_to_score = {}
    if isfile(score_file):
        stored_scores = utils.load_json_data(score_file)
        i = 1
        for score in stored_scores:
            sid_to_score[score['sid']] = score
            i += 1
    sum_file = join(out_path, fn[:fn.rfind('.')] + '.sum')
    HighLighter.multithread_summ([s['text'] for s in anns],
                                 6,
                                 score_file,
                                 sum_file,
                                 src=ann_file,
                                 sids=[s['sid'] for s in anns],
                                 score_dict=sid_to_score)
コード例 #12
0
def load_ht_data(ann_file_path):
    score_files = [
        join(ann_file_path, f) for f in listdir(ann_file_path)
        if isfile(join(ann_file_path, f)) and f.endswith('_annotated_ann.json')
    ]
    sents = []
    for sf in score_files:
        sents += [{
            'text': so['text'],
            'class': 'ht' if 'marked' in so else 'nht'
        } for so in (sos for sos in utils.load_json_data(sf))]
    return sents
    print 'total #sents %s \n top 1 is %s' % (len(sents), sents[0])
コード例 #13
0
def get3DCords(score_file, container, out_file, hter):
    scores = utils.load_json_data(score_file)
    anns = utils.load_json_data(scores[0]['doc_id'])
    sids = []
    for ann in anns:
        if 'marked' in ann:
            sids.append(ann['sid'])
    for s in scores:
        if s['sid'] not in sids:
            continue
        cat = hter.get_sp_type(s)
        p = s['pattern']
        nes = sorted(list(set([k for k in p['nes']])))
        cds = sorted(list(set([k for k in p['cds']])))
        container.append({
            'x': cat,
            #                   'N/A' if 'sp_index' not in p or p['sp_index'] == -1 else \
            # '-'.join(p['sub'] if p['sub'] is not None else []) + ' ' + \
            # '-'.join(p['pred'] if p['pred'] is not None else []),
            'y': len(nes),
            'z': len(cds)
            # 'y': 'N/A' if len(p['nes']) == 0 else ' '.join(nes),
            # 'z': 'N/A' if len(p['cds']) == 0 else ' '.join(cds),
        })
コード例 #14
0
def paper_language_pattern_dist(score_file, container, hter, out_file):
    scores = utils.load_json_data(score_file)
    anns = utils.load_json_data(scores[0]['doc_id'])

    b_marked = False
    hts = []
    for i in range(len(anns)):
        ann = anns[i]
        if 'marked' in ann:
            b_marked = True
            hts.append(ann['sid'])

    if not b_marked or 15 > len(hts) < 10:
        return

    max_sid = int(scores[len(scores) - 1]['sid'])
    stat = {'ht': {}, 'all': {}, 'max_sid': max_sid}
    for s in scores:
        all_sp_types = []
        cat = hter.get_sp_type(s, all_types=all_sp_types)
        for t in all_sp_types:
            stat['all'][t] = 1 if t not in stat['all'] else 1 + stat['all'][t]
            if s['sid'] in hts:
                stat['ht'][t] = 1 if t not in stat['ht'] else 1 + stat['ht'][t]
        p = s['pattern']
        if len(p['nes']) > 0:
            t = 'NE'
            stat['all'][t] = 1 if t not in stat['all'] else 1 + stat['all'][t]
            if s['sid'] in hts:
                stat['ht'][t] = 1 if t not in stat['ht'] else 1 + stat['ht'][t]
        if len(p['cds']) > 0:
            t = 'CDS'
            stat['all'][t] = 1 if t not in stat['all'] else 1 + stat['all'][t]
            if s['sid'] in hts:
                stat['ht'][t] = 1 if t not in stat['ht'] else 1 + stat['ht'][t]
    container.append(stat)
コード例 #15
0
def append_abstract_label(xml_file):
    p, f = os.path.split(xml_file)
    ann_file = os.path.join(p, f[:f.rfind('.')] + '_ann.json')
    tree = ET.parse(xml_file)
    root = tree.getroot()
    abstracts = root.findall(".//abstract")
    if len(abstracts) > 0:
        ab_sents = abstracts[0].findall("s")
        max_ab_sid = int(ab_sents[len(ab_sents) - 1].attrib['sid'])
        if max_ab_sid >= 0:
            anns = util.load_json_data(ann_file)
            for ann in anns:
                if int(ann['sid']) <= max_ab_sid:
                    ann['abstract-title'] = True
            util.save_json_array(anns, ann_file)
コード例 #16
0
def compute_sp_type_statics():
    sp2ratio = {}
    stats = utils.load_json_data(
        './training/language_pattern_stats_ranged.json')
    total = 0
    for r in stats:
        total += stats[r]['s_ht'] + stats[r]['s_nm']
        for p in stats[r]['ht']['sp']:
            print p, stats[r]['ht']['sp'][p]
            sp2ratio[
                p] = stats[r]['ht']['sp'][p] if p not in sp2ratio else stats[
                    r]['ht']['sp'][p] + sp2ratio[p]

    print json.dumps(sp2ratio)
    for p in sp2ratio:
        sp2ratio[p] = sp2ratio[p] * 1.0 / total
    print json.dumps(sp2ratio)
コード例 #17
0
def process_pmc_paper(pmcid, job_path, job_id):
    ann_file = join(job_path, pmcid + '_ann.json')
    if exists(ann_file):
        print '%s exists, skipping download' % ann_file
        update_paper_fulltext(pmcid, utils.load_json_data(ann_file))
        return
    t = get_pmc_paper_fulltext(pmcid)
    if t is None or len(t) == 0:
        utils.append_text_file(pmcid, join(job_path, 'not_available.txt'))
    else:
        sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        sents = sent_detector.tokenize(t.strip())
        if job_path is not None:
            fulltext = [{
                'text': sents[i],
                'sid': str(i + 1)
            } for i in range(len(sents))]
            utils.save_json_array(fulltext, ann_file)
            update_paper_fulltext(pmcid, fulltext)
コード例 #18
0
def compute_sp_type_regioned_weights():
    sp2ratio = {}
    stats = utils.load_json_data(
        './training/language_pattern_stats_ranged.json')
    total = 0
    for r in stats:
        total += stats[r]['s_ht'] + stats[r]['s_nm']
        for p in stats[r]['ht']['sp']:
            sp2ratio[p] = {} if p not in sp2ratio else sp2ratio[p]
            sp2ratio[p][r] = stats[r]['ht']['sp'][p]
            sp2ratio[p]['max'] = stats[r]['ht']['sp'][p] \
                if 'max' not in sp2ratio[p] or sp2ratio[p]['max'] < stats[r]['ht']['sp'][p] \
                else sp2ratio[p]['max']

    for p in sp2ratio:
        for k in sp2ratio[p]:
            m = sp2ratio[p]['max']
            if k != 'max':
                sp2ratio[p][k] = 1.0 * sp2ratio[p][k] / m

    print json.dumps(sp2ratio)
コード例 #19
0
def visualise_highlights_geometric(geo_feature_file, fn, cat):
    gms = utils.load_json_data(geo_feature_file)
    subplots = {}
    for paper in gms:
        j = paper['journal']
        if j not in subplots:
            subplots[j] = []
        traces = subplots[j]
        y_vals = []
        x_vals = []
        sects = paper['sect_dict']
        sid_cat = paper['sid_cat']
        for y in sects:
            for x in sects[y]:
                if sid_cat[x] == cat:
                    x_vals.append(1.0 * int(x) / int(paper['total']))
                    y_vals.append(y)
        traces.append({'x': x_vals, 'y': y_vals})
    plots = []
    for j in subplots:
        if len(subplots[j]) >= 6 and j is not None:
            m_x = []
            m_y = []
            for d in subplots[j]:
                m_x += d['x']
                m_y += d['y']
            plots.append(
                go.Scatter(x=m_x,
                           y=m_y,
                           mode='markers',
                           name=j if j is not None else 'unknown'))

    fig = tools.make_subplots(rows=len(plots), cols=1, shared_xaxes=True)
    for i in range(len(plots)):
        fig.append_trace(plots[i], i + 1, 1)
    fig['layout'].update(height=600, width=600)
    py.plot(fig, filename=fn)
コード例 #20
0
def visualise_lp_stats(stat_file,
                       cat,
                       title,
                       skips=None,
                       score_output_file=None):
    stats = utils.load_json_data(stat_file)
    total_normal = stats['s_nm']
    total_highlights = stats['s_ht']
    keys, scores = score_language_patterns(stats['nm'][cat], stats['ht'][cat],
                                           total_normal, total_highlights)
    if score_output_file is None:
        trace1 = go.Bar(x=keys,
                        y=scores,
                        name='Highlighted Sentences / Other Sentences')
        data = [trace1]
        layout = go.Layout(barmode='group', title=title)

        fig = go.Figure(data=data, layout=layout)
        py.plot(fig, filename='language pattern stats - ' + cat)
    else:
        data = {}
        for i in range(len(keys)):
            data[keys[i]] = scores[i]
        utils.save_json_array(data, score_output_file)
コード例 #21
0
def load_resources(ne_file,
                   cd_file,
                   sp_file,
                   sf_nes,
                   sf_cds,
                   sf_sp,
                   sf_ranged_nes,
                   sf_ranged_cds,
                   sf_ranged_sp,
                   sp_cat_file=None):
    ne = read_text_res(ne_file)
    cd = read_text_res(cd_file)
    sp = read_sub_pred_file(sp_file)
    sp_cats = None if sp_cat_file is None else utils.load_json_data(
        sp_cat_file)
    scores_nes = utils.load_json_data(sf_nes)
    scores_cds = utils.load_json_data(sf_cds)
    scores_sp = utils.load_json_data(sf_sp)
    scores_ranged_nes = utils.load_json_data(sf_ranged_nes)
    scores_ranged_cds = utils.load_json_data(sf_ranged_cds)
    scores_ranged_sp = utils.load_json_data(sf_ranged_sp)
    return ne, cd, sp, sp_cats, \
           scores_nes, scores_cds, scores_sp, \
           scores_ranged_nes, scores_ranged_cds, scores_ranged_sp
コード例 #22
0
ファイル: ncboann.py プロジェクト: KHP-Informatics/NapEasy
def file_match_concepts(ann_file, concepts):
    anns = utils.load_json_data(ann_file)
    for ann in anns:
        ret = match_concepts(ann['text'], concepts)
        if len(ret) > 0:
            print ret, ann['sid']
コード例 #23
0
def score_paper_threshold(score_file,
                          container,
                          out_file,
                          hter,
                          threshold,
                          manual_ann=None):

    ma = None
    if manual_ann is not None:
        fpath, fn = split(score_file)
        m = re.match(r'(\d+)_annotated_ann_scores\.json', fn)
        if m is not None:
            paperid = m.group(1)
            if paperid in manual_ann:
                ma = manual_ann[paperid]
    units = 5
    scores = utils.load_json_data(score_file)
    max_sid = int(scores[len(scores) - 1]['sid'])
    offset = int(math.ceil(1.0 * len(scores) / units))

    anns = utils.load_json_data(scores[0]['doc_id'])
    hts = []
    sid2ann = {}
    sid2onto = {}
    abstract_sents = []
    for ann in anns:
        if ma is not None and 'max_abstract_sid' in ma and int(
                ann['sid']) <= ma['max_abstract_sid']:
            abstract_sents.append(ann['sid'])
            continue  # skipe the abstract sentences
        # if 'abstract-title' in ann or ('struct' in ann and (ann['struct'] == 'DoCO:Abstract' or ann['struct'] == 'DoCO:Title')):
        #     abstract_sents.append(ann['sid'])
        #     continue  # skipe the abstract sentences
        if 'marked' in ann:
            hts.append(ann['sid'])
        sid2ann[ann['sid']] = ann

    # skip papers with no highlights
    # if len(hts) == 0:
    #     return

    if ma is not None:
        hts += [str(sid) for sid in ma['also_correct']]

    prediction = []
    num_correct = 0

    sentence_level_details = []
    for i in range(len(scores)):
        r = (i + 1) / offset
        score = scores[i]
        if score['sid'] in abstract_sents:
            continue  # skip the abstract sentences
        score_ret = hter.score(score, region='r' + str(r))
        sent_type = '-'.join(sorted(score_ret['all_sps']))

        onto2scores = HighLighter.get_onto_name_scores()
        onto_score = 0 if score['sid'] not in sid2onto else \
            0 if sid2onto[score['sid']] not in onto2scores \
                else onto2scores[sid2onto[score['sid']]]
        confidence = 1 if 'confidence' not in score['pattern'] else score[
            'pattern']['confidence']
        # if confidence < 1:
        #     sent_type = ''

        if (len(score_ret['sp']) > 0) \
                or (score_ret['cds'] + score_ret['nes'] > 0) \
                or onto_score > .2:
            s_sp = 0.0
            if len(score_ret['sp']) > 0:
                if len(score_ret['sp']) == 1:
                    for t in score_ret['sp']:
                        s_sp = score_ret['sp'][t]
                else:
                    type_score = []
                    for t in score_ret['sp']:
                        type_score.append([t, score_ret['sp'][t]])
                    type_score = sorted(type_score,
                                        cmp=lambda p1, p2: 1
                                        if p2[1] > p1[1] else 0
                                        if p2[1] == p1[1] else -1)
                    s_sp = type_score[0][1]
            # average combination
            # s = (s_sp + score_ret['cds'] + score_ret['nes'])/3
            # empirical setting
            s = 0.35 * s_sp + .2 * score_ret['cds'] + .45 * score_ret['nes']

            # F2: voting enhancement
            voted = 0
            if score_ret['nes'] > 0:
                voted += 1
            if score_ret['cds'] > 0:
                voted += 1
            if s_sp > 0:
                voted += 0.18
            s *= voted / 2.18

            # F3: type regional boosting (spatial features)
            type_boost = .3 if r in [0, 1] else .07 if r in [2, 3] else 0.005
            region = 'r%s' % r
            sent_boost = HighLighter.get_sent_type_boost()
            if sent_type in sent_boost:
                type_boost = sent_boost[sent_type][
                    region] if region in sent_boost[sent_type] else 0.001
            type_boost = math.pow(type_boost, 1.2)
            s *= type_boost * 10
            prediction.append([score['sid'], s, sent_type])

            if score['sid'] in hts or s > threshold:
                sentence_level_details.append(
                    u'[{}]\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                        score['sid'], 'H' if score['sid'] in hts else '-',
                        'P' if s > threshold else '-', sent_type,
                        '{}/{}'.format(s, type_boost),
                        '{}/{}'.format(s_sp, confidence), '{}/{}'.format(
                            score_ret['cds'], score['pattern']['cds'] if 'cds'
                            in score['pattern'] else ''), '{}/{}'.format(
                                score_ret['nes'], score['pattern']['nes']
                                if 'nes' in score['pattern'] else ''),
                        anns[i]['text'].replace('\n', '').replace('\t', '')))
        else:
            if score['sid'] in hts:
                sentence_level_details.append(
                    u'[{}]\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}'.format(
                        score['sid'], 'H' if score['sid'] in hts else '-', '-',
                        '-', '-', '-', '-', '-',
                        anns[i]['text'].replace('\n', '').replace('\t', '')))
    prediction = sort_by_threshold(prediction,
                                   threshold,
                                   cmp=lambda p1, p2: 1 if p2[1] > p1[1] else 0
                                   if p2[1] == p1[1] else -1)

    for s in prediction:
        if s[0] in hts:
            num_correct += 1

    container.append({
        'paper': scores[0]['doc_id'],
        'predicted': len(prediction),
        'correct': num_correct,
        'hts': len(hts),
        'max_sid': max_sid,
        'highlights': prediction
    })
    return sentence_level_details
コード例 #24
0
def remove_ann_sentences(ann_file):
    anns = utils.load_json_data(ann_file)
    for ann in anns:
        ann['text'] = ''
    utils.save_json_array(anns, ann_file)
コード例 #25
0
 def get_sub_pred():
     if HighLighter.sub_pred is None:
         HighLighter.sub_pred = utils.load_json_data(
             './resources/sub_pred.txt')
     return HighLighter.sub_pred
コード例 #26
0
 def get_sub_pred_ne_stat():
     if HighLighter.sub_pred_ne_stat is None:
         HighLighter.sub_pred_ne_stat = utils.load_json_data(
             './resources/sub_pred_ne_stat.json')
     return HighLighter.sub_pred_ne_stat
コード例 #27
0
 def get_onto_name_scores():
     if HighLighter.onto_name_scores is None:
         HighLighter.onto_name_scores = utils.load_json_data(
             './resources/score_ncbo_ontos.json')
     return HighLighter.onto_name_scores
コード例 #28
0
 def get_sent_type_boost():
     if HighLighter.sent_type_boost is None:
         HighLighter.sent_type_boost = utils.load_json_data(
             './resources/sent_type_region_boost.json')
     return HighLighter.sent_type_boost
コード例 #29
0
def get_manual_checked_result():
    return utils.load_json_data(manual_file)
コード例 #30
0
def get_language_pattern_stats(score_file, container, out_file, hter):
    scores = utils.load_json_data(score_file)
    max_sid = int(scores[len(scores) - 1]['sid'])
    units = 5
    offset = int(1.0 * max_sid / units)
    anns = utils.load_json_data(scores[0]['doc_id'])

    b_marked = False
    ranges = []
    r = {'sids': [], 's': 0, 'seq': 0}
    ranges.append(r)
    for i in range(len(anns)):
        if (i + 1) % offset == 0:
            r['e'] = i - 1
            r = {'sids': [], 's': i, 'seq': (i + 1) / offset}
            ranges.append(r)
        ann = anns[i]
        if 'marked' in ann:
            b_marked = True
            r['sids'].append(ann['sid'])
    r['e'] = len(anns) - 1

    if not b_marked:
        return

    for r in ranges:
        sids = r['sids']
        stats = get_stats_obj()
        stats['s_nm'] = r['e'] - r['s'] - len(sids)
        stats['s_ht'] = len(sids)
        for i in range(r['s'], r['e']):
            s = scores[i]
            sent_type = 'ht' if s['sid'] in sids else 'nm'
            stat = stats[sent_type]['sp']

            all_sp_types = []
            cat = hter.get_sp_type(s, all_types=all_sp_types)
            if len(all_sp_types) > 0:
                t = '-'.join(sorted(all_sp_types))
                stat[t] = 1 if t not in stat else 1 + stat[t]
            else:
                # count not typed as well
                stat[cat] = 1 if cat not in stat else 1 + stat[cat]
            p = s['pattern']
            nes = sorted(list(set([k for k in p['nes']])))
            cds = sorted(list(set([k for k in p['cds']])))

            if len(all_sp_types) > 0:
                sp = '-'.join(
                    p['sub'] if p['sub'] is not None else '') + ' ' + '-'.join(
                        p['pred'] if p['pred'] is not None else '')
                stat = stats[sent_type]['sp_breakdown']
                stat[sp] = 1 if sp not in stat else 1 + stat[sp]

            stat = stats[sent_type]['ne']
            for ptn in nes:
                if ptn in hter.get_named_entities():
                    stat[ptn] = 1 if ptn not in stat else 1 + stat[ptn]
            stat = stats[sent_type]['cd']
            for ptn in cds:
                if ptn in hter.get_cardinal_nouns():
                    stat[ptn] = 1 if ptn not in stat else 1 + stat[ptn]
        container.append({'r%s' % r['seq']: stats})