Esempio n. 1
0
def main():
    sys.stdout = open(os.path.join(project_base, data_pre_1 + "_" + data_pre_2 + "_between_log"), "w")

    data_pre = data_pre_1
    try:
        with open(os.path.join(project_base, data_pre + "_dmp"), "r") as fp:
            print "un-pickling..."
            ds = cPickle.load(fp)
    except IOError:
        print "something wrong with the pickle file path"

    between_tls = []
    for doc in ds.docs.values():
        between_tls.extend(doc.tlinks_between_gold)
    cfg = Config(mallet_bin, project_temp, data_pre + "_between")
    train_cfg = train_on_data(between_tls, lambda x: x.ds_id, lambda x: x.type, between_tlink_feats, "MaxEnt", cfg)

    data_pre = data_pre_2
    try:
        with open(os.path.join(project_base, data_pre + "_dmp"), "r") as fp:
            print "un-pickling..."
            ds = cPickle.load(fp)
    except IOError:
        print "something wrong with the pickle file path"

    pair = [0, 0, 0, 0]
    label = [0, 0, 0, 0]
    for doc in ds.docs.values():
        gold = doc.tlinks_between_gold
        candid = create_candid_event2sectime(doc)
        coref = create_candid_coref(doc)
        for c in coref:
            tl = search_tlink_between_enty(c.from_enty, c.to_enty, candid)
            if tl:
                tl.pred = "OVERLAP"
            else:
                candid.append(c)
        update_candid_id(candid, doc.ds_id)
        cfg = Config(mallet_bin, project_temp, data_pre + "_between_" + doc.ds_id)
        ti = apply_to_data(candid, lambda x: x.ds_id, between_tlink_feats, train_cfg.model_path, cfg)
        for c in candid:
            c.pred = ti.ID2pred[c.ds_id]
        print "document:", doc.ds_id
        rslt = verify(candid, gold)
        print rslt
        pair = [x + y for (x, y) in zip(pair, rslt[0])]
        label = [x + y for (x, y) in zip(label, rslt[1])]
    print "pair", pair
    print "label", label
Esempio n. 2
0
def main():
    data_pre = data_pre_1
    try:
        with open(os.path.join(project_base, data_pre + '_dmp'), 'r') as fp:
            print 'un-pickling...'
            ds = cPickle.load(fp)
    except IOError:
        print 'something wrong with the pickle file path'
        
    within_tls = []
    for doc in ds.docs.values():
        within_tls.extend(doc.tlinks_within_gold)
        within_tls.extend(doc.tlinks_within_closure)
    train_cfg = Config(mallet_bin, project_temp, data_pre + '_within')
    train_on_data(within_tls, lambda x: x.ds_id, lambda x: x.type, within_tlink_feats, 'MaxEnt', train_cfg)
    
    data_pre = data_pre_2
    try:
        with open(os.path.join(project_base, data_pre + '_dmp'), 'r') as fp:
            print 'un-pickling...'
            ds = cPickle.load(fp)
    except IOError:
        print 'something wrong with the pickle file path'
        
    within_candids = []
    between_candids = []
    for doc in ds.docs.values():
        # within
        w_candid = create_candid_within3(doc)
        update_candid_id(w_candid, doc.ds_id)
        within_candids.extend(w_candid)
        test_cfg = Config(mallet_bin, project_temp, data_pre + '_within_' + doc.ds_id)
        ti = apply_to_data(w_candid, lambda x: x.ds_id, within_tlink_feats, train_cfg.model_path, test_cfg)
        for c in w_candid:
            c.pred = ti.ID2pred[c.ds_id]
            c.probs = ti.ID2probs[c.ds_id]
        # between
        b_candid = create_candid_event2sectime(doc)
        coref = create_candid_coref(doc)
        for c in coref:
            tl = search_tlink_between_enty(c.from_enty, c.to_enty, b_candid)
            if tl:
                tl.pred = 'OVERLAP'
            else:
                b_candid.append(c)
        between_candids.extend(b_candid)
        
        gold_xml = project_base + '/' + data_pre_2 + '_i_sub' + '/' + doc.ds_id + '.xml'
        if not os.path.exists(gold_xml):
            print 'skip xml', doc.ds_id
            continue
        pred_xml = project_base + '/' + data_pre_2 + '_pred_all' + '/' + doc.ds_id + '.xml'
        parser = etree.XMLParser(recover=True)
        try:
            tree = etree.parse(gold_xml, parser)
        except:
            print 'Something wrong with opening/parsing specified input xml file'
            raise
        root = tree.getroot()
        tags = root.find('TAGS')
        # for evaluation purpose, remove original tlink
        tlinks = tags.findall('TLINK')
        for tl in tlinks:
            tags.remove(tl)
            
        counter = 0
        for tl in w_candid + b_candid:
            tl_elmt = etree.Element('TLINK')
            tl_elmt.set('id', 'TL' + str(counter))
            tl_elmt.set('fromID', tl.from_enty.origin_id)
            tl_elmt.set('fromText', str(tl.from_enty))
            tl_elmt.set('toID', tl.to_enty.origin_id)
            tl_elmt.set('toText', str(tl.to_enty))
            tl_elmt.set('type', tl.pred)
            # the 'tail' is set so that each TLINK will start from a new line
            tl_elmt.tail = '\n'
            tags.append(tl_elmt)
            counter += 1
            
        with open(pred_xml, 'w') as fp:
            fp.write(etree.tostring(root, xml_declaration=True).replace('/>', ' />'))
    """
Esempio n. 3
0
def main():
    data_pre = data_pre_1
    try:
        with open(os.path.join(project_base, data_pre + "_dmp"), "r") as fp:
            print "un-pickling..."
            ds = cPickle.load(fp)
    except IOError:
        print "something wrong with the pickle file path"

    within_tls = []
    for doc in ds.docs.values():
        within_tls.extend(doc.tlinks_within_gold)
        within_tls.extend(doc.tlinks_within_closure)
    cfg = Config(mallet_bin, project_temp, data_pre + "_within")
    train_cfg = train_on_data(within_tls, lambda x: x.ds_id, lambda x: x.type, within_tlink_feats, "MaxEnt", cfg)

    data_pre = data_pre_2
    try:
        with open(os.path.join(project_base, data_pre + "_dmp"), "r") as fp:
            print "un-pickling..."
            ds = cPickle.load(fp)
    except IOError:
        print "something wrong with the pickle file path"

    all_a = []
    all_c = []
    all_u = []
    all_i = []

    all_m = []
    all_um = []
    all_uw = []
    all_mi = []

    all_within = []
    all_candid = []

    for doc in ds.docs.values():
        within = doc.tlinks_within_gold + doc.tlinks_within_closure
        for tl in within:
            tl.sent.tlinks_within.append(tl)
        all_within.extend(within)
        candid = create_candid_within3(doc)
        update_candid_id(candid, doc.ds_id)
        all_candid.extend(candid)
        cfg = Config(mallet_bin, project_temp, data_pre + "_within_" + doc.ds_id)
        ti = apply_to_data(candid, lambda x: x.ds_id, within_tlink_feats, train_cfg.model_path, cfg)

        for c in candid:
            c.pred = ti.ID2pred[c.ds_id]
            c.probs = ti.ID2probs[c.ds_id]

        # for s in doc.sents:
        #    expand(s)

        for s in doc.sents:
            # print s.ds_id
            """
            if s.freq_tx != []:
                print 'freq_tx:', s.freq_tx
            for tl in s.freq_tl:
                tl.pred = 'OVERLAP'
            tx3s_copy = s.timex3s[:]
            for tx in s.freq_tx:
                tx3s_copy.remove(tx)
            a, c, u, i = get_conflict_info2(s.events + tx3s_copy, s.candids_within, lambda x: x.span[0].begin, lambda x: x.pred)
            """
            a, c, u, i = get_conflict_info2(
                s.events + s.timex3s, s.candids_within, lambda x: x.span[0].begin, lambda x: x.pred
            )
            all_a.extend(a)
            all_c.extend(c)
            all_u.extend(u)
            all_i.extend(i)

        print "document:", doc.ds_id, "\n"
        for s in doc.sents:
            # print s.ds_id
            # m, um, uw, mi = verify(s.candids_within + s.freq_tl, s.tlinks_within)
            m, um, uw, mi = verify(s.candids_within, s.tlinks_within)
            all_m.extend(m)
            all_um.extend(um)
            all_uw.extend(uw)
            all_mi.extend(mi)

            """
            resolve_conflict_within2(s)
            print 'after resolution:', doc.ds_id, '\n'
            
            rslt = verify(s.candids_within, s.tlinks_within)
            print s.ds_id, rslt, '\n'
            pair1 = [x + y for (x, y) in zip(pair1, rslt[0])]
            label1 = [x + y for (x, y) in zip(label1, rslt[1])]
            """

    all_rslt = [all_a, all_c, all_u, all_i, all_m, all_um, all_uw, all_mi, all_within, all_candid]

    try:
        with open(os.path.join(project_base, data_pre_2 + "_within_result_dmp"), "w") as fp:
            cPickle.dump(all_rslt, fp)
    except:
        print "something wrong when dumping"