def extract_relation_pairs(infile,
                           relation_set='exact',
                           pairtype='ee',
                           train=False,
                           nodename='narr_timeml_simple'):
    if debug:
        print("extracting relation pairs: ", relation_set, pairtype, "train: ",
              str(train))
    starttime = time.time()
    # Get the xml from file
    tree = etree.parse(infile)
    root = tree.getroot()
    all_labels = []
    all_pairs = []
    ids = []
    records = 0
    #nodename = name
    global undersample
    undersample = 1

    for child in root:
        id_node = child.find("record_id")
        rec_id = id_node.text
        records += 1
        node = child.find(nodename)
        try:
            node = etree.fromstring('<' + nodename + '>' +
                                    data_util3.stringify_children(node).encode(
                                        'utf8').decode('utf8') + '</' +
                                    nodename + '>')
        except etree.XMLSyntaxError as e:
            #dropped += 1
            position = e.position[1]
            print(
                "XMLSyntaxError at ", e.position, str(e),
                data_util3.stringify_children(node)[position - 5:position + 5])
        if node is not None:
            ids.append(rec_id)
            pairs, pair_labels = extract_pairs(node, relation_set, pairtype,
                                               train)
            all_pairs.append(pairs)
            all_labels.append(pair_labels)

    return ids, all_pairs, all_labels
Beispiel #2
0
def get_graphs(filename, relation_set='exact'):
    if debug: print("creating graph: ", relation_set)
    starttime = time.time()
    # Get the xml from file
    tree = etree.parse(filename)
    root = tree.getroot()
    graphs = []
    ids = []
    records = 0
    dropped = 0

    for child in root:
        id_node = child.find("record_id")
        rec_id = id_node.text
        print("rec_id:", rec_id)
        records += 1
        node = child.find("narr_timeml_simple")
        #print("node is None:", str(node is None))
        try:
            #node = etree.fromstring(etree.tostring(node).decode('utf8'))
            node = etree.fromstring('<narr_timeml_simple>' + data_util3.stringify_children(node).encode('utf8').decode('utf8') + '</narr_timeml_simple>')
        except etree.XMLSyntaxError as e:
            dropped += 1
            position = e.position[1]
            print("XMLSyntaxError at ", e.position, str(e), data_util3.stringify_children(node)[position-5:position+5])
        if node is not None:
            graph = create_graph(node, relation_set, find_more_relations=False)
            digraph = create_digraph(node, relation_set)
            ids.append(rec_id)
            graphs.append(digraph)
            records += 1

            # Save an image of the graph
            pdot_graph = nx.drawing.nx_pydot.to_pydot(graph)
            pdot_graph.write_png(filename + "." + rec_id + ".graph.png")
            pdot_digraph = nx.drawing.nx_pydot.to_pydot(digraph)
            pdot_digraph.write_png(filename + "." + rec_id + ".digraph.png")

    # Print the first few feature vectors as a sanity check
    print("records:", str(records))
    print("dropped:", str(dropped))
    print("time:", str(time.time()-starttime), "s")
    return graphs
Beispiel #3
0
def get_lists(filename, outfile, relation_set='exact'):
    #print("creating graph: ", relation_set)
    starttime = time.time()
    # Get the xml from file
    tree = etree.parse(filename)
    root = tree.getroot()
    timelines = []
    ids = []
    records = 0
    dropped = 0

    for child in root:
        id_node = child.find("record_id")
        rec_id = id_node.text
        #print("rec_id:", rec_id)
        records += 1
        node = child.find("narr_timeml_simple")
        list_node = child.find(list_name)
        #print("node is None:", str(node is None))
        try:
            node = etree.fromstring('<narr_timeml_simple>' + data_util3.stringify_children(node).encode('utf8').decode('utf8') + '</narr_timeml_simple>')
        except etree.XMLSyntaxError as e:
            dropped += 1
            position = e.position[1]
            print("XMLSyntaxError at ", e.position, str(e), data_util3.stringify_children(node)[position-5:position+5])
        if node is not None and list_node is None: # Skip records that already have a list
            timeline = listify(node, relation_set)
            ids.append(rec_id)
            timelines.append(timeline)
            timeline_node = etree.SubElement(child, list_name)
            timeline_to_xml(timeline, timeline_node)
            # Write the file after every record in case one of them fails
            tree.write(outfile)

    print("records:", str(records))
    print("dropped:", str(dropped))
    tutil.print_time(time.time()-starttime)
    return timelines
Beispiel #4
0
def run(infile, outfile):
    seqs = {} # id -> seq

    # Get the xml from file
    tree = etree.parse(infile)
    root = tree.getroot()

    for child in root:
        id_node = child.find("MG_ID")
        rec_id = id_node.text
        node = child.find("narr_timeml_simple")
        narr = ""
        if node is not None:
            narr = data_util.stringify_children(node).encode('utf-8')
        if len(narr) > 0:
            seq_narr = xml_to_seq(narr)
            seqs[rec_id] = seq_narr

    # write the stats to file
    output = open(outfile, 'w')
    output.write(str(seqs))
    output.close()
Beispiel #5
0
def get_seqs(filename, split_sents=False, inline=True, add_spaces=False):
    print("get_seqs ", filename)
    ids = []
    narrs = []
    anns = []
    seqs = []
    seq_ids = []

    # Get the xml from file
    tree = etree.parse(filename)
    root = tree.getroot()

    for child in root:
        narr = ""
        rec_id = child.find(id_name).text
        ids.append(rec_id)
        # Get the narrative text
        node = child.find("narr_timeml_simple")
        if inline:
            if node is None:
                narr_node = child.find("narrative")
                if narr_node is None:
                    print("no narrative: ", data_util.stringify_children(child))
                else:
                    narr = narr_node.text
                    #print "narr: " + narr
                    narrs.append(narr)
            else:
                rec_id = child.find(id_name).text
                #print "rec_id: " + rec_id
                #narr = etree.tostring(node, encoding='utf-8').decode('utf-8')
                narr = tools.stringify_children(node)
                if add_spaces:
                    narr = narr.replace('<', ' <')
                    narr = narr.replace('>', '> ')
                    narr = narr.replace('.', ' .')
                    narr = narr.replace(',', ' ,')
                    narr = narr.replace(':', ' :')
                    narr = narr.replace('  ', ' ')
                #print("narr: ", narr)
                #ids.append(rec_id)
                narrs.append(narr)
        else: # NOT inline
            anns.append(data_util.stringify_children(node).encode('utf8'))
            narr_node = child.find("narrative")
            narrs.append(narr_node.text)

    if inline:
        #split_sents = False
        for x in range(len(narrs)):
            narr = narrs[x]
            rec_id = ids[x]
            if split_sents:
                sents = narr.split('.')
                for sent in sents:
                    sent_seq = xmltoseq.xml_to_seq(sent.strip())
                    seqs.append(sent_seq)
                    seq_ids.append(rec_id)
            else:
                narr_seq = xmltoseq.xml_to_seq(narr)
                for seq in narr_seq:
                    seqs.append(seq)
                    seq_ids.append(rec_id)
    else:
        # TEMP
        use_ncrf = False
        split_sents = True
        print("split_sents: ", str(split_sents))
        for x in range(len(narrs)):
            narr = narrs[x]
            ann = anns[x]
            rec_id = ids[x]
            ann_seqs = xmltoseq.ann_to_seq(narr, ann, split_sents, use_ncrf)
            print("seqs: ", str(len(ann_seqs)))
            for s in ann_seqs:
                seqs.append(s)
                seq_ids.append(rec_id)

    if debug: print("seqs[0]", str(seqs[0]))
    return seq_ids, seqs
def extract_features(filename,
                     relation_set='exact',
                     pairtype='ee',
                     vecfile=None,
                     train=False,
                     rank_features=False,
                     limit=None):
    if debug:
        print("extracting features: ", relation_set, pairtype, "train: ",
              str(train))
    starttime = time.time()
    # Get the xml from file
    tree = etree.parse(filename)
    root = tree.getroot()
    features = []
    vec_features = []
    labels = []
    all_events = []
    ids = []
    all_pairs = []
    vec_model = None
    dropped = 0
    records = 0

    if rank_features:
        nodename = "event_list"
    else:
        nodename = "narr_timeml_simple"

    if vecfile is not None:
        print("Loading vectors: ", vecfile)
        vec_model = KeyedVectors.load_word2vec_format(vecfile, binary=True)

    for child in root:
        if limit is not None and (records >= limit):
            dropped += 1
            continue
        id_node = child.find("record_id")
        rec_id = id_node.text
        #if debug: print("rec_id:", rec_id)
        records += 1
        node = child.find(nodename)
        narrative = child.find('narrative').text
        #print("node is None:", str(node is None))
        if node is not None:
            try:
                if rank_features:
                    node = etree.fromstring(
                        etree.tostring(node).decode('utf8'))
                else:
                    node = etree.fromstring('<' + nodename + '>' +
                                            data_util3.stringify_children(node)
                                            .encode('utf8').decode('utf8') +
                                            '</' + nodename + '>')
            except etree.XMLSyntaxError as e:
                dropped += 1
                position = e.position[1]
                print(
                    "XMLSyntaxError at ", e.position, str(e),
                    data_util3.stringify_children(node)[position - 5:position +
                                                        5])

            if rank_features:
                events, feats, vec_feats, ranks = extract_event_feats(
                    node, vec_model, narrative)
                for x in range(len(feats)):
                    ids.append(rec_id)
                    features.append(feats[x])
                    labels.append(ranks[x])
                    vec_features.append(vec_feats[x])
                    all_events.append(events[x])
            else:
                #if train:
                #    us = .999
                #else:
                #    us = 0
                us = 1
                pairs, pair_labels = extract_pairs(node,
                                                   relation_set,
                                                   pairtype,
                                                   train,
                                                   under=us)
                for x in range(len(pair_labels)):
                    labels.append(pair_labels[x])
                    ids.append(rec_id)
                    feats, vec_feats = pair_features(pairs[x], pairtype,
                                                     vec_model, narrative)
                    features.append(feats)
                    if pairtype == 'ee':
                        all_pairs.append((pairs[x][0].attrib['eid'],
                                          pairs[x][1].attrib['eid']))
                    if vec_feats is not None:
                        vec_features.append(vec_feats)
        else:
            #print("node is None!")
            dropped += 1

    # Print the first few feature vectors as a sanity check
    print("records:", str(records))
    print("examples:", str(len(features)))
    print("dropped:", str(dropped))
    for x in range(3):
        print("features[", str(x), "]: ", str(features[x]))
        #print("vec_features[", str(x), "]: len ", str(len(vec_features[x])))
        print("labels[", str(x), "]: ", str(labels[x]))

    # Normalize features
    num_feats = len(features[0])
    global labelenc_map
    for y in range(1, num_feats):
        if type(features[0][y]) is str:  # Only encode string features
            column = []
            for feat in features:
                column.append(feat[y])
            if train:
                if debug: print('training labelencoder')
                global labelenc_map
                labelencoder = LabelEncoder()
                labelencoder.fit(column)
                labelenc_map[y] = labelencoder
            else:
                labelencoder = labelenc_map[y]
            norm_column = labelencoder.transform(column)
            for x in range(len(norm_column)):
                features[x][y] = norm_column[x]
    for x in range(1):
        print("encoded features[", str(x), "]: ", str(features[x]))

    # Scale the features
    global scaler
    if train:
        scaler = MinMaxScaler()
        scaler.fit(features)
    features_scaled = scaler.transform(features)
    print("features_scaled[0]: ", str(features_scaled[0]))

    # Merge the two feature sets
    features_final = []
    if len(vec_features) > 0:
        for z in range(len(vec_features)):
            #feats_sc = []
            #for item in features_scaled[z].tolist():
            #    feats_sc.append(numpy.asscalar(item))
            features_final.append(features_scaled[z].tolist() +
                                  vec_features[z])
    else:
        features_final = features_scaled

    # Encode the pairwise labels (but not rank labels)
    if not rank_features:
        if train:
            global relationencoder
            relationencoder = LabelEncoder()
            relationencoder.fit(labels)
            if debug: print("labels: ", str(relationencoder.classes_))
        encoded_labels = relationencoder.transform(labels)
        num_classes = len(relationencoder.classes_)

        # One-hot encoding
        #onehot_labels = []
        #for lab in encoded_labels:
        #    onehot_lab = numpy.zeros(num_classes).tolist()
        #    onehot_lab[lab] = 1
        #    onehot_labels.append(onehot_lab)
        labels = encoded_labels

    # Free the word embedding model
    del vec_model

    print("feature extraction took ",
          tutil.print_time(time.time() - starttime))
    if rank_features:
        return ids, all_events, features_final, labels
    else:
        return ids, all_pairs, features_final, labels