Exemple #1
0
def run(infile, outfile, vecfile):
    # Get the xml from file
    tree = etree.parse(infile)
    root = tree.getroot()

    timelines = []
    for child in root:
        node = child.find("narr_timeml_simple")
        narr = ""
        if node != None:
            narr = data_util.stringify_children(node) #.encode('utf-8')
            print "narr: " + narr
            timeline, events = create_timeline(narr, vecfile)
            timelines.append(timeline)
            print "Timeline text:"
            for event in events:
                print str(event)

    out = open(outfile, 'w')
    out.write(str(timelines))
    out.close()
Exemple #2
0
def run(infile, outfile):
    seqs = {}  # id -> seq

    # Get the xml from file
    tree = etree.parse(infile)
    root = tree.getroot()

    for child in root:
        id_node = child.find("MG_ID")
        rec_id = id_node.text
        node = child.find("narr_timeml_simple")
        narr = ""
        if node != None:
            narr = data_util.stringify_children(node).encode('utf-8')
        if len(narr) > 0:
            seq_narr = xml_to_seq(narr)
            seqs[rec_id] = seq_narr

    # write the stats to file
    output = open(outfile, 'w')
    output.write(str(seqs))
    output.close()
Exemple #3
0
def filter_narr(tree, tagger_name):
    print "filter_narr"
    root = tree.getroot()
    symp_narr = ""
    for child in root:
        #print "child: " + data_util.stringify_children(child)
        node = child.find(symp_narr_tag)
        if node is None:
            print "no " + symp_narr_tag + ": " + data_util.stringify_children(
                child)
        for item in node.iterdescendants("EVENT", "TIMEX3"):
            if item.text is not None:
                symp_narr = symp_narr + " " + item.text
            else:
                for it in item.iterdescendants():
                    if it.text is not None:
                        symp_narr = symp_narr + " " + it.text.strip()
        newnode = etree.SubElement(child, symp_narr_tag)
        newnode.text = symp_narr.strip()
        #print "symp_narr: " + symp_narr
        tagger_node = etree.SubElement(child, symp_tagger_tag)
        tagger_node.text = tagger_name
    return tree
def convert_spans_to_xml(text):
    lines = text.splitlines()
    lines = lines[1:]  # ignore the xml header
    #root  = etree.fromstring("<root>" + ' '.join(lines) + "</root>")
    root = etree.fromstring("<root>" + '\n'.join(lines) + "</root>")
    timeml = root.find("TimeML")
    tags = []

    # Get DCT
    dct_node = timeml.find("DCT")
    dct_text = data_util.stringify_children(dct_node)

    # Get document text
    text_node = timeml.find("TEXT")
    text_text = dct_text + data_util.stringify_children(text_node)
    print "text: " + text_text

    build_list = etree.XPath("//EVENT")
    events = build_list(text_node)
    event_dict = {}
    for event in events:
        #print etree.tostring(event)
        eventid = event.attrib['eid']
        event_dict[eventid] = event

    # Copy attributes from makeinstance to the actual event tag
    instance_to_event = {}
    mis = root.xpath("//MAKEINSTANCE")
    for mi in mis:
        #print etree.tostring(mi)
        eventid = mi.attrib['eventID']
        instanceid = mi.attrib['eiid']
        instance_to_event[instanceid] = eventid
        event = event_dict[eventid]
        for att in mi.attrib.keys():
            if not att == 'eventID' and not att == 'eiid':
                event.attrib[att] = mi.attrib[att]

    tlinks = root.xpath("//TLINK")
    slinks = root.xpath("//SLINK")
    tags = tlinks + slinks

    # Convert eiids to eids in links
    for tl in tags:
        if 'eventInstanceID' in tl.attrib:
            eiid = tl.attrib['eventInstanceID']
            tl.attrib['eventID'] = instance_to_event[eiid]
        if 'relatedToEventInstance' in tl.attrib:
            tl.attrib['relatedToEventID'] = instance_to_event[
                tl.attrib['relatedToEventInstance']]
        if 'subordinatedEventInstance' in tl.attrib:
            tl.attrib['subordinatedEventID'] = instance_to_event[
                tl.attrib['subordinatedEventInstance']]

    #print "Updated node: " + etree.tostring(text_node)

    text_root = etree.fromstring("<root>" + text_text + "</root>")
    narr_text = ''.join(text_root.xpath("//text()"))
    xml_text = dct_text + data_util.stringify_children(text_node)

    for tag in tags:
        xml_text = xml_text + etree.tostring(tag)
    print "narr_text: " + narr_text
    #print "xml_text: " + xml_text

    return narr_text, xml_text
def extract(infile,
            outfile,
            dict_keys,
            stem=False,
            lemma=False,
            element="narrative",
            arg_rebalance="",
            arg_vecfile=""):
    train = False
    narratives = []
    keywords = []

    if event_vec in featurenames or event_seq in featurenames:
        element = "narr_symp"

    # Get the xml from file
    root = etree.parse(infile).getroot()

    if dict_keys == None:
        train = True

        # Set up the keys for the feature vector
        dict_keys = ["MG_ID"]
        if "codex" in labelname:
            dict_keys.append("WB10_codex")
            dict_keys.append("WB10_codex2")
            dict_keys.append("WB10_codex4")
        else:
            dict_keys.append(labelname)
        if rec_type in featurenames:
            dict_keys.append("CL_" + rec_type)
        if checklist in featurenames:
            dict_keys = dict_keys + [
                "CL_DeathAge", "CL_ageunit", "CL_DeceasedSex", "CL_Occupation",
                "CL_Marital", "CL_Hypertension", "CL_Heart", "CL_Stroke",
                "CL_Diabetes", "CL_TB", "CL_HIV", "CL_Cancer", "CL_Asthma",
                "CL_InjuryHistory", "CL_SmokeD", "CL_AlcoholD",
                "CL_ApplytobaccoD"
            ]
        elif dem in featurenames:
            dict_keys = dict_keys + ["CL_DeathAge", "CL_DeceasedSex"]
        print "dict_keys: " + str(dict_keys)
        #keywords = set([])
        #narrwords = set([])

    print "train: " + str(train)
    print "stem: " + str(stem)
    print "lemma: " + str(lemma)
    # Extract features
    matrix = []
    for child in root:
        features = {}

        if rec_type in featurenames:
            features["CL_" + rec_type] = child.tag

        # CHECKLIST features
        for key in dict_keys:
            if key[0:3] == "CL_":
                key = key[3:]
            item = child.find(key)
            value = "0"
            if item != None:
                value = item.text
            if key == "AlcoholD" or key == "ApplytobaccoD":
                if value == 'N':
                    value = 9
            features[key] = value
            #print "-- value: " + value
            if key == "MG_ID":
                print "extracting features from: " + value

        # KEYWORD features
        if kw_features:
            keyword_string = get_keywords(child)
            # Remove punctuation and trailing spaces from keywords
            words = [
                s.strip().translate(string.maketrans("", ""),
                                    string.punctuation)
                for s in keyword_string.split(',')
            ]
            # Split keyword phrases into individual words
            for word in words:
                w = word.split(' ')
                words.remove(word)
                for wx in w:
                    words.append(wx.strip().strip('–'))
            keywords.append(" ".join(words))

        # NARRATIVE features
        if narr_features or ((not train) and (symp_train in featurenames)):
            narr_string = ""
            item = child.find(element)
            if item != None:
                narr_string = data_util.stringify_children(item).encode(
                    'utf-8')

                if narr_string == "":
                    print "warning: empty narrative"
                narr_words = [
                    w.strip() for w in narr_string.lower().translate(
                        string.maketrans("", ""), string.punctuation).split(
                            ' ')
                ]
                text = " ".join(narr_words)

                if stem:
                    narr_string = preprocessing.stem(text)
                elif lemma:
                    narr_string = preprocessing.lemmatize(text)
            narratives.append(narr_string.strip().lower())
            #print "Adding narr: " + narr_string.lower()

        # SYMPTOM features
        elif train and (symp_train in featurenames):
            narr_string = ""
            item = child.find("narrative_symptoms")
            if item != None:
                item_text = item.text
                if item_text != None and len(item_text) > 0:
                    narr_string = item.text.encode("utf-8")
                    #narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')]
            narratives.append(narr_string.lower())
            print "Adding symp_narr: " + narr_string.lower()

        # Save features
        matrix.append(features)

    # Construct the feature matrix

    # COUNT or TFIDF features
    if narr_count in featurenames or kw_count in featurenames or narr_tfidf in featurenames or kw_tfidf in featurenames or lda in featurenames or symp_train in featurenames:
        documents = []
        if narr_count in featurenames or narr_tfidf in featurenames or lda in featurenames or symp_train in featurenames:
            documents = narratives
            print "narratives: " + str(len(narratives))
        elif kw_count in featurenames or kw_tfidf in featurenames:
            documents = keywords
            print "keywords: " + str(len(keywords))

        # Create count matrix
        global count_vectorizer
        if train:
            print "training count_vectorizer"
            count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(
                ngram_range=(min_ngram, max_ngram), stop_words=stopwords)
            count_vectorizer.fit(documents)
            dict_keys = dict_keys + count_vectorizer.get_feature_names()
        print "transforming data with count_vectorizer"
        count_matrix = count_vectorizer.transform(documents)
        matrix_keys = count_vectorizer.get_feature_names()

        print "writing count matrix to file"
        out_matrix = open(infile + ".countmatrix", "w")
        out_matrix.write(str(count_matrix))
        out_matrix.close()

        # Add count features to the dictionary
        for x in range(len(matrix)):
            feat = matrix[x]
            for i in range(len(matrix_keys)):
                key = matrix_keys[i]
                val = count_matrix[x, i]
                feat[key] = val

        # Convert counts to TFIDF
        if (narr_tfidf in featurenames) or (kw_tfidf in featurenames):
            print "converting to tfidf..."
            print "matrix_keys: " + str(len(matrix_keys))

            # Use the training count matrix for fitting
            if train:
                global tfidfTransformer
                tfidfTransformer = sklearn.feature_extraction.text.TfidfTransformer(
                )
                tfidfTransformer.fit(count_matrix)

            # Convert matrix to tfidf
            tfidf_matrix = tfidfTransformer.transform(count_matrix)
            print "count_matrix: " + str(count_matrix.shape)
            print "tfidf_matrix: " + str(tfidf_matrix.shape)

            # Replace features in matrix with tfidf
            for x in range(len(matrix)):
                feat = matrix[x]
                #values = tfidf_matrix[x,0:]
                #print "values: " + str(values.shape[0])
                for i in range(len(matrix_keys)):
                    key = matrix_keys[i]
                    val = tfidf_matrix[x, i]
                    feat[key] = val

        # LDA topic modeling features
        if lda in featurenames:
            global ldaModel
            if train:
                ldaModel = LatentDirichletAllocation(n_topics=num_topics)
                ldaModel.fit(count_matrix)
            lda_matrix = ldaModel.transform(count_matrix)
            for t in range(0, num_topics):
                dict_keys.append("lda_topic_" + str(t))
            for x in range(len(matrix)):
                for y in range(len(lda_matrix[x])):
                    val = lda_matrix[x][y]
                    matrix[x]["lda_topic_" + str(y)] = val

            # TODO: Print LDA topics

    # WORD2VEC features
    elif narr_vec in featurenames or event_vec in featurenames or event_seq in featurenames:
        feat_name = narr_vec
        if event_vec in featurenames:
            feat_name = event_vec
        elif event_seq in featurenames:
            feat_name = event_seq

        matrix, dict_keys = vector_features(feat_name, narratives, matrix,
                                            dict_keys, vecfile)

    # narr_seq for RNN
    elif narr_seq in featurenames:
        global vocab_size, max_seq_len
        if train:
            dict_keys.append(narr_seq)
            dict_keys.append('vocab_size')
            dict_keys.append('max_seq_len')
            vocab = set()
            for narr in narratives:
                words = narr.split(' ')
                for word in words:
                    vocab.add(word)
            vocab_size = len(vocab)
            max_seq_len = 0

        sequences = []

        # Convert text into integer sequences
        for x in range(len(matrix)):
            narr = narratives[x]
            seq = hashing_trick(narr,
                                vocab_size,
                                hash_function='md5',
                                filters='\t\n',
                                lower=True,
                                split=' ')
            if len(seq) > max_seq_len:
                max_seq_len = len(seq)
            sequences.append(seq)

        # Pad the sequences
        sequences = pad_sequences(sequences,
                                  maxlen=max_seq_len,
                                  dtype='int32',
                                  padding='pre')
        for x in range(len(matrix)):
            matrix[x]['narr_seq'] = sequences[x]
            matrix[x]['vocab_size'] = vocab_size
            matrix[x]['max_seq_len'] = max_seq_len

    #if arg_rebalance != "":
    #    matrix_re = rebalance_data(matrix, dict_keys, arg_rebalance)
    #    write_to_file(matrix_re, dict_keys, outfile)
    #else:
    data_util.write_to_file(matrix, dict_keys, outfile)
Exemple #6
0
def get_seqs(filename, split_sents=False, inline=True):
    print "get_seqs " + filename
    ids = []
    narrs = []
    anns = []
    seqs = []
    seq_ids = []

    # Get the xml from file
    tree = etree.parse(filename)
    root = tree.getroot()

    for child in root:
        narr = ""
        rec_id = child.find(id_name).text
        ids.append(rec_id)
        # Get the narrative text
        node = child.find("narr_timeml_simple")
        if inline:
            if node == None:
                narr_node = child.find("narrative")
                if narr_node == None:
                    print "no narrative: " + data_util.stringify_children(
                        child)
                else:
                    narr = narr_node.text
                    #print "narr: " + narr
                    narrs.append(narr)
            else:
                rec_id = child.find(id_name).text
                #print "rec_id: " + rec_id
                narr = data_util.stringify_children(node).encode('utf-8')
                #print "narr: " + narr
                ids.append(rec_id)
                narrs.append(narr)
        else:  # NOT inline
            anns.append(data_util.stringify_children(node).encode('utf8'))
            narr_node = child.find("narrative")
            narrs.append(narr_node.text)

    if inline:
        for x in range(len(narrs)):
            narr = narrs[x]
            rec_id = ids[x]
            if split_sents:
                sents = narr.split('.')
                for sent in sents:
                    sent_seq = xmltoseq.xml_to_seq(sent.strip())
                    seqs.append(sent_seq)
                    seq_ids.append(rec_id)
            else:
                narr_seq = xmltoseq.xml_to_seq(narr)
                seqs.append(narr_seq)
                seq_ids.append(rec_id)
    else:
        for x in range(len(narrs)):
            narr = narrs[x]
            ann = anns[x]
            rec_id = ids[x]
            print "split_sents: " + str(split_sents)
            ann_seqs = xmltoseq.ann_to_seq(narr, ann, split_sents)
            print "seqs: " + str(len(ann_seqs))
            for s in ann_seqs:
                seqs.append(s)
                seq_ids.append(rec_id)

    return seq_ids, seqs