def run(infile, outfile, vecfile): # Get the xml from file tree = etree.parse(infile) root = tree.getroot() timelines = [] for child in root: node = child.find("narr_timeml_simple") narr = "" if node != None: narr = data_util.stringify_children(node) #.encode('utf-8') print "narr: " + narr timeline, events = create_timeline(narr, vecfile) timelines.append(timeline) print "Timeline text:" for event in events: print str(event) out = open(outfile, 'w') out.write(str(timelines)) out.close()
def run(infile, outfile): seqs = {} # id -> seq # Get the xml from file tree = etree.parse(infile) root = tree.getroot() for child in root: id_node = child.find("MG_ID") rec_id = id_node.text node = child.find("narr_timeml_simple") narr = "" if node != None: narr = data_util.stringify_children(node).encode('utf-8') if len(narr) > 0: seq_narr = xml_to_seq(narr) seqs[rec_id] = seq_narr # write the stats to file output = open(outfile, 'w') output.write(str(seqs)) output.close()
def filter_narr(tree, tagger_name): print "filter_narr" root = tree.getroot() symp_narr = "" for child in root: #print "child: " + data_util.stringify_children(child) node = child.find(symp_narr_tag) if node is None: print "no " + symp_narr_tag + ": " + data_util.stringify_children( child) for item in node.iterdescendants("EVENT", "TIMEX3"): if item.text is not None: symp_narr = symp_narr + " " + item.text else: for it in item.iterdescendants(): if it.text is not None: symp_narr = symp_narr + " " + it.text.strip() newnode = etree.SubElement(child, symp_narr_tag) newnode.text = symp_narr.strip() #print "symp_narr: " + symp_narr tagger_node = etree.SubElement(child, symp_tagger_tag) tagger_node.text = tagger_name return tree
def convert_spans_to_xml(text): lines = text.splitlines() lines = lines[1:] # ignore the xml header #root = etree.fromstring("<root>" + ' '.join(lines) + "</root>") root = etree.fromstring("<root>" + '\n'.join(lines) + "</root>") timeml = root.find("TimeML") tags = [] # Get DCT dct_node = timeml.find("DCT") dct_text = data_util.stringify_children(dct_node) # Get document text text_node = timeml.find("TEXT") text_text = dct_text + data_util.stringify_children(text_node) print "text: " + text_text build_list = etree.XPath("//EVENT") events = build_list(text_node) event_dict = {} for event in events: #print etree.tostring(event) eventid = event.attrib['eid'] event_dict[eventid] = event # Copy attributes from makeinstance to the actual event tag instance_to_event = {} mis = root.xpath("//MAKEINSTANCE") for mi in mis: #print etree.tostring(mi) eventid = mi.attrib['eventID'] instanceid = mi.attrib['eiid'] instance_to_event[instanceid] = eventid event = event_dict[eventid] for att in mi.attrib.keys(): if not att == 'eventID' and not att == 'eiid': event.attrib[att] = mi.attrib[att] tlinks = root.xpath("//TLINK") slinks = root.xpath("//SLINK") tags = tlinks + slinks # Convert eiids to eids in links for tl in tags: if 'eventInstanceID' in tl.attrib: eiid = tl.attrib['eventInstanceID'] tl.attrib['eventID'] = instance_to_event[eiid] if 'relatedToEventInstance' in tl.attrib: tl.attrib['relatedToEventID'] = instance_to_event[ tl.attrib['relatedToEventInstance']] if 'subordinatedEventInstance' in tl.attrib: tl.attrib['subordinatedEventID'] = instance_to_event[ tl.attrib['subordinatedEventInstance']] #print "Updated node: " + etree.tostring(text_node) text_root = etree.fromstring("<root>" + text_text + "</root>") narr_text = ''.join(text_root.xpath("//text()")) xml_text = dct_text + data_util.stringify_children(text_node) for tag in tags: xml_text = xml_text + etree.tostring(tag) print "narr_text: " + narr_text #print "xml_text: " + xml_text return narr_text, xml_text
def extract(infile, outfile, dict_keys, stem=False, lemma=False, element="narrative", arg_rebalance="", arg_vecfile=""): train = False narratives = [] keywords = [] if event_vec in featurenames or event_seq in featurenames: element = "narr_symp" # Get the xml from file root = etree.parse(infile).getroot() if dict_keys == None: train = True # Set up the keys for the feature vector dict_keys = ["MG_ID"] if "codex" in labelname: dict_keys.append("WB10_codex") dict_keys.append("WB10_codex2") dict_keys.append("WB10_codex4") else: dict_keys.append(labelname) if rec_type in featurenames: dict_keys.append("CL_" + rec_type) if checklist in featurenames: dict_keys = dict_keys + [ "CL_DeathAge", "CL_ageunit", "CL_DeceasedSex", "CL_Occupation", "CL_Marital", "CL_Hypertension", "CL_Heart", "CL_Stroke", "CL_Diabetes", "CL_TB", "CL_HIV", "CL_Cancer", "CL_Asthma", "CL_InjuryHistory", "CL_SmokeD", "CL_AlcoholD", "CL_ApplytobaccoD" ] elif dem in featurenames: dict_keys = dict_keys + ["CL_DeathAge", "CL_DeceasedSex"] print "dict_keys: " + str(dict_keys) #keywords = set([]) #narrwords = set([]) print "train: " + str(train) print "stem: " + str(stem) print "lemma: " + str(lemma) # Extract features matrix = [] for child in root: features = {} if rec_type in featurenames: features["CL_" + rec_type] = child.tag # CHECKLIST features for key in dict_keys: if key[0:3] == "CL_": key = key[3:] item = child.find(key) value = "0" if item != None: value = item.text if key == "AlcoholD" or key == "ApplytobaccoD": if value == 'N': value = 9 features[key] = value #print "-- value: " + value if key == "MG_ID": print "extracting features from: " + value # KEYWORD features if kw_features: keyword_string = get_keywords(child) # Remove punctuation and trailing spaces from keywords words = [ s.strip().translate(string.maketrans("", ""), string.punctuation) for s in keyword_string.split(',') ] # Split keyword phrases into individual words for word in words: w = word.split(' ') words.remove(word) for wx in w: words.append(wx.strip().strip('–')) keywords.append(" ".join(words)) # NARRATIVE features if narr_features or ((not train) and (symp_train in featurenames)): narr_string = "" item = child.find(element) if item != None: narr_string = data_util.stringify_children(item).encode( 'utf-8') if narr_string == "": print "warning: empty narrative" narr_words = [ w.strip() for w in narr_string.lower().translate( string.maketrans("", ""), string.punctuation).split( ' ') ] text = " ".join(narr_words) if stem: narr_string = preprocessing.stem(text) elif lemma: narr_string = preprocessing.lemmatize(text) narratives.append(narr_string.strip().lower()) #print "Adding narr: " + narr_string.lower() # SYMPTOM features elif train and (symp_train in featurenames): narr_string = "" item = child.find("narrative_symptoms") if item != None: item_text = item.text if item_text != None and len(item_text) > 0: narr_string = item.text.encode("utf-8") #narr_words = [w.strip() for w in narr_string.lower().translate(string.maketrans("",""), string.punctuation).split(' ')] narratives.append(narr_string.lower()) print "Adding symp_narr: " + narr_string.lower() # Save features matrix.append(features) # Construct the feature matrix # COUNT or TFIDF features if narr_count in featurenames or kw_count in featurenames or narr_tfidf in featurenames or kw_tfidf in featurenames or lda in featurenames or symp_train in featurenames: documents = [] if narr_count in featurenames or narr_tfidf in featurenames or lda in featurenames or symp_train in featurenames: documents = narratives print "narratives: " + str(len(narratives)) elif kw_count in featurenames or kw_tfidf in featurenames: documents = keywords print "keywords: " + str(len(keywords)) # Create count matrix global count_vectorizer if train: print "training count_vectorizer" count_vectorizer = sklearn.feature_extraction.text.CountVectorizer( ngram_range=(min_ngram, max_ngram), stop_words=stopwords) count_vectorizer.fit(documents) dict_keys = dict_keys + count_vectorizer.get_feature_names() print "transforming data with count_vectorizer" count_matrix = count_vectorizer.transform(documents) matrix_keys = count_vectorizer.get_feature_names() print "writing count matrix to file" out_matrix = open(infile + ".countmatrix", "w") out_matrix.write(str(count_matrix)) out_matrix.close() # Add count features to the dictionary for x in range(len(matrix)): feat = matrix[x] for i in range(len(matrix_keys)): key = matrix_keys[i] val = count_matrix[x, i] feat[key] = val # Convert counts to TFIDF if (narr_tfidf in featurenames) or (kw_tfidf in featurenames): print "converting to tfidf..." print "matrix_keys: " + str(len(matrix_keys)) # Use the training count matrix for fitting if train: global tfidfTransformer tfidfTransformer = sklearn.feature_extraction.text.TfidfTransformer( ) tfidfTransformer.fit(count_matrix) # Convert matrix to tfidf tfidf_matrix = tfidfTransformer.transform(count_matrix) print "count_matrix: " + str(count_matrix.shape) print "tfidf_matrix: " + str(tfidf_matrix.shape) # Replace features in matrix with tfidf for x in range(len(matrix)): feat = matrix[x] #values = tfidf_matrix[x,0:] #print "values: " + str(values.shape[0]) for i in range(len(matrix_keys)): key = matrix_keys[i] val = tfidf_matrix[x, i] feat[key] = val # LDA topic modeling features if lda in featurenames: global ldaModel if train: ldaModel = LatentDirichletAllocation(n_topics=num_topics) ldaModel.fit(count_matrix) lda_matrix = ldaModel.transform(count_matrix) for t in range(0, num_topics): dict_keys.append("lda_topic_" + str(t)) for x in range(len(matrix)): for y in range(len(lda_matrix[x])): val = lda_matrix[x][y] matrix[x]["lda_topic_" + str(y)] = val # TODO: Print LDA topics # WORD2VEC features elif narr_vec in featurenames or event_vec in featurenames or event_seq in featurenames: feat_name = narr_vec if event_vec in featurenames: feat_name = event_vec elif event_seq in featurenames: feat_name = event_seq matrix, dict_keys = vector_features(feat_name, narratives, matrix, dict_keys, vecfile) # narr_seq for RNN elif narr_seq in featurenames: global vocab_size, max_seq_len if train: dict_keys.append(narr_seq) dict_keys.append('vocab_size') dict_keys.append('max_seq_len') vocab = set() for narr in narratives: words = narr.split(' ') for word in words: vocab.add(word) vocab_size = len(vocab) max_seq_len = 0 sequences = [] # Convert text into integer sequences for x in range(len(matrix)): narr = narratives[x] seq = hashing_trick(narr, vocab_size, hash_function='md5', filters='\t\n', lower=True, split=' ') if len(seq) > max_seq_len: max_seq_len = len(seq) sequences.append(seq) # Pad the sequences sequences = pad_sequences(sequences, maxlen=max_seq_len, dtype='int32', padding='pre') for x in range(len(matrix)): matrix[x]['narr_seq'] = sequences[x] matrix[x]['vocab_size'] = vocab_size matrix[x]['max_seq_len'] = max_seq_len #if arg_rebalance != "": # matrix_re = rebalance_data(matrix, dict_keys, arg_rebalance) # write_to_file(matrix_re, dict_keys, outfile) #else: data_util.write_to_file(matrix, dict_keys, outfile)
def get_seqs(filename, split_sents=False, inline=True): print "get_seqs " + filename ids = [] narrs = [] anns = [] seqs = [] seq_ids = [] # Get the xml from file tree = etree.parse(filename) root = tree.getroot() for child in root: narr = "" rec_id = child.find(id_name).text ids.append(rec_id) # Get the narrative text node = child.find("narr_timeml_simple") if inline: if node == None: narr_node = child.find("narrative") if narr_node == None: print "no narrative: " + data_util.stringify_children( child) else: narr = narr_node.text #print "narr: " + narr narrs.append(narr) else: rec_id = child.find(id_name).text #print "rec_id: " + rec_id narr = data_util.stringify_children(node).encode('utf-8') #print "narr: " + narr ids.append(rec_id) narrs.append(narr) else: # NOT inline anns.append(data_util.stringify_children(node).encode('utf8')) narr_node = child.find("narrative") narrs.append(narr_node.text) if inline: for x in range(len(narrs)): narr = narrs[x] rec_id = ids[x] if split_sents: sents = narr.split('.') for sent in sents: sent_seq = xmltoseq.xml_to_seq(sent.strip()) seqs.append(sent_seq) seq_ids.append(rec_id) else: narr_seq = xmltoseq.xml_to_seq(narr) seqs.append(narr_seq) seq_ids.append(rec_id) else: for x in range(len(narrs)): narr = narrs[x] ann = anns[x] rec_id = ids[x] print "split_sents: " + str(split_sents) ann_seqs = xmltoseq.ann_to_seq(narr, ann, split_sents) print "seqs: " + str(len(ann_seqs)) for s in ann_seqs: seqs.append(s) seq_ids.append(rec_id) return seq_ids, seqs