def read(serifxml_dir): serifxml_files = [] for filename in os.listdir(serifxml_dir): docid = filename if filename.endswith(".serifxml"): docid = filename[0:-(len(".serifxml"))] elif filename.endswith(".xml"): docid = filename[0:-(len(".xml"))] else: docid = None if docid is None: print("Skipping file: " + filename) continue serifxml_files.append(( docid, os.path.join(serifxml_dir, filename), )) for docid, serifxml_file in serifxml_files: print("==== " + serifxml_file) serif_doc = serifxml.Document(serifxml_file) for s in serif_doc.sentences: st = s.sentence_theories[0] for serif_em in st.event_mention_set: event_type = serif_em.event_type anchor_text = serif_em.anchor_node.text confidence = serif_em.score print(event_type) print(sanitize(anchor_text)) print(sanitize(get_snippet(serif_doc, st)))
def read(serifxml_dir): print "Reading from: " + serifxml_dir serifxml_files = [] for filename in os.listdir(serifxml_dir): docid = filename if filename.endswith(".serifxml"): docid = filename[0:-(len(".serifxml"))] elif filename.endswith(".xml"): docid = filename[0:-(len(".xml"))] else: docid = None if docid is None: print "Skipping file: " + filename continue serifxml_files.append((docid, os.path.join(serifxml_dir, filename),)) for docid, serifxml_file in serifxml_files: print ("Reading " + serifxml_file) serif_doc = serifxml.Document(serifxml_file) for s in serif_doc.sentences: st = s.sentence_theories[0] for serif_em in st.event_mention_set: event_type=serif_em.event_type anchor_text=serif_em.anchor_node.text confidence = serif_em.score #if "." not in event_type: print "em\t" + event_type + "\t" + anchor_text + "\t" + str(confidence)
def to_lingo_doc(filepath): """Takes in a filepath to a SerifXML, and use its sentences, tokens, entity-mentions, value-mentions to construct a nlplingo.text.text_theory.Document Returns: nlplingo.text.text_theory.Document """ serif_doc = serifxml.Document(filepath) """:type: serifxml.Document""" docid = serif_doc.docid lingo_doc = lingoDoc(docid) for st_index, sentence in enumerate(serif_doc.sentences): st = sentence.sentence_theories[0] """:type: serifxml.SentenceTheory""" if len(st.token_sequence) == 0: continue st_text, st_start, st_end = get_snippet(serif_doc, st) tokens = to_tokens(st) assert st_start == tokens[0].start_char_offset() assert (st_end+1) == tokens[-1].end_char_offset() s = Sentence(docid, IntPair(st_start, st_end+1), st_text, tokens, st_index) add_entity_mentions(st, s, lingo_doc) add_value_mentions(st, s, lingo_doc) add_names(st, lingo_doc) lingo_doc.add_sentence(s) return lingo_doc
def extractSentences(serifXmlPath): serif_doc = serifxml.Document(serifXmlPath) doc_tokens = [] doc_offsets = [] for s in serif_doc.sentences: st = s.sentence_theories[0] (tokens, offsets) = get_tokenized_text(st) doc_tokens.append(tokens) doc_offsets.append(offsets) return (doc_tokens, doc_offsets)
def record_doc_vocab(filepath, vocab_locations): """ :type vocab_locations: dict[str, defaultdict(list)] """ serif_doc = serifxml.Document(filepath) """:type: serifxml.Document""" docid = serif_doc.docid for st_index, sentence in enumerate(serif_doc.sentences): #st = sentence.sentence_theories[0] """:type: serifxml.SentenceTheory""" #st_text, st_start, st_end = get_snippet(serif_doc, st) tokens = to_tokens(sentence) record_unigram_info(docid, tokens, vocab_locations, sentence) record_bigram_info(docid, tokens, vocab_locations, sentence)
def extractSentences(self, serifXmlPath): self.sentenceFile = NamedTemporaryFile('w', delete=False) self.sentenceFile.close() o = codecs.open(self.sentenceFile.name, 'w', encoding='utf8') serif_doc = serifxml.Document(serifXmlPath) doc_tokens = [] doc_offsets = [] for s in serif_doc.sentences: st = s.sentence_theories[0] (tokens, offsets) = self.get_tokenized_text(st) doc_tokens.append(tokens) doc_offsets.append(offsets) o.write(' '.join(tokens) + "\n") o.close() #return self.sentenceFile.name return (self.sentenceFile.name, doc_tokens, doc_offsets)
files_length = len(serifxml_files) count = 0 actor_id_to_entity_group = dict() for docid, serifxml_file in serifxml_files: mention_map = dict() # maps serif mention to KBMention event_map = dict( ) # maps serif event mention (or icews event mention) to (KBEvent, KBEventMention, SentenceTheory) count += 1 print "SerifXMLReader producing KB objects in: " + docid + " (" + str( count) + "/" + str(files_length) + ")" serif_doc = serifxml.Document(serifxml_file) for serif_event in serif_doc.event_set: event_type = serif_event.event_type print( "=== Event: " + event_type ) # + "\ttense: " + serif_event.tense.value + " genericity: " + serif_event.genericity.value + " modality: " + serif_event.modality.value + " polarity: " + serif_event.polarity.value) for argument in serif_event.arguments: if argument.entity is not None: canonical_name = "NA" if argument.entity.canonical_name is not None: canonical_name = argument.entity.canonical_name print(" -" + str(argument.role) + ": " + argument.entity.entity_type + "\t" +
docid = filename[0:-(len(".serifxml"))] elif filename.endswith(".sgm.xml"): docid = filename[0:-(len(".sgm.xml"))] else: docid = None if docid is None: continue doc_num += 1 print("[cross-doc] Doc #" + str(doc_num) + ": " + docid) # outfile = os.path.join(output_dir, docid + ".json") # o = codecs.open(outfile, 'w', encoding='utf8') serif_doc = serifxml.Document(os.path.join(input_dir, filename)) processor.produce_entities(docid, serif_doc) if include_entities: for eid in processor.eid_to_entity: entity = processor.eid_to_entity[eid] result["all"]["entities"].append(entity) # If we have 3 doc types, we'll split the entity into 3, # each containing the mentions from a particular doc type # doc_type_to_entity is a dict that maps doc_type to a single entity doc_type_to_entity = entity.split_into_doc_types(docid_to_doc_type_map) for doc_type in doc_type_to_entity: entity = doc_type_to_entity[doc_type] result[doc_type]["entities"].append(entity)
# CREATE INPUT TEXT FILE # Text file containing sentences that we will run on text_filename = basename if not text_filename.endswith(".txt"): text_filename = text_filename + ".txt" outfile = os.path.join(output_dir, text_filename) o = codecs.open(outfile, 'w', encoding='utf8') # Metadata file containing char offset mapping from text file to # serifxml file metadata_filename = basename + ".meta" metadata_file = os.path.join(metadata_dir, metadata_filename) m = open(metadata_file, 'w') document = serifxml.Document(input_file) current_output_offset = 0 for sentence in document.sentences: original_sentence_text = sentence.text sentence_text = original_sentence_text.replace("\r", " ") sentence_text = sentence_text.replace("\n", " ") o.write(sentence_text + "\n") m.write(str(current_output_offset) + " " + str(sentence.start_char) + "\n") current_output_offset += len(sentence.text) + 1 # +1 for newline character o.close() m.close() # RUN DISCOURSE PARSER time_limit = 1800 os.chdir(config_dir)