def parseCorpus(self, docToVerifiedSentences): # maps the long, original REF names to a small, more readable REF ID REFToUREF = {} UREF = 0 print("* parsing ECB corpus:", self.args.corpusPath) numMentionsIgnored = 0 corpus = Corpus() files = [] filteredTrainingDirs = self.helper.trainingDirs[0:self.args.devDir] print("filteredTrainingDirs:", filteredTrainingDirs) for root, _, filenames in os.walk(self.args.corpusPath): for filename in fnmatch.filter(filenames, '*.xml'): f = os.path.join(root, filename) doc_id = f[f.rfind("/") + 1:] dir_num = int(doc_id.split("_")[0]) if dir_num in self.helper.trainingDirs and dir_num not in filteredTrainingDirs: continue files.append(os.path.join(root, filename)) globalSentenceNum = 0 lastToken_id = -1 intraCount = 0 # used for keeping track of how many mentions were pronouns had_pronoun = 0 not_had_pronoun = 0 num_events_with_pronouns = 0 for f in sorted(files): lm_idToMention = {} # only used to tmp store the mentions removed_m_ids = set() # keeps track of the mentions that had pronouns and we removed (if we care to remove them) doc_id = f[f.rfind("/") + 1:] dir_num = int(doc_id.split("_")[0]) extension = doc_id[doc_id.find("ecb"):] dirHalf = str(dir_num) + extension curDoc = Doc(doc_id) corpus.ECBDirs[dir_num].docs[doc_id] = curDoc corpus.dirHalves[dirHalf].docs[doc_id] = curDoc tmpDocTokens = [] tmpDocTokenIDsToTokens = {} # opens the xml file and makes needed replacements input_file = open(f, 'r', encoding="utf-8") #with open(f, 'r', encoding="utf-8") as myfile: fileContents = input_file.read().replace('\n', ' ') for badToken in self.replacementsList: # self.replacementsSet: fileContents = fileContents.replace(badToken, self.replacements[badToken]) # reads <tokens> it = tuple(re.finditer(r"<token t\_id=\"(\d+)\" sentence=\"(\d+)\" number=\"(\d+)\".*?>(.*?)</(.*?)>", fileContents)) lastSentenceNum = -1 if self.write_stanford_input: tmp_line_to_stanford_input = defaultdict(list) # numbers every token in each given sentence, starting at 1 (each sentence starts at 1) tokenNum = 0 firstToken = True lastTokenText = "" for match in it: t_id = match.group(1) sentenceNum = int(match.group(2)) hTokenNum = int(match.group(3)) # only used for matching w/ HDDCRP's files #tokenText = match.group(4).rstrip() # should be used if i'll write out hte corpus for Stan tokenText = match.group(4).lower().rstrip() # removes tokens that end in : (e.g., newspaper:) but leaves the atomic ":" alone if len(tokenText) > 1 and tokenText[-1] == ":": tokenText = tokenText[:-1] if tokenText == "''": tokenText = "\"" elif tokenText == "''bagman\"": tokenText = "\"bagman\"" print("* replaced bagman1") elif tokenText == "''bagman": tokenText = "\"bagman" print("* replaced bagman2") if sentenceNum > curDoc.highestSentenceNum: curDoc.highestSentenceNum = sentenceNum if sentenceNum > 0 or "plus" not in doc_id: # writes Stanford_input if self.write_stanford_input: tmp_line_to_stanford_input[int(sentenceNum)].append(match.group(4).rstrip()) hSentenceNum = sentenceNum if "plus" in doc_id: hSentenceNum = sentenceNum - 1 # TMP ''' if sentenceNum not in tmpSentenceNums: tmpSentenceNums.append(sentenceNum) ''' # we are starting a new sentence if sentenceNum != lastSentenceNum: # we are possibly ending the prev sentence if not firstToken: # if sentence ended with an atomic ":", let's change it to a "." if lastTokenText == ":": lastToken = tmpDocTokenIDsToTokens[lastToken_id] lastToken.text = "." tmpDocTokenIDsToTokens[lastToken_id] = lastToken elif lastTokenText not in self.endPunctuation: endToken = Token("-1", lastSentenceNum, globalSentenceNum, tokenNum, doc_id, hSentenceNum, hTokenNum, ".") tmpDocTokens.append(endToken) globalSentenceNum = globalSentenceNum + 1 tokenNum = 0 # adds token curToken = Token(t_id, sentenceNum, globalSentenceNum, tokenNum, doc_id, hSentenceNum, hTokenNum, tokenText) #corpus.UIDToToken[curToken.UID] = curToken #curDoc.UIDs.append(curToken.UID) tmpDocTokenIDsToTokens[t_id] = curToken firstToken = False tmpDocTokens.append(curToken) tokenNum = tokenNum + 1 curDoc.globalSentenceNums.add(globalSentenceNum) lastSentenceNum = sentenceNum lastTokenText = tokenText lastToken_id = t_id if self.write_stanford_input: tmpFOUT = open("../data/stanford_in/"+doc_id, "w") for sent_num in sorted(tmp_line_to_stanford_input.keys()): tmpFOUT.write(" ".join(tmp_line_to_stanford_input[sent_num]) + "\n") tmpFOUT.close() # if sentence ended with an atomic ":", let's change it to a "." if lastTokenText == ":": lastToken = tmpDocTokenIDsToTokens[lastToken_id] lastToken.text = "." tmpDocTokenIDsToTokens[lastToken_id] = lastToken elif lastTokenText not in self.endPunctuation: endToken = Token("-1", lastSentenceNum, globalSentenceNum, tokenNum, doc_id, -1, -1, ".") tmpDocTokens.append(endToken) globalSentenceNum = globalSentenceNum + 1 # reads <markables> 1st time regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>" markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")] it = tuple(re.finditer(regex, markables)) for match in it: # gets the token IDs regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(3))) tmpCurrentMentionSpanIDs = [] hasAllTokens = True for match2 in it2: tokenID = match2.group(1) tmpCurrentMentionSpanIDs.append(int(tokenID)) if tokenID not in tmpDocTokenIDsToTokens.keys(): hasAllTokens = False for t in tmpDocTokens: corpus.addToken(t) curDoc.tokens.append(t) corpus.UIDToToken[t.UID] = t #if doc_id == "31_3ecbplus.xml": # print("t:",t) # reads <markables> 2nd time regex = r"<([\w]+) m_id=\"(\d+)?\".*?>(.*?)?</.*?>" markables = fileContents[fileContents.find("<Markables>")+11:fileContents.find("</Markables>")] it = tuple(re.finditer(regex, markables)) for match in it: isPred = False mentionType = match.group(1) if "ACTION" in mentionType: isPred = True m_id = int(match.group(2)) # gets the token IDs regex2 = r"<token_anchor t_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(3))) tmpTokens = [] text = [] hasAllTokens = True has_pronoun = False for match2 in it2: tokenID = match2.group(1) if tokenID in tmpDocTokenIDsToTokens.keys(): cur_token = tmpDocTokenIDsToTokens[tokenID] tmpTokens.append(cur_token) text.append(cur_token.text) else: hasAllTokens = False # only process Mentions if they adhere to our preferences of using pronouns or not # determines if it has a pronoun or not (and if we care) if len(text) == 1: if text[0] in self.helper.pronouns: has_pronoun = True if has_pronoun: had_pronoun += 1 if isPred: num_events_with_pronouns += 1 else: not_had_pronoun += 1 # possibly add the mention use_pronoun = False if isPred: use_pronoun = self.helper.event_pronouns else: use_pronoun = self.helper.entity_pronouns use_mention = True if not use_pronoun and has_pronoun: use_mention = False #print("* not constructing mention:", text) removed_m_ids.add(m_id) # we should only have incomplete Mentions for our hand-curated, sample corpus, # for we do not want to have all mentions, so we curtail the sentences of tokens if hasAllTokens and use_mention: curMention = Mention(dirHalf, dir_num, doc_id, tmpTokens, text, isPred, mentionType) lm_idToMention[m_id] = curMention #tmpSentenceNumToMentions[tmpTokens[0].sentenceNum].append(curMention) #corpus.addMention(curMention, "123") # reads <relations> relations = fileContents[fileContents.find("<Relations>"):fileContents.find("</Relations>")] regex = r"<CROSS_DOC_COREF.*?note=\"(.+?)\".*?>(.*?)?</.*?>" it = tuple(re.finditer(regex, relations)) for match in it: REF = match.group(1) regex2 = r"<source m_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(2))) # only keep track of REFs for which we have found Mentions for match2 in it2: m_id = int(match2.group(1)) if m_id not in lm_idToMention: if m_id not in removed_m_ids: print("*** MISSING MENTION! EXITING 1") exit(1) else: #elif lm_idToMention[m_id].isPred: foundMention = lm_idToMention[m_id] if self.onlyEvents and not foundMention.isPred: continue token0 = foundMention.tokens[0] if self.args.onlyValidSentences and token0.sentenceNum not in docToVerifiedSentences[doc_id]: numMentionsIgnored += 1 continue else: corpus.addMention(foundMention, REF) if self.args.addIntraDocs: regex = r"<INTRA_DOC_COREF.*?>(.*?)?</.*?>" it = tuple(re.finditer(regex, relations)) for match in it: regex2 = r"<source m_id=\"(\d+)\".*?/>" it2 = tuple(re.finditer(regex2, match.group(1))) # only keep track of REFs for which we have found Mentions for match2 in it2: m_id = int(match2.group(1)) if m_id not in lm_idToMention: print("*** MISSING MENTION! EXITING 2") exit(1) else: foundMention = lm_idToMention[m_id] if self.onlyEvents and not foundMention.isPred: continue token0 = foundMention.tokens[0] if self.args.onlyValidSentences and token0.sentenceNum not in docToVerifiedSentences[doc_id]: numMentionsIgnored += 1 continue else: corpus.addMention(foundMention, "INTRA"+str(intraCount)) intraCount += 1 corpus.addDocPointer(doc_id, curDoc) # optionally displays annotations (Mentions clearly designated w/ unique REF IDs#) if self.printCorpusTokens: print("\n------------------\ndoc:",doc_id,"\n------------------") sent_num = -1 oline = "" lastMentions = set() for t in curDoc.tokens: if t.sentenceNum != sent_num and sent_num != -1: sent_num = t.sentenceNum print(oline) oline = "" added = False removed = False urefToAdd = -1 entOrEventToAdd = "" for m in t.mentions: if m not in lastMentions: if m.REF in REFToUREF.keys(): urefToAdd = REFToUREF[m.REF] else: urefToAdd = UREF REFToUREF[m.REF] = UREF UREF += 1 if m.isPred: entOrEventToAdd = "v" else: entOrEventToAdd = "ent" added = True if len(lastMentions) > 0: for m in lastMentions: if m not in t.mentions: removed = True if removed: oline += "] " if added: if len(oline) > 0 and oline[-1] != " ": oline += " " oline += str(entOrEventToAdd) + str(urefToAdd) + "[" if len(oline) > 0 and oline[-1] != " " and oline[-1] != "[": oline += " " oline += str(t.text) lastMentions = t.mentions print(oline) corpus.assignGlobalSentenceNums() print("numMentionsIgnored:", numMentionsIgnored) print("# ECB mentions created:", len(corpus.ecb_mentions)) num_events = 0 for m in corpus.ecb_mentions: if m.isPred: num_events += 1 print("\t# events:", num_events) print("\t\t# of event which had pronouns:", num_events_with_pronouns) print("\t# entities:", len(corpus.ecb_mentions) - num_events) print("# ECB+ tokens:", len(corpus.corpusTokens)) print("# mentions that had_pronoun:", had_pronoun) print("# mentions that did not had_pronoun:", not_had_pronoun) return corpus