def loadCorpus(corpus, parse, tokenization=None, removeNameInfo=False, removeIntersentenceInteractionsFromCorpusElements=True): """ Load an entire corpus through CorpusElements and add SentenceGraph-objects to its SentenceElements-objects. """ import cElementTreeUtils as ETUtils import sys sys.path.append("..") from Utils.ProgressCounter import ProgressCounter from InteractionXML.CorpusElements import CorpusElements # Corpus may be in file or not if type(corpus) == types.StringType: print >> sys.stderr, "Loading corpus file", corpus corpusTree = ETUtils.ETFromObj(corpus) corpusRoot = corpusTree.getroot() # Use CorpusElements-class to access xml-tree corpusElements = CorpusElements(corpusRoot, parse, tokenization, tree=corpusTree, removeNameInfo=removeNameInfo, removeIntersentenceInteractions=removeIntersentenceInteractionsFromCorpusElements) print >> sys.stderr, str(len(corpusElements.documentsById)) + " documents, " + str(len(corpusElements.sentencesById)) + " sentences" # Make sentence graphs duplicateInteractionEdgesRemoved = 0 sentences = [] counter = ProgressCounter(len(corpusElements.sentences), "Make sentence graphs") counter.showMilliseconds = True for sentence in corpusElements.sentences[:]: counter.update(1, "Making sentence graphs ("+sentence.sentence.get("id")+"): ") # No tokens, no sentence. No also no dependencies = no sentence. # Let's not remove them though, so that we don't lose sentences from input. if len(sentence.tokens) == 0 or len(sentence.dependencies) == 0: #corpusElements.sentences.remove(sentence) sentence.sentenceGraph = None continue for pair in sentence.pairs: # gif-xml defines two closely related element types, interactions and # pairs. Pairs are like interactions, but they can also be negative (if # interaction-attribute == False). Sometimes pair-elements have been # (incorrectly) used without this attribute. To work around these issues # we take all pair-elements that define interaction and add them to # the interaction-element list. isInteraction = pair.get("interaction") if isInteraction == "True" or isInteraction == None: sentence.interactions.append(pair) # add to interaction-elements if pair.get("type") == None: # type-attribute must be explicitly defined pair.set("type", "undefined") # Construct the basic SentenceGraph (only syntactic information) graph = SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) # Add semantic information, i.e. the interactions graph.mapInteractions(sentence.entities, sentence.interactions) graph.interSentenceInteractions = sentence.interSentenceInteractions duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved sentence.sentenceGraph = graph graph.parseElement = sentence.parseElement #graph.mapEntityHints() print >> sys.stderr, "Skipped", duplicateInteractionEdgesRemoved, "duplicate interaction edges in SentenceGraphs" return corpusElements
def loadCorpus(corpus, parse, tokenization=None, removeNameInfo=False, removeIntersentenceInteractionsFromCorpusElements=True): """ Load an entire corpus through CorpusElements and add SentenceGraph-objects to its SentenceElements-objects. """ import Utils.ElementTreeUtils as ETUtils import sys from Utils.ProgressCounter import ProgressCounter from Utils.InteractionXML.CorpusElements import CorpusElements # Corpus may be in file or not if type(corpus) == types.StringType: print >> sys.stderr, "Loading corpus file", corpus corpusTree = ETUtils.ETFromObj(corpus) corpusRoot = corpusTree.getroot() # Use CorpusElements-class to access xml-tree corpusElements = CorpusElements(corpusRoot, parse, tokenization, tree=corpusTree, removeNameInfo=removeNameInfo, removeIntersentenceInteractions=removeIntersentenceInteractionsFromCorpusElements) print >> sys.stderr, str(len(corpusElements.documentsById)) + " documents, " + str(len(corpusElements.sentencesById)) + " sentences" # Make sentence graphs duplicateInteractionEdgesRemoved = 0 sentences = [] counter = ProgressCounter(len(corpusElements.sentences), "Make sentence graphs") counter.showMilliseconds = True for sentence in corpusElements.sentences[:]: counter.update(1, "Making sentence graphs ("+sentence.sentence.get("id")+"): ") # No tokens, no sentence. No also no dependencies = no sentence. # Let's not remove them though, so that we don't lose sentences from input. if len(sentence.tokens) == 0 or len(sentence.dependencies) == 0: #corpusElements.sentences.remove(sentence) sentence.sentenceGraph = None continue for pair in sentence.pairs: # gif-xml defines two closely related element types, interactions and # pairs. Pairs are like interactions, but they can also be negative (if # interaction-attribute == False). Sometimes pair-elements have been # (incorrectly) used without this attribute. To work around these issues # we take all pair-elements that define interaction and add them to # the interaction-element list. isInteraction = pair.get("interaction") if isInteraction == "True" or isInteraction == None: sentence.interactions.append(pair) # add to interaction-elements if pair.get("type") == None: # type-attribute must be explicitly defined pair.set("type", "undefined") # Construct the basic SentenceGraph (only syntactic information) graph = SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) # Add semantic information, i.e. the interactions graph.mapInteractions(sentence.entities, sentence.interactions) graph.interSentenceInteractions = sentence.interSentenceInteractions duplicateInteractionEdgesRemoved += graph.duplicateInteractionEdgesRemoved sentence.sentenceGraph = graph graph.parseElement = sentence.parseElement #graph.mapEntityHints() print >> sys.stderr, "Skipped", duplicateInteractionEdgesRemoved, "duplicate interaction edges in SentenceGraphs" return corpusElements
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False): if iterate: from Utils.ProgressCounter import ProgressCounter import InteractionXML.SentenceElements as SentenceElements print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization print >> sys.stderr, "Removing existing head offsets" removeCount = 0 counter = ProgressCounter(None, "Find heads") counter.showMilliseconds = True for sentences in SentenceElements.getCorpusIterator(input, output, parse, tokenization): for sentence in sentences: if removeExisting: for e in sentence.sentence.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) graph.mapInteractions(sentence.entities, sentence.interactions) # Make sure every parse gets head scores #if graph.tokenHeadScores == None: # graph.getTokenHeadScores() counter.update(len(sentences), "Finding heads ("+sentences[-1].sentence.get("id")+"): ") print >> sys.stderr, "Removed head offsets from", removeCount, "entities" else: xml = ETUtils.ETFromObj(input) if removeExisting: print >> sys.stderr, "Removing existing head offsets" removeCount = 0 xml = ETUtils.ETFromObj(input) for d in xml.getroot().findall("document"): for s in d.findall("sentence"): for e in s.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] print >> sys.stderr, "Removed head offsets from", removeCount, "entities" # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization) # Make sure every parse gets head scores for sentence in corpusElements.sentences: if sentence.sentenceGraph == None: continue if sentence.sentenceGraph.tokenHeadScores == None: sentence.sentenceGraph.getTokenHeadScores() if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return xml
def waitForProcess(process, numCorpusSentences, measureByGap, outputFile, counterName, updateMessage, timeout=None): """ Waits for a process to finish, and tracks the number of entities it writes to it's outputfile. If writing a sentence takes longer than the timeout, the process is considered stalled and is killed. """ maxStartupTime = 600 # Give extra time for the process to start up (even if it creates immediately an empty output file) counter = ProgressCounter(numCorpusSentences, counterName) counter.showMilliseconds = True prevNumSentences = 0 # Number of output sentences on previous check finalCheckLeft = True # Make one final check to update counters processStatus = None # When None, process not finished prevTime = time.time() startTime = time.time() # Wait until process is finished and periodically check it's progress. while processStatus == None or finalCheckLeft: if processStatus != None: # Extra loop to let counters finish finalCheckLeft = False # Done only once if os.path.exists(outputFile[0]): # Output file has already appeared on disk # Measure number of sentences in output file numSentences = 0 f = codecs.open(outputFile[0], "rt", **outputFile[1]) for line in f: if measureByGap: if line.strip() == "": numSentences += 1 else: numSentences += 1 f.close() # Update status if numSentences - prevNumSentences != 0: # Process has progressed counter.update(numSentences - prevNumSentences, updateMessage + ": ") if finalCheckLeft: # This is a normal loop, not the final check # Startuptime hasn't yet passed or process has made progress if time.time() - startTime < maxStartupTime or numSentences - prevNumSentences != 0: #if prevNumSentences == 0 or numSentences - prevNumSentences != 0: prevTime = time.time() # reset timeout else: # Nothing happened on this update, check whether process hung elapsedTime = time.time() - prevTime if timeout != None and elapsedTime > timeout: print >> sys.stderr, "Process timed out (" + str(elapsedTime) + " vs. " + str(timeout) + ")" print >> sys.stderr, "Killing process" process.kill() prevNumSentences = numSentences time.sleep(1) else: # Output file doesn't exist yet prevTime = time.time() # reset counter if output file hasn't been created processStatus = process.poll() # Get process status, None == still running counter.markFinished() # If we get this far, don't show the error message even if process didn't finish return (numSentences, numCorpusSentences)
def makeSentences(input, output=None, removeText=False, postProcess=True, debug=False): """ Run GENIA Sentence Splitter Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ global sentenceSplitterDir print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Running GENIA Sentence Splitter", Settings.GENIA_SENTENCE_SPLITTER_DIR, if postProcess: print >> sys.stderr, "(Using post-processing)" else: print >> sys.stderr, "(No post-processing)" docCount = 0 sentencesCreated = 0 redivideCount = 0 sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "GeniaSentenceSplitter") counter.showMilliseconds = True # Create working directory workdir = tempfile.mkdtemp() for document in sourceElements: counter.update(1, "Splitting Documents ("+document.get("id")+"): ") docId = document.get("id") if docId == None: docId = "CORPUS.d" + str(docCount) docTag = "-" + str(docCount) assert document.find("sentence") == None text = document.get("text") if text == None or text.strip() == "": continue #print type(text) # Write text to workfile #workdir = tempfile.mkdtemp() workfile = codecs.open(os.path.join(workdir, "sentence-splitter-input.txt"+docTag), "wt", "utf-8") # From http://themoritzfamily.com/python-encodings-and-unicode.html # "You have to be careful with the codecs module. Whatever you pass to it must be a Unicode # object otherwise it will try to automatically decode the byte stream as ASCII" # However, the unicode errors here were simply due to STTools reading unicode ST-format as ASCII, # thus creating an ASCII interaction XML, which then triggered here the unicode error. So, at this # point we should be able to safely write(text), as the output file is unicode, and reading with # the correct coded is taken care of earlier in the pipeline. workfile.write(text) #.encode("utf-8")) workfile.close() # Run sentence splitter assert os.path.exists(Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh"), Settings.GENIA_SENTENCE_SPLITTER_DIR args = [Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh", os.path.join(workdir, "sentence-splitter-input.txt"+docTag), os.path.join(workdir, "sentence-splitter-output.txt"+docTag), Settings.RUBY_PATH] #p = subprocess.call(args) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if stdout != "": print >> sys.stderr, stdout if stderr != 'Extracting events.roading model file.\nstart classification.\n': print >> sys.stderr, stderr #print "stdout<", p.stdout.readlines(), ">" #print "stderr<", p.stderr.readlines(), ">" if postProcess: ppIn = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8") ppOut = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "wt", "utf-8") subprocess.call(os.path.join(Settings.GENIA_SENTENCE_SPLITTER_DIR, "geniass-postproc.pl"), stdin=ppIn, stdout=ppOut) ppIn.close() ppOut.close() # Read split sentences workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output-postprocessed.txt"+docTag), "rt", "utf-8") else: workfile = codecs.open(os.path.join(workdir, "sentence-splitter-output.txt"+docTag), "rt", "utf-8") start = 0 # sentences are consecutively aligned to the text for charOffsets sentenceCount = 0 #text = text.replace("\n", " ") # should stop sentence splitter from crashing. #text = text.replace(" ", " ") # should stop sentence splitter from crashing. #alignmentText = text.replace("\n", " ").replace("\r", " ") #docTokens = reWhiteSpace.split(text) docIndex = 0 sentenceBeginIndex = -1 prevSentence = None prevEndIndex = None emptySentenceCount = 0 prevText = None for sText in workfile.readlines(): sText = sText.strip() # The text of the sentence if sText == "": emptySentenceCount += 1 continue for i in range(len(sText)): if sText[i].isspace(): assert sText[i] not in ["\n", "\r"] continue while text[docIndex].isspace(): if text[docIndex] in ["\n", "\r"] and sentenceBeginIndex != -1: redivideCount += 1 prevSentence = makeSentence(text, sentenceBeginIndex, docIndex-1, prevSentence, prevEndIndex) prevSentence.set("id", docId + ".s" + str(sentenceCount)) prevSentence.set("redevided", "True") sentencesCreated += 1 sentenceCount += 1 prevEndIndex = docIndex-1 sentenceBeginIndex = -1 document.append(prevSentence) docIndex += 1 assert sText[i] == text[docIndex], (text, sText, prevText, sText[i:i+10], text[docIndex:docIndex+10], (i, docIndex), sentenceBeginIndex) # tokens[i].isspace() == False if sentenceBeginIndex == -1: sentenceBeginIndex = docIndex docIndex += 1 prevText = sText if sentenceBeginIndex != -1: prevSentence = makeSentence(text, sentenceBeginIndex, docIndex-1, prevSentence, prevEndIndex) prevSentence.set("id", docId + ".s" + str(sentenceCount)) prevEndIndex = docIndex-1 sentenceBeginIndex = -1 sentencesCreated += 1 sentenceCount += 1 document.append(prevSentence) # Add possible tail for last sentence if prevEndIndex < len(text) - 1 and prevSentence != None: assert prevSentence.get("tail") == None, prevSentence.get("tail") prevSentence.set("tail", text[prevEndIndex+1:]) if emptySentenceCount > 0: print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences in", document.get("id") # Remove original text if removeText: del document["text"] # Move elements from document element to sentences moveElements(document) docCount += 1 print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" print >> sys.stderr, "Redivided", redivideCount, "sentences" if debug: print >> sys.stderr, "Work directory preserved for debugging at", workdir else: # Remove work directory shutil.rmtree(workdir) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def mainFunc(input, output=None, parseName="McCC", tokenizationName=None, newParseName=None, newTokenizationName=None, logFileName=None, removeOld=True): print >> sys.stderr, "Protein Name Splitter" if logFileName != None: print >> sys.stderr, "Writing log to", logFileName logFile = open(logFileName, "wt") else: logFile = None #if input.endswith(".gz"): # inFile = gzip.GzipFile(input) #else: # inFile = open(input) tree = ETUtils.ETFromObj(input) if tokenizationName == None: tokenizationName = parseName #tree = ElementTree.parse(inFile) root = tree.getroot() sentences = [x for x in root.getiterator("sentence")] counter = ProgressCounter(len(sentences), "Split Protein Names") counter.showMilliseconds = True missingTokCount = 0 for sentence in sentences: sId = sentence.get("id") counter.update(1, "Splitting names (" + sId + "): ") tok = getTokenization(tokenizationName, sentence, sId, remove=removeOld) if tok == None: missingTokCount += 1 continue assert tok is not None, "Missing tokenization '%s' in sentence %s!" % ( tokenizationName, sId) parse = getParse(parseName, tokenizationName, sentence, sId, remove=removeOld) assert parse is not None, "Missing parse '%s' in sentence %s!" % ( parseName, sId) split = splitTokens(tok, sentence, logFile) # Default names if removeOld: if newTokenizationName == None: newTokenizationName = tok.get("tokenizer") if newParseName == None: newParseName = parse.get("parser") else: if newTokenizationName == None: newTokenizationName = "split-" + tok.get("tokenizer") if newParseName == None: newParseName = "split-" + parse.get("parser") # add a new tokenization with the split tokens. splittok = addTokenization(newTokenizationName, sentence, sId) addTokensToTree(split, splittok) for a in tok.attrib: if splittok.get(a) == None: splittok.set(a, tok.get(a)) #splittok.set("split-") # make a mapping from original to split token ids. Store the # head token when given. tokenIdMap = {} for t in split: if t.head: head = t.head # traverse while head.head is not None: assert head.head != t, "Cyclic heads" head = head.head # should match (nah, punctuation problems) # assert t.origId not in tokenIdMap or tokenIdMap[t.origId] == head.id, "Head conflict" tokenIdMap[t.origId] = head.id else: # only allow overwrite of existing entry if the current token # is not punctuation. if t.origId not in tokenIdMap or not t.isPunct(): tokenIdMap[t.origId] = t.id # make a copy of the specified parse that refers to the split tokens # instead of the originals. newparse = addParse(newParseName, newTokenizationName, sentence, sId) for a in parse.attrib: if newparse.get(a) == None: newparse.set(a, parse.get(a)) newparse.set("ProteinNameSplitter", "True") splittok.set("ProteinNameSplitter", "True") depSeqId = 0 #1 for d in parse.getiterator("dependency"): t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type") assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR" dep = ElementTree.SubElement(newparse, "dependency") dep.set("t1", tokenIdMap[t1]) dep.set("t2", tokenIdMap[t2]) dep.set("type", dType) dep.set("id", "sd_%d" % depSeqId) depSeqId += 1 # Add in new dependencies between the split parts. for t in [tok for tok in split if tok.head is not None]: dep = ElementTree.SubElement(newparse, "dependency") dep.set("t1", t.head.id) dep.set("t2", t.id) dep.set("type", t.depType) dep.set("split", "PNS") dep.set("id", "spd_%d" % depSeqId) depSeqId += 1 for phrase in parse.getiterator("phrase"): newparse.append(phrase) # debugging #print >> sys.stderr, "NEW DEP IN", sId print >> sys.stderr, "Tokenization missing from", missingTokCount, "sentences" #indent(root) if logFile != None: logFile.close() # debugging if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(tree, output) return tree
def makeSentences(input, output=None, removeText=False, postProcess=True, debug=False): """ Run GENIA Sentence Splitter Divide text in the "text" attributes of document and section elements into sentence elements. These sentence elements are inserted into their respective parent elements. """ global sentenceSplitterDir print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() print >> sys.stderr, "Running GENIA Sentence Splitter", Settings.GENIA_SENTENCE_SPLITTER_DIR, if postProcess: print >> sys.stderr, "(Using post-processing)" else: print >> sys.stderr, "(No post-processing)" docCount = 0 sentencesCreated = 0 redivideCount = 0 emptySentenceCount = 0 sourceElements = [x for x in corpusRoot.getiterator("document") ] + [x for x in corpusRoot.getiterator("section")] counter = ProgressCounter(len(sourceElements), "GeniaSentenceSplitter") counter.showMilliseconds = True # Create working directory workdir = tempfile.mkdtemp() for document in sourceElements: counter.update(1, "Splitting Documents (" + document.get("id") + "): ") docId = document.get("id") if docId == None: docId = "CORPUS.d" + str(docCount) docTag = "-" + str(docCount) assert document.find("sentence") == None text = document.get("text") if text == None or text.strip() == "": continue #print type(text) # Write text to workfile #workdir = tempfile.mkdtemp() workfile = codecs.open( os.path.join(workdir, "sentence-splitter-input.txt" + docTag), "wt", "utf-8") # From http://themoritzfamily.com/python-encodings-and-unicode.html # "You have to be careful with the codecs module. Whatever you pass to it must be a Unicode # object otherwise it will try to automatically decode the byte stream as ASCII" # However, the unicode errors here were simply due to STTools reading unicode ST-format as ASCII, # thus creating an ASCII interaction XML, which then triggered here the unicode error. So, at this # point we should be able to safely write(text), as the output file is unicode, and reading with # the correct coded is taken care of earlier in the pipeline. workfile.write(text) #.encode("utf-8")) workfile.close() # Run sentence splitter assert os.path.exists( Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh"), Settings.GENIA_SENTENCE_SPLITTER_DIR args = [ Settings.GENIA_SENTENCE_SPLITTER_DIR + "/run_geniass.sh", os.path.join(workdir, "sentence-splitter-input.txt" + docTag), os.path.join(workdir, "sentence-splitter-output.txt" + docTag), Settings.RUBY_PATH ] #p = subprocess.call(args) p = subprocess.Popen(args, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() if stdout != "": print >> sys.stderr, stdout if stderr != 'Extracting events.roading model file.\nstart classification.\n': print >> sys.stderr, stderr #print "stdout<", p.stdout.readlines(), ">" #print "stderr<", p.stderr.readlines(), ">" if postProcess: postProcessorPath = os.path.join( os.path.dirname(os.path.abspath(__file__)), "geniass-postproc.pl") assert os.path.exists(postProcessorPath), postProcessorPath ppIn = codecs.open( os.path.join(workdir, "sentence-splitter-output.txt" + docTag), "rt", "utf-8") ppOut = codecs.open( os.path.join( workdir, "sentence-splitter-output-postprocessed.txt" + docTag), "wt", "utf-8") perlReturnValue = subprocess.call(["perl", postProcessorPath], stdin=ppIn, stdout=ppOut) assert perlReturnValue == 0, perlReturnValue ppIn.close() ppOut.close() # Read split sentences workfile = codecs.open( os.path.join( workdir, "sentence-splitter-output-postprocessed.txt" + docTag), "rt", "utf-8") else: workfile = codecs.open( os.path.join(workdir, "sentence-splitter-output.txt" + docTag), "rt", "utf-8") start = 0 # sentences are consecutively aligned to the text for charOffsets sentenceCount = 0 #text = text.replace("\n", " ") # should stop sentence splitter from crashing. #text = text.replace(" ", " ") # should stop sentence splitter from crashing. #alignmentText = text.replace("\n", " ").replace("\r", " ") #docTokens = reWhiteSpace.split(text) docIndex = 0 sentenceBeginIndex = -1 prevSentence = None prevEndIndex = None #emptySentenceCount = 0 prevText = None for sText in workfile.readlines(): sText = sText.strip() # The text of the sentence if sText == "": emptySentenceCount += 1 continue for i in range(len(sText)): if sText[i].isspace(): assert sText[i] not in ["\n", "\r"] continue while text[docIndex].isspace(): if text[docIndex] in ["\n", "\r" ] and sentenceBeginIndex != -1: redivideCount += 1 prevSentence = makeSentence(text, sentenceBeginIndex, docIndex, prevSentence, prevEndIndex) prevSentence.set("id", docId + ".s" + str(sentenceCount)) prevSentence.set("redevided", "True") sentencesCreated += 1 sentenceCount += 1 prevEndIndex = docIndex - 1 sentenceBeginIndex = -1 document.append(prevSentence) docIndex += 1 assert sText[i] == text[docIndex], ( text, sText, prevText, sText[i:i + 10], text[docIndex:docIndex + 10], (i, docIndex), sentenceBeginIndex) # tokens[i].isspace() == False if sentenceBeginIndex == -1: sentenceBeginIndex = docIndex docIndex += 1 prevText = sText if sentenceBeginIndex != -1: prevSentence = makeSentence(text, sentenceBeginIndex, docIndex, prevSentence, prevEndIndex) prevSentence.set("id", docId + ".s" + str(sentenceCount)) prevEndIndex = docIndex - 1 sentenceBeginIndex = -1 sentencesCreated += 1 sentenceCount += 1 document.append(prevSentence) # Add possible tail for last sentence if prevEndIndex < len(text) - 1 and prevSentence != None: assert prevSentence.get("tail") == None, prevSentence.get("tail") prevSentence.set("tail", text[prevEndIndex + 1:]) #if emptySentenceCount > 0: # print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences in", document.get("id") # Remove original text if removeText: del document["text"] # Move elements from document element to sentences moveElements(document) docCount += 1 print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences" print >> sys.stderr, "Redivided", redivideCount, "sentences" if emptySentenceCount > 0: print >> sys.stderr, "Warning,", emptySentenceCount, "empty sentences" if debug: print >> sys.stderr, "Work directory preserved for debugging at", workdir else: # Remove work directory shutil.rmtree(workdir) if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusRoot, output) return corpusTree
def findHeads(input, parse, tokenization=None, output=None, removeExisting=True, iterate=False): if iterate: from Utils.ProgressCounter import ProgressCounter import InteractionXML.SentenceElements as SentenceElements print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization print >> sys.stderr, "Removing existing head offsets" removeCount = 0 counter = ProgressCounter(None, "Find heads") counter.showMilliseconds = True for sentences in SentenceElements.getCorpusIterator( input, output, parse, tokenization): for sentence in sentences: if removeExisting: for e in sentence.sentence.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] graph = SentenceGraph.SentenceGraph(sentence.sentence, sentence.tokens, sentence.dependencies) graph.mapInteractions(sentence.entities, sentence.interactions) # Make sure every parse gets head scores #if graph.tokenHeadScores == None: # graph.getTokenHeadScores() counter.update( len(sentences), "Finding heads (" + sentences[-1].sentence.get("id") + "): ") print >> sys.stderr, "Removed head offsets from", removeCount, "entities" else: xml = ETUtils.ETFromObj(input) if removeExisting: print >> sys.stderr, "Removing existing head offsets" removeCount = 0 xml = ETUtils.ETFromObj(input) for d in xml.getroot().findall("document"): for s in d.findall("sentence"): for e in s.findall("entity"): if e.get("headOffset") != None: removeCount += 1 del e.attrib["headOffset"] print >> sys.stderr, "Removed head offsets from", removeCount, "entities" # SentenceGraph automatically calculates head offsets and adds them to entities if they are missing print >> sys.stderr, "Determining head offsets using parse", parse, "and tokenization", tokenization corpusElements = SentenceGraph.loadCorpus(xml, parse, tokenization) # Make sure every parse gets head scores for sentence in corpusElements.sentences: if sentence.sentenceGraph == None: continue if sentence.sentenceGraph.tokenHeadScores == None: sentence.sentenceGraph.getTokenHeadScores() if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(corpusElements.rootElement, output) return xml
def mainFunc(input, output=None, parseName="McCC", tokenizationName=None, newParseName=None, newTokenizationName=None, logFileName=None, removeOld=True): print >> sys.stderr, "Protein Name Splitter" if logFileName != None: print >> sys.stderr, "Writing log to", logFileName logFile = open(logFileName, "wt") else: logFile = None #if input.endswith(".gz"): # inFile = gzip.GzipFile(input) #else: # inFile = open(input) tree = ETUtils.ETFromObj(input) if tokenizationName == None: tokenizationName = parseName #tree = ElementTree.parse(inFile) root = tree.getroot() sentences = [x for x in root.getiterator("sentence")] counter = ProgressCounter(len(sentences), "Split Protein Names") counter.showMilliseconds = True missingTokCount = 0 for sentence in sentences: sId = sentence.get("id") counter.update(1, "Splitting names ("+sId+"): ") tok = getTokenization(tokenizationName, sentence, sId, remove=removeOld) if tok == None: missingTokCount += 1 continue assert tok is not None, "Missing tokenization '%s' in sentence %s!" % (tokenizationName, sId) parse = getParse(parseName, tokenizationName, sentence, sId, remove=removeOld) assert parse is not None, "Missing parse '%s' in sentence %s!" % (parseName, sId) split = splitTokens(tok, sentence, logFile) # Default names if removeOld: if newTokenizationName == None: newTokenizationName = tok.get("tokenizer") if newParseName == None: newParseName = parse.get("parser") else: if newTokenizationName == None: newTokenizationName = "split-" + tok.get("tokenizer") if newParseName == None: newParseName = "split-" + parse.get("parser") # add a new tokenization with the split tokens. splittok = addTokenization(newTokenizationName, sentence, sId) addTokensToTree(split, splittok) for a in tok.attrib: if splittok.get(a) == None: splittok.set(a, tok.get(a)) #splittok.set("split-") # make a mapping from original to split token ids. Store the # head token when given. tokenIdMap = {} for t in split: if t.head: head = t.head # traverse while head.head is not None: assert head.head != t, "Cyclic heads" head = head.head # should match (nah, punctuation problems) # assert t.origId not in tokenIdMap or tokenIdMap[t.origId] == head.id, "Head conflict" tokenIdMap[t.origId] = head.id else: # only allow overwrite of existing entry if the current token # is not punctuation. if t.origId not in tokenIdMap or not t.isPunct(): tokenIdMap[t.origId] = t.id # make a copy of the specified parse that refers to the split tokens # instead of the originals. newparse = addParse(newParseName, newTokenizationName, sentence, sId) for a in parse.attrib: if newparse.get(a) == None: newparse.set(a, parse.get(a)) newparse.set("ProteinNameSplitter", "True") splittok.set("ProteinNameSplitter", "True") depSeqId = 0 #1 for d in parse.getiterator("dependency"): t1, t2, dType = d.get("t1"), d.get("t2"), d.get("type") assert t1 in tokenIdMap and t2 in tokenIdMap, "INTERNAL ERROR" dep = ElementTree.SubElement(newparse, "dependency") dep.set("t1", tokenIdMap[t1]) dep.set("t2", tokenIdMap[t2]) dep.set("type", dType) dep.set("id", "sd_%d" % depSeqId) depSeqId += 1 # Add in new dependencies between the split parts. for t in [tok for tok in split if tok.head is not None]: dep = ElementTree.SubElement(newparse, "dependency") dep.set("t1", t.head.id) dep.set("t2", t.id) dep.set("type", t.depType) dep.set("split", "PNS") dep.set("id", "spd_%d" % depSeqId) depSeqId += 1 for phrase in parse.getiterator("phrase"): newparse.append(phrase) # debugging #print >> sys.stderr, "NEW DEP IN", sId print >> sys.stderr, "Tokenization missing from", missingTokCount, "sentences" #indent(root) if logFile != None: logFile.close() # debugging if output != None: print >> sys.stderr, "Writing output to", output ETUtils.write(tree, output) return tree
def waitForProcess(process, numCorpusSentences, measureByGap, outputFile, counterName, updateMessage, timeout=None): """ Waits for a process to finish, and tracks the number of entities it writes to it's outputfile. If writing a sentence takes longer than the timeout, the process is considered stalled and is killed. """ maxStartupTime = 600 # Give extra time for the process to start up (even if it creates immediately an empty output file) counter = ProgressCounter(numCorpusSentences, counterName) counter.showMilliseconds = True prevNumSentences = 0 # Number of output sentences on previous check finalCheckLeft = True # Make one final check to update counters processStatus = None # When None, process not finished prevTime = time.time() startTime = time.time() # Wait until process is finished and periodically check it's progress. while processStatus == None or finalCheckLeft: if processStatus != None: # Extra loop to let counters finish finalCheckLeft = False # Done only once if os.path.exists( outputFile[0]): # Output file has already appeared on disk # Measure number of sentences in output file numSentences = 0 f = codecs.open(outputFile[0], "rt", **outputFile[1]) for line in f: if measureByGap: if line.strip() == "": numSentences += 1 else: numSentences += 1 f.close() # Update status if numSentences - prevNumSentences != 0: # Process has progressed counter.update(numSentences - prevNumSentences, updateMessage + ": ") if finalCheckLeft: # This is a normal loop, not the final check # Startuptime hasn't yet passed or process has made progress if time.time( ) - startTime < maxStartupTime or numSentences - prevNumSentences != 0: #if prevNumSentences == 0 or numSentences - prevNumSentences != 0: prevTime = time.time() # reset timeout else: # Nothing happened on this update, check whether process hung elapsedTime = time.time() - prevTime if timeout != None and elapsedTime > timeout: print >> sys.stderr, "Process timed out (" + str( elapsedTime) + " vs. " + str(timeout) + ")" print >> sys.stderr, "Killing process" process.kill() prevNumSentences = numSentences time.sleep(1) else: # Output file doesn't exist yet prevTime = time.time( ) # reset counter if output file hasn't been created processStatus = process.poll( ) # Get process status, None == still running counter.markFinished( ) # If we get this far, don't show the error message even if process didn't finish return (numSentences, numCorpusSentences)