def makeInputFile(self, workdir, corpusRoot, requireEntities, skipIds, skipParsed, tokenizationName, debug): if requireEntities: print >> sys.stderr, "Parsing only sentences with entities" # Write text to input file if debug: print >> sys.stderr, "BLLIP parser workdir", workdir infileName = os.path.join(workdir, "parser-input.txt") infile = codecs.open(infileName, "wt", "utf-8") numCorpusSentences = 0 if tokenizationName == None or tokenizationName == "PARSED_TEXT": # Parser does tokenization if tokenizationName == None: print >> sys.stderr, "Parser does the tokenization" else: print >> sys.stderr, "Parsing tokenized text" #for sentence in corpusRoot.getiterator("sentence"): for sentence in self.getSentences(corpusRoot, requireEntities, skipIds, skipParsed): infile.write("<s> " + sentence.get("text").replace("\n", " ").replace("\r", " ").strip() + " </s>\n") numCorpusSentences += 1 else: # Use existing tokenization print >> sys.stderr, "Using existing tokenization", tokenizationName for sentence in self.getSentences(corpusRoot, requireEntities, skipIds, skipParsed): tokenization = IXMLUtils.getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":tokenizationName}) assert tokenization.get("tokenizer") == tokenizationName s = "" for token in tokenization.findall("token"): s += token.get("text") + " " infile.write("<s> " + s + "</s>\n") numCorpusSentences += 1 infile.close() return infileName, numCorpusSentences
def loadSet(path, setName=None, level="a2", sitesAreArguments=False, a2Tags=["a2", "rel"], readScores=False, debug=False, subPath=None, origIdType=None): assert level in ["txt", "a1", "a2"] if path.endswith(".tar.gz") or path.endswith(".tgz") or path.endswith(".zip"): import tempfile import zipfile import shutil dir = tempfile.mkdtemp() if path.endswith(".zip"): with zipfile.ZipFile(path, "r") as f: f.extractall(dir) else: import tarfile f = tarfile.open(path, "r") f.extractall(dir) # Check if compressed directory is included in the package, like in the ST'11 corpus files compressedFilePath = os.path.join(dir, os.path.basename(path)[:-len(".tar.gz")]) if not os.path.exists(compressedFilePath): compressedFilePath = os.path.join(dir, os.path.basename(path)[:-len(".tgz")]) if not os.path.exists(compressedFilePath): # at least CO training set has a different dirname inside the tarfile compressedFilePath = compressedFilePath.rsplit("_", 1)[0] print >> sys.stderr, "Package name directory does not exist, trying", compressedFilePath if os.path.exists(compressedFilePath): print >> sys.stderr, "Reading document set from compressed filename directory", compressedFilePath dir = compressedFilePath if subPath != None: dir = os.path.join(compressedFilePath, subPath) f.close() elif path.endswith(".txt"): import tempfile import shutil dir = tempfile.mkdtemp() shutil.copy2(path, os.path.join(dir, os.path.basename(path))) else: dir = path ids = set() documents = [] license = None if os.path.exists(os.path.join(dir, "LICENSE")): licenseFile = open(os.path.join(dir, "LICENSE"), "rt") license = "".join(licenseFile.readlines()) licenseFile.close() origIds = {} for filename in os.listdir(dir): if filename.endswith(".txt"): if filename.startswith("._"): # a hack to skip the broken files in the GRO13 data packages continue id = filename.rsplit(".", 1)[0] ids.add(id) origIds[id] = IXMLUtils.getOrigId(os.path.join(dir, filename), origIdType) for id in sorted(list(ids)): #print "Loading", id doc = Document(id, dir, a2Tags, readScores, debug, origId=origIds[id]) doc.dataSet = setName doc.license = license documents.append(doc) if dir != path: shutil.rmtree(dir) return documents
def convert(self, input, dataSetNames=None, corpusName=None, output=None, extensions=None, origIdType=None): assert isinstance( input, basestring) and (os.path.isdir(input) or input.endswith(".tar.gz") or input.endswith(".txt") or "," in input) print >> sys.stderr, "Converting ST-format to Interaction XML" sourceDirs = self.getSourceDirs(input, dataSetNames) print >> sys.stderr, "Checking source directories:", sourceDirs if corpusName == None: corpusName = "TEES" # Convert all ST format input files into one corpus stExtensions = set(["txt", "a1", "a2", "rel"]) if extensions != None: if isinstance(extensions, basestring): extensions = extensions.split(",") stExtensions = set([x for x in stExtensions if x in extensions]) documents = [] xml = None for sourceDir in sourceDirs: sp = sourceDir["path"] if len(stExtensions.intersection(sourceDir["extensions"]) ) > 0 or sp.endswith(".tar.gz") or sp.endswith( ".tgz") or sp.endswith(".zip"): print >> sys.stderr, "Reading", sourceDir["path"] docs = Utils.STFormat.STTools.loadSet(sourceDir["path"], sourceDir["dataset"], origIdType=origIdType) print >> sys.stderr, len(docs), "documents" documents.extend(docs) if len(documents) > 0: print >> sys.stderr, "Resolving equivalences" Utils.STFormat.Equiv.process(documents) xml = Utils.STFormat.ConvertXML.toInteractionXML( documents, corpusName, output) # Add parse files into the corpus parseExtensions = set([ "sentences", "tok", "ptb", "sd", "conll", "conllx", "conllu", "epe" ]) if extensions != None: parseExtensions = set( [x for x in parseExtensions if x in extensions]) for sourceDir in sourceDirs: if len(parseExtensions.intersection(sourceDir["extensions"])) > 0: print >> sys.stderr, "Importing parses from", sourceDir[ "path"], "file types", sorted(sourceDir["extensions"]) if xml == None: xml = IXMLUtils.makeEmptyCorpus(corpusName) xml = ParseConverter().insertParses(sourceDir["path"], xml, output, "McCC", sourceDir["extensions"], origIdType=origIdType) return xml
def getParseFiles(self, parseDir, extensions, subDirs, counts, extMap=None, origIdType=None): files = {} if subDirs == None: subDirs = ["ptb", "conll", "sd_ccproc"] elif isinstance(subDirs, basestring): subDirs = subDirs.split(",") directories = [parseDir] + [os.path.join(parseDir, x) for x in subDirs] for directory in directories: fileCounts = defaultdict(int) if not os.path.exists(directory): continue print >> sys.stderr, "Collecting parses from", directory, for filename in os.listdir(directory): filePath = os.path.join(directory, filename) if "." not in filename or os.path.isdir(filePath): continue ext = filename.rsplit(".", 1)[-1] docName = IXMLUtils.getOrigId(filePath, origIdType) if extMap and ext in extMap: ext = extMap[ext] if ext not in extensions: fileCounts["skipped:" + ext] += 1 continue if docName not in files: files[docName] = {} if ext in files[docName]: raise Exception("Error, multiple files for extension: " + str((ext, [files[docName][ext], filePath]))) files[docName][ext] = filePath fileCounts[ext] += 1 counts[ext + "-read"] += 1 print >> sys.stderr, dict(fileCounts) return files
def toSTFormat(input, output=None, outputTag="a2", useOrigIds=False, debug=False, skipArgs=[], validate=True, writeExtra=False, allAsRelations=False, files=None, exportIds=None, clear=True, skipModifiers=False): print >> sys.stderr, "Loading corpus", input corpusTree = ETUtils.ETFromObj(input) print >> sys.stderr, "Corpus file loaded" corpusRoot = corpusTree.getroot() nonEntitySiteCount = 0 documents = [] for document in corpusRoot.findall("document"): stDoc = Document() stDoc.id = IXMLUtils.getExportId(document, exportIds) #stDoc.id = document.get("pmid") #if stDoc.id == None: # stDoc.id = document.get("origId") addTextToSTDoc(stDoc, document) documents.append(stDoc) eMap = {} tMap = {} entityElementMap = {} # for task 3 addEntitiesToSTDoc(stDoc, document, tMap, eMap, entityElementMap, useOrigIds, skipModifiers=skipModifiers) addInteractionsToSTDoc(stDoc, document, tMap, eMap, entityElementMap, skipArgs, allAsRelations, skipModifiers=skipModifiers) if output != None: print >> sys.stderr, "Writing output to", output writeSet(documents, output, resultFileTag=outputTag, debug=debug, writeExtra=writeExtra, files=files, clear=clear) return documents
def processCorpus(input, output, wordVectorPath, tokenizerName="McCC", max_rank_mem=100000, max_rank=10000000): print >> sys.stderr, "Making vocabulary" print >> sys.stderr, "Loading corpus file", input corpusTree = ETUtils.ETFromObj(input) corpusRoot = corpusTree.getroot() vocabulary = {"indices":{}, "vectors":[]} print >> sys.stderr, "Loading word vectors from", wordVectorPath print >> sys.stderr, "max_rank_mem", max_rank_mem print >> sys.stderr, "max_rank", max_rank max_rank_mem = int(max_rank_mem) max_rank = int(max_rank) wv = WV.load(wordVectorPath, max_rank_mem, max_rank) dimVector = wv.vectors.shape[1] print >> sys.stderr, "WordVector length", dimVector #addVector("[out]", wv.w_to_normv("and").tolist(), vocabulary) #addVector("[out]", dimVector * [0.0] + [0.0, 1.0], vocabulary) # Outside sentence range #addVector("[OoV]", wv.w_to_normv("and").tolist(), vocabulary) #addVector("[OoV]", dimVector * [0.0] + [1.0, 0.0], vocabulary) # Out of vocabulary addVector("[out]", dimVector * [0.0] + [0.0, 1.0], vocabulary) # Outside sentence range addVector("[OoV]", dimVector * [0.0] + [1.0, 0.0], vocabulary) # Out of vocabulary documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") counts = defaultdict(int) for document in documents: counter.update() counts["document"] += 1 for sentence in document.findall("sentence"): counts["sentence"] += 1 tokenization = IXMLUtils.getTokenizationElement(sentence, tokenizerName) if tokenization != None: counts["tokenization"] += 1 for token in tokenization.findall("token"): counts["token"] += 1 text = token.get("text") if text not in vocabulary["indices"]: counts["token-unique"] += 1 vector = wv.w_to_normv(token.get("text").lower()) if vector is not None: counts["vector"] += 1 vector = vector.tolist() + [0.0, 0.0] addVector(text, vector, vocabulary) else: counts["no-vector"] += 1 print >> sys.stderr, "Counts:", dict(counts) if output != None: print >> sys.stderr, "Writing vectors to", output + "-vectors.json.gz" with gzip.open(output + "-vectors.json.gz", "wt") as f: json.dump(vocabulary, f) print >> sys.stderr, "Writing indices to", output + "-indices.json.gz" with gzip.open(output + "-indices.json.gz", "wt") as f: json.dump({"indices":vocabulary["indices"], "vectors":None}, f) return vocabulary
def visualize(inPath, outPath, sentId, parseName): setSVGOptions() xml = ETUtils.ETFromObj(inPath) sentences = {x.get("id"):x for x in xml.iter("sentence")} if sentId not in sentences: print >> sys.stderr, "Sentence id '" + sentId + "' not found" return sentence = sentences[sentId] parse = IXMLUtils.getParseElement(sentence, parseName) if not parse: print >> sys.stderr, "Sentence has no parse with name '" + parseName + "'" return tokenization = IXMLUtils.getTokenizationElement(sentence, parse.get("tokenizer")) graph = SentenceGraph(sentence, [x for x in tokenization.findall("token")], [x for x in parse.findall("dependency")]) graph.mapInteractions([x for x in sentence.findall("entity")], [x for x in sentence.findall("interaction")]) svgTokens = tokensToSVG(tokenization.findall("token")) svgEdges = edgesToSVG(svgTokens, graph) #writeSVG({x.id:x for x in svgTokens}, svgEdges, outPath) writeSVG(svgTokens, svgEdges, outPath)
def makeInputFile(self, corpusRoot, workdir, parserName, reparse=False, action="convert", debug=False): if debug: print >> sys.stderr, "Stanford parser workdir", workdir stanfordInput = os.path.join(workdir, "input") stanfordInputFile = codecs.open(stanfordInput, "wt", "utf-8") existingCount = 0 for sentence in corpusRoot.getiterator("sentence"): if action in ("convert", "dep"): parse = IXMLUtils.getParseElement(sentence, parserName, addIfNotExist=(action == "dep")) pennTree = "" # Sentences with no parse (from the constituency step) are skipped in converter mode if parse != None: # Both the 'convert' and 'dep' actions rely on tokens generated from the penn tree pennTree = parse.get("pennstring", "") # Check for existing dependencies if len(parse.findall("dependency")) > 0: if reparse: # remove existing stanford conversion for dep in parse.findall("dependency"): parse.remove(dep) del parse.attrib["stanford"] else: # don't reparse existingCount += 1 pennTree = "" # Generate the input if action == "convert": # Put penn tree lines in input file inputString = pennTree if inputString == "": inputString = "(S1 (S (NN DUMMYINPUTTOKEN)))" else: # action == "dep" #tokenization = IXMLUtils.getTokenizationElement(sentence, parserName, addIfNotExist=False) #if tokenization != None: # tokenized = " ".join([x.get("text") for x in tokenization.findall("token")]) # stanfordInputFile.write(tokenized.replace("\n", " ").replace("\r", " ").strip() + "\n") #else: inputString = sentence.get("text").replace("\n", " ").replace("\r", " ").strip() inputString = inputString.strip() if inputString == "": inputString = "DUMMYINPUTTOKEN" # The parser skips empty lines stanfordInputFile.write(inputString + "\n") else: # action == "penn" stanfordInputFile.write(sentence.get("text").replace("\n", " ").replace("\r", " ").strip() + "\n") stanfordInputFile.close() if existingCount != 0: print >> sys.stderr, "Skipping", existingCount, "already converted sentences." return stanfordInput
def convert(self, input, dataSetNames=None, corpusName=None, output=None, extensions=None, origIdType=None): assert isinstance(input, basestring) and (os.path.isdir(input) or input.endswith(".tar.gz") or input.endswith(".txt") or "," in input) print >> sys.stderr, "Converting ST-format to Interaction XML" sourceDirs = self.getSourceDirs(input, dataSetNames) print >> sys.stderr, "Checking source directories:", sourceDirs if corpusName == None: corpusName = "TEES" # Convert all ST format input files into one corpus stExtensions = set(["txt", "a1", "a2", "rel"]) if extensions != None: if isinstance(extensions, basestring): extensions = extensions.split(",") stExtensions = set([x for x in stExtensions if x in extensions]) documents = [] xml = None for sourceDir in sourceDirs: sp = sourceDir["path"] if len(stExtensions.intersection(sourceDir["extensions"])) > 0 or sp.endswith(".tar.gz") or sp.endswith(".tgz") or sp.endswith(".zip"): print >> sys.stderr, "Reading", sourceDir["path"] docs = Utils.STFormat.STTools.loadSet(sourceDir["path"], sourceDir["dataset"], origIdType=origIdType) print >> sys.stderr, len(docs), "documents" documents.extend(docs) if len(documents) > 0: print >> sys.stderr, "Resolving equivalences" Utils.STFormat.Equiv.process(documents) xml = Utils.STFormat.ConvertXML.toInteractionXML(documents, corpusName, output) # Add parse files into the corpus parseExtensions = set(["sentences", "tok", "ptb", "sd", "conll", "conllx", "conllu", "epe"]) if extensions != None: parseExtensions = set([x for x in parseExtensions if x in extensions]) for sourceDir in sourceDirs: if len(parseExtensions.intersection(sourceDir["extensions"])) > 0: print >> sys.stderr, "Importing parses from", sourceDir["path"], "file types", sorted(sourceDir["extensions"]) if xml == None: xml = IXMLUtils.makeEmptyCorpus(corpusName) xml = ParseConverter().insertParses(sourceDir["path"], xml, output, "McCC", sourceDir["extensions"], origIdType=origIdType) return xml
def makeInputFile(self, workdir, corpusRoot, requireEntities, skipIds, skipParsed, tokenizationName, debug): if requireEntities: print >> sys.stderr, "Parsing only sentences with entities" # Write text to input file if debug: print >> sys.stderr, "BLLIP parser workdir", workdir infileName = os.path.join(workdir, "parser-input.txt") infile = codecs.open(infileName, "wt", "utf-8") numCorpusSentences = 0 if tokenizationName == None or tokenizationName == "PARSED_TEXT": # Parser does tokenization if tokenizationName == None: print >> sys.stderr, "Parser does the tokenization" else: print >> sys.stderr, "Parsing tokenized text" #for sentence in corpusRoot.getiterator("sentence"): for sentence in self.getSentences(corpusRoot, requireEntities, skipIds, skipParsed): infile.write("<s> " + sentence.get("text").replace( "\n", " ").replace("\r", " ").strip() + " </s>\n") numCorpusSentences += 1 else: # Use existing tokenization print >> sys.stderr, "Using existing tokenization", tokenizationName for sentence in self.getSentences(corpusRoot, requireEntities, skipIds, skipParsed): tokenization = IXMLUtils.getElementByAttrib( sentence.find("analyses"), "tokenization", {"tokenizer": tokenizationName}) assert tokenization.get("tokenizer") == tokenizationName s = "" for token in tokenization.findall("token"): s += token.get("text") + " " infile.write("<s> " + s + "</s>\n") numCorpusSentences += 1 infile.close() return infileName, numCorpusSentences
def export(self, input, output, parseName, tokenizerName=None, toExport=["tok", "ptb", "sd"], inputSuffixes=None, clear=False, tokenIdOffset=0, exportIds=None, useSetDirs=False): print >> sys.stderr, "##### Export Parse #####" if toExport == None: toExport = ["txt", "sentences", "tok", "ptb", "sd"] print >> sys.stderr, "Exporting parse formats", toExport if os.path.exists(output) and clear: shutil.rmtree(output) if not os.path.exists(output): os.makedirs(output) if inputSuffixes != None: inputFileNames = [] for suffix in inputSuffixes: inputFileNames.append(input + suffix) else: inputFileNames = [input] for inputFileName in inputFileNames: print >> sys.stderr, "Processing input file", inputFileName corpusRoot = ETUtils.ETFromObj(inputFileName).getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") counts = {"corpus":defaultdict(int)} for fileExt in toExport: counts[fileExt] = defaultdict(int) for document in documents: counter.update() counts["corpus"]["documents"] += 1 exportId = IXMLUtils.getExportId(document, exportIds) # Open document output files outfiles = {} for fileExt in toExport: #print output, exportId , fileExt if useSetDirs: outfilePath = os.path.join(output, document.get("set"), exportId + "." + fileExt) else: outfilePath = os.path.join(output, exportId + "." + fileExt) if os.path.exists(outfilePath): # check for overlapping files raise Exception("Export file '" + str(outfilePath) + "' already exists") if not os.path.exists(os.path.dirname(outfilePath)): os.makedirs(os.path.dirname(outfilePath)) outfiles[fileExt] = codecs.open(outfilePath, "wt", "utf-8") # Export document text if "txt" in outfiles and document.get("text") != None: outfiles["txt"].write(document.get("text")) if "txt" not in counts: counts["txt"] = defaultdict(int) counts["txt"]["documents"] += 1 # Process all the sentences in the document sentenceCount = 0 for sentence in document.findall("sentence"): counts["corpus"]["sentences"] += 1 parse = IXMLUtils.getParseElement(sentence, parseName) tokenization = IXMLUtils.getTokenizationElement(sentence, tokenizerName) if "sentences" in outfiles: outfiles["sentences"].write(sentence.get("text").strip().replace("\n", " ").replace("\r", " ") + "\n") counts["sentences"]["sentences"] += 1 if "ptb" in outfiles: if self.exportPennTreeBank(parse, outfiles["ptb"]): counts["ptb"]["sentences"] += 1 if tokenization != None: if "tok" in outfiles: if self.exportTokenization(tokenization, parse, sentence, outfiles["tok"]): counts["tok"]["sentences"] += 1 if "sd" in outfiles: if self.exportStanfordDependencies(parse, tokenization, outfiles["sd"], tokenIdOffset): counts["sd"]["sentences"] += 1 for conllFormat in ("conll", "conllx", "conllu"): if conllFormat in outfiles: if self.exportCoNLL(tokenization, parse, outfiles[conllFormat], conllFormat, counts[conllFormat]): counts[conllFormat]["sentences"] += 1 if "epe" in outfiles: if self.exportEPE(tokenization, parse, sentence, sentenceCount, outfiles["epe"]): counts["epe"]["sentences"] += 1 sentenceCount += 1 # Close document output files for fileExt in outfiles: outfiles[fileExt].close() outfiles[fileExt] = None print >> sys.stderr, "Parse export counts:" for k in sorted(counts.keys()): print >> sys.stderr, " " + str(k) + ":", dict(counts[k])
def loadSet(path, setName=None, level="a2", sitesAreArguments=False, a2Tags=["a2", "rel"], readScores=False, debug=False, subPath=None, origIdType=None): assert level in ["txt", "a1", "a2"] if path.endswith(".tar.gz") or path.endswith(".tgz") or path.endswith( ".zip"): import tempfile import zipfile import shutil dir = tempfile.mkdtemp() if path.endswith(".zip"): with zipfile.ZipFile(path, "r") as f: f.extractall(dir) else: import tarfile f = tarfile.open(path, "r") f.extractall(dir) # Check if compressed directory is included in the package, like in the ST'11 corpus files compressedFilePath = os.path.join( dir, os.path.basename(path)[:-len(".tar.gz")]) if not os.path.exists(compressedFilePath): compressedFilePath = os.path.join( dir, os.path.basename(path)[:-len(".tgz")]) if not os.path.exists( compressedFilePath ): # at least CO training set has a different dirname inside the tarfile compressedFilePath = compressedFilePath.rsplit("_", 1)[0] print >> sys.stderr, "Package name directory does not exist, trying", compressedFilePath if os.path.exists(compressedFilePath): print >> sys.stderr, "Reading document set from compressed filename directory", compressedFilePath dir = compressedFilePath if subPath != None: dir = os.path.join(compressedFilePath, subPath) f.close() elif path.endswith(".txt"): import tempfile import shutil dir = tempfile.mkdtemp() shutil.copy2(path, os.path.join(dir, os.path.basename(path))) else: dir = path ids = set() documents = [] license = None if os.path.exists(os.path.join(dir, "LICENSE")): licenseFile = open(os.path.join(dir, "LICENSE"), "rt") license = "".join(licenseFile.readlines()) licenseFile.close() origIds = {} for filename in os.listdir(dir): if filename.endswith(".txt"): if filename.startswith( "._" ): # a hack to skip the broken files in the GRO13 data packages continue id = filename.rsplit(".", 1)[0] ids.add(id) origIds[id] = IXMLUtils.getOrigId(os.path.join(dir, filename), origIdType) for id in sorted(list(ids)): #print "Loading", id doc = Document(id, dir, a2Tags, readScores, debug, origId=origIds[id]) doc.dataSet = setName doc.license = license documents.append(doc) if dir != path: shutil.rmtree(dir) return documents
def export(input, output, parse, tokenization=None, toExport=["tok", "ptb", "sd"], inputSuffixes=None, clear=False, tokenIdOffset=0, exportIds=None): print >> sys.stderr, "##### Export Parse #####" if toExport == None: toExport = ["txt", "sentences", "tok", "ptb", "sd"] print >> sys.stderr, "Exporting parse formats", toExport if os.path.exists(output) and clear: shutil.rmtree(output) if not os.path.exists(output): os.makedirs(output) if inputSuffixes != None: inputFileNames = [] for suffix in inputSuffixes: inputFileNames.append(input + suffix) else: inputFileNames = [input] for inputFileName in inputFileNames: print >> sys.stderr, "Processing input file", inputFileName corpusRoot = ETUtils.ETFromObj(inputFileName).getroot() documents = corpusRoot.findall("document") counter = ProgressCounter(len(documents), "Documents") counts = defaultdict(int) for document in documents: counter.update() # docId = document.get("pmid") # if docId == None: # docId = document.get("origId") # if docId == None: # docId = document.get("id") exportId = IXMLUtils.getExportId(document, exportIds) counts["document"] += 1 # Open document output files outfiles = {} for fileExt in toExport: #print output, exportId , fileExt outfilePath = output + "/" + exportId + "." + fileExt if os.path.exists(outfilePath): # check for overlapping files raise Exception("Export file '" + str(outfilePath) + "' already exists") outfiles[fileExt] = codecs.open(outfilePath, "wt", "utf-8") # Export document text if "txt" in outfiles and document.get("text") != None: outfiles["txt"].write(document.get("text")) counts["txt"] += 1 # Process all the sentences in the document for sentence in document.findall("sentence"): counts["sentence"] += 1 parseElement = None for e in sentence.getiterator("parse"): if e.get("parser") == parse: parseElement = e counts["parse"] += 1 break if tokenization == None: tokenization = parseElement.get("tokenizer") tokenizationElement = None for e in sentence.getiterator("tokenization"): if e.get("tokenizer") == tokenization: tokenizationElement = e counts["tokenization"] += 1 break if "sentences" in outfiles: outfiles["sentences"].write(sentence.get("text").strip().replace("\n", " ").replace("\r", " ") + "\n") counts["sentences"] += 1 if "tok" in outfiles: if exportTokenization(tokenizationElement, parseElement, sentence, outfiles["tok"]): counts["tok"] += 1 if "ptb" in outfiles: if exportPennTreeBank(parseElement, outfiles["ptb"]): counts["ptb"] += 1 if "sd" in outfiles: if exportStanfordDependencies(parseElement, tokenizationElement, outfiles["sd"], tokenIdOffset): counts["sd"] += 1 # Close document output files for fileExt in outfiles: outfiles[fileExt].close() outfiles[fileExt] = None print >> sys.stderr, "Parse export counts:" for k in sorted(counts.keys()): print >> sys.stderr, " " + str(k) + ":", counts[k]