Example #1
0
 def makeInputFile(self, workdir, corpusRoot, requireEntities, skipIds, skipParsed, tokenizationName, debug):    
     if requireEntities:
         print >> sys.stderr, "Parsing only sentences with entities"
     # Write text to input file
     if debug:
         print >> sys.stderr, "BLLIP parser workdir", workdir
     infileName = os.path.join(workdir, "parser-input.txt")
     infile = codecs.open(infileName, "wt", "utf-8")
     numCorpusSentences = 0
     if tokenizationName == None or tokenizationName == "PARSED_TEXT": # Parser does tokenization
         if tokenizationName == None:
             print >> sys.stderr, "Parser does the tokenization"
         else:
             print >> sys.stderr, "Parsing tokenized text"
         #for sentence in corpusRoot.getiterator("sentence"):
         for sentence in self.getSentences(corpusRoot, requireEntities, skipIds, skipParsed):
             infile.write("<s> " + sentence.get("text").replace("\n", " ").replace("\r", " ").strip() + " </s>\n")
             numCorpusSentences += 1
     else: # Use existing tokenization
         print >> sys.stderr, "Using existing tokenization", tokenizationName 
         for sentence in self.getSentences(corpusRoot, requireEntities, skipIds, skipParsed):
             tokenization = IXMLUtils.getElementByAttrib(sentence.find("analyses"), "tokenization", {"tokenizer":tokenizationName})
             assert tokenization.get("tokenizer") == tokenizationName
             s = ""
             for token in tokenization.findall("token"):
                 s += token.get("text") + " "
             infile.write("<s> " + s + "</s>\n")
             numCorpusSentences += 1
     infile.close()
     return infileName, numCorpusSentences
Example #2
0
def loadSet(path, setName=None, level="a2", sitesAreArguments=False, a2Tags=["a2", "rel"], readScores=False, debug=False, subPath=None, origIdType=None):
    assert level in ["txt", "a1", "a2"]
    if path.endswith(".tar.gz") or path.endswith(".tgz") or path.endswith(".zip"):
        import tempfile
        import zipfile
        import shutil
        dir = tempfile.mkdtemp()
        if path.endswith(".zip"):
            with zipfile.ZipFile(path, "r") as f:
                f.extractall(dir)
        else:
            import tarfile
            f = tarfile.open(path, "r")
            f.extractall(dir)
        # Check if compressed directory is included in the package, like in the ST'11 corpus files
        compressedFilePath = os.path.join(dir, os.path.basename(path)[:-len(".tar.gz")])
        if not os.path.exists(compressedFilePath):
            compressedFilePath = os.path.join(dir, os.path.basename(path)[:-len(".tgz")])
        if not os.path.exists(compressedFilePath): # at least CO training set has a different dirname inside the tarfile
            compressedFilePath = compressedFilePath.rsplit("_", 1)[0]
            print >> sys.stderr, "Package name directory does not exist, trying", compressedFilePath
        if os.path.exists(compressedFilePath):
            print >> sys.stderr, "Reading document set from compressed filename directory", compressedFilePath
            dir = compressedFilePath
        if subPath != None:
            dir = os.path.join(compressedFilePath, subPath)
        f.close()
    elif path.endswith(".txt"):
        import tempfile
        import shutil
        dir = tempfile.mkdtemp()
        shutil.copy2(path, os.path.join(dir, os.path.basename(path)))
    else:
        dir = path
    
    ids = set()
    documents = []
    license = None
    if os.path.exists(os.path.join(dir, "LICENSE")):
        licenseFile = open(os.path.join(dir, "LICENSE"), "rt")
        license = "".join(licenseFile.readlines())
        licenseFile.close()
    origIds = {}
    for filename in os.listdir(dir):
        if filename.endswith(".txt"):
            if filename.startswith("._"): # a hack to skip the broken files in the GRO13 data packages
                continue
            id = filename.rsplit(".", 1)[0]
            ids.add(id)
            origIds[id] = IXMLUtils.getOrigId(os.path.join(dir, filename), origIdType)
    for id in sorted(list(ids)):
        #print "Loading", id
        doc = Document(id, dir, a2Tags, readScores, debug, origId=origIds[id])
        doc.dataSet = setName
        doc.license = license
        documents.append(doc)
    
    if dir != path:
        shutil.rmtree(dir)
    return documents
Example #3
0
 def convert(self,
             input,
             dataSetNames=None,
             corpusName=None,
             output=None,
             extensions=None,
             origIdType=None):
     assert isinstance(
         input, basestring) and (os.path.isdir(input)
                                 or input.endswith(".tar.gz")
                                 or input.endswith(".txt") or "," in input)
     print >> sys.stderr, "Converting ST-format to Interaction XML"
     sourceDirs = self.getSourceDirs(input, dataSetNames)
     print >> sys.stderr, "Checking source directories:", sourceDirs
     if corpusName == None:
         corpusName = "TEES"
     # Convert all ST format input files into one corpus
     stExtensions = set(["txt", "a1", "a2", "rel"])
     if extensions != None:
         if isinstance(extensions, basestring):
             extensions = extensions.split(",")
         stExtensions = set([x for x in stExtensions if x in extensions])
     documents = []
     xml = None
     for sourceDir in sourceDirs:
         sp = sourceDir["path"]
         if len(stExtensions.intersection(sourceDir["extensions"])
                ) > 0 or sp.endswith(".tar.gz") or sp.endswith(
                    ".tgz") or sp.endswith(".zip"):
             print >> sys.stderr, "Reading", sourceDir["path"]
             docs = Utils.STFormat.STTools.loadSet(sourceDir["path"],
                                                   sourceDir["dataset"],
                                                   origIdType=origIdType)
             print >> sys.stderr, len(docs), "documents"
             documents.extend(docs)
     if len(documents) > 0:
         print >> sys.stderr, "Resolving equivalences"
         Utils.STFormat.Equiv.process(documents)
         xml = Utils.STFormat.ConvertXML.toInteractionXML(
             documents, corpusName, output)
     # Add parse files into the corpus
     parseExtensions = set([
         "sentences", "tok", "ptb", "sd", "conll", "conllx", "conllu", "epe"
     ])
     if extensions != None:
         parseExtensions = set(
             [x for x in parseExtensions if x in extensions])
     for sourceDir in sourceDirs:
         if len(parseExtensions.intersection(sourceDir["extensions"])) > 0:
             print >> sys.stderr, "Importing parses from", sourceDir[
                 "path"], "file types", sorted(sourceDir["extensions"])
             if xml == None:
                 xml = IXMLUtils.makeEmptyCorpus(corpusName)
             xml = ParseConverter().insertParses(sourceDir["path"],
                                                 xml,
                                                 output,
                                                 "McCC",
                                                 sourceDir["extensions"],
                                                 origIdType=origIdType)
     return xml
Example #4
0
 def getParseFiles(self, parseDir, extensions, subDirs, counts, extMap=None, origIdType=None):
     files = {}
     if subDirs == None:
         subDirs = ["ptb", "conll", "sd_ccproc"]
     elif isinstance(subDirs, basestring):
         subDirs = subDirs.split(",")
     directories = [parseDir] + [os.path.join(parseDir, x) for x in subDirs]
     for directory in directories:
         fileCounts = defaultdict(int)
         if not os.path.exists(directory):
             continue
         print >> sys.stderr, "Collecting parses from", directory,
         for filename in os.listdir(directory):
             filePath = os.path.join(directory, filename)
             if "." not in filename or os.path.isdir(filePath):
                 continue
             ext = filename.rsplit(".", 1)[-1]
             docName = IXMLUtils.getOrigId(filePath, origIdType)
             if extMap and ext in extMap:
                 ext = extMap[ext]
             if ext not in extensions:
                 fileCounts["skipped:" + ext] += 1
                 continue
             if docName not in files:
                 files[docName] = {}
             if ext in files[docName]:
                 raise Exception("Error, multiple files for extension: " + str((ext, [files[docName][ext], filePath])))
             files[docName][ext] = filePath
             fileCounts[ext] += 1
             counts[ext + "-read"] += 1
         print >> sys.stderr, dict(fileCounts)
     return files
Example #5
0
def toSTFormat(input, output=None, outputTag="a2", useOrigIds=False, debug=False, skipArgs=[], validate=True, writeExtra=False, allAsRelations=False, files=None, exportIds=None, clear=True, skipModifiers=False):
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    nonEntitySiteCount = 0
    documents = []
    for document in corpusRoot.findall("document"):
        stDoc = Document()
        stDoc.id = IXMLUtils.getExportId(document, exportIds)
        #stDoc.id = document.get("pmid")
        #if stDoc.id == None:
        #    stDoc.id = document.get("origId")
        addTextToSTDoc(stDoc, document)
        documents.append(stDoc)
        eMap = {}
        tMap = {}
        entityElementMap = {} # for task 3
        addEntitiesToSTDoc(stDoc, document, tMap, eMap, entityElementMap, useOrigIds, skipModifiers=skipModifiers)
        addInteractionsToSTDoc(stDoc, document, tMap, eMap, entityElementMap, skipArgs, allAsRelations, skipModifiers=skipModifiers)
    
    if output != None:
        print >> sys.stderr, "Writing output to", output
        writeSet(documents, output, resultFileTag=outputTag, debug=debug, writeExtra=writeExtra, files=files, clear=clear)
    return documents
Example #6
0
def processCorpus(input, output, wordVectorPath, tokenizerName="McCC", max_rank_mem=100000, max_rank=10000000):
    print >> sys.stderr, "Making vocabulary"
    print >> sys.stderr, "Loading corpus file", input
    corpusTree = ETUtils.ETFromObj(input)
    corpusRoot = corpusTree.getroot()
    vocabulary = {"indices":{}, "vectors":[]}
    
    print >> sys.stderr, "Loading word vectors from", wordVectorPath
    print >> sys.stderr, "max_rank_mem", max_rank_mem
    print >> sys.stderr, "max_rank", max_rank
    max_rank_mem = int(max_rank_mem)
    max_rank = int(max_rank)
    wv = WV.load(wordVectorPath, max_rank_mem, max_rank)
    dimVector = wv.vectors.shape[1]
    print >> sys.stderr, "WordVector length", dimVector
    #addVector("[out]", wv.w_to_normv("and").tolist(), vocabulary) #addVector("[out]", dimVector * [0.0] + [0.0, 1.0], vocabulary) # Outside sentence range
    #addVector("[OoV]", wv.w_to_normv("and").tolist(), vocabulary) #addVector("[OoV]", dimVector * [0.0] + [1.0, 0.0], vocabulary) # Out of vocabulary
    addVector("[out]", dimVector * [0.0] + [0.0, 1.0], vocabulary) # Outside sentence range
    addVector("[OoV]", dimVector * [0.0] + [1.0, 0.0], vocabulary) # Out of vocabulary
    
    documents = corpusRoot.findall("document")
    counter = ProgressCounter(len(documents), "Documents")
    counts = defaultdict(int)
    for document in documents:
        counter.update()
        counts["document"] += 1
        for sentence in document.findall("sentence"):
            counts["sentence"] += 1
            tokenization = IXMLUtils.getTokenizationElement(sentence, tokenizerName)
            if tokenization != None:
                counts["tokenization"] += 1
                for token in tokenization.findall("token"):
                    counts["token"] += 1
                    text = token.get("text")
                    if text not in vocabulary["indices"]:
                        counts["token-unique"] += 1
                        vector = wv.w_to_normv(token.get("text").lower())
                        if vector is not None:
                            counts["vector"] += 1
                            vector = vector.tolist() + [0.0, 0.0]
                            addVector(text, vector, vocabulary)
                        else:
                            counts["no-vector"] += 1           
    
    print >> sys.stderr, "Counts:", dict(counts)
    
    if output != None:
        print >> sys.stderr, "Writing vectors to", output + "-vectors.json.gz"
        with gzip.open(output + "-vectors.json.gz", "wt") as f:
            json.dump(vocabulary, f)
        print >> sys.stderr, "Writing indices to", output + "-indices.json.gz"
        with gzip.open(output + "-indices.json.gz", "wt") as f:
            json.dump({"indices":vocabulary["indices"], "vectors":None}, f)
    return vocabulary
Example #7
0
def visualize(inPath, outPath, sentId, parseName):
    setSVGOptions()
    
    xml = ETUtils.ETFromObj(inPath)
    sentences = {x.get("id"):x for x in xml.iter("sentence")}
    if sentId not in sentences:
        print >> sys.stderr, "Sentence id '" + sentId + "' not found"
        return
    sentence = sentences[sentId]
    parse = IXMLUtils.getParseElement(sentence, parseName)
    if not parse:
        print >> sys.stderr, "Sentence has no parse with name '" + parseName + "'"
        return
    
    tokenization = IXMLUtils.getTokenizationElement(sentence, parse.get("tokenizer"))
    graph = SentenceGraph(sentence, [x for x in tokenization.findall("token")], [x for x in parse.findall("dependency")])
    graph.mapInteractions([x for x in sentence.findall("entity")], [x for x in sentence.findall("interaction")])
    svgTokens = tokensToSVG(tokenization.findall("token"))
    svgEdges = edgesToSVG(svgTokens, graph)
    #writeSVG({x.id:x for x in svgTokens}, svgEdges, outPath)
    writeSVG(svgTokens, svgEdges, outPath)
Example #8
0
 def makeInputFile(self, corpusRoot, workdir, parserName, reparse=False, action="convert", debug=False):
     if debug:
         print >> sys.stderr, "Stanford parser workdir", workdir
     stanfordInput = os.path.join(workdir, "input")
     stanfordInputFile = codecs.open(stanfordInput, "wt", "utf-8")
     
     existingCount = 0
     for sentence in corpusRoot.getiterator("sentence"):
         if action in ("convert", "dep"):
             parse = IXMLUtils.getParseElement(sentence, parserName, addIfNotExist=(action == "dep"))
             pennTree = ""
             # Sentences with no parse (from the constituency step) are skipped in converter mode
             if parse != None:
                 # Both the 'convert' and 'dep' actions rely on tokens generated from the penn tree
                 pennTree = parse.get("pennstring", "")
                 # Check for existing dependencies
                 if len(parse.findall("dependency")) > 0:
                     if reparse: # remove existing stanford conversion
                         for dep in parse.findall("dependency"):
                             parse.remove(dep)
                         del parse.attrib["stanford"]
                     else: # don't reparse
                         existingCount += 1
                         pennTree = ""
             # Generate the input
             if action == "convert": # Put penn tree lines in input file
                 inputString = pennTree
                 if inputString == "":
                     inputString = "(S1 (S (NN DUMMYINPUTTOKEN)))"
             else: # action == "dep"
                 #tokenization = IXMLUtils.getTokenizationElement(sentence, parserName, addIfNotExist=False)
                 #if tokenization != None:
                 #    tokenized = " ".join([x.get("text") for x in tokenization.findall("token")])
                 #    stanfordInputFile.write(tokenized.replace("\n", " ").replace("\r", " ").strip() + "\n")
                 #else:
                 inputString = sentence.get("text").replace("\n", " ").replace("\r", " ").strip()
             inputString = inputString.strip()
             if inputString == "":
                 inputString = "DUMMYINPUTTOKEN" # The parser skips empty lines
             stanfordInputFile.write(inputString  + "\n")
         else: # action == "penn"
             stanfordInputFile.write(sentence.get("text").replace("\n", " ").replace("\r", " ").strip() + "\n")
     stanfordInputFile.close()
     if existingCount != 0:
         print >> sys.stderr, "Skipping", existingCount, "already converted sentences."
     return stanfordInput
Example #9
0
 def getParseFiles(self,
                   parseDir,
                   extensions,
                   subDirs,
                   counts,
                   extMap=None,
                   origIdType=None):
     files = {}
     if subDirs == None:
         subDirs = ["ptb", "conll", "sd_ccproc"]
     elif isinstance(subDirs, basestring):
         subDirs = subDirs.split(",")
     directories = [parseDir] + [os.path.join(parseDir, x) for x in subDirs]
     for directory in directories:
         fileCounts = defaultdict(int)
         if not os.path.exists(directory):
             continue
         print >> sys.stderr, "Collecting parses from", directory,
         for filename in os.listdir(directory):
             filePath = os.path.join(directory, filename)
             if "." not in filename or os.path.isdir(filePath):
                 continue
             ext = filename.rsplit(".", 1)[-1]
             docName = IXMLUtils.getOrigId(filePath, origIdType)
             if extMap and ext in extMap:
                 ext = extMap[ext]
             if ext not in extensions:
                 fileCounts["skipped:" + ext] += 1
                 continue
             if docName not in files:
                 files[docName] = {}
             if ext in files[docName]:
                 raise Exception("Error, multiple files for extension: " +
                                 str((ext,
                                      [files[docName][ext], filePath])))
             files[docName][ext] = filePath
             fileCounts[ext] += 1
             counts[ext + "-read"] += 1
         print >> sys.stderr, dict(fileCounts)
     return files
Example #10
0
 def convert(self, input, dataSetNames=None, corpusName=None, output=None, extensions=None, origIdType=None):
     assert isinstance(input, basestring) and (os.path.isdir(input) or input.endswith(".tar.gz") or input.endswith(".txt") or "," in input)
     print >> sys.stderr, "Converting ST-format to Interaction XML"
     sourceDirs = self.getSourceDirs(input, dataSetNames)
     print >> sys.stderr, "Checking source directories:", sourceDirs
     if corpusName == None:
         corpusName = "TEES"
     # Convert all ST format input files into one corpus
     stExtensions = set(["txt", "a1", "a2", "rel"])
     if extensions != None:
         if isinstance(extensions, basestring):
             extensions = extensions.split(",")
         stExtensions = set([x for x in stExtensions if x in extensions])
     documents = []
     xml = None
     for sourceDir in sourceDirs:
         sp = sourceDir["path"]
         if len(stExtensions.intersection(sourceDir["extensions"])) > 0 or sp.endswith(".tar.gz") or sp.endswith(".tgz") or sp.endswith(".zip"):
             print >> sys.stderr, "Reading", sourceDir["path"]
             docs = Utils.STFormat.STTools.loadSet(sourceDir["path"], sourceDir["dataset"], origIdType=origIdType)
             print >> sys.stderr, len(docs), "documents"
             documents.extend(docs)
     if len(documents) > 0:
         print >> sys.stderr, "Resolving equivalences"
         Utils.STFormat.Equiv.process(documents)
         xml = Utils.STFormat.ConvertXML.toInteractionXML(documents, corpusName, output)
     # Add parse files into the corpus
     parseExtensions = set(["sentences", "tok", "ptb", "sd", "conll", "conllx", "conllu", "epe"])
     if extensions != None:
         parseExtensions = set([x for x in parseExtensions if x in extensions])
     for sourceDir in sourceDirs:
         if len(parseExtensions.intersection(sourceDir["extensions"])) > 0:
             print >> sys.stderr, "Importing parses from", sourceDir["path"], "file types", sorted(sourceDir["extensions"])
             if xml == None:
                 xml = IXMLUtils.makeEmptyCorpus(corpusName)
             xml = ParseConverter().insertParses(sourceDir["path"], xml, output, "McCC", sourceDir["extensions"], origIdType=origIdType)
     return xml
Example #11
0
 def makeInputFile(self, workdir, corpusRoot, requireEntities, skipIds,
                   skipParsed, tokenizationName, debug):
     if requireEntities:
         print >> sys.stderr, "Parsing only sentences with entities"
     # Write text to input file
     if debug:
         print >> sys.stderr, "BLLIP parser workdir", workdir
     infileName = os.path.join(workdir, "parser-input.txt")
     infile = codecs.open(infileName, "wt", "utf-8")
     numCorpusSentences = 0
     if tokenizationName == None or tokenizationName == "PARSED_TEXT":  # Parser does tokenization
         if tokenizationName == None:
             print >> sys.stderr, "Parser does the tokenization"
         else:
             print >> sys.stderr, "Parsing tokenized text"
         #for sentence in corpusRoot.getiterator("sentence"):
         for sentence in self.getSentences(corpusRoot, requireEntities,
                                           skipIds, skipParsed):
             infile.write("<s> " + sentence.get("text").replace(
                 "\n", " ").replace("\r", " ").strip() + " </s>\n")
             numCorpusSentences += 1
     else:  # Use existing tokenization
         print >> sys.stderr, "Using existing tokenization", tokenizationName
         for sentence in self.getSentences(corpusRoot, requireEntities,
                                           skipIds, skipParsed):
             tokenization = IXMLUtils.getElementByAttrib(
                 sentence.find("analyses"), "tokenization",
                 {"tokenizer": tokenizationName})
             assert tokenization.get("tokenizer") == tokenizationName
             s = ""
             for token in tokenization.findall("token"):
                 s += token.get("text") + " "
             infile.write("<s> " + s + "</s>\n")
             numCorpusSentences += 1
     infile.close()
     return infileName, numCorpusSentences
Example #12
0
 def export(self, input, output, parseName, tokenizerName=None, toExport=["tok", "ptb", "sd"], inputSuffixes=None, clear=False, tokenIdOffset=0, exportIds=None, useSetDirs=False):
     print >> sys.stderr, "##### Export Parse #####"
     if toExport == None:
         toExport = ["txt", "sentences", "tok", "ptb", "sd"]
     print >> sys.stderr, "Exporting parse formats", toExport
     
     if os.path.exists(output) and clear:
         shutil.rmtree(output)
     if not os.path.exists(output):
         os.makedirs(output)
     if inputSuffixes != None:
         inputFileNames = []
         for suffix in inputSuffixes:
             inputFileNames.append(input + suffix)
     else:
         inputFileNames = [input]
 
     for inputFileName in inputFileNames:
         print >> sys.stderr, "Processing input file", inputFileName
         corpusRoot = ETUtils.ETFromObj(inputFileName).getroot()
         documents = corpusRoot.findall("document")
         counter = ProgressCounter(len(documents), "Documents")
         counts = {"corpus":defaultdict(int)}
         for fileExt in toExport:
             counts[fileExt] = defaultdict(int)
         for document in documents:
             counter.update()
             counts["corpus"]["documents"] += 1
             exportId = IXMLUtils.getExportId(document, exportIds)
             # Open document output files
             outfiles = {}
             for fileExt in toExport:
                 #print output, exportId , fileExt
                 if useSetDirs:
                     outfilePath = os.path.join(output, document.get("set"), exportId + "." + fileExt)
                 else:
                     outfilePath = os.path.join(output, exportId + "." + fileExt)
                 if os.path.exists(outfilePath): # check for overlapping files
                     raise Exception("Export file '" + str(outfilePath) + "' already exists")
                 if not os.path.exists(os.path.dirname(outfilePath)):
                     os.makedirs(os.path.dirname(outfilePath))
                 outfiles[fileExt] = codecs.open(outfilePath, "wt", "utf-8")
             # Export document text
             if "txt" in outfiles and document.get("text") != None:
                 outfiles["txt"].write(document.get("text"))
                 if "txt" not in counts:
                     counts["txt"] = defaultdict(int)
                 counts["txt"]["documents"] += 1
             # Process all the sentences in the document
             sentenceCount = 0
             for sentence in document.findall("sentence"):
                 counts["corpus"]["sentences"] += 1
                 parse = IXMLUtils.getParseElement(sentence, parseName)
                 tokenization = IXMLUtils.getTokenizationElement(sentence, tokenizerName)
                 if "sentences" in outfiles:
                     outfiles["sentences"].write(sentence.get("text").strip().replace("\n", " ").replace("\r", " ") + "\n")
                     counts["sentences"]["sentences"] += 1
                 if "ptb" in outfiles:
                     if self.exportPennTreeBank(parse, outfiles["ptb"]):
                         counts["ptb"]["sentences"] += 1
                 if tokenization != None:
                     if "tok" in outfiles:
                         if self.exportTokenization(tokenization, parse, sentence, outfiles["tok"]):
                             counts["tok"]["sentences"] += 1
                     if "sd" in outfiles:
                         if self.exportStanfordDependencies(parse, tokenization, outfiles["sd"], tokenIdOffset):
                             counts["sd"]["sentences"] += 1
                     for conllFormat in ("conll", "conllx", "conllu"):
                         if conllFormat in outfiles:
                             if self.exportCoNLL(tokenization, parse, outfiles[conllFormat], conllFormat, counts[conllFormat]):
                                 counts[conllFormat]["sentences"] += 1
                     if "epe" in outfiles:
                         if self.exportEPE(tokenization, parse, sentence, sentenceCount, outfiles["epe"]):
                             counts["epe"]["sentences"] += 1
                 sentenceCount += 1
             # Close document output files
             for fileExt in outfiles:
                 outfiles[fileExt].close()
                 outfiles[fileExt] = None
         
     print >> sys.stderr, "Parse export counts:"
     for k in sorted(counts.keys()):
         print >> sys.stderr, "  " + str(k) + ":", dict(counts[k])
Example #13
0
def loadSet(path,
            setName=None,
            level="a2",
            sitesAreArguments=False,
            a2Tags=["a2", "rel"],
            readScores=False,
            debug=False,
            subPath=None,
            origIdType=None):
    assert level in ["txt", "a1", "a2"]
    if path.endswith(".tar.gz") or path.endswith(".tgz") or path.endswith(
            ".zip"):
        import tempfile
        import zipfile
        import shutil
        dir = tempfile.mkdtemp()
        if path.endswith(".zip"):
            with zipfile.ZipFile(path, "r") as f:
                f.extractall(dir)
        else:
            import tarfile
            f = tarfile.open(path, "r")
            f.extractall(dir)
        # Check if compressed directory is included in the package, like in the ST'11 corpus files
        compressedFilePath = os.path.join(
            dir,
            os.path.basename(path)[:-len(".tar.gz")])
        if not os.path.exists(compressedFilePath):
            compressedFilePath = os.path.join(
                dir,
                os.path.basename(path)[:-len(".tgz")])
        if not os.path.exists(
                compressedFilePath
        ):  # at least CO training set has a different dirname inside the tarfile
            compressedFilePath = compressedFilePath.rsplit("_", 1)[0]
            print >> sys.stderr, "Package name directory does not exist, trying", compressedFilePath
        if os.path.exists(compressedFilePath):
            print >> sys.stderr, "Reading document set from compressed filename directory", compressedFilePath
            dir = compressedFilePath
        if subPath != None:
            dir = os.path.join(compressedFilePath, subPath)
        f.close()
    elif path.endswith(".txt"):
        import tempfile
        import shutil
        dir = tempfile.mkdtemp()
        shutil.copy2(path, os.path.join(dir, os.path.basename(path)))
    else:
        dir = path

    ids = set()
    documents = []
    license = None
    if os.path.exists(os.path.join(dir, "LICENSE")):
        licenseFile = open(os.path.join(dir, "LICENSE"), "rt")
        license = "".join(licenseFile.readlines())
        licenseFile.close()
    origIds = {}
    for filename in os.listdir(dir):
        if filename.endswith(".txt"):
            if filename.startswith(
                    "._"
            ):  # a hack to skip the broken files in the GRO13 data packages
                continue
            id = filename.rsplit(".", 1)[0]
            ids.add(id)
            origIds[id] = IXMLUtils.getOrigId(os.path.join(dir, filename),
                                              origIdType)
    for id in sorted(list(ids)):
        #print "Loading", id
        doc = Document(id, dir, a2Tags, readScores, debug, origId=origIds[id])
        doc.dataSet = setName
        doc.license = license
        documents.append(doc)

    if dir != path:
        shutil.rmtree(dir)
    return documents
Example #14
0
def export(input, output, parse, tokenization=None, toExport=["tok", "ptb", "sd"], inputSuffixes=None, clear=False, tokenIdOffset=0, exportIds=None):
    print >> sys.stderr, "##### Export Parse #####"
    if toExport == None:
        toExport = ["txt", "sentences", "tok", "ptb", "sd"]
    print >> sys.stderr, "Exporting parse formats", toExport
    
    if os.path.exists(output) and clear:
        shutil.rmtree(output)
    if not os.path.exists(output):
        os.makedirs(output)
    if inputSuffixes != None:
        inputFileNames = []
        for suffix in inputSuffixes:
            inputFileNames.append(input + suffix)
    else:
        inputFileNames = [input]

    for inputFileName in inputFileNames:
        print >> sys.stderr, "Processing input file", inputFileName
        corpusRoot = ETUtils.ETFromObj(inputFileName).getroot()
        documents = corpusRoot.findall("document")
        counter = ProgressCounter(len(documents), "Documents")
        counts = defaultdict(int)
        for document in documents:
            counter.update()
#             docId = document.get("pmid")
#             if docId == None:
#                 docId = document.get("origId")
#             if docId == None:
#                 docId = document.get("id")
            exportId = IXMLUtils.getExportId(document, exportIds)
            counts["document"] += 1
            # Open document output files
            outfiles = {}
            for fileExt in toExport:
                #print output, exportId , fileExt
                outfilePath = output + "/" + exportId + "." + fileExt
                if os.path.exists(outfilePath): # check for overlapping files
                    raise Exception("Export file '" + str(outfilePath) + "' already exists")
                outfiles[fileExt] = codecs.open(outfilePath, "wt", "utf-8")
            # Export document text
            if "txt" in outfiles and document.get("text") != None:
                outfiles["txt"].write(document.get("text"))
                counts["txt"] += 1
            # Process all the sentences in the document
            for sentence in document.findall("sentence"):
                counts["sentence"] += 1
                parseElement = None
                for e in sentence.getiterator("parse"):
                    if e.get("parser") == parse:
                        parseElement = e
                        counts["parse"] += 1
                        break
                if tokenization == None:
                    tokenization = parseElement.get("tokenizer")
                tokenizationElement = None
                for e in sentence.getiterator("tokenization"):
                    if e.get("tokenizer") == tokenization:
                        tokenizationElement = e
                        counts["tokenization"] += 1
                        break
                if "sentences" in outfiles:
                    outfiles["sentences"].write(sentence.get("text").strip().replace("\n", " ").replace("\r", " ") + "\n")
                    counts["sentences"] += 1
                if "tok" in outfiles:
                    if exportTokenization(tokenizationElement, parseElement, sentence, outfiles["tok"]):
                        counts["tok"] += 1
                if "ptb" in outfiles:
                    if exportPennTreeBank(parseElement, outfiles["ptb"]):
                        counts["ptb"] += 1
                if "sd" in outfiles:
                    if exportStanfordDependencies(parseElement, tokenizationElement, outfiles["sd"], tokenIdOffset):
                        counts["sd"] += 1
            # Close document output files
            for fileExt in outfiles:
                outfiles[fileExt].close()
                outfiles[fileExt] = None
        
    print >> sys.stderr, "Parse export counts:"
    for k in sorted(counts.keys()):
        print >> sys.stderr, "  " + str(k) + ":", counts[k]