Example #1
0
def insertParses(input, parsePath, output=None, parseName="McCC", extraAttributes={}):
    import tarfile
    from SentenceSplitter import openFile
    """
    Divide text in the "text" attributes of document and section 
    elements into sentence elements. These sentence elements are
    inserted into their respective parent elements.
    """  
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    print >> sys.stderr, "Inserting parses from", parsePath
    if parsePath.find(".tar.gz") != -1:
        tarFilePath, parsePath = parsePath.split(".tar.gz")
        tarFilePath += ".tar.gz"
        tarFile = tarfile.open(tarFilePath)
        if parsePath[0] == "/":
            parsePath = parsePath[1:]
    else:
        tarFile = None
    
    docCount = 0
    failCount = 0
    sentenceCount = 0
    docsWithStanford = 0
    sentencesCreated = 0
    sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
    counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion")
    for document in sourceElements:
        docCount += 1
        docId = document.get("id")
        if docId == None:
            docId = "CORPUS.d" + str(docCount)
        
        f = openFile(os.path.join(parsePath, document.get("pmid") + ".sd"), tarFile)
        if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension
            f = openFile(os.path.join(parsePath, document.get("pmid") + ".dep"), tarFile)
        if f != None:
            sentences = document.findall("sentence")
            # TODO: Following for-loop is the same as when used with a real parser, and should
            # be moved to its own function.
            for sentence in sentences:
                sentenceCount += 1
                counter.update(0, "Processing Documents ("+sentence.get("id")+"/" + document.get("pmid") + "): ")
                if not insertParse(sentence, f, parseName, extraAttributes={}):
                    failCount += 1
            f.close()
        counter.update(1, "Processing Documents ("+document.get("id")+"/" + document.get("pmid") + "): ")
    
    if tarFile != None:
        tarFile.close()
    #print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
    #print >> sys.stderr, docsWithSentences, "/", docCount, "documents have stanford parses"

    print >> sys.stderr, "Stanford conversion was inserted to", sentenceCount, "sentences,", failCount, "failed"
        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree
Example #2
0
def insertParses(input, parsePath, output=None, parseName="McCC", tokenizationName = None, makePhraseElements=True, extraAttributes={}):
    import tarfile
    from SentenceSplitter import openFile
    """
    Divide text in the "text" attributes of document and section 
    elements into sentence elements. These sentence elements are
    inserted into their respective parent elements.
    """  
    print >> sys.stderr, "Loading corpus", input
    corpusTree = ETUtils.ETFromObj(input)
    print >> sys.stderr, "Corpus file loaded"
    corpusRoot = corpusTree.getroot()
    
    print >> sys.stderr, "Inserting parses from", parsePath
    assert os.path.exists(parsePath)
    if parsePath.find(".tar.gz") != -1:
        tarFilePath, parsePath = parsePath.split(".tar.gz")
        tarFilePath += ".tar.gz"
        tarFile = tarfile.open(tarFilePath)
        if parsePath[0] == "/":
            parsePath = parsePath[1:]
    else:
        tarFile = None
    
    docCount = 0
    failCount = 0
    docsWithSentences = 0
    numCorpusSentences = 0
    sentencesCreated = 0
    sourceElements = [x for x in corpusRoot.getiterator("document")] + [x for x in corpusRoot.getiterator("section")]
    counter = ProgressCounter(len(sourceElements), "McCC Parse Insertion")
    for document in sourceElements:
        docCount += 1
        origId = document.get("pmid")
        if origId == None:
            origId = document.get("origId")
        if origId == None:
            origId = document.get("id")
        origId = str(origId)
        counter.update(1, "Processing Documents ("+document.get("id")+"/" + origId + "): ")
        docId = document.get("id")
        if docId == None:
            docId = "CORPUS.d" + str(docCount)
        
        f = openFile(os.path.join(parsePath, origId + ".ptb"), tarFile)
        if f == None: # file with BioNLP'11 extension not found, try BioNLP'09 extension
            f = openFile(os.path.join(parsePath, origId + ".pstree"), tarFile)
            if f == None: # no parse found
                continue
        parseStrings = f.readlines()
        f.close()
        sentences = document.findall("sentence")
        numCorpusSentences += len(sentences)
        assert len(sentences) == len(parseStrings)
        # TODO: Following for-loop is the same as when used with a real parser, and should
        # be moved to its own function.
        for sentence, treeLine in zip(sentences, parseStrings):
            if not insertParse(sentence, treeLine, makePhraseElements=makePhraseElements, extraAttributes=extraAttributes, docId=origId):
                failCount += 1
    
    if tarFile != None:
        tarFile.close()
    #print >> sys.stderr, "Sentence splitting created", sentencesCreated, "sentences"
    #print >> sys.stderr, docsWithSentences, "/", docCount, "documents have sentences"

    print >> sys.stderr, "Inserted parses for", numCorpusSentences, "sentences (" + str(failCount) + " failed)"
    if failCount == 0:
        print >> sys.stderr, "All sentences have a parse"
    else:
        print >> sys.stderr, "Warning, a failed parse exists for", failCount, "out of", numCorpusSentences, "sentences"
        print >> sys.stderr, "The \"pennstring\" attribute of these sentences has an empty string."        
    if output != None:
        print >> sys.stderr, "Writing output to", output
        ETUtils.write(corpusRoot, output)
    return corpusTree