Exemple #1
0
def get_text(fileDir):
    document = zipfile.ZipFile(fileDir)
    #xml_content = document.read('content.xml')
    #document.close()

    #xml = parse(document.)
    #xml = parse('inputText/content.xml')
    #print(document.filelist)
    #print(document.open('content.xml'))
    xml = parse(document.open('content.xml'))

    textSoup = BeautifulStoneSoup(document.read('content.xml'))
    #print(textSoup.prettify())
    #print(textSoup.get_text())

    document.close()
    """
    officeText = xml.getElementsByTagName('office:text')

    textFromDoc = []

    if len((officeText[0].childNodes)) != 0:
        for officeNode in officeText[0].childNodes:
            if len(officeNode.childNodes) != 0:
                for nextNode1 in officeNode.childNodes:
                    if len(nextNode1.childNodes) == 0:
                        if nextNode1.nodeValue == None:
                            textFromDoc.append(' ')
                        else:
                            textFromDoc.append(nextNode1.nodeValue)
                    else:
                        for nextNode2 in  nextNode1.childNodes:
                            if len(nextNode2.childNodes) == 0:
                                textFromDoc.append(nextNode2.nodeValue)
    """

    #for node in text:
    #textFromDoc.append(getTextFromTag(node))
    #print(getTextFromTag(node))

    return textSoup.get_text()
Exemple #2
0
def get_text(fileDir):
    document = zipfile.ZipFile(fileDir)
    textSoup = BeautifulStoneSoup(document.read('content.xml'))
    document.close()
    return textSoup.get_text()