예제 #1
0
class ExecuteConverter:
    def __init__(self):
        self.__resultTuple = None

    def extractTables(self, path, target):
        """
            Starts the table extraction. Using only this method, nothing will be returned,
            but the HTML output Files will be created in the specified output folder.
        """
        try:
            os.mkdir(target)
        except OSError:
            pass
        os.chdir(target)
        self.__dtdFile = open(target + "/pdf2xml.dtd", "w")
        self.buildDtd()
        self.__cmdLine = "pdftohtml -xml " + path
        print(self.__cmdLine)
        os.system(self.__cmdLine)
        xmlFile = os.path.basename(path).rstrip(".pdf") + ".xml"
        fileMover.moveXmlFile(path=path, target=target)

        #starting the extraction
        firstClassification = FirstClassification(target)
        self.__resultTuple = firstClassification.run(target + "/" + xmlFile)

        tableList = self.__resultTuple[0]
        fontsList = self.__resultTuple[1]
        path = self.__resultTuple[2]

        self.__outputObj = Output(tableList, fontsList, path)
        self.__outputObj.createOutput()

    def getTableList(self, outputTypeObj=GetOutputStringIOList()):
        """
            Return a list. The output type depends of the parameter.
            The default return type is a list of stringIO's containing the content of
            the generate HTML output files
        """
        self.__outputObj.setOutputType(outputTypeObj)
        outputFilesList = self.__outputObj.getOutputList()

        return outputFilesList

    def buildDtd(self):
        dtd = "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n" + \
            "<!ELEMENT pdf2xml (page+,line*,fontspec*)>\n" + \
            "<!ELEMENT page (fontspec*, text*)>\n" + \
            "<!ATTLIST page\n" + \
                "number CDATA #REQUIRED\n" + \
                "position CDATA #REQUIRED\n" + \
                "top CDATA #REQUIRED\n" + \
                "left CDATA #REQUIRED\n" + \
                "height CDATA #REQUIRED\n" + \
                "width CDATA #REQUIRED\n" + \
            ">\n" + \
            "<!ELEMENT fontspec EMPTY>\n" + \
            "<!ATTLIST fontspec\n" + \
                "id CDATA #REQUIRED\n" + \
                "size CDATA #REQUIRED\n" + \
                "family CDATA #REQUIRED\n" + \
                "color CDATA #REQUIRED\n" + \
            ">\n" + \
            "<!ELEMENT text (#PCDATA | b | i)*>\n" + \
            "<!ATTLIST text\n" + \
                "top CDATA #REQUIRED\n" + \
                "left CDATA #REQUIRED\n" + \
                "width CDATA #REQUIRED\n" + \
                "height CDATA #REQUIRED\n" + \
                "font CDATA #REQUIRED\n" + \
            ">\n" + \
            "<!ELEMENT b (#PCDATA)>\n" + \
            "<!ELEMENT i (#PCDATA)>\n" + \
            "<!ELEMENT line (text+)>\n" + \
            "<!ATTLIST line\n" + \
                "typ CDATA #REQUIRED\n" + \
                "top CDATA #REQUIRED\n" + \
                "left CDATA #REQUIRED\n" + \
                "font CDATA #REQUIRED\n" + \
            ">"
        self.__dtdFile.write(dtd)
        self.__dtdFile.close()
예제 #2
0
class ExecuteConverter:
    def __init__(self):
        self.__resultTuple = None

    def extractTables(self, path, target):
        """
            Starts the table extraction. Using only this method, nothing will be returned,
            but the HTML output Files will be created in the specified output folder.
        """
        try:
            os.mkdir(target)
        except OSError:
            pass
        os.chdir(target)
        self.__dtdFile = open(target + "/pdf2xml.dtd", "w")
        self.buildDtd()
        self.__cmdLine = "pdftohtml -xml " + path
        print(self.__cmdLine)
        os.system(self.__cmdLine)
        xmlFile = os.path.basename(path).rstrip(".pdf") + ".xml"
        fileMover.moveXmlFile(path = path, target = target)

        #starting the extraction
        firstClassification = FirstClassification(target)
        self.__resultTuple = firstClassification.run(target + "/" + xmlFile)

        tableList = self.__resultTuple[0]
        fontsList = self.__resultTuple[1]
        path = self.__resultTuple[2]

        self.__outputObj = Output(tableList, fontsList, path)
        self.__outputObj.createOutput()

    def getTableList(self, outputTypeObj = GetOutputStringIOList()):
        """
            Return a list. The output type depends of the parameter.
            The default return type is a list of stringIO's containing the content of
            the generate HTML output files
        """
        self.__outputObj.setOutputType(outputTypeObj)
        outputFilesList = self.__outputObj.getOutputList()

        return outputFilesList

    def buildDtd(self):
        dtd = "<?xml version=\"1.0\" encoding=\"iso-8859-1\"?>\n" + \
            "<!ELEMENT pdf2xml (page+,line*,fontspec*)>\n" + \
            "<!ELEMENT page (fontspec*, text*)>\n" + \
            "<!ATTLIST page\n" + \
                "number CDATA #REQUIRED\n" + \
                "position CDATA #REQUIRED\n" + \
                "top CDATA #REQUIRED\n" + \
                "left CDATA #REQUIRED\n" + \
                "height CDATA #REQUIRED\n" + \
                "width CDATA #REQUIRED\n" + \
            ">\n" + \
            "<!ELEMENT fontspec EMPTY>\n" + \
            "<!ATTLIST fontspec\n" + \
                "id CDATA #REQUIRED\n" + \
                "size CDATA #REQUIRED\n" + \
                "family CDATA #REQUIRED\n" + \
                "color CDATA #REQUIRED\n" + \
            ">\n" + \
            "<!ELEMENT text (#PCDATA | b | i)*>\n" + \
            "<!ATTLIST text\n" + \
                "top CDATA #REQUIRED\n" + \
                "left CDATA #REQUIRED\n" + \
                "width CDATA #REQUIRED\n" + \
                "height CDATA #REQUIRED\n" + \
                "font CDATA #REQUIRED\n" + \
            ">\n" + \
            "<!ELEMENT b (#PCDATA)>\n" + \
            "<!ELEMENT i (#PCDATA)>\n" + \
            "<!ELEMENT line (text+)>\n" + \
            "<!ATTLIST line\n" + \
                "typ CDATA #REQUIRED\n" + \
                "top CDATA #REQUIRED\n" + \
                "left CDATA #REQUIRED\n" + \
                "font CDATA #REQUIRED\n" + \
            ">"
        self.__dtdFile.write(dtd)
        self.__dtdFile.close()