コード例 #1
0
ファイル: extract_text.py プロジェクト: the-sumit/core-engine
class TextExtractor(object):

    def __init__(self, url, fileFormat=None):
        self.url = url
        self.fileFormat = fileFormat

    def getResponse(self):
        self.response = urllib2.urlopen(self.url)
        self.content = self.response.read()
        return self

    def openDocxResponse(self):
        fp = StringIO(self.content)
        zfp = zipfile.ZipFile(fp, 'r')
        xmlContent = zfp.read('word/document.xml')
        self.document = etree.fromstring(xmlContent)
        return self

    def openPdfResponse(self):
        self.fp = StringIO(self.content)
        self.document = PdfFileReader(self.fp)
        return self

    def minePdf(self, decode=True):
        self.miner = Miner(self.content)
        text = self.miner.extract_text()
        self.text = ' '.join(text.split())
        if decode is True:
            self.text = self.text.decode('utf8').encode('ascii', 'ignore')

    def getDocxText(self):
        '''Return the raw text of a document as a list of paragraphs.'''
        paratextlist = []

        # Compile a list of all paragraph (p) elements
        paralist = [e for e in self.document.iter()
                    if e.tag == '{' + nsprefixes['w'] + '}p']

        # Since a single sentence might be spread over multiple text elements,
        # iterate through each paragraph, appending all text (t) children to that
        # paragraphs text.
        for para in paralist:
            paratext = u''
            # Loop through each paragraph
            for element in para.iter():
                # Find t (text) elements
                if element.tag == '{' + nsprefixes['w'] + '}t' and element.text:
                    paratext = paratext + element.text
                elif element.tag == '{' + nsprefixes['w'] + '}tab':
                    paratext = paratext + '\t'
            # Add our completed paragraph text to the list of paragraph text
            if not len(paratext) == 0:
                paratextlist.append(paratext)

        self.text = ' '.join(paratextlist)
        return self

    def getPdfText(self):
        '''Return the raw text of a pdf file'''
        pdf = self.document.pages
        self.text = ' '.join([p.extractText() for p in pdf])
        return self

    def getText(self):
        '''Conditional wrapper for getDocxText and getPdfText'''
        if self.fileFormat == 'pdf':
            self.getPdfText()
        elif self.fileFormat == 'docx':
            self.getDocxText()
        else:
            self.text = ''