class TextExtractor(object): def __init__(self, url, fileFormat=None): self.url = url self.fileFormat = fileFormat def getResponse(self): self.response = urllib2.urlopen(self.url) self.content = self.response.read() return self def openDocxResponse(self): fp = StringIO(self.content) zfp = zipfile.ZipFile(fp, 'r') xmlContent = zfp.read('word/document.xml') self.document = etree.fromstring(xmlContent) return self def openPdfResponse(self): self.fp = StringIO(self.content) self.document = PdfFileReader(self.fp) return self def minePdf(self, decode=True): self.miner = Miner(self.content) text = self.miner.extract_text() self.text = ' '.join(text.split()) if decode is True: self.text = self.text.decode('utf8').encode('ascii', 'ignore') def getDocxText(self): '''Return the raw text of a document as a list of paragraphs.''' paratextlist = [] # Compile a list of all paragraph (p) elements paralist = [e for e in self.document.iter() if e.tag == '{' + nsprefixes['w'] + '}p'] # Since a single sentence might be spread over multiple text elements, # iterate through each paragraph, appending all text (t) children to that # paragraphs text. for para in paralist: paratext = u'' # Loop through each paragraph for element in para.iter(): # Find t (text) elements if element.tag == '{' + nsprefixes['w'] + '}t' and element.text: paratext = paratext + element.text elif element.tag == '{' + nsprefixes['w'] + '}tab': paratext = paratext + '\t' # Add our completed paragraph text to the list of paragraph text if not len(paratext) == 0: paratextlist.append(paratext) self.text = ' '.join(paratextlist) return self def getPdfText(self): '''Return the raw text of a pdf file''' pdf = self.document.pages self.text = ' '.join([p.extractText() for p in pdf]) return self def getText(self): '''Conditional wrapper for getDocxText and getPdfText''' if self.fileFormat == 'pdf': self.getPdfText() elif self.fileFormat == 'docx': self.getDocxText() else: self.text = ''