def do_data(ftype, data, afile): if ftype == constants.BF_UNKNOWN: return elif ftype == constants.BF_BZIP2: from blueflower.modules.bzip2 import bzip2_do_data bzip2_do_data(data, afile) elif ftype == constants.BF_DOCX: from blueflower.modules.docx import docx_do_data docx_do_data(data, afile) elif ftype == constants.BF_GZ: from blueflower.modules.gz import gz_do_data gz_do_data(data, afile) elif ftype == constants.BF_PDF: from blueflower.modules.pdf import pdf_do_data pdf_do_data(data, afile) elif ftype == constants.BF_TAR: from blueflower.modules.tar import tar_do_data tar_do_data(data, afile) elif ftype == constants.BF_TEXT: from blueflower.modules.text import text_do_data text_do_data(data, afile) elif ftype == constants.BF_XLSX: from blueflower.modules.xlsx import xlsx_do_data xlsx_do_data(data, afile) elif ftype == constants.BF_ZIP: from blueflower.modules.zip import zip_do_data zip_do_data(data, afile)
def do_data(ftype, data, afile): if ftype == "other": return elif ftype == "bzip2": from blueflower.modules.bzip2 import bzip2_do_data bzip2_do_data(data, afile) elif ftype == "gz": from blueflower.modules.gz import gz_do_data gz_do_data(data, afile) elif ftype == "pdf": from blueflower.modules.pdf import pdf_do_data pdf_do_data(data, afile) elif ftype == "tar": from blueflower.modules.tar import tar_do_data tar_do_data(data, afile) elif ftype == "text": from blueflower.modules.text import text_do_data text_do_data(data, afile) elif ftype == "zip": from blueflower.modules.zip import zip_do_data zip_do_data(data, afile)
def xlsx_do_xlsx(axl, afile): rows = [] try: for i in xrange(axl.nsheets): sheet = axl.sheet_by_index(i) for j in xrange(sheet.nrows): rows.append(' '.join(sheet.row_values(j))) except TypeError as e: log_error(str(e), afile) return text = '\n\n'.join(rows) text_do_data(text, afile)
def pdf_do_pdf(astream, afile): rsrcmgr = PDFResourceManager() retstr = StringIO() codec = 'utf-8' laparams = LAParams() device = TextConverter(rsrcmgr, retstr, codec=codec, laparams=laparams) interpreter = PDFPageInterpreter(rsrcmgr, device) pagenos = set() for page in PDFPage.get_pages(astream, pagenos, maxpages=0, password='', \ caching=True, check_extractable=True): interpreter.process_page(page) device.close() text = retstr.getvalue() retstr.close() text_do_data(text, afile)
def docx_do_docx(azip, afile): namespace = '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}' par = namespace + 'p' txt = namespace + 't' xml_content = azip.read('word/document.xml') tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(par): texts = [node.text for node in paragraph.getiterator(txt) if node.text] if texts: paragraphs.append(''.join(texts)) text = '\n\n'.join(paragraphs) text_do_data(text, afile)
def docx_do_docx(azip, afile): word_namespace = "{http://schemas.openxmlformats.org/wordprocessingml/2006/main}" par = word_namespace + "p" txt = word_namespace + "t" xml_content = azip.read("word/document.xml") tree = XML(xml_content) paragraphs = [] for paragraph in tree.getiterator(par): texts = [node.text for node in paragraph.getiterator(txt) if node.text] if texts: paragraphs.append("".join(texts)) text = "\n\n".join(paragraphs) text_do_data(text, afile)
def pdf_do_pdf(astream, afile): outstream = io.BytesIO() laparams = LAParams() rsrcmgr = PDFResourceManager(caching=True) device = TextConverter(rsrcmgr, outstream, codec='utf-8', laparams=laparams, imagewriter=None) interpreter = PDFPageInterpreter(rsrcmgr, device) try: for page in PDFPage.get_pages(astream, set(), maxpages=0, password='', caching=True, check_extractable=True): interpreter.process_page(page) except PDFTextExtractionNotAllowed as e: log_error(str(e), afile) return text = outstream.getvalue() text_do_data(text, afile) outstream.close()
def do_data(ftype, data, afile): if ftype == 'other': return elif ftype == 'bzip2': from blueflower.modules.bzip2 import bzip2_do_data bzip2_do_data(data, afile) elif ftype == 'gz': from blueflower.modules.gz import gz_do_data gz_do_data(data, afile) elif ftype == 'pdf': from blueflower.modules.pdf import pdf_do_data pdf_do_data(data, afile) elif ftype == 'tar': from blueflower.modules.tar import tar_do_data tar_do_data(data, afile) elif ftype == 'text': from blueflower.modules.text import text_do_data text_do_data(data, afile) elif ftype == 'zip': from blueflower.modules.zip import zip_do_data zip_do_data(data, afile)