def _basic_params(self): self.name = 'extract' self.pdftotext = self.extra_args[0] self.force_update = False if len(self.extra_args) > 1: self.force_update = True jarLoad = classPathHacker() tikaPath = os.path.join(self.cwd, 'lib', 'tika-app-1.2.jar') if os.path.exists(tikaPath): jarLoad.addFile(tikaPath) from org.apache.tika import Tika self.tika = Tika()
def _basic_params(self): self.name = "extract" self.pdftotext = self.extra_args[0] self.force_update = False if len(self.extra_args) > 1: self.force_update = True jarLoad = classPathHacker() tikaPath = os.path.join(self.cwd, "lib", "tika-app-1.2.jar") if os.path.exists(tikaPath): jarLoad.addFile(tikaPath) from org.apache.tika import Tika self.tika = Tika()
def _basic_params(self): self.name = 'extract' self.pdftotext = self.extra_args[0] if not os.path.exists(self.pdftotext): logging.error('pdftotext not found!') sys.exit(1) self.force_update = False if len(self.extra_args) > 1: self.force_update = True jarLoad = classPathHacker() tikaPath = os.path.join(self.cwd, 'lib', 'tika-app-1.2.jar') if os.path.exists(tikaPath): jarLoad.addFile(tikaPath) from org.apache.tika import Tika self.tika = Tika()
class Extract(textprocessor.TextProcessor): """ Extract text from PDF or HTML files """ import java.io.File def _basic_params(self): self.name = "extract" self.pdftotext = self.extra_args[0] self.force_update = False if len(self.extra_args) > 1: self.force_update = True jarLoad = classPathHacker() tikaPath = os.path.join(self.cwd, "lib", "tika-app-1.2.jar") if os.path.exists(tikaPath): jarLoad.addFile(tikaPath) from org.apache.tika import Tika self.tika = Tika() def process(self): logging.info("starting to process") itemIDs = {} for filename in self.files: id = self.metadata[filename]["itemID"] if id not in itemIDs: itemIDs[id] = [] itemIDs[id].append(filename) saved = [] for itemID, filenames in itemIDs.iteritems(): try: out_file = self.metadata[filenames[0]]["outfile"] out_dir = os.path.dirname(out_file) if not os.path.exists(out_dir): os.makedirs(out_dir) text = u'' if not os.path.exists(out_file) or self.force_update: for filename in filenames: if filename.lower().endswith(".txt"): text += codecs.open(filename, 'r', encoding='utf-8', errors='ignore').read() elif filename.lower().endswith(".html"): text += strip_tags(filename) elif filename.lower().endswith(".doc") or filename.lower().endswith(".docx"): if getattr(self, "tika", None) is not None: txtfile = self.java.io.File(filename) b = self.tika.parse(txtfile) d = u"" c = b.read() while c > 0: d += unichr(c) c = b.read() b.close() text += d elif filename.lower().endswith(".pdf"): import_args = [self.pdftotext, '-enc', 'UTF-8', '-nopgbrk', filename, '-'] import_proc = subprocess.Popen(import_args, stdout = subprocess.PIPE) text += import_proc.communicate()[0].decode('utf-8') logging.info("processed "+out_file) with codecs.open(out_file, 'w', encoding="utf-8") as f: f.write(text) saved.append({"itemID": itemID, "collection": self.metadata[filename]["collection"], "filename": out_file}) self.update_progress() except: logging.error(traceback.format_exc()) if self.progress_initialized: self.progress_file.write('<1000>\n') json_out = os.path.join(self.out_dir, self.name + self.collection + ".json") with codecs.open(json_out, 'wb', encoding='utf-8') as f: json.dump(saved, f) params = {"SUCCEEDED": str(len(saved)), "TOTAL": str(len(itemIDs.keys()))} self.write_html(params)
class Extract(textprocessor.TextProcessor): """ Extract text from PDF or HTML files """ import java.io.File def _basic_params(self): self.name = 'extract' self.pdftotext = self.extra_args[0] if not os.path.exists(self.pdftotext): logging.error('pdftotext not found!') sys.exit(1) self.force_update = False if len(self.extra_args) > 1: self.force_update = True jarLoad = classPathHacker() tikaPath = os.path.join(self.cwd, 'lib', 'tika-app-1.2.jar') if os.path.exists(tikaPath): jarLoad.addFile(tikaPath) from org.apache.tika import Tika self.tika = Tika() def process(self): logging.info('starting to process') itemIDs = {} for filename in self.files: itemid = self.metadata[filename]['itemID'] if itemid not in itemIDs: itemIDs[itemid] = [] itemIDs[itemid].append(filename) saved = [] for (itemID, filenames) in itemIDs.iteritems(): try: out_file = self.metadata[filenames[0]]['outfile'] out_dir = os.path.dirname(out_file) if not os.path.exists(out_dir): os.makedirs(out_dir) text = u'' if not os.path.exists(out_file) or self.force_update: for filename in filenames: fname = filename.lower() if fname.endswith('.txt'): text += codecs.open(filename, 'r', encoding='utf-8', errors='ignore' ).read() elif fname.endswith('.html'): text += strip_tags(filename) elif fname.endswith('.doc') \ or fname.endswith('.docx'): if getattr(self, 'tika', None) is not None: txtfile = self.java.io.File(filename) b = self.tika.parse(txtfile) d = u'' c = b.read() while c > 0: d += unichr(c) c = b.read() b.close() text += d elif fname.endswith('.pdf'): import_args = [ self.pdftotext, '-enc', 'UTF-8', '-nopgbrk', filename, '-', ] import_proc = subprocess.Popen(import_args, stdout=subprocess.PIPE) text += \ import_proc.communicate()[0].decode('utf-8' ) logging.info('processed ' + out_file) with codecs.open(out_file, 'w', encoding='utf-8' ) as f: f.write(text) saved.append( {'itemID': itemID, 'collection': self.metadata[filename]['collection'], 'filename': out_file } ) self.update_progress() except: logging.error(traceback.format_exc()) if self.progress_initialized: self.progress_file.write('<1000>\n') json_out = os.path.join(self.out_dir, self.name + self.collection + '.json') with codecs.open(json_out, 'wb', encoding='utf-8') as f: json.dump(saved, f) params = {'SUCCEEDED': str(len(saved)), 'TOTAL': str(len(itemIDs.keys()))} self.write_html(params)
class Extract(textprocessor.TextProcessor): """ Extract text from PDF or HTML files """ import java.io.File def _basic_params(self): self.name = 'extract' self.pdftotext = self.extra_args[0] self.force_update = False if len(self.extra_args) > 1: self.force_update = True jarLoad = classPathHacker() tikaPath = os.path.join(self.cwd, 'lib', 'tika-app-1.2.jar') if os.path.exists(tikaPath): jarLoad.addFile(tikaPath) from org.apache.tika import Tika self.tika = Tika() def process(self): if not os.path.exists(self.pdftotext): logging.error('pdftotext not found!') logging.info('starting to process') itemIDs = {} for filename in self.files: itemid = self.metadata[filename]['itemID'] if itemid not in itemIDs: itemIDs[itemid] = [] itemIDs[itemid].append(filename) saved = [] for (itemID, filenames) in itemIDs.iteritems(): try: out_file = self.metadata[filenames[0]]['outfile'] out_dir = os.path.dirname(out_file) if not os.path.exists(out_dir): os.makedirs(out_dir) text = u'' if not os.path.exists(out_file) or self.force_update: for filename in filenames: fname = filename.lower() if fname.endswith('.txt'): text += codecs.open(filename, 'r', encoding='utf-8', errors='ignore' ).read() elif fname.endswith('.html'): text += strip_tags(filename) elif fname.endswith('.doc') \ or fname.endswith('.docx'): if getattr(self, 'tika', None) is not None: txtfile = self.java.io.File(filename) b = self.tika.parse(txtfile) d = u'' c = b.read() while c > 0: d += unichr(c) c = b.read() b.close() text += d elif fname.endswith('.pdf'): import_args = [ self.pdftotext, '-enc', 'UTF-8', '-nopgbrk', filename, '-', ] try: import_proc = subprocess.Popen(import_args, stdout=subprocess.PIPE) text += \ import_proc.communicate()[0].decode('utf-8' ) except: logging.error(traceback.format_exc()) logging.info('processed ' + out_file) with codecs.open(out_file, 'w', encoding='utf-8' ) as f: f.write(text) saved.append( {'itemID': itemID, 'collection': self.metadata[filename]['collection'], 'filename': out_file } ) self.update_progress() except: logging.error(traceback.format_exc()) if self.progress_initialized: self.progress_file.write('<1000>\n') json_out = os.path.join(self.out_dir, self.name + self.collection + '.json') with codecs.open(json_out, 'wb', encoding='utf-8') as f: json.dump(saved, f) params = {'SUCCEEDED': str(len(saved)), 'TOTAL': str(len(itemIDs.keys()))} self.write_html(params)