def process(fullpath, config, rcontext, columns=None): parser = tika.AutoDetectParser() input = tika.FileInputStream(tika.File(fullpath)) content = tika.BodyContentHandler() metadata = tika.Metadata() context = tika.ParseContext() parser.parse(input, content, metadata, context) content = content.toString() processed = [ metadata.get("Creation-Date"), metadata.get("Last-Modified"), metadata.get("Last-Save-Date"), metadata.get("Revision-Number"), metadata.get("Author"), metadata.get("Last-Author"), metadata.get("Template"), metadata.get("Word-Count"), metadata.get("title"), metadata.get("subject"), metadata.get("Company"), metadata.get("Keywords"), metadata.get("Page-Count"), metadata.get("Character Count"), content ] extract.tika_extract(fullpath, context, metadata, config, rcontext) return processed
def process(fullpath, config, rcontext, columns=None): parser = tika.AutoDetectParser() input = tika.FileInputStream(tika.File(fullpath)) content = tika.BodyContentHandler() metadata = tika.Metadata() context = tika.ParseContext() parser.parse(input,content,metadata,context) content = content.toString() processed = [ metadata.get("Creation-Date"), metadata.get("Last-Modified"), metadata.get("Last-Save-Date"), metadata.get("Author"), metadata.get("producer"), metadata.get("xmpTPg:NPages"), content ] extract.xpdf_extract(fullpath, config, rcontext) return processed
def process(fullpath, config, rcontext, columns=None): results = [] meta = [] parser = tika.AutoDetectParser() input = tika.FileInputStream(tika.File(fullpath)) content = tika.BodyContentHandler() metadata = tika.Metadata() context = tika.ParseContext() parser.parse(input, content, metadata, context) content = content.toString() for n in metadata.names(): meta.append(metadata.get(n)) val = 0 parse = [0, 3, 4, 7, 9] for x in meta: if val in parse: results.append(x) val += 1 results.append(content) extract.tika_extract(fullpath, context, metadata, config, rcontext) return results
def __handler(stream): handler = tika.ToHTMLContentHandler() met = tika.Metadata() pc = tika.ParseContext() parser = tika.AutoDetectParser() parser.parse(stream, handler, met, pc) return handler.toString()
def __parse(stream): parsed = {} parser = tika.AutoDetectParser() content = tika.BodyContentHandler(-1) metadata = tika.Metadata() context = tika.ParseContext() parser.parse(stream, content, metadata, context) parsed["content"] = content.toString() parsed["metadata"] = {} for n in metadata.names(): parsed["metadata"][n] = metadata.get(n) return parsed
def __parse(stream): if not tika.getVMEnv(): raise RuntimeError("tika.initVM() not called") parsed = {} parser = tika.AutoDetectParser() content = tika.BodyContentHandler() metadata = tika.Metadata() context = tika.ParseContext() parser.parse(stream, content, metadata, context) parsed["content"] = content.toString() parsed["metadata"] = {} for n in metadata.names(): parsed["metadata"][n] = metadata.get(n) return parsed