Beispiel #1
0
def process(fullpath, config, rcontext, columns=None):
    parser = tika.AutoDetectParser()

    input = tika.FileInputStream(tika.File(fullpath))

    content = tika.BodyContentHandler()
    metadata = tika.Metadata()
    context = tika.ParseContext()

    parser.parse(input, content, metadata, context)
    content = content.toString()

    processed = [
        metadata.get("Creation-Date"),
        metadata.get("Last-Modified"),
        metadata.get("Last-Save-Date"),
        metadata.get("Revision-Number"),
        metadata.get("Author"),
        metadata.get("Last-Author"),
        metadata.get("Template"),
        metadata.get("Word-Count"),
        metadata.get("title"),
        metadata.get("subject"),
        metadata.get("Company"),
        metadata.get("Keywords"),
        metadata.get("Page-Count"),
        metadata.get("Character Count"), content
    ]

    extract.tika_extract(fullpath, context, metadata, config, rcontext)

    return processed
Beispiel #2
0
def process(fullpath, config, rcontext, columns=None):
    parser = tika.AutoDetectParser()

    input = tika.FileInputStream(tika.File(fullpath))

    content = tika.BodyContentHandler()
    metadata = tika.Metadata()
    context = tika.ParseContext()
    
    parser.parse(input,content,metadata,context)
    content = content.toString()

    processed = [
        metadata.get("Creation-Date"),
        metadata.get("Last-Modified"),
        metadata.get("Last-Save-Date"),
        metadata.get("Author"),
        metadata.get("producer"),
        metadata.get("xmpTPg:NPages"),
        content
    ]

    extract.xpdf_extract(fullpath, config, rcontext)

    return processed
Beispiel #3
0
def process(fullpath, config, rcontext, columns=None):
    results = []
    meta = []

    parser = tika.AutoDetectParser()

    input = tika.FileInputStream(tika.File(fullpath))

    content = tika.BodyContentHandler()
    metadata = tika.Metadata()
    context = tika.ParseContext()

    parser.parse(input, content, metadata, context)
    content = content.toString()

    for n in metadata.names():
        meta.append(metadata.get(n))

    val = 0
    parse = [0, 3, 4, 7, 9]

    for x in meta:
        if val in parse:
            results.append(x)
        val += 1

    results.append(content)

    extract.tika_extract(fullpath, context, metadata, config, rcontext)

    return results
Beispiel #4
0
def __handler(stream):
    handler = tika.ToHTMLContentHandler()
    met = tika.Metadata()
    pc = tika.ParseContext()
    parser = tika.AutoDetectParser()
    parser.parse(stream, handler, met, pc)
    return handler.toString()
Beispiel #5
0
def __parse(stream):
    parsed = {}
    parser = tika.AutoDetectParser()
    content = tika.BodyContentHandler(-1)
    metadata = tika.Metadata()
    context = tika.ParseContext()
    parser.parse(stream, content, metadata, context)
    parsed["content"] = content.toString()
    parsed["metadata"] = {}
    for n in metadata.names():
        parsed["metadata"][n] = metadata.get(n)
    return parsed
Beispiel #6
0
def __parse(stream):
    if not tika.getVMEnv():
        raise RuntimeError("tika.initVM() not called")
    parsed = {}
    parser = tika.AutoDetectParser()
    content = tika.BodyContentHandler()
    metadata = tika.Metadata()
    context = tika.ParseContext()
    parser.parse(stream, content, metadata, context)
    parsed["content"] = content.toString()
    parsed["metadata"] = {}
    for n in metadata.names():
        parsed["metadata"][n] = metadata.get(n)
    return parsed