Esempio n. 1
0
def process(fullpath, config, rcontext, columns=None):
    parser = tika.AutoDetectParser()

    input = tika.FileInputStream(tika.File(fullpath))

    content = tika.BodyContentHandler()
    metadata = tika.Metadata()
    context = tika.ParseContext()

    parser.parse(input,content,metadata,context)
    content = content.toString()

    processed = [
        metadata.get("Creation-Date"),
        metadata.get("Last-Modified"),
        metadata.get("Last-Save-Date"),
        metadata.get("Revision-Number"),
        metadata.get("Author"),
        metadata.get("Last-Author"),
        metadata.get("Template"),
        metadata.get("Word-Count"),
        metadata.get("title"),
        metadata.get("subject"),
        metadata.get("Company"),
        metadata.get("Keywords"),
        metadata.get("Page-Count"),
        metadata.get("Character Count"),
        content
    ]

    extract.tika_extract(fullpath, context, metadata, config, rcontext)

    return processed
Esempio n. 2
0
File: xls.py Progetto: vicgc/Uforia
def process(fullpath, config, rcontext, columns=None):
    results = []
    meta = []

    parser = tika.AutoDetectParser()

    input = tika.FileInputStream(tika.File(fullpath))

    content = tika.BodyContentHandler()
    metadata = tika.Metadata()
    context = tika.ParseContext()

    parser.parse(input, content, metadata, context)
    content = content.toString()

    for n in metadata.names():
        meta.append(metadata.get(n))

    val = 0
    parse = [0, 3, 4, 7, 9]

    for x in meta:
        if val in parse:
            results.append(x)
        val += 1

    results.append(content)

    extract.tika_extract(fullpath, context, metadata, config, rcontext)

    return results
Esempio n. 3
0
def process(fullpath, config, rcontext, columns=None):
    parser = tika.AutoDetectParser()

    input = tika.FileInputStream(tika.File(fullpath))

    content = tika.BodyContentHandler()
    metadata = tika.Metadata()
    context = tika.ParseContext()

    parser.parse(input, content, metadata, context)
    content = content.toString()

    processed = [
        metadata.get("Creation-Date"),
        metadata.get("Last-Modified"),
        metadata.get("Last-Save-Date"),
        metadata.get("Revision-Number"),
        metadata.get("Author"),
        metadata.get("Last-Author"),
        metadata.get("Template"),
        metadata.get("Word-Count"),
        metadata.get("title"),
        metadata.get("subject"),
        metadata.get("Company"),
        metadata.get("Keywords"),
        metadata.get("Page-Count"),
        metadata.get("Character Count"), content
    ]

    extract.tika_extract(fullpath, context, metadata, config, rcontext)

    return processed
Esempio n. 4
0
File: xls.py Progetto: vicgc/Uforia
def process(fullpath, config, rcontext, columns=None):
    results = []
    meta = []

    parser = tika.AutoDetectParser()

    input = tika.FileInputStream(tika.File(fullpath))

    content = tika.BodyContentHandler()
    metadata = tika.Metadata()
    context = tika.ParseContext()

    parser.parse(input,content,metadata,context)
    content = content.toString()

    for n in metadata.names():
        meta.append(metadata.get(n))

    val = 0
    parse = [0,3,4,7,9]

    for x in meta:
        if val in parse:
            results.append(x)
        val += 1

    results.append(content)

    extract.tika_extract(fullpath, context, metadata, config, rcontext);

    return results