Beispiel #1
0
def main():
    args = get_args()

    tika = TikaApp(args.jar or os.environ.get("TIKA_APP_JAR", None))

    parameters = {
        "path": args.file,
        "payload": args.payload,
        "objectInput": sys.stdin if args.stdin else None
    }

    try:
        if args.detect:
            print(tika.detect_content_type(**parameters))

        if args.text:
            print(tika.extract_only_content(**parameters))

        if args.language:
            print(tika.detect_language(**parameters))

        if args.all:
            parameters["pretty_print"] = True
            print(tika.extract_all_content(**parameters))

        if args.metadata:
            parameters["pretty_print"] = True
            print(tika.extract_only_metadata(**parameters))

    except IOError:
        pass
def main():
    args = get_args()

    command_line = dict()
    if args.jar:
        command_line = {"TIKA_APP_JAR": args.jar}

    defaults = {"TIKA_APP_JAR": "/opt/tika/tika-app-1.15.jar"}
    options = ChainMap(command_line, os.environ, defaults)

    tika = TikaApp(options['TIKA_APP_JAR'])

    try:
        if args.file:
            f = args.file

            if args.detect:
                print(tika.detect_content_type(path=f))

            if args.text:
                print(tika.extract_only_content(path=f))

            if args.language:
                print(tika.detect_language(path=f))

            if args.all:
                print(tika.extract_all_content(path=f, pretty_print=True))

        elif args.payload:
            p = args.payload

            if args.detect:
                print(tika.detect_content_type(payload=p))

            if args.text:
                print(tika.extract_only_content(payload=p))

            if args.language:
                print(tika.detect_language(payload=p))

            if args.all:
                print(tika.extract_all_content(payload=p, pretty_print=True))

    except IOError:
        pass
Beispiel #3
0
class TikaReader(object):
    def __init__(self, path):
        self.tika_client = TikaApp(file_jar=path)

    def detect_type(self, doc):
        return self.tika_client.detect_content_type(doc)

    def detect_language(self, doc):
        return self.tika_client.detect_language(doc)

    def content(self, doc):
        return self.tika_client.extract_all_content(doc)
class TikaReader:
    # Iniciador de la clase.
    def __init__(self, file_process):
        # Cliente Tika que utiliza que carga el fichero jar cliente.
        self.tika_client = TikaApp(file_jar="tika-app-1.20.jar")
        self.file_process = file_process

    # Detector del tipo de contenido MIME.
    def detect_document_type(self):
        return self.tika_client.detect_content_type(self.file_process)

    # Detector de lenguaje utilizado en el documento.
    def detect_language(self):
        return self.tika_client.detect_language(self.file_process)

    # Extractor del contenido completo del documento.
    def extract_complete_info(self, value=False):
        return self.tika_client.extract_all_content(self.file_process,
                                                    convert_to_obj=value)

    # Extractor de solo el contenido del documento.
    def extract_content_info(self):
        return self.tika_client.extract_only_content(self.file_process)
Beispiel #5
0
from tikapp import TikaApp

tika_client = TikaApp(
    file_jar="/Users/yma2/Documents/_garage/python/cxm/tika/tika-app-1.20.jar")

analyzeFile = "/Users/yma2/Downloads/Azure_Developer_Guide_eBook_ja-JP.pdf"
print(tika_client.detect_content_type(analyzeFile))
print(tika_client.detect_language(analyzeFile))
print(tika_client.extract_only_content(analyzeFile))
print(tika_client.extract_only_metadata(analyzeFile))
def tika_content_type():
    tika_client = TikaApp(file_jar=TIKA_APP_JAR)
    output = tika_client.detect_content_type(path=test_zip)
    return output