Example #1
0
def write_to_temp(img_binary):
    """
    Detect binary data content type and write to tmp file using appropriate
    suffix.

    :param img_binary: Image binary data as a string chunck.
    :type img_binary: str

    :return: Filepath to the written temp file, or None if a temp file could not be written.
    :rtype: str | None

    """
    log = logging.getLogger("compute.write_to_temp")
    ct = tika_detector.from_buffer(img_binary)
    if not ct:
        log.warn("Detected no content type (None)")
        return None
    if ct not in VALID_TYPES:
        log.warn("Invalid image type '%s'", ct)
        return None
    ext = mt.guess_extension(ct)
    if not ext:
        log.warn("Count not guess extension for type: %s", ext)
        return None
    fd, filepath = tempfile.mkstemp(suffix=ext, dir=TEMP_DIR)
    os.close(fd)
    with open(filepath, "wb") as ofile:
        ofile.write(img_binary)
    return filepath
Example #2
0
def async_write_temp((key, img_binary, out_q)):
    """
    Detect binary data content type and write to tmp file using appropriate
    suffix. Outputs (key, filepath) to given output Queue.

    :param key: key of the element
    :type key: str

    :param img_binary: Image binary data as a string chunck.
    :type img_binary: str

    :param out_q: Output queue.
    :type out_q: multiprocessing.Queue

    """
    log = logging.getLogger("compute.async_write_temp[key::%s]" % key)
    ct = tika_detector.from_buffer(img_binary)
    if not ct:
        log.warn("Detected no content type (None)")
        return
    if ct not in VALID_TYPES:
        log.warn("Invalid image type '%s'", ct)
        return
    ext = mt.guess_extension(ct)
    if not ext:
        log.warn("Count not guess extension for type: %s", ext)
        return
    fd, filepath = tempfile.mkstemp(suffix=ext, dir=TEMP_DIR)
    os.close(fd)
    with open(filepath, "wb") as ofile:
        ofile.write(img_binary)

    out_q.put((key, filepath))
Example #3
0
def write_to_temp(img_binary):
    """
    Detect binary data content type and write to tmp file using appropriate
    suffix.

    :param img_binary: Image binary data as a string chunck.
    :type img_binary: str

    :return: Filepath to the written temp file, or None if a temp file could not be written.
    :rtype: str | None

    """
    log = logging.getLogger("compute.write_to_temp")
    ct = tika_detector.from_buffer(img_binary)
    if not ct:
        log.warn("Detected no content type (None)")
        return None
    if ct not in VALID_TYPES:
        log.warn("Invalid image type '%s'", ct)
        return None
    ext = mt.guess_extension(ct)
    if not ext:
        log.warn("Count not guess extension for type: %s", ext)
        return None
    fd, filepath = tempfile.mkstemp(suffix=ext, dir=TEMP_DIR)
    os.close(fd)
    with open(filepath, 'wb') as ofile:
        ofile.write(img_binary)
    return filepath
Example #4
0
def run_exist_tool(dir_list, output_name, subtype):
    file_list = get_file_list(dir_list)
    data = []
    for idx, val in enumerate(file_list):
        print(idx)
        with open(val) as input_file:
            mime_type = str()
            if subtype is not None:
                mime_type = subtype
            else:
                subtype = ''
                mime_type = detector.from_buffer(input_file)
            if mime_type is not None and mime_type.endswith(subtype):
                parsed = parser.from_buffer(input_file)
                if 'metadata' in parsed and parsed['metadata'] is not None:
                    file_name = val.split('/')[-1]
                    data.append({file_name: parsed['metadata']})

    dump_to_json(output_name, data)
    return
Example #5
0
def detect_content_type(filename_or_url: str) -> str:
    """
    Use tika to get the content type of a file or url.
    TODO there may be faster/ better ways
    """
    content_type = None
    try:
        if path.isfile(filename_or_url):

            content_type = detector.from_file(filename_or_url)
        else:
            buffer = requests.get(filename_or_url).content
            content_type = detector.from_buffer(BytesIO(buffer))

        log.info(f"Detected '{content_type}' as content type for: {filename_or_url}")

    except Exception as e:
        msg = f"Error detecting content type of '{filename_or_url}' : {str(e)}"
        log.error(msg)
        raise Exception(msg)

    assert content_type

    return content_type
Example #6
0
def async_write_temp((key, img_binary, out_q)):
    """
    Detect binary data content type and write to tmp file using appropriate
    suffix. Outputs (key, filepath) to given output Queue.

    :param key: key of the element
    :type key: str

    :param img_binary: Image binary data as a string chunck.
    :type img_binary: str

    :param out_q: Output queue.
    :type out_q: multiprocessing.Queue

    """
    log = logging.getLogger("compute.async_write_temp[key::%s]" % key)
    ct = tika_detector.from_buffer(img_binary)
    if not ct:
        log.warn("Detected no content type (None)")
        return
    if ct not in VALID_TYPES:
        log.warn("Invalid image type '%s'", ct)
        return
    ext = mt.guess_extension(ct)
    if not ext:
        log.warn("Count not guess extension for type: %s", ext)
        return
    sha1 = hashlib.sha1(img_binary).hexdigest()
    fd, filepath = tempfile.mkstemp(suffix=ext,
                                    prefix=sha1 + '.',
                                    dir=TEMP_DIR)
    os.close(fd)
    with open(filepath, 'wb') as ofile:
        ofile.write(img_binary)

    out_q.put((key, filepath))
Example #7
0
 def content_type(self):
     if self._binary_ct_cache is None:
         self._binary_ct_cache = tika_detector.from_buffer(self.get_bytes())
     return self._binary_ct_cache
Example #8
0
    log = logging.getLogger(__name__)

    try:
        r = requests.get(url, stream=True)
        r.raise_for_status()
    except requests.ConnectionError, ex:
        log.warn("Skipping '%s': %s: %s", url, str(type(ex)), str(ex))
        return None, None, None
    except requests.HTTPError, ex:
        log.warn("Skipping '%s': %s (code=%s)", url, r.reason, r.status_code)
        return None, None, None

    content = StringIO.StringIO()
    for c in r.iter_content(1024):
        content.write(c)
    cont_type = tika_detector.from_buffer(content.getvalue())
    ext = mimetypes.guess_extension(cont_type)
    if not ext:
        log.warn("Skipping '%s': Bad content type '%s'", url, cont_type)
        return None, None, None

    segs = url.split('/')
    dirpath = os.path.join(output_dir, *segs[2:-1])
    safe_create_dir(dirpath)

    basename = os.path.splitext(segs[-1])[0]
    save_pth = os.path.join(dirpath, basename + ext)

    if not os.path.isfile(save_pth):
        sha1_checksum = hashlib.sha1(content.getvalue()).hexdigest()
        tmp_pth = '.'.join([save_pth, uuid.uuid4().hex])
import os
#os.putenv( 'TIKA_VERSION','default')  # - set to the version string, e.g., 1.12 or default to current Tika version.
#os.putenv( 'TIKA_SERVER_JAR','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to the full URL to the remote Tika server jar to download and cache.
os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998') #- set to the host (local or remote) for the running Tika server jar.
#os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998/language/string') #- set to the host (local or remote) for the running Tika server jar.
#os.putenv( 'TIKA_CLIENT_ONLY','True') #- if set to True, then TIKA_SERVER_JAR is ignored, and relies on the value for TIKA_SERVER_ENDPOINT and treats Tika like a REST client.
#os.putenv( 'TIKA_TRANSLATOR','org/apache/tika/language/translate/') #- set to the fully qualified class name (defaults to Lingo24) for the Tika translator implementation.
#os.putenv( 'TIKA_SERVER_CLASSPATH','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to a string (delimited by ':' for each additional path) to prepend to the Tika server jar path.
tika.initVM()
from tika import parser
parsed = parser.from_buffer("comme çi comme ça")
print(parsed["metadata"])
print(parsed["content"])
global Verbose
Verbose=True

result=translate.auto_from_buffer("comme çi comme ça", 'en')
print(result)
result = detector.from_buffer("comme çi comme ça")
print (result)
result = translate.from_buffer("comme çi comme ça",'fr','en')
print (result)
result = language.from_buffer("comme çi comme ça")
print (result)
for line in lines:
    if len(line)>0:
        result=translate.from_buffer(line, 'ru','en')
        print(result)

print ('\n########################### No Errors ####################################')
Example #10
0
 def test_detect_doc_buffer(self):
     with open('tika/tests/arguments/Newton.doc', 'rb') as f:
         resp = from_buffer(f.read())
         self.assertEqual(resp, 'application/msword')
Example #11
0
os.putenv(
    'TIKA_SERVER_ENDPOINT', ' http://localhost:9998'
)  #- set to the host (local or remote) for the running Tika server jar.
#os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998/language/string') #- set to the host (local or remote) for the running Tika server jar.
#os.putenv( 'TIKA_CLIENT_ONLY','True') #- if set to True, then TIKA_SERVER_JAR is ignored, and relies on the value for TIKA_SERVER_ENDPOINT and treats Tika like a REST client.
#os.putenv( 'TIKA_TRANSLATOR','org/apache/tika/language/translate/') #- set to the fully qualified class name (defaults to Lingo24) for the Tika translator implementation.
#os.putenv( 'TIKA_SERVER_CLASSPATH','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to a string (delimited by ':' for each additional path) to prepend to the Tika server jar path.
tika.initVM()
from tika import parser
parsed = parser.from_buffer("comme çi comme ça")
print(parsed["metadata"])
print(parsed["content"])
global Verbose
Verbose = True

result = translate.auto_from_buffer("comme çi comme ça", 'en')
print(result)
result = detector.from_buffer("comme çi comme ça")
print(result)
result = translate.from_buffer("comme çi comme ça", 'fr', 'en')
print(result)
result = language.from_buffer("comme çi comme ça")
print(result)
for line in lines:
    if len(line) > 0:
        result = translate.from_buffer(line, 'ru', 'en')
        print(result)

print(
    '\n########################### No Errors ####################################'
)