def write_to_temp(img_binary): """ Detect binary data content type and write to tmp file using appropriate suffix. :param img_binary: Image binary data as a string chunck. :type img_binary: str :return: Filepath to the written temp file, or None if a temp file could not be written. :rtype: str | None """ log = logging.getLogger("compute.write_to_temp") ct = tika_detector.from_buffer(img_binary) if not ct: log.warn("Detected no content type (None)") return None if ct not in VALID_TYPES: log.warn("Invalid image type '%s'", ct) return None ext = mt.guess_extension(ct) if not ext: log.warn("Count not guess extension for type: %s", ext) return None fd, filepath = tempfile.mkstemp(suffix=ext, dir=TEMP_DIR) os.close(fd) with open(filepath, "wb") as ofile: ofile.write(img_binary) return filepath
def async_write_temp((key, img_binary, out_q)): """ Detect binary data content type and write to tmp file using appropriate suffix. Outputs (key, filepath) to given output Queue. :param key: key of the element :type key: str :param img_binary: Image binary data as a string chunck. :type img_binary: str :param out_q: Output queue. :type out_q: multiprocessing.Queue """ log = logging.getLogger("compute.async_write_temp[key::%s]" % key) ct = tika_detector.from_buffer(img_binary) if not ct: log.warn("Detected no content type (None)") return if ct not in VALID_TYPES: log.warn("Invalid image type '%s'", ct) return ext = mt.guess_extension(ct) if not ext: log.warn("Count not guess extension for type: %s", ext) return fd, filepath = tempfile.mkstemp(suffix=ext, dir=TEMP_DIR) os.close(fd) with open(filepath, "wb") as ofile: ofile.write(img_binary) out_q.put((key, filepath))
def write_to_temp(img_binary): """ Detect binary data content type and write to tmp file using appropriate suffix. :param img_binary: Image binary data as a string chunck. :type img_binary: str :return: Filepath to the written temp file, or None if a temp file could not be written. :rtype: str | None """ log = logging.getLogger("compute.write_to_temp") ct = tika_detector.from_buffer(img_binary) if not ct: log.warn("Detected no content type (None)") return None if ct not in VALID_TYPES: log.warn("Invalid image type '%s'", ct) return None ext = mt.guess_extension(ct) if not ext: log.warn("Count not guess extension for type: %s", ext) return None fd, filepath = tempfile.mkstemp(suffix=ext, dir=TEMP_DIR) os.close(fd) with open(filepath, 'wb') as ofile: ofile.write(img_binary) return filepath
def run_exist_tool(dir_list, output_name, subtype): file_list = get_file_list(dir_list) data = [] for idx, val in enumerate(file_list): print(idx) with open(val) as input_file: mime_type = str() if subtype is not None: mime_type = subtype else: subtype = '' mime_type = detector.from_buffer(input_file) if mime_type is not None and mime_type.endswith(subtype): parsed = parser.from_buffer(input_file) if 'metadata' in parsed and parsed['metadata'] is not None: file_name = val.split('/')[-1] data.append({file_name: parsed['metadata']}) dump_to_json(output_name, data) return
def detect_content_type(filename_or_url: str) -> str: """ Use tika to get the content type of a file or url. TODO there may be faster/ better ways """ content_type = None try: if path.isfile(filename_or_url): content_type = detector.from_file(filename_or_url) else: buffer = requests.get(filename_or_url).content content_type = detector.from_buffer(BytesIO(buffer)) log.info(f"Detected '{content_type}' as content type for: {filename_or_url}") except Exception as e: msg = f"Error detecting content type of '{filename_or_url}' : {str(e)}" log.error(msg) raise Exception(msg) assert content_type return content_type
def async_write_temp((key, img_binary, out_q)): """ Detect binary data content type and write to tmp file using appropriate suffix. Outputs (key, filepath) to given output Queue. :param key: key of the element :type key: str :param img_binary: Image binary data as a string chunck. :type img_binary: str :param out_q: Output queue. :type out_q: multiprocessing.Queue """ log = logging.getLogger("compute.async_write_temp[key::%s]" % key) ct = tika_detector.from_buffer(img_binary) if not ct: log.warn("Detected no content type (None)") return if ct not in VALID_TYPES: log.warn("Invalid image type '%s'", ct) return ext = mt.guess_extension(ct) if not ext: log.warn("Count not guess extension for type: %s", ext) return sha1 = hashlib.sha1(img_binary).hexdigest() fd, filepath = tempfile.mkstemp(suffix=ext, prefix=sha1 + '.', dir=TEMP_DIR) os.close(fd) with open(filepath, 'wb') as ofile: ofile.write(img_binary) out_q.put((key, filepath))
def content_type(self): if self._binary_ct_cache is None: self._binary_ct_cache = tika_detector.from_buffer(self.get_bytes()) return self._binary_ct_cache
log = logging.getLogger(__name__) try: r = requests.get(url, stream=True) r.raise_for_status() except requests.ConnectionError, ex: log.warn("Skipping '%s': %s: %s", url, str(type(ex)), str(ex)) return None, None, None except requests.HTTPError, ex: log.warn("Skipping '%s': %s (code=%s)", url, r.reason, r.status_code) return None, None, None content = StringIO.StringIO() for c in r.iter_content(1024): content.write(c) cont_type = tika_detector.from_buffer(content.getvalue()) ext = mimetypes.guess_extension(cont_type) if not ext: log.warn("Skipping '%s': Bad content type '%s'", url, cont_type) return None, None, None segs = url.split('/') dirpath = os.path.join(output_dir, *segs[2:-1]) safe_create_dir(dirpath) basename = os.path.splitext(segs[-1])[0] save_pth = os.path.join(dirpath, basename + ext) if not os.path.isfile(save_pth): sha1_checksum = hashlib.sha1(content.getvalue()).hexdigest() tmp_pth = '.'.join([save_pth, uuid.uuid4().hex])
import os #os.putenv( 'TIKA_VERSION','default') # - set to the version string, e.g., 1.12 or default to current Tika version. #os.putenv( 'TIKA_SERVER_JAR','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to the full URL to the remote Tika server jar to download and cache. os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998') #- set to the host (local or remote) for the running Tika server jar. #os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998/language/string') #- set to the host (local or remote) for the running Tika server jar. #os.putenv( 'TIKA_CLIENT_ONLY','True') #- if set to True, then TIKA_SERVER_JAR is ignored, and relies on the value for TIKA_SERVER_ENDPOINT and treats Tika like a REST client. #os.putenv( 'TIKA_TRANSLATOR','org/apache/tika/language/translate/') #- set to the fully qualified class name (defaults to Lingo24) for the Tika translator implementation. #os.putenv( 'TIKA_SERVER_CLASSPATH','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to a string (delimited by ':' for each additional path) to prepend to the Tika server jar path. tika.initVM() from tika import parser parsed = parser.from_buffer("comme çi comme ça") print(parsed["metadata"]) print(parsed["content"]) global Verbose Verbose=True result=translate.auto_from_buffer("comme çi comme ça", 'en') print(result) result = detector.from_buffer("comme çi comme ça") print (result) result = translate.from_buffer("comme çi comme ça",'fr','en') print (result) result = language.from_buffer("comme çi comme ça") print (result) for line in lines: if len(line)>0: result=translate.from_buffer(line, 'ru','en') print(result) print ('\n########################### No Errors ####################################')
def test_detect_doc_buffer(self): with open('tika/tests/arguments/Newton.doc', 'rb') as f: resp = from_buffer(f.read()) self.assertEqual(resp, 'application/msword')
os.putenv( 'TIKA_SERVER_ENDPOINT', ' http://localhost:9998' ) #- set to the host (local or remote) for the running Tika server jar. #os.putenv( 'TIKA_SERVER_ENDPOINT',' http://localhost:9998/language/string') #- set to the host (local or remote) for the running Tika server jar. #os.putenv( 'TIKA_CLIENT_ONLY','True') #- if set to True, then TIKA_SERVER_JAR is ignored, and relies on the value for TIKA_SERVER_ENDPOINT and treats Tika like a REST client. #os.putenv( 'TIKA_TRANSLATOR','org/apache/tika/language/translate/') #- set to the fully qualified class name (defaults to Lingo24) for the Tika translator implementation. #os.putenv( 'TIKA_SERVER_CLASSPATH','/home/richard/.m2/repository/org/apache/tika/tika-server/1.13/tika-server-1.13.jar') #- set to a string (delimited by ':' for each additional path) to prepend to the Tika server jar path. tika.initVM() from tika import parser parsed = parser.from_buffer("comme çi comme ça") print(parsed["metadata"]) print(parsed["content"]) global Verbose Verbose = True result = translate.auto_from_buffer("comme çi comme ça", 'en') print(result) result = detector.from_buffer("comme çi comme ça") print(result) result = translate.from_buffer("comme çi comme ça", 'fr', 'en') print(result) result = language.from_buffer("comme çi comme ça") print(result) for line in lines: if len(line) > 0: result = translate.from_buffer(line, 'ru', 'en') print(result) print( '\n########################### No Errors ####################################' )