def test_scan_scan_xz(): """ Unit test for scan method, lzma case. """ file_io = FileIO() for method_row, util_row in zip(file_io.scan(TEST_CSV_XZ), util_test_csv(TEST_CSV)): assert method_row == util_row
def test_scan_scan_bzip2(): """ Unit test for scan method, gzip case. """ file_io = FileIO() for method_row, util_row in zip(file_io.scan(TEST_CSV_BZ2), util_test_csv(TEST_CSV)): assert method_row == util_row
def test_scan_tar_files(): """Unit test for method scan, compressed tar file case.""" file_io = FileIO() for ffile in os.listdir(TEST_TAR_LOCATION): for dfile in file_io.scan(os.path.join(TEST_TAR_LOCATION, ffile)): assert os.path.exists(dfile)
def test__get_url_unsuccessful(): """ Unit tests for _get_url method, unsuccessful case. """ try: FileIO._get_url(TEST_UNKNOWN) except IOError: assert True
def test_scan_csv(): """ Unit test for scan method, txt or csv case. """ file_io = FileIO() for method_row, util_row in zip(file_io.scan(TEST_CSV), util_test_csv(TEST_CSV)): assert method_row == util_row
def test_scan_unknown_file(self): """ Unit test for scan method, unknown file case """ file_io = FileIO() with self.assertRaises(FileTypeNotSupportedYet): next(file_io.scan(TEST_UNKNOWN))
def test_infer_file_type_from_uri_remote(): """ Unit test for method infer_file_type_from_uri, remote case. """ file_io = FileIO() assert file_io.infer_file_type_from_uri(TEST_URL) == 'JPG'
def test_resolve_path_end(): """Unit test for method test_resolve_path_end.""" test_path_with_sep = '/just/a/test/path/' test_path_without_sep = '/just/a/test/path' expected_return = '/just/a/test/path' assert FileIO.resolve_path_end(test_path_with_sep) == expected_return assert FileIO.resolve_path_end(test_path_without_sep) == expected_return
def test_infer_file_type_from_uri_no_mimetype(): """ Unit test for method infer_file_type_from_uri, no mimetype case """ file_io = FileIO() assert file_io.infer_file_type_from_uri(TEST_LOCAL, mimetype=False) == 'JPG'
def test_scan_compressed_tar_file_http(): """Unit test for method scan_compressed_tar_file, http case.""" test_uri = 'http://localhost:8888/images.tar.xz' test_file_reader = 'r|xz' file_io = FileIO() for ffile in file_io.scan_compressed_tar_file(test_uri, test_file_reader): assert os.path.exists(ffile)
def test_infer_file_type_from_uri_with_mimetype(): """ Unit test for method infer_file_type_from_uri, with mimetype case """ file_io = FileIO() _, mime = file_io.infer_file_type_from_uri(TEST_LOCAL, mimetype=True) assert mime == 'image/jpeg'
def test_infer_file_type_from_uri_unsupported(self): """ Unit test for method infer_file_type_from_uri, unsupported file type case """ file_io = FileIO() with self.assertRaises(FileTypeNotSupportedYet): file_io.infer_file_type_from_uri(TEST_UNSUPPORTED_FILE_TYPE, mimetype=True)
def test_dump(): """Unit test for method dump.""" file_io = FileIO() file_io.dump(TEST_SCAN_DIR, tempfile.gettempdir()) assert is_tarfile( os.path.join( tempfile.gettempdir(), '.'.join( (os.path.basename(file_io.resolve_path_end(TEST_SCAN_DIR)), 'pupyl'))))
def test_progress_not_precise(): """Unit test for method progress, not precise case.""" test_generator = range(10) test_unpacked = [*test_generator] test_result_generator = FileIO.progress(test_generator, precise=False) test_result_unpacked = FileIO.progress(test_unpacked, precise=False) for t_gen, r_gen in zip(test_generator, test_result_generator): assert t_gen == r_gen for t_unp, r_unp in zip(test_unpacked, test_result_unpacked): assert t_unp == r_unp
def test_scan_directory(): """ Unit test for scan method, directory case """ file_io = FileIO() test_against_tree = [ abspath(f'{TEST_SCAN_DIR}{ffile}') for ffile in [*walk(TEST_SCAN_DIR)][0][-1] ] test_current_tree = [*file_io.scan(abspath(TEST_SCAN_DIR))] assert test_current_tree == test_against_tree
def test_infer_file_type_tar_files(): """Unit test for method infer_file_type_from_uri, tar file case.""" file_io = FileIO() for ffile in os.listdir(TEST_TAR_LOCATION): test_file_type = mimetypes.guess_type( os.path.join(TEST_TAR_LOCATION, ffile))[1] assert file_io.infer_file_type_from_uri( os.path.join(TEST_TAR_LOCATION, ffile), mimetype=True) == TarCompressedTypes.mime(test_file_type) assert file_io.infer_file_type_from_uri( os.path.join(TEST_TAR_LOCATION, ffile), mimetype=False) == TarCompressedTypes.name(test_file_type)
def test_remove(): """Unit test for method remove.""" index_to_remove = 8 temp_file = FileIO.safe_temp_file(file_name='pupyl.index') temp_dir = os.path.dirname(temp_file) with Index(TEST_VECTOR_SIZE, data_dir=temp_dir) as index: for _ in range(16): index.append(numpy.random.normal(size=TEST_VECTOR_SIZE)) test_size_before = len(index) test_value = index[index_to_remove] with Index(TEST_VECTOR_SIZE, data_dir=temp_dir) as index: index.remove(index_to_remove) assert len(index) == test_size_before - 1 numpy.testing.assert_raises( AssertionError, numpy.testing.assert_array_equal, test_value, index[index_to_remove] )
def __init__(self, data_dir=None, **kwargs): if data_dir: self._data_dir = data_dir else: self._data_dir = FileIO.pupyl_temp_data_dir() self._index_config_path = os.path.join(self._data_dir, 'index.json') configurations = self._index_configuration('r') if configurations: self._import_images = configurations['import_images'] self._characteristic = Characteristics.by_name( configurations['characteristic']) if configurations.get('feature_size'): self._feature_size = configurations['feature_size'] else: import_images = kwargs.get('import_images') characteristic = kwargs.get('characteristic') if import_images: self._import_images = import_images else: self._import_images = True if characteristic: self._characteristic = characteristic else: self._characteristic = Characteristics.\ HEAVYWEIGHT_HUGE_PRECISION self.image_database = ImageDatabase(import_images=self._import_images, data_dir=self._data_dir)
def __init__(self, size, data_dir=None, trees=.001, volatile=False): """ Indexing tensors operations and nearest neighbours search. Parameters ---------- size: int Shape of unidimensional vectors which will be indexed data_dir: str Location where to load or save the index trees (optional): float Defines the number of trees to create based on the dataset size. Should be a number between 0 and 1. volatile (optional): bool If the index will be temporary or not. """ self._position = -1 self._size = size self._data_dir = data_dir self._trees = trees self._volatile = volatile if self._data_dir and not self._volatile: if os.path.isfile(self._data_dir): raise OSError('data_dir parameter is not a directory') os.makedirs(self._data_dir, exist_ok=True) self._path = os.path.join(self._data_dir, self.index_name) elif not self._data_dir and not self._volatile: raise NoDataDirForPermanentIndex elif not self._data_dir and self._volatile: _temp_file = FileIO.safe_temp_file() self._data_dir = os.path.dirname(_temp_file) self._path = _temp_file else: raise DataDirDefinedForVolatileIndex if os.path.isfile(self._path): try: self.tree = AnnoyIndex(size, metric='angular') self.tree.load(self._path) self._is_new_index = False except OSError as os_error: raise FileIsNotAnIndex from os_error else: self.tree = AnnoyIndex(size, metric='angular') self._is_new_index = True self._image_database = ImageDatabase( import_images=True, data_dir=self._data_dir, )
def test_safe_temp_file_exists(): """Unit test for method safe_temp_file, file exists case.""" test_temp_file_name = 'just_a_temp_file.txt' Path(join(tempfile.gettempdir(), test_temp_file_name)).touch() _ = FileIO.safe_temp_file(file_name=test_temp_file_name) assert not exists(test_temp_file_name)
def test_scan_compressed_tar_file_local(): """Unit test for method scan_compressed_tar_file, local case.""" test_tar_compressed_file_readers = { 'TZ2': 'r:bz2', 'TGZ': 'r:gz', 'TXZ': 'r:xz' } file_io = FileIO() for ffile in os.listdir(TEST_TAR_LOCATION): test_file_type = mimetypes.guess_type( os.path.join(TEST_TAR_LOCATION, ffile))[1] for dfile in file_io.scan_compressed_tar_file( os.path.join(TEST_TAR_LOCATION, ffile), test_tar_compressed_file_readers[TarCompressedTypes.name( test_file_type)]): assert os.path.exists(dfile)
def test_get_metadata_local(): """Unit test for method get_metadata, local case.""" test_metadata = { 'original_file_name': 'test_image.jpg', 'original_path': abspath('tests'), 'original_file_size': '5K' } test_local_metadata = FileIO.get_metadata(TEST_LOCAL) del test_local_metadata['original_access_time'] assert test_metadata == test_local_metadata
def test_progress_precise(): """Unit test for method progress, not precise case.""" def test_gen(): """Closure to test functions which returns generators.""" for value in range(10): yield value test_generator = range(10) test_unpacked = [*test_generator] test_result_generator = FileIO.progress(test_generator, precise=True) test_result_unpacked = FileIO.progress(test_unpacked, precise=True) test_result_func_gen = FileIO.progress(test_gen(), precise=True) for t_gen, r_gen in zip(test_generator, test_result_generator): assert t_gen == r_gen for t_unp, r_unp in zip(test_unpacked, test_result_unpacked): assert t_unp == r_unp for t_fgen, r_fgen in zip(test_generator, test_result_func_gen): assert t_fgen == r_fgen
def test_get_metadata_http_no_date(): """Unit test for method get_metadata, http and not date case.""" test_metadata = { 'original_file_name': 'axuvb8oxm7liskynxggfczfus.jpg', 'original_path': """http://images.protopage.com/view/ 572714""".replace('\n ', '') } test_request_metadata = FileIO.get_metadata(TEST_URL_NO_DATE) del test_request_metadata['original_access_time'] del test_request_metadata['original_file_size'] assert test_metadata == test_request_metadata
def test_bind(): """Unit test for method bind.""" file_io = FileIO() file_io.dump(TEST_SCAN_DIR, tempfile.gettempdir()) file_io.bind( os.path.join( tempfile.gettempdir(), '.'.join( (os.path.basename(file_io.resolve_path_end(TEST_SCAN_DIR)), 'pupyl'))), os.path.join(tempfile.gettempdir(), TEST_SCAN_DIR)) assert os.path.isdir(os.path.join(tempfile.gettempdir(), TEST_SCAN_DIR))
def test_get_metadata_http(): """Unit test for method get_metadata, http case.""" test_metadata = { 'original_file_name': '320px-Cheshm-Nazar.JPG', 'original_path': """https://upload.wikimedia.org/wikipedia/commons/ thumb/e/e4/Cheshm-Nazar.JPG""".replace('\n ', ''), 'original_file_size': '9K' } test_request_metadata = FileIO.get_metadata(TEST_URL) del test_request_metadata['original_access_time'] assert test_metadata == test_request_metadata
def test_pop(): """Unit test for method pop.""" temp_file = FileIO.safe_temp_file(file_name='pupyl.index') temp_dir = os.path.dirname(temp_file) with Index(TEST_VECTOR_SIZE, data_dir=temp_dir) as index: for _ in range(16): index.append(numpy.random.normal(size=TEST_VECTOR_SIZE)) test_size_before = len(index) test_value_before = index[-1] with Index(TEST_VECTOR_SIZE, data_dir=temp_dir) as index: test_value_after = index.pop() assert len(index) == test_size_before - 1 numpy.testing.assert_array_equal( test_value_before, test_value_after )
def export_by_group_by(self, path, top=10, **kwargs): """ Saves images, creating directories, based on their groups. Parameters ---------- path: str Place to create the directories and export images top (optional, default 10): How many similar internal images should be returned position (optional): int Returns the groups based on a specified position. """ for element in FileIO.progress( self.group_by( top=top, position=kwargs.get('position') ) ): if isinstance(element, dict): item = [*element.keys()][0] similars = element[item] elif isinstance(element, list): item = kwargs['position'] similars = element save_path = os.path.join( path, str(item) ) os.makedirs( save_path, exist_ok=True ) try: copyfile( self._image_database.mount_file_name( item, 'jpg' ), os.path.join( save_path, 'group.jpg' ) ) except FileNotFoundError: continue for rank, similar in enumerate(similars): original_file_path = self._image_database.mount_file_name( similar, 'jpg' ) try: copyfile( original_file_path, os.path.join( save_path, f'{rank + 1}.jpg' ) ) except FileNotFoundError: continue
def serve(data_dir=None, port=8080): """ Start the web server. Parameters ---------- port (optional)(default: 8080): int Defines the network port which the web server will start listening. """ if not data_dir: data_dir = FileIO.pupyl_temp_data_dir() pupyl_image_search = PupylImageSearch(data_dir) class RequestHandler(SimpleHTTPRequestHandler): """A web request handler.""" _data_dir = data_dir def __init__(self, request, client_address, server): SimpleHTTPRequestHandler.__init__(self, request, client_address, server) def do_GET(self): """Handler for GET request methods.""" query_image = None self.send_response(200) self.send_header('Content-type', 'text/html') self.end_headers() query_string = parse_qs(urlparse(self.path).query) query_list = query_string.get('uri', None) image_tags = self.images(query_list) if query_list: query_image = '<img class="img-thumbnail" ' + \ f'src="{query_list[0]}">' + \ '<figcaption class="figure-caption">' + \ 'Query image used in the search.</figcaption>' self.wfile.write( bytes( TEMPLATE.format(images=image_tags, query=query_image if query_image else ''), 'utf-8')) @staticmethod def filter_metadata(index): """ Return a filtered metadata information. Parameters ---------- index: int Index number of image Returns ------- dict: With a pre-filtered metadata. """ metadata = pupyl_image_search.image_database.\ load_image_metadata( index, filtered=( 'original_file_name', 'original_file_size' ) ) return ', '.join(map(str, metadata.values())) def images(self, query_uri=None, top=None): """ Return image tags from database. Parameters ---------- query_uri (optional): str Location where the query image is stored. top (optional)(default: 24): int How many results should be returned from some search request. """ image_tags = '' img_src = '<figure class="figure">' + \ '<img class="img-fluid border"' + \ 'src="data:image/jpg;base64, {image_b64}" ' + \ 'alt="🧿 Pupyl"><figcaption class="figure-caption">' + \ '{figure_caption}</figcaption></figure>' top = top if top else 24 if query_uri: query_uri = query_uri[0] for result in pupyl_image_search.search(query_uri, top=top): result = int(result) image = pupyl_image_search.image_database.\ get_image_bytes_to_base64( pupyl_image_search.image_database. load_image(result) ).decode('utf-8') filtered_metadata = self.filter_metadata(result) image_tags += img_src.format( image_b64=image, figure_caption=filtered_metadata) return image_tags for index, image in pupyl_image_search.image_database.list_images( return_index=True, top=9): image_base64 = pupyl_image_search.image_database.\ get_image_base64( image ).decode('utf-8') filtered_metadata = self.filter_metadata(index) image_tags += img_src.format(image_b64=image_base64, figure_caption=filtered_metadata) return image_tags if not port: port = 8080 try: with socketserver.TCPServer(('', port), RequestHandler) as httpd: print( termcolor.colored(f'Server listening on port {port}.', color='green', attrs=['bold'])) webbrowser.open_new_tab(f'http://localhost:{port}') httpd.serve_forever() except OSError: print( termcolor.colored( f'Port {port} already in use. Trying {port + 1}...', color='red', attrs=['bold'])) serve(data_dir=data_dir, port=port + 1) except KeyboardInterrupt: print('🧿 Pupyl says bye.')
def test_safe_temp_file(): """Unit test for method safe_temp_file.""" test_temp_file_name = FileIO.safe_temp_file() assert not exists(test_temp_file_name)