def upload_file(self, filename): """Create a new entry from a given local file. Will make a copy of the given file. Raises ValueError if the given file does not exist. Parameters ---------- filename: string Path to file on disk Returns ------- vizier.filestore.base.FileHandle """ # Ensure that the given file exists if not os.path.isfile(filename): raise ValueError('invalid file path \'' + str(filename) + '\'') name = name = os.path.basename(filename) # Create a new unique identifier for the file. identifier = get_unique_identifier() file_dir = self.get_file_dir(identifier, create=True) output_file = os.path.join(file_dir, DATA_FILENAME) # Copy the uploaded file shutil.copyfile(filename, output_file) # Add file to file index f_handle = FileHandle( identifier, filepath=output_file, file_name=name ) # Write metadata file write_metadata_file(file_dir, f_handle) return f_handle
def upload_stream(self, file, file_name): """Create a new entry from a given file stream. Will copy the given file to a file in the base directory. Parameters ---------- file: werkzeug.datastructures.FileStorage File object (e.g., uploaded via HTTP request) file_name: string Name of the file Returns ------- vizier.filestore.base.FileHandle """ # Create a new unique identifier for the file. identifier = get_unique_identifier() file_dir = self.get_file_dir(identifier, create=True) output_file = os.path.join(file_dir, DATA_FILENAME) # Save the file object to the new file path file.save(output_file) f_handle = FileHandle( identifier, filepath=output_file, file_name=file_name ) # Write metadata file write_metadata_file(file_dir, f_handle) return f_handle
def download_file(self, url, username=None, password=None): """Create a local copy of the identified web resource. Parameters ---------- url : string Unique resource identifier for external resource that is accessed username: string, optional Optional user name for authentication password: string, optional Optional password for authentication Returns ------- vizier.filestore.base.FileHandle """ # Get unique identifier and output file identifier = get_unique_identifier() file_dir = self.get_file_dir(identifier, create=True) output_file = os.path.join(file_dir, DATA_FILENAME) # Write web resource to output file. response = urllib.request.urlopen(url) filename = get_download_filename(url, response.info()) mode = 'w' if filename.endswith('.gz'): mode += 'b' with open(output_file, mode) as f: f.write(response.read()) # Add file to file index f_handle = FileHandle(identifier, filepath=output_file, file_name=filename) # Write metadata file write_metadata_file(file_dir, f_handle) return f_handle
def download_dataset(self, url, username=None, password=None, filestore=None): """Create a new dataset from a given file. Returns the handle for the downloaded file only if the filestore has been provided as an argument in which case the file handle is meaningful file handle. Raises ValueError if the given file could not be loaded as a dataset. Parameters ---------- url : string Unique resource identifier for external resource that is accessed username: string, optional Optional user name for authentication password: string, optional Optional password for authentication filestore: vizier.filestore.base.Filestore, optional Optional filestore to save a local copy of the downloaded resource Returns ------- vizier.datastore.fs.dataset.FileSystemDatasetHandle, vizier.filestore.base.FileHandle """ if not filestore is None: # Upload the file to the filestore to get the file handle fh = filestore.download_file(url=url, username=username, password=password) # Since the filestore was given we return a tuple of dataset # descriptor and file handle return self.load_dataset(fh), fh else: # Manually download the file temporarily temp_dir = tempfile.mkdtemp() try: response = urllib.request.urlopen(url) filename = get_download_filename(url, response.info()) download_file = os.path.join(temp_dir, filename) mode = 'w' if filename.endswith('.gz'): mode += 'b' with open(download_file, mode) as f: f.write(response.read()) fh = FileHandle(identifier=filename, filepath=download_file, file_name=filename) dataset = self.load_dataset(fh) shutil.rmtree(temp_dir) # Return only the dataset descriptor return dataset except Exception as ex: if os.path.isdir(temp_dir): shutil.rmtree(temp_dir) raise ex
def load_dataset( self, f_handle: FileHandle, proposed_schema: List[Tuple[str, str]] = []) -> FileSystemDatasetHandle: """Create a new dataset from a given file. Raises ValueError if the given file could not be loaded as a dataset. Parameters ---------- f_handle : vizier.filestore.base.FileHandle Handle for an uploaded file Returns ------- vizier.datastore.fs.dataset.FileSystemDatasetHandle """ # The file handle might be None in which case an exception is raised if f_handle is None: raise ValueError('unknown file') # Expects a file in a supported tabular data format. if not f_handle.is_tabular: raise ValueError('cannot create dataset from file \'' + f_handle.name + '\'') # Open the file as a csv file. Expects that the first row contains the # column names. Read dataset schema and dataset rows into two separate # lists. columns: List[DatasetColumn] = [] rows: List[DatasetRow] = [] with f_handle.open() as csvfile: reader = csv.reader(csvfile, delimiter=f_handle.delimiter) for col_name in next(reader): columns.append( DatasetColumn(identifier=len(columns), name=col_name.strip())) for row in reader: values = [cast(v.strip()) for v in row] rows.append( DatasetRow(identifier=str(len(rows)), values=values)) # Get unique identifier and create subfolder for the new dataset identifier = get_unique_identifier() dataset_dir = self.get_dataset_dir(identifier) os.makedirs(dataset_dir) # Write rows to data file data_file = os.path.join(dataset_dir, DATA_FILE) DefaultJsonDatasetReader(data_file).write(rows) # Create dataset an write descriptor to file dataset = FileSystemDatasetHandle(identifier=identifier, columns=columns, data_file=data_file, row_count=len(rows), max_row_id=len(rows) - 1) dataset.to_file( descriptor_file=os.path.join(dataset_dir, DESCRIPTOR_FILE)) return dataset
def unload_dataset(self, filepath, dataset_name, format='csv', options=[], filename=""): """Export a dataset from a given name. Raises ValueError if the given dataset could not be exported. Parameters ---------- dataset_name: string Name of the dataset to unload format: string Format for output (csv, json, ect.) options: dict Options for data unload filename: string The output filename - may be empty if outputting to a database Returns ------- vizier.filestore.base.FileHandle """ name = os.path.basename(filepath).lower() basepath = filepath.replace(name, "") # Create a new unique identifier for the file. abspath = os.path.abspath((r'%s' % filepath)) exported_files = mimir.unloadDataSource(dataset_name, abspath, format, options) file_handles = [] for output_file in exported_files: name = os.path.basename(output_file).lower() identifier = get_unique_identifier() file_dir = os.path.join(basepath, identifier) if not os.path.isdir(file_dir): os.makedirs(file_dir) fs_output_file = os.path.join(file_dir, DATA_FILENAME) shutil.move(os.path.join(filepath, output_file), fs_output_file) f_handle = FileHandle(identifier, output_file, name) file_handles.append(f_handle) write_metadata_file(file_dir, f_handle) return file_handles
def list_files(self): """Get list of file handles for all uploaded files. Returns ------- list(vizier.filestore.base.FileHandle) """ result = list() for f_name in os.listdir(self.base_path): dir_name = os.path.join(self.base_path, f_name) if os.path.isdir(dir_name): file_name, mimetype, encoding = read_metadata_file(dir_name) f_handle = FileHandle(f_name, filepath=os.path.join( dir_name, DATA_FILENAME), file_name=file_name, mimetype=mimetype, encoding=encoding) result.append(f_handle) return result
def get_file(self, identifier): """Get handle for file with given identifier. Returns None if no file with given identifier exists. Parameters ---------- identifier: string Unique file identifier Returns ------- vizier.filestore.base.FileHandle """ file_dir = self.get_file_dir(identifier) if os.path.isdir(file_dir): file_name, mimetype, encoding = read_metadata_file(file_dir) return FileHandle(identifier, filepath=os.path.join(file_dir, DATA_FILENAME), file_name=file_name, mimetype=mimetype, encoding=encoding) return None
from vizier.datastore.base import METADATA_FILE from vizier.datastore.dataset import DatasetColumn, DatasetRow from vizier.datastore.fs.base import FileSystemDatastore from vizier.datastore.fs.base import DATA_FILE, DESCRIPTOR_FILE from vizier.datastore.fs.base import validate_dataset from vizier.filestore.fs.base import FileSystemFilestore from vizier.filestore.base import FileHandle, FORMAT_TSV BASE_DIR = './.tmp' STORE_DIR = './.tmp/ds' FSSTORE_DIR = './.tmp/fs' FILE = FileHandle( identifier='0000', filepath='./tests/test_data/r.csv', file_name='r.csv' ) # Note that some tests access an external resource to test download capabilities. # The test will fail if the specified resource is not available. Set the # DOWNLOAD_URL to an available resource or to None to skip the download tests DOWNLOAD_URL = 'https://github.com/UBOdin/mimir-api/raw/master/test_data/r.csv' EXAMPLE_PROPERTIES = { 'columns': [ { 'name': 'A', 'structural_type': 'http://schema.org/Integer', 'semantic_types': [], 'unclean_values_ratio': 0.0,
from vizier.datastore.annotation.dataset import DatasetMetadata from vizier.datastore.base import METADATA_FILE from vizier.datastore.dataset import DatasetColumn, DatasetRow from vizier.datastore.fs.base import FileSystemDatastore from vizier.datastore.fs.base import DATA_FILE, DESCRIPTOR_FILE from vizier.datastore.fs.base import validate_dataset from vizier.filestore.fs.base import FileSystemFilestore from vizier.filestore.base import FileHandle, FORMAT_TSV BASE_DIR = './.tmp' STORE_DIR = './.tmp/ds' FSSTORE_DIR = './.tmp/fs' FILE = FileHandle( identifier='0000', filepath='./.files/w49k-mmkh.tsv', file_name='w49k-mmkh.tsv' ) # Note that some tests access an external resource to test download capabilities. # The test will fail if the specified resource is not available. Set the # DOWNLOAD_URL to an available resource or to None to skip the download tests DOWNLOAD_URL = 'http://cds-swg1.cims.nyu.edu:8080/opendb-api/api/v1/datasets/w49k-mmkh/rows/download' class TestFileSystemDatastore(unittest.TestCase): def setUp(self): """Create an empty datastore directory.""" # Delete datastore directory if it exists if os.path.isdir(BASE_DIR): shutil.rmtree(BASE_DIR)