def create_table_dataset_version(tmpdir, mock_s3): from taiga2.conv import csv_to_columnar tmpsrc = str(tmpdir.join("thing.csv")) tmpdst = str(tmpdir.join("thing.columnar")) with open(tmpsrc, "wt") as fd: fd.write("a,b\n1,2\n") csv_to_columnar(StubProgress(), tmpsrc, tmpdst) # put data into mock S3 mock_s3.Object("bucket", "key").upload_file(tmpdst) # create datafile df = mc.add_s3_datafile( name="dfname", s3_bucket="bucket", s3_key="key", compressed_s3_key="compressed/key", type=models.S3DataFile.DataFileFormat.Columnar, encoding="UTF-8", short_summary="short", long_summary="long", ) ds = mc.add_dataset(name="dataset name", description="dataset description", datafiles_ids=[df.id]) return str(ds.dataset_versions[0].id)
def test_add_dataset_version(session: SessionBase): new_dataset_version_name = "New Dataset Version" new_dataset_name = "New Dataset for test_add_dataset_version" new_dataset_description = "New description for test_add_dataset_version" _new_datafile = mc.add_s3_datafile( name="Datafile for test_add_dataset_version", s3_bucket="broadtaiga2prototype", s3_key=mc.generate_convert_key(), compressed_s3_key=mc.generate_compressed_key(), type=mc.S3DataFile.DataFileFormat.Raw, encoding="UTF-8", short_summary="short", long_summary="long", ) _new_dataset = mc.add_dataset( name=new_dataset_version_name, description=new_dataset_description, datafiles_ids=[_new_datafile.id], ) _new_dataset_version = _new_dataset.dataset_versions[0] _new_dataset_version.datafiles.append(_new_datafile) db.session.add(_new_dataset_version) db.session.commit() return _new_dataset_version assert new_dataset_version.name == new_dataset_version_name assert new_dataset_version.creator == flask.g.current_user assert new_dataset_version.dataset == new_dataset assert new_dataset_version.state == DatasetVersion.DatasetVersionState.approved
def test_add_dataset(session: SessionBase, new_datafile): new_dataset_name = "New Dataset" new_dataset_permaname = generate_permaname(new_dataset_name) _new_dataset = mc.add_dataset( name=new_dataset_name, permaname=new_dataset_permaname, description="New dataset description", datafiles_ids=[new_datafile.id], ) added_dataset_by_id = (session.query(Dataset).filter( Dataset.id == _new_dataset.id).one()) added_dataset_by_permaname = (session.query(Dataset).filter( Dataset.permanames.any( DatasetPermaname.permaname == _new_dataset.permaname)).one()) # Ensure the object we put in the database is the same than the one # we get by id assert added_dataset_by_id == _new_dataset assert added_dataset_by_id.name == new_dataset_name assert added_dataset_by_id.permaname == new_dataset_permaname # Also ensure that we have a unique permaname and it is the right one assert added_dataset_by_permaname == _new_dataset
def new_dataset(new_datafile): new_dataset_name = "New Dataset" new_dataset_permaname = generate_permaname(new_dataset_name) _new_dataset = mc.add_dataset( name=new_dataset_name, permaname=new_dataset_permaname, description="New dataset description", datafiles_ids=[new_datafile.id], ) return _new_dataset
def test_basic_create_virtual_dataset(session: SessionBase): # create mock data of a single dataset and a virtual dataset which references the files but with a different name _new_datafile = mc.add_s3_datafile( name="underlying-datafile", s3_bucket="broadtaiga2prototype", s3_key=mc.generate_convert_key(), compressed_s3_key=mc.generate_compressed_key(), type=mc.S3DataFile.DataFileFormat.Raw, encoding="UTF-8", short_summary="short", long_summary="long", ) mc.add_dataset(name="underlying-dataset", description="", datafiles_ids=[_new_datafile.id]) virtual_datafile = mc.add_virtual_datafile(name="alias", datafile_id=_new_datafile.id) virtual_dataset = mc.add_dataset(name="virtual-dataset", description="desc", datafiles_ids=[virtual_datafile.id]) # make sure the subsequent queries can find new objects session.flush() assert virtual_dataset.id is not None v = mc.get_dataset(virtual_dataset.id) assert v.name == "virtual-dataset" assert len(v.dataset_versions) == 1 version = v.dataset_versions[0] assert len(version.datafiles) entry = version.datafiles[0] assert entry.name == "alias" assert entry.underlying_data_file.id == _new_datafile.id
def new_dataset(new_datafile): # TODO: These tests should be using the endpoint and not the model new_dataset_name = "New Dataset" new_dataset_permaname = generate_permaname(new_dataset_name) _new_dataset = models_controller.add_dataset( name=new_dataset_name, permaname=new_dataset_permaname, description="New dataset description", datafiles_ids=[new_datafile.id], ) return _new_dataset
def new_dataset_version(new_datafile): # TODO: Add in the name it is an empty dataset_version new_dataset_name = "New Dataset for new_dataset_version" new_dataset_description = "New description for new_dataset_version" _new_dataset = mc.add_dataset( name=new_dataset_name, description=new_dataset_description, datafiles_ids=[new_datafile.id], ) _new_dataset_version = _new_dataset.dataset_versions[0] return _new_dataset_version
def _create_dataset_with_a_virtual_file( files, folder_id, name="virtual", description="description" ) -> Dataset: datafiles = [] for file in files: datafile = DataFile.query.get(file[1]) assert datafile is not None datafiles.append(models_controller.add_virtual_datafile(file[0], datafile.id)) dataset = models_controller.add_dataset( name=name, description=description, datafiles_ids=[x.id for x in datafiles] ) models_controller.copy_to_folder([dataset.id], folder_id) return dataset
def new_dataset_in_new_folder_in_home( session: SessionBase, new_folder_in_home, new_datafile ): new_dataset_name = "New Dataset in a folder" new_dataset_permaname = generate_permaname(new_dataset_name) _new_dataset = models_controller.add_dataset( name=new_dataset_name, permaname=new_dataset_permaname, description="New dataset description", datafiles_ids=[new_datafile.id], ) models_controller.move_to_folder([_new_dataset.id], None, new_folder_in_home.id) return _new_dataset
def create_simple_dataset(): # create datafile df = mc.add_s3_datafile( name="df", s3_bucket="bucket", s3_key="converted/key", compressed_s3_key="compressed/key", type=models.S3DataFile.DataFileFormat.Raw, encoding="UTF-8", short_summary="short", long_summary="long", ) ds = mc.add_dataset(name="dataset name", description="dataset description", datafiles_ids=[df.id]) return ds.permaname, ds.dataset_versions[0].id, "df"
def _create_dataset_with_a_file(name="datafile") -> Dataset: _new_datafile = models_controller.add_s3_datafile( name=name, s3_bucket="broadtaiga2prototype", s3_key=models_controller.generate_convert_key(), compressed_s3_key=models_controller.generate_compressed_key(), type=models_controller.S3DataFile.DataFileFormat.Raw, encoding="UTF-8", short_summary="short", long_summary="long", ) dataset = models_controller.add_dataset( name="dataset", description="", datafiles_ids=[_new_datafile.id] ) return dataset
def test_add_dataset_with_datafile(session: SessionBase, new_datafile): new_dataset_name = "New dataset with datasetVersion" new_dataset_description = "New dataset with datasetVersion" datafiles_ids = [new_datafile.id] new_dataset = mc.add_dataset( name=new_dataset_name, description=new_dataset_description, datafiles_ids=datafiles_ids, ) latest_dataset_version = mc.get_latest_dataset_version(new_dataset.id) assert new_dataset.name == new_dataset_name assert new_dataset.description == None assert latest_dataset_version.description == new_dataset_description assert len(new_dataset.dataset_versions) == 1 assert len(new_dataset.dataset_versions[0].datafiles) == 1 assert new_dataset.dataset_versions[0].datafiles[0] == new_datafile
def populate_db(dataset_csv_path, dataset_version_with_datafile_csv_path): # TODO: We should handle the Public folder properly, instead of adding it to Philip's account # Summary nb_user_created = 0 nb_user_skipped = 0 nb_dataset_created = 0 nb_row_dataset_skipped = 0 nb_datafile_created = 0 nb_datafile_skipped = 0 nb_row_datafile_skipped = 0 nb_dataset_version_created = 0 nb_dataset_version_skipped = 0 # Dictionary to link find the dataset matching the dataset via the permanames to create the dataset versions # Dict<String, Array<int>> dict_permaname_datafile_ids = {} # We first manage the dataset creation with open(dataset_csv_path) as dataset_file: print("Creating the users and the datasets") reader = csv.DictReader(dataset_file) for row in reader: is_public = False if not row["permaname"]: print( "Warning: We found an empty permaname entry: {}. Skipping it." .format(row)) nb_row_dataset_skipped += 1 continue dataset_name = row["name"] dataset_permaname = row["permaname"] dataset_description = row["description"] if row["folder"].startswith("home"): dataset_folder_user = row["folder"] # To get the user from dataset_folder_user, we extract the user from the parenthesis dataset_user_email = dataset_folder_user[ dataset_folder_user.find("(") + 1:dataset_folder_user.find(")")] # Handle the case where user email is None if dataset_user_email == "None": print( "Warning: We found a row with folder {}. Skipping it.". format(row["folder"])) nb_user_skipped += 1 continue # To get the target folder, we take the string before the parenthesis dataset_folder_name = dataset_folder_user.split("(")[0] else: # For now, we store all the others into [email protected] is_public = True dataset_folder_name = row["folder"] dataset_user_email = "*****@*****.**" # Setting up the user try: dataset_current_user = models_controller.get_user_by_email( dataset_user_email) except NoResultFound: # User does not exists yet, so we create it dataset_user_name = dataset_user_email[:dataset_user_email. find("@")] dataset_current_user = models_controller.add_user( name=dataset_user_name, email=dataset_user_email) print("User with email: {} created".format(dataset_user_email)) nb_user_created += 1 flask.g.current_user = dataset_current_user # TODO: We should not create the dataset if it already exists new_dataset = models_controller.add_dataset( name=dataset_name, permaname=dataset_permaname, description=dataset_description, ) try: # TODO: Check it is case insensitive if str.lower(dataset_folder_name) == "home": dataset_folder = dataset_current_user.home_folder elif str.lower(dataset_folder_name) == "trash": dataset_folder = dataset_current_user.trash_folder else: dataset_folder = models_controller.get_folder_by_name( dataset_folder_name) except NoResultFound: # If no result, it means we need to create the folder in the user space or in public dataset_folder = models_controller.add_folder( name=dataset_folder_name, folder_type=models_controller.Folder.FolderType.folder, description=None, ) if is_public: models_controller.move_to_folder( entry_ids=[dataset_folder.id], current_folder_id=None, target_folder_id=models_controller.get_public_folder(). id, ) else: models_controller.move_to_folder( entry_ids=[dataset_folder.id], current_folder_id=None, target_folder_id=dataset_current_user.home_folder_id, ) # Now we can move the dataset to the folder models_controller.move_to_folder([new_dataset.id], None, dataset_folder.id) # We add the dataset_permaname as key with value an empty array so we can add each matching datafile dict_permaname_datafile_ids[dataset_permaname] = [] nb_dataset_created += 1 # We then manage the attribution of the dataset_version to the freshly created datasets with open(dataset_version_with_datafile_csv_path ) as dataset_version_with_datafile_csv: print("") print("Creating the datafiles") reader = csv.DictReader(dataset_version_with_datafile_csv) for row in reader: if not row["permaname"]: print("We found an empty permaname entry: {}. Skipping it.". format(row)) nb_row_datafile_skipped += 1 nb_datafile_skipped += 1 continue # We first create the datafiles datafile_type = row["type"] datafile_name = row.get("name", "data") datafile_s3_location = urlparse(row["s3_location"]) datafile_short_summary = row["short_desc"] datafile_long_summary = row.get("long_desc", "") datafile_id = row["id"] datafile_creation_date = row["created_timestamp"] datafile_version = row["version"] datafile_created_by = row["created_by"] dataset_permaname = row["permaname"] # s3://taiga2/imported/4bb2169e-5b87-4d1c-a78e-3e6006316561.hdf5 datafile_s3_bucket = datafile_s3_location.netloc datafile_s3_key = datafile_s3_location.path[ 1:] # We remove the first '/' # Set the user to the one in the row to make the manipulations under his name try: current_user = models_controller.get_user_by_email( datafile_created_by) except NoResultFound: print( "Warning: The user email found in 'created_by' column ({}) was not found in the dataset side. " "Creating one.".format(datafile_created_by)) datafile_created_by_name = datafile_created_by[: datafile_created_by .find("@")] current_user = models_controller.add_user( name=datafile_created_by_name, email=datafile_created_by) nb_user_created += 1 flask.g.current_user = current_user # TODO: We should not create the datafile if it already exists: ie s3_bucket/s3_key exists new_datafile = models_controller.add_s3_datafile( s3_bucket=datafile_s3_bucket, s3_key=datafile_s3_key, name=datafile_name, type=datafile_type, short_summary=datafile_short_summary, long_summary=datafile_long_summary, ) # We register the datafile with its permaname dataset to later create the dataset version # with all the datafiles if dataset_permaname in dict_permaname_datafile_ids: datafile_info = DataFileInfo( id=datafile_id, datafile=new_datafile, version=datafile_version, creation_date=datafile_creation_date, owner_email=datafile_created_by, ) dict_permaname_datafile_ids[dataset_permaname].append( datafile_info) else: print( "Warning: We found a dataset ({}) without a matching dataset ({}). Skipping it." .format(datafile_id, dataset_permaname)) nb_datafile_skipped += 1 continue nb_datafile_created += 1 # Then we create the dataset_version with the taiga id, linking with the dataset using its permaname print("") print("Linking the datafiles with the datasets") for dataset_permaname, array_data_file_info in dict_permaname_datafile_ids.items( ): dataset = models_controller.get_dataset_from_permaname( dataset_permaname) # Get the creation date from the first dataset_version for datafile_info in array_data_file_info: flask.g.current_user = models_controller.get_user_by_email( datafile_info.owner_email) # TODO: We should not create the dataset_version if it already exists. ie version already exists for this dataset dataset_version = models_controller.add_dataset_version( dataset_id=dataset.id, datafiles_ids=[datafile_info.datafile.id], anterior_creation_date=datafile_info.creation_date, forced_id=datafile_info.id, ) # Then we edit the dataset version creation_date to the if int(datafile_info.version) == 1: models_controller.update_dataset_creation_date( dataset_id=dataset.id, new_date=datafile_info.creation_date) nb_dataset_version_created += 1 print("") print("Done! Here is the summary:") print("\tLines skipped in dataset file: {}".format(nb_row_dataset_skipped)) print( "\tLines skipped in datafile file: {}".format(nb_row_datafile_skipped)) print("") print("\tDatasets created: {}".format(nb_dataset_created)) print("\tUsers created: {}".format(nb_user_created)) print("\tUsers skipped: {}".format(nb_user_skipped)) print("") print("\tDatafiles created: {}".format(nb_datafile_created)) print("\tDatafiles skipped: {}".format(nb_datafile_skipped)) print("") print("\tDatasetVersions created: {}".format(nb_dataset_version_created)) print("\tDatasetVersions skipped and datasets cleaned: {}".format( nb_dataset_version_skipped))