Example #1
0
def create_table_dataset_version(tmpdir, mock_s3):
    from taiga2.conv import csv_to_columnar

    tmpsrc = str(tmpdir.join("thing.csv"))
    tmpdst = str(tmpdir.join("thing.columnar"))

    with open(tmpsrc, "wt") as fd:
        fd.write("a,b\n1,2\n")

    csv_to_columnar(StubProgress(), tmpsrc, tmpdst)

    # put data into mock S3
    mock_s3.Object("bucket", "key").upload_file(tmpdst)

    # create datafile
    df = mc.add_s3_datafile(
        name="dfname",
        s3_bucket="bucket",
        s3_key="key",
        compressed_s3_key="compressed/key",
        type=models.S3DataFile.DataFileFormat.Columnar,
        encoding="UTF-8",
        short_summary="short",
        long_summary="long",
    )

    ds = mc.add_dataset(name="dataset name",
                        description="dataset description",
                        datafiles_ids=[df.id])
    return str(ds.dataset_versions[0].id)
Example #2
0
def test_add_dataset_version(session: SessionBase):
    new_dataset_version_name = "New Dataset Version"

    new_dataset_name = "New Dataset for test_add_dataset_version"
    new_dataset_description = "New description for test_add_dataset_version"
    _new_datafile = mc.add_s3_datafile(
        name="Datafile for test_add_dataset_version",
        s3_bucket="broadtaiga2prototype",
        s3_key=mc.generate_convert_key(),
        compressed_s3_key=mc.generate_compressed_key(),
        type=mc.S3DataFile.DataFileFormat.Raw,
        encoding="UTF-8",
        short_summary="short",
        long_summary="long",
    )

    _new_dataset = mc.add_dataset(
        name=new_dataset_version_name,
        description=new_dataset_description,
        datafiles_ids=[_new_datafile.id],
    )

    _new_dataset_version = _new_dataset.dataset_versions[0]

    _new_dataset_version.datafiles.append(_new_datafile)

    db.session.add(_new_dataset_version)
    db.session.commit()

    return _new_dataset_version

    assert new_dataset_version.name == new_dataset_version_name
    assert new_dataset_version.creator == flask.g.current_user
    assert new_dataset_version.dataset == new_dataset
    assert new_dataset_version.state == DatasetVersion.DatasetVersionState.approved
Example #3
0
def test_add_dataset(session: SessionBase, new_datafile):
    new_dataset_name = "New Dataset"
    new_dataset_permaname = generate_permaname(new_dataset_name)

    _new_dataset = mc.add_dataset(
        name=new_dataset_name,
        permaname=new_dataset_permaname,
        description="New dataset description",
        datafiles_ids=[new_datafile.id],
    )

    added_dataset_by_id = (session.query(Dataset).filter(
        Dataset.id == _new_dataset.id).one())

    added_dataset_by_permaname = (session.query(Dataset).filter(
        Dataset.permanames.any(
            DatasetPermaname.permaname == _new_dataset.permaname)).one())

    # Ensure the object we put in the database is the same than the one
    # we get by id
    assert added_dataset_by_id == _new_dataset
    assert added_dataset_by_id.name == new_dataset_name
    assert added_dataset_by_id.permaname == new_dataset_permaname

    # Also ensure that we have a unique permaname and it is the right one
    assert added_dataset_by_permaname == _new_dataset
Example #4
0
def new_dataset(new_datafile):
    new_dataset_name = "New Dataset"
    new_dataset_permaname = generate_permaname(new_dataset_name)

    _new_dataset = mc.add_dataset(
        name=new_dataset_name,
        permaname=new_dataset_permaname,
        description="New dataset description",
        datafiles_ids=[new_datafile.id],
    )

    return _new_dataset
Example #5
0
def test_basic_create_virtual_dataset(session: SessionBase):
    # create mock data of a single dataset and a virtual dataset which references the files but with a different name
    _new_datafile = mc.add_s3_datafile(
        name="underlying-datafile",
        s3_bucket="broadtaiga2prototype",
        s3_key=mc.generate_convert_key(),
        compressed_s3_key=mc.generate_compressed_key(),
        type=mc.S3DataFile.DataFileFormat.Raw,
        encoding="UTF-8",
        short_summary="short",
        long_summary="long",
    )

    mc.add_dataset(name="underlying-dataset",
                   description="",
                   datafiles_ids=[_new_datafile.id])

    virtual_datafile = mc.add_virtual_datafile(name="alias",
                                               datafile_id=_new_datafile.id)

    virtual_dataset = mc.add_dataset(name="virtual-dataset",
                                     description="desc",
                                     datafiles_ids=[virtual_datafile.id])

    # make sure the subsequent queries can find new objects
    session.flush()

    assert virtual_dataset.id is not None

    v = mc.get_dataset(virtual_dataset.id)
    assert v.name == "virtual-dataset"

    assert len(v.dataset_versions) == 1

    version = v.dataset_versions[0]
    assert len(version.datafiles)

    entry = version.datafiles[0]
    assert entry.name == "alias"
    assert entry.underlying_data_file.id == _new_datafile.id
Example #6
0
def new_dataset(new_datafile):
    # TODO: These tests should be using the endpoint and not the model
    new_dataset_name = "New Dataset"
    new_dataset_permaname = generate_permaname(new_dataset_name)

    _new_dataset = models_controller.add_dataset(
        name=new_dataset_name,
        permaname=new_dataset_permaname,
        description="New dataset description",
        datafiles_ids=[new_datafile.id],
    )

    return _new_dataset
Example #7
0
def new_dataset_version(new_datafile):
    # TODO: Add in the name it is an empty dataset_version
    new_dataset_name = "New Dataset for new_dataset_version"
    new_dataset_description = "New description for new_dataset_version"

    _new_dataset = mc.add_dataset(
        name=new_dataset_name,
        description=new_dataset_description,
        datafiles_ids=[new_datafile.id],
    )

    _new_dataset_version = _new_dataset.dataset_versions[0]

    return _new_dataset_version
Example #8
0
def _create_dataset_with_a_virtual_file(
    files, folder_id, name="virtual", description="description"
) -> Dataset:
    datafiles = []
    for file in files:
        datafile = DataFile.query.get(file[1])
        assert datafile is not None
        datafiles.append(models_controller.add_virtual_datafile(file[0], datafile.id))

    dataset = models_controller.add_dataset(
        name=name, description=description, datafiles_ids=[x.id for x in datafiles]
    )

    models_controller.copy_to_folder([dataset.id], folder_id)

    return dataset
Example #9
0
def new_dataset_in_new_folder_in_home(
    session: SessionBase, new_folder_in_home, new_datafile
):
    new_dataset_name = "New Dataset in a folder"
    new_dataset_permaname = generate_permaname(new_dataset_name)

    _new_dataset = models_controller.add_dataset(
        name=new_dataset_name,
        permaname=new_dataset_permaname,
        description="New dataset description",
        datafiles_ids=[new_datafile.id],
    )

    models_controller.move_to_folder([_new_dataset.id], None, new_folder_in_home.id)

    return _new_dataset
Example #10
0
def create_simple_dataset():
    # create datafile
    df = mc.add_s3_datafile(
        name="df",
        s3_bucket="bucket",
        s3_key="converted/key",
        compressed_s3_key="compressed/key",
        type=models.S3DataFile.DataFileFormat.Raw,
        encoding="UTF-8",
        short_summary="short",
        long_summary="long",
    )

    ds = mc.add_dataset(name="dataset name",
                        description="dataset description",
                        datafiles_ids=[df.id])
    return ds.permaname, ds.dataset_versions[0].id, "df"
Example #11
0
def _create_dataset_with_a_file(name="datafile") -> Dataset:
    _new_datafile = models_controller.add_s3_datafile(
        name=name,
        s3_bucket="broadtaiga2prototype",
        s3_key=models_controller.generate_convert_key(),
        compressed_s3_key=models_controller.generate_compressed_key(),
        type=models_controller.S3DataFile.DataFileFormat.Raw,
        encoding="UTF-8",
        short_summary="short",
        long_summary="long",
    )

    dataset = models_controller.add_dataset(
        name="dataset", description="", datafiles_ids=[_new_datafile.id]
    )

    return dataset
Example #12
0
def test_add_dataset_with_datafile(session: SessionBase, new_datafile):
    new_dataset_name = "New dataset with datasetVersion"
    new_dataset_description = "New dataset with datasetVersion"

    datafiles_ids = [new_datafile.id]

    new_dataset = mc.add_dataset(
        name=new_dataset_name,
        description=new_dataset_description,
        datafiles_ids=datafiles_ids,
    )
    latest_dataset_version = mc.get_latest_dataset_version(new_dataset.id)

    assert new_dataset.name == new_dataset_name
    assert new_dataset.description == None
    assert latest_dataset_version.description == new_dataset_description
    assert len(new_dataset.dataset_versions) == 1

    assert len(new_dataset.dataset_versions[0].datafiles) == 1
    assert new_dataset.dataset_versions[0].datafiles[0] == new_datafile
Example #13
0
def populate_db(dataset_csv_path, dataset_version_with_datafile_csv_path):
    # TODO: We should handle the Public folder properly, instead of adding it to Philip's account
    # Summary
    nb_user_created = 0
    nb_user_skipped = 0
    nb_dataset_created = 0
    nb_row_dataset_skipped = 0
    nb_datafile_created = 0
    nb_datafile_skipped = 0
    nb_row_datafile_skipped = 0
    nb_dataset_version_created = 0
    nb_dataset_version_skipped = 0

    # Dictionary to link find the dataset matching the dataset via the permanames to create the dataset versions
    # Dict<String, Array<int>>
    dict_permaname_datafile_ids = {}

    # We first manage the dataset creation
    with open(dataset_csv_path) as dataset_file:
        print("Creating the users and the datasets")
        reader = csv.DictReader(dataset_file)

        for row in reader:
            is_public = False

            if not row["permaname"]:
                print(
                    "Warning: We found an empty permaname entry: {}. Skipping it."
                    .format(row))
                nb_row_dataset_skipped += 1
                continue

            dataset_name = row["name"]
            dataset_permaname = row["permaname"]
            dataset_description = row["description"]

            if row["folder"].startswith("home"):
                dataset_folder_user = row["folder"]

                # To get the user from dataset_folder_user, we extract the user from the parenthesis
                dataset_user_email = dataset_folder_user[
                    dataset_folder_user.find("(") +
                    1:dataset_folder_user.find(")")]

                # Handle the case where user email is None
                if dataset_user_email == "None":
                    print(
                        "Warning: We found a row with folder {}. Skipping it.".
                        format(row["folder"]))
                    nb_user_skipped += 1
                    continue

                # To get the target folder, we take the string before the parenthesis
                dataset_folder_name = dataset_folder_user.split("(")[0]
            else:
                # For now, we store all the others into [email protected]
                is_public = True
                dataset_folder_name = row["folder"]
                dataset_user_email = "*****@*****.**"

            # Setting up the user
            try:
                dataset_current_user = models_controller.get_user_by_email(
                    dataset_user_email)
            except NoResultFound:
                # User does not exists yet, so we create it
                dataset_user_name = dataset_user_email[:dataset_user_email.
                                                       find("@")]
                dataset_current_user = models_controller.add_user(
                    name=dataset_user_name, email=dataset_user_email)
                print("User with email: {} created".format(dataset_user_email))
                nb_user_created += 1

            flask.g.current_user = dataset_current_user

            # TODO: We should not create the dataset if it already exists
            new_dataset = models_controller.add_dataset(
                name=dataset_name,
                permaname=dataset_permaname,
                description=dataset_description,
            )
            try:
                # TODO: Check it is case insensitive
                if str.lower(dataset_folder_name) == "home":
                    dataset_folder = dataset_current_user.home_folder
                elif str.lower(dataset_folder_name) == "trash":
                    dataset_folder = dataset_current_user.trash_folder
                else:
                    dataset_folder = models_controller.get_folder_by_name(
                        dataset_folder_name)
            except NoResultFound:
                # If no result, it means we need to create the folder in the user space or in public
                dataset_folder = models_controller.add_folder(
                    name=dataset_folder_name,
                    folder_type=models_controller.Folder.FolderType.folder,
                    description=None,
                )

                if is_public:
                    models_controller.move_to_folder(
                        entry_ids=[dataset_folder.id],
                        current_folder_id=None,
                        target_folder_id=models_controller.get_public_folder().
                        id,
                    )
                else:
                    models_controller.move_to_folder(
                        entry_ids=[dataset_folder.id],
                        current_folder_id=None,
                        target_folder_id=dataset_current_user.home_folder_id,
                    )

            # Now we can move the dataset to the folder
            models_controller.move_to_folder([new_dataset.id], None,
                                             dataset_folder.id)

            # We add the dataset_permaname as key with value an empty array so we can add each matching datafile
            dict_permaname_datafile_ids[dataset_permaname] = []

            nb_dataset_created += 1

    # We then manage the attribution of the dataset_version to the freshly created datasets
    with open(dataset_version_with_datafile_csv_path
              ) as dataset_version_with_datafile_csv:
        print("")
        print("Creating the datafiles")
        reader = csv.DictReader(dataset_version_with_datafile_csv)

        for row in reader:
            if not row["permaname"]:
                print("We found an empty permaname entry: {}. Skipping it.".
                      format(row))
                nb_row_datafile_skipped += 1
                nb_datafile_skipped += 1
                continue

            # We first create the datafiles
            datafile_type = row["type"]
            datafile_name = row.get("name", "data")
            datafile_s3_location = urlparse(row["s3_location"])
            datafile_short_summary = row["short_desc"]
            datafile_long_summary = row.get("long_desc", "")
            datafile_id = row["id"]
            datafile_creation_date = row["created_timestamp"]
            datafile_version = row["version"]
            datafile_created_by = row["created_by"]

            dataset_permaname = row["permaname"]

            # s3://taiga2/imported/4bb2169e-5b87-4d1c-a78e-3e6006316561.hdf5
            datafile_s3_bucket = datafile_s3_location.netloc
            datafile_s3_key = datafile_s3_location.path[
                1:]  # We remove the first '/'

            # Set the user to the one in the row to make the manipulations under his name
            try:
                current_user = models_controller.get_user_by_email(
                    datafile_created_by)
            except NoResultFound:
                print(
                    "Warning: The user email found in 'created_by' column ({}) was not found in the dataset side. "
                    "Creating one.".format(datafile_created_by))
                datafile_created_by_name = datafile_created_by[:
                                                               datafile_created_by
                                                               .find("@")]
                current_user = models_controller.add_user(
                    name=datafile_created_by_name, email=datafile_created_by)
                nb_user_created += 1

            flask.g.current_user = current_user

            # TODO: We should not create the datafile if it already exists: ie s3_bucket/s3_key exists
            new_datafile = models_controller.add_s3_datafile(
                s3_bucket=datafile_s3_bucket,
                s3_key=datafile_s3_key,
                name=datafile_name,
                type=datafile_type,
                short_summary=datafile_short_summary,
                long_summary=datafile_long_summary,
            )

            # We register the datafile with its permaname dataset to later create the dataset version
            # with all the datafiles
            if dataset_permaname in dict_permaname_datafile_ids:
                datafile_info = DataFileInfo(
                    id=datafile_id,
                    datafile=new_datafile,
                    version=datafile_version,
                    creation_date=datafile_creation_date,
                    owner_email=datafile_created_by,
                )
                dict_permaname_datafile_ids[dataset_permaname].append(
                    datafile_info)
            else:
                print(
                    "Warning: We found a dataset ({}) without a matching dataset ({}). Skipping it."
                    .format(datafile_id, dataset_permaname))
                nb_datafile_skipped += 1
                continue

            nb_datafile_created += 1

    # Then we create the dataset_version with the taiga id, linking with the dataset using its permaname
    print("")
    print("Linking the datafiles with the datasets")
    for dataset_permaname, array_data_file_info in dict_permaname_datafile_ids.items(
    ):
        dataset = models_controller.get_dataset_from_permaname(
            dataset_permaname)

        # Get the creation date from the first dataset_version
        for datafile_info in array_data_file_info:
            flask.g.current_user = models_controller.get_user_by_email(
                datafile_info.owner_email)
            # TODO: We should not create the dataset_version if it already exists. ie version already exists for this dataset
            dataset_version = models_controller.add_dataset_version(
                dataset_id=dataset.id,
                datafiles_ids=[datafile_info.datafile.id],
                anterior_creation_date=datafile_info.creation_date,
                forced_id=datafile_info.id,
            )

            # Then we edit the dataset version creation_date to the
            if int(datafile_info.version) == 1:
                models_controller.update_dataset_creation_date(
                    dataset_id=dataset.id,
                    new_date=datafile_info.creation_date)

        nb_dataset_version_created += 1

    print("")
    print("Done! Here is the summary:")
    print("\tLines skipped in dataset file: {}".format(nb_row_dataset_skipped))
    print(
        "\tLines skipped in datafile file: {}".format(nb_row_datafile_skipped))
    print("")
    print("\tDatasets created: {}".format(nb_dataset_created))
    print("\tUsers created: {}".format(nb_user_created))
    print("\tUsers skipped: {}".format(nb_user_skipped))
    print("")
    print("\tDatafiles created: {}".format(nb_datafile_created))
    print("\tDatafiles skipped: {}".format(nb_datafile_skipped))
    print("")
    print("\tDatasetVersions created: {}".format(nb_dataset_version_created))
    print("\tDatasetVersions skipped and datasets cleaned: {}".format(
        nb_dataset_version_skipped))