Exemple #1
0
def create_table_dataset_version(tmpdir, mock_s3):
    from taiga2.conv import csv_to_columnar

    tmpsrc = str(tmpdir.join("thing.csv"))
    tmpdst = str(tmpdir.join("thing.columnar"))

    with open(tmpsrc, "wt") as fd:
        fd.write("a,b\n1,2\n")

    csv_to_columnar(StubProgress(), tmpsrc, tmpdst)

    # put data into mock S3
    mock_s3.Object("bucket", "key").upload_file(tmpdst)

    # create datafile
    df = mc.add_s3_datafile(
        name="dfname",
        s3_bucket="bucket",
        s3_key="key",
        compressed_s3_key="compressed/key",
        type=models.S3DataFile.DataFileFormat.Columnar,
        encoding="UTF-8",
        short_summary="short",
        long_summary="long",
    )

    ds = mc.add_dataset(name="dataset name",
                        description="dataset description",
                        datafiles_ids=[df.id])
    return str(ds.dataset_versions[0].id)
Exemple #2
0
def test_add_dataset_version(session: SessionBase):
    new_dataset_version_name = "New Dataset Version"

    new_dataset_name = "New Dataset for test_add_dataset_version"
    new_dataset_description = "New description for test_add_dataset_version"
    _new_datafile = mc.add_s3_datafile(
        name="Datafile for test_add_dataset_version",
        s3_bucket="broadtaiga2prototype",
        s3_key=mc.generate_convert_key(),
        compressed_s3_key=mc.generate_compressed_key(),
        type=mc.S3DataFile.DataFileFormat.Raw,
        encoding="UTF-8",
        short_summary="short",
        long_summary="long",
    )

    _new_dataset = mc.add_dataset(
        name=new_dataset_version_name,
        description=new_dataset_description,
        datafiles_ids=[_new_datafile.id],
    )

    _new_dataset_version = _new_dataset.dataset_versions[0]

    _new_dataset_version.datafiles.append(_new_datafile)

    db.session.add(_new_dataset_version)
    db.session.commit()

    return _new_dataset_version

    assert new_dataset_version.name == new_dataset_version_name
    assert new_dataset_version.creator == flask.g.current_user
    assert new_dataset_version.dataset == new_dataset
    assert new_dataset_version.state == DatasetVersion.DatasetVersionState.approved
def new_datafile():
    # TODO: These tests should be using the endpoint and not the model
    new_datafile_name = "New Datafile"

    _new_datafile = models_controller.add_s3_datafile(
        name=new_datafile_name,
        s3_bucket="broadtaiga2prototype",
        s3_key=models_controller.generate_convert_key(),
        compressed_s3_key=models_controller.generate_compressed_key(),
        type=S3DataFile.DataFileFormat.Raw,
        encoding="UTF-8",
        short_summary="short",
        long_summary="long",
    )

    return _new_datafile
Exemple #4
0
def new_datafile():
    new_datafile_name = "New Datafile"
    new_datafile_url = "http://google.com"

    _new_datafile = mc.add_s3_datafile(
        name=new_datafile_name,
        s3_bucket="broadtaiga2prototype",
        s3_key=mc.generate_convert_key(),
        compressed_s3_key=mc.generate_compressed_key(),
        type=S3DataFile.DataFileFormat.Raw,
        encoding="UTF-8",
        short_summary="short",
        long_summary="long",
    )

    return _new_datafile
Exemple #5
0
def create_simple_dataset():
    # create datafile
    df = mc.add_s3_datafile(
        name="df",
        s3_bucket="bucket",
        s3_key="converted/key",
        compressed_s3_key="compressed/key",
        type=models.S3DataFile.DataFileFormat.Raw,
        encoding="UTF-8",
        short_summary="short",
        long_summary="long",
    )

    ds = mc.add_dataset(name="dataset name",
                        description="dataset description",
                        datafiles_ids=[df.id])
    return ds.permaname, ds.dataset_versions[0].id, "df"
def _create_dataset_with_a_file(name="datafile") -> Dataset:
    _new_datafile = models_controller.add_s3_datafile(
        name=name,
        s3_bucket="broadtaiga2prototype",
        s3_key=models_controller.generate_convert_key(),
        compressed_s3_key=models_controller.generate_compressed_key(),
        type=models_controller.S3DataFile.DataFileFormat.Raw,
        encoding="UTF-8",
        short_summary="short",
        long_summary="long",
    )

    dataset = models_controller.add_dataset(
        name="dataset", description="", datafiles_ids=[_new_datafile.id]
    )

    return dataset
Exemple #7
0
def test_basic_create_virtual_dataset(session: SessionBase):
    # create mock data of a single dataset and a virtual dataset which references the files but with a different name
    _new_datafile = mc.add_s3_datafile(
        name="underlying-datafile",
        s3_bucket="broadtaiga2prototype",
        s3_key=mc.generate_convert_key(),
        compressed_s3_key=mc.generate_compressed_key(),
        type=mc.S3DataFile.DataFileFormat.Raw,
        encoding="UTF-8",
        short_summary="short",
        long_summary="long",
    )

    mc.add_dataset(name="underlying-dataset",
                   description="",
                   datafiles_ids=[_new_datafile.id])

    virtual_datafile = mc.add_virtual_datafile(name="alias",
                                               datafile_id=_new_datafile.id)

    virtual_dataset = mc.add_dataset(name="virtual-dataset",
                                     description="desc",
                                     datafiles_ids=[virtual_datafile.id])

    # make sure the subsequent queries can find new objects
    session.flush()

    assert virtual_dataset.id is not None

    v = mc.get_dataset(virtual_dataset.id)
    assert v.name == "virtual-dataset"

    assert len(v.dataset_versions) == 1

    version = v.dataset_versions[0]
    assert len(version.datafiles)

    entry = version.datafiles[0]
    assert entry.name == "alias"
    assert entry.underlying_data_file.id == _new_datafile.id
Exemple #8
0
def populate_db(dataset_csv_path, dataset_version_with_datafile_csv_path):
    # TODO: We should handle the Public folder properly, instead of adding it to Philip's account
    # Summary
    nb_user_created = 0
    nb_user_skipped = 0
    nb_dataset_created = 0
    nb_row_dataset_skipped = 0
    nb_datafile_created = 0
    nb_datafile_skipped = 0
    nb_row_datafile_skipped = 0
    nb_dataset_version_created = 0
    nb_dataset_version_skipped = 0

    # Dictionary to link find the dataset matching the dataset via the permanames to create the dataset versions
    # Dict<String, Array<int>>
    dict_permaname_datafile_ids = {}

    # We first manage the dataset creation
    with open(dataset_csv_path) as dataset_file:
        print("Creating the users and the datasets")
        reader = csv.DictReader(dataset_file)

        for row in reader:
            is_public = False

            if not row["permaname"]:
                print(
                    "Warning: We found an empty permaname entry: {}. Skipping it."
                    .format(row))
                nb_row_dataset_skipped += 1
                continue

            dataset_name = row["name"]
            dataset_permaname = row["permaname"]
            dataset_description = row["description"]

            if row["folder"].startswith("home"):
                dataset_folder_user = row["folder"]

                # To get the user from dataset_folder_user, we extract the user from the parenthesis
                dataset_user_email = dataset_folder_user[
                    dataset_folder_user.find("(") +
                    1:dataset_folder_user.find(")")]

                # Handle the case where user email is None
                if dataset_user_email == "None":
                    print(
                        "Warning: We found a row with folder {}. Skipping it.".
                        format(row["folder"]))
                    nb_user_skipped += 1
                    continue

                # To get the target folder, we take the string before the parenthesis
                dataset_folder_name = dataset_folder_user.split("(")[0]
            else:
                # For now, we store all the others into [email protected]
                is_public = True
                dataset_folder_name = row["folder"]
                dataset_user_email = "*****@*****.**"

            # Setting up the user
            try:
                dataset_current_user = models_controller.get_user_by_email(
                    dataset_user_email)
            except NoResultFound:
                # User does not exists yet, so we create it
                dataset_user_name = dataset_user_email[:dataset_user_email.
                                                       find("@")]
                dataset_current_user = models_controller.add_user(
                    name=dataset_user_name, email=dataset_user_email)
                print("User with email: {} created".format(dataset_user_email))
                nb_user_created += 1

            flask.g.current_user = dataset_current_user

            # TODO: We should not create the dataset if it already exists
            new_dataset = models_controller.add_dataset(
                name=dataset_name,
                permaname=dataset_permaname,
                description=dataset_description,
            )
            try:
                # TODO: Check it is case insensitive
                if str.lower(dataset_folder_name) == "home":
                    dataset_folder = dataset_current_user.home_folder
                elif str.lower(dataset_folder_name) == "trash":
                    dataset_folder = dataset_current_user.trash_folder
                else:
                    dataset_folder = models_controller.get_folder_by_name(
                        dataset_folder_name)
            except NoResultFound:
                # If no result, it means we need to create the folder in the user space or in public
                dataset_folder = models_controller.add_folder(
                    name=dataset_folder_name,
                    folder_type=models_controller.Folder.FolderType.folder,
                    description=None,
                )

                if is_public:
                    models_controller.move_to_folder(
                        entry_ids=[dataset_folder.id],
                        current_folder_id=None,
                        target_folder_id=models_controller.get_public_folder().
                        id,
                    )
                else:
                    models_controller.move_to_folder(
                        entry_ids=[dataset_folder.id],
                        current_folder_id=None,
                        target_folder_id=dataset_current_user.home_folder_id,
                    )

            # Now we can move the dataset to the folder
            models_controller.move_to_folder([new_dataset.id], None,
                                             dataset_folder.id)

            # We add the dataset_permaname as key with value an empty array so we can add each matching datafile
            dict_permaname_datafile_ids[dataset_permaname] = []

            nb_dataset_created += 1

    # We then manage the attribution of the dataset_version to the freshly created datasets
    with open(dataset_version_with_datafile_csv_path
              ) as dataset_version_with_datafile_csv:
        print("")
        print("Creating the datafiles")
        reader = csv.DictReader(dataset_version_with_datafile_csv)

        for row in reader:
            if not row["permaname"]:
                print("We found an empty permaname entry: {}. Skipping it.".
                      format(row))
                nb_row_datafile_skipped += 1
                nb_datafile_skipped += 1
                continue

            # We first create the datafiles
            datafile_type = row["type"]
            datafile_name = row.get("name", "data")
            datafile_s3_location = urlparse(row["s3_location"])
            datafile_short_summary = row["short_desc"]
            datafile_long_summary = row.get("long_desc", "")
            datafile_id = row["id"]
            datafile_creation_date = row["created_timestamp"]
            datafile_version = row["version"]
            datafile_created_by = row["created_by"]

            dataset_permaname = row["permaname"]

            # s3://taiga2/imported/4bb2169e-5b87-4d1c-a78e-3e6006316561.hdf5
            datafile_s3_bucket = datafile_s3_location.netloc
            datafile_s3_key = datafile_s3_location.path[
                1:]  # We remove the first '/'

            # Set the user to the one in the row to make the manipulations under his name
            try:
                current_user = models_controller.get_user_by_email(
                    datafile_created_by)
            except NoResultFound:
                print(
                    "Warning: The user email found in 'created_by' column ({}) was not found in the dataset side. "
                    "Creating one.".format(datafile_created_by))
                datafile_created_by_name = datafile_created_by[:
                                                               datafile_created_by
                                                               .find("@")]
                current_user = models_controller.add_user(
                    name=datafile_created_by_name, email=datafile_created_by)
                nb_user_created += 1

            flask.g.current_user = current_user

            # TODO: We should not create the datafile if it already exists: ie s3_bucket/s3_key exists
            new_datafile = models_controller.add_s3_datafile(
                s3_bucket=datafile_s3_bucket,
                s3_key=datafile_s3_key,
                name=datafile_name,
                type=datafile_type,
                short_summary=datafile_short_summary,
                long_summary=datafile_long_summary,
            )

            # We register the datafile with its permaname dataset to later create the dataset version
            # with all the datafiles
            if dataset_permaname in dict_permaname_datafile_ids:
                datafile_info = DataFileInfo(
                    id=datafile_id,
                    datafile=new_datafile,
                    version=datafile_version,
                    creation_date=datafile_creation_date,
                    owner_email=datafile_created_by,
                )
                dict_permaname_datafile_ids[dataset_permaname].append(
                    datafile_info)
            else:
                print(
                    "Warning: We found a dataset ({}) without a matching dataset ({}). Skipping it."
                    .format(datafile_id, dataset_permaname))
                nb_datafile_skipped += 1
                continue

            nb_datafile_created += 1

    # Then we create the dataset_version with the taiga id, linking with the dataset using its permaname
    print("")
    print("Linking the datafiles with the datasets")
    for dataset_permaname, array_data_file_info in dict_permaname_datafile_ids.items(
    ):
        dataset = models_controller.get_dataset_from_permaname(
            dataset_permaname)

        # Get the creation date from the first dataset_version
        for datafile_info in array_data_file_info:
            flask.g.current_user = models_controller.get_user_by_email(
                datafile_info.owner_email)
            # TODO: We should not create the dataset_version if it already exists. ie version already exists for this dataset
            dataset_version = models_controller.add_dataset_version(
                dataset_id=dataset.id,
                datafiles_ids=[datafile_info.datafile.id],
                anterior_creation_date=datafile_info.creation_date,
                forced_id=datafile_info.id,
            )

            # Then we edit the dataset version creation_date to the
            if int(datafile_info.version) == 1:
                models_controller.update_dataset_creation_date(
                    dataset_id=dataset.id,
                    new_date=datafile_info.creation_date)

        nb_dataset_version_created += 1

    print("")
    print("Done! Here is the summary:")
    print("\tLines skipped in dataset file: {}".format(nb_row_dataset_skipped))
    print(
        "\tLines skipped in datafile file: {}".format(nb_row_datafile_skipped))
    print("")
    print("\tDatasets created: {}".format(nb_dataset_created))
    print("\tUsers created: {}".format(nb_user_created))
    print("\tUsers skipped: {}".format(nb_user_skipped))
    print("")
    print("\tDatafiles created: {}".format(nb_datafile_created))
    print("\tDatafiles skipped: {}".format(nb_datafile_skipped))
    print("")
    print("\tDatasetVersions created: {}".format(nb_dataset_version_created))
    print("\tDatasetVersions skipped and datasets cleaned: {}".format(
        nb_dataset_version_skipped))
def create_db_and_populate():
    create_db()

    admin_group = models_controller.get_group_by_name("Admin")

    # Create the Admin user
    admin_user = models_controller.add_user(name="admin",
                                            email="*****@*****.**",
                                            token="test-token")
    admin_group.users.append(admin_user)
    home_folder_admin = admin_user.home_folder

    # Setting up the flask user
    flask.g.current_user = admin_user

    # Create a session where all this is happening
    upload_session_origin = models_controller.add_new_upload_session()

    # Create the origin data
    upload_session_file_origin = models_controller.add_upload_session_s3_file(
        session_id=upload_session_origin.id,
        filename="origin",
        s3_bucket=bucket_name,
        initial_file_type=models.InitialFileType.Raw,
        initial_s3_key="x",
        encoding="UTF-8",
    )

    origin_dataset = models_controller.add_dataset_from_session(
        session_id=upload_session_origin.id,
        dataset_name="origin",
        dataset_description="No description",
        current_folder_id=home_folder_admin.id,
    )

    # Create the Folder A folder
    folderA = models_controller.add_folder(
        name="Folder A",
        folder_type=models.Folder.FolderType.folder,
        description="desc")
    models_controller.add_folder_entry(folder_id=home_folder_admin.id,
                                       entry_id=folderA.id)

    # Create Folder B inside Folder A
    folderB = models_controller.add_folder(
        name="Folder B",
        folder_type=models.Folder.FolderType.folder,
        description="")
    models_controller.add_folder_entry(folder_id=folderA.id,
                                       entry_id=folderB.id)

    # Create Data inside Folder B
    upload_session_data = models_controller.add_new_upload_session()
    upload_session_file_data = models_controller.add_upload_session_s3_file(
        session_id=upload_session_data.id,
        filename="Data",
        s3_bucket=bucket_name,
        initial_file_type=models.InitialFileType.Raw,
        initial_s3_key="y",
        encoding="UTF-8",
    )

    data = models_controller.add_dataset_from_session(
        session_id=upload_session_data.id,
        dataset_name="Data",
        dataset_description="No description",
        current_folder_id=folderB.id,
    )

    data_datafiles = get_latest_version_datafiles_from_dataset(data.id)

    temp_data_datafiles = copy.copy(data_datafiles)

    # Create A1 Data/A2 Data/A3 Data inside Folder A
    for i in range(1, 4):
        name = "".join(["A", str(i), " DatasetVersion"])

        # We need now to generate new datafiles
        if i >= 1:
            loop_datafiles = []
            for datafile in temp_data_datafiles:
                loop_datafile = models_controller.add_s3_datafile(
                    name=datafile.name + "v" + str(i),
                    s3_bucket=bucket_name,
                    s3_key=models_controller.generate_convert_key(),
                    compressed_s3_key=models_controller.
                    generate_compressed_key(),
                    type=datafile.format,
                    encoding="UTF-8",
                    short_summary="short summary",
                    long_summary="long_summary",
                )
                loop_datafiles.append(loop_datafile)
            temp_data_datafiles = loop_datafiles
        datafiles_id = [datafile.id for datafile in temp_data_datafiles]
        dataAX = models_controller.add_dataset_version(
            dataset_id=origin_dataset.id, datafiles_ids=datafiles_id)

    # create a sample dataset in a known location with a known permaname
    create_sample_dataset(forced_permaname="sample-1", folder_id="public")