コード例 #1
0
def upload_dataset_version_to_figshare(figshareDatasetVersionLink):
    dataset_version_id = figshareDatasetVersionLink["dataset_version_id"]
    article_name = figshareDatasetVersionLink["article_name"]
    article_description = figshareDatasetVersionLink["article_description"]
    article_license = figshareDatasetVersionLink.get("license", 0)
    article_categories = figshareDatasetVersionLink.get("categories", None)
    article_keywords = figshareDatasetVersionLink.get("keywords", None)
    article_references = figshareDatasetVersionLink.get("references", None)

    files_to_upload = figshareDatasetVersionLink["files_to_upload"]

    token = _fetch_figshare_token()
    if token is None:
        flask.abort(401)

    dataset_version = models_controller.get_dataset_version(dataset_version_id)
    figshare_dataset_version_link = figshare.create_article(
        dataset_version_id,
        article_name,
        article_description,
        article_license,
        article_categories,
        article_keywords,
        article_references,
        token,
    )

    from taiga2.tasks import upload_datafile_to_figshare

    for file_to_upload in files_to_upload:
        datafile = models_controller.get_datafile(
            file_to_upload["datafile_id"])
        if datafile.type == "gcs":
            file_to_upload[
                "failure_reason"] = "Cannot upload GCS pointer files"
            continue
        elif datafile.type == "virtual":
            datafile = datafile.underlying_data_file

        if datafile.compressed_s3_key is None:
            file_to_upload[
                "failure_reason"] = "Cannot upload files without compressed S3 file"
            continue

        task = upload_datafile_to_figshare.delay(
            figshare_dataset_version_link.figshare_article_id,
            figshare_dataset_version_link.id,
            file_to_upload["file_name"],
            file_to_upload["datafile_id"],
            datafile.compressed_s3_key,
            datafile.original_file_md5,
            token,
        )

        file_to_upload["task_id"] = task.id

    return flask.jsonify({
        "article_id": figshare_dataset_version_link.figshare_article_id,
        "files": files_to_upload,
    })
コード例 #2
0
def _download_and_compress_s3_backfill(datafile_id: str, s3_key: str,
                                       mime_type: str):
    # TODO: Add progress?
    s3 = aws.s3
    datafile = models_controller.get_datafile(datafile_id)
    compressed_s3_key = models_controller.generate_compressed_key(
    ) + "-backfilled"

    bucket_name = flask.current_app.config["S3_BUCKET"]

    s3_object = s3.Object(bucket_name, s3_key)
    compressed_s3_object = s3.Object(bucket_name, compressed_s3_key)

    with tempfile.NamedTemporaryFile() as download_dest:
        with tempfile.NamedTemporaryFile() as compressed_dest:
            s3_object.download_fileobj(download_dest)
            _compress_and_upload_to_s3(
                s3_object,
                download_dest,
                compressed_dest,
                compressed_s3_object,
                mime_type,
                None,
            )

    models_controller.update_datafile_compressed_key_and_column_types(
        datafile_id, compressed_s3_key, None)
コード例 #3
0
def copy_datafile_to_google_bucket(self, datafile_id: str, dest_bucket: str,
                                   dest_gcs_path: str):
    s3 = aws.s3
    datafile = models_controller.get_datafile(datafile_id)
    compressed_s3_object = s3.Object(datafile.s3_bucket,
                                     datafile.compressed_s3_key)
    with tempfile.NamedTemporaryFile() as download_dest:
        compressed_s3_object.download_fileobj(download_dest)
        download_dest.seek(0)
        upload_from_file(
            download_dest,
            dest_bucket,
            dest_gcs_path,
            compressed_s3_object.content_type,
            compressed_s3_object.content_encoding,
        )
コード例 #4
0
def get_provenance_graph(gid):
    print("We received the graph Id: {}!".format(gid))
    provenance_full_graph_schema = schemas.ProvenanceGraphFullSchema()

    graph = models_controller.get_provenance_graph_by_id(gid)

    json_graph_data = provenance_full_graph_schema.dump(graph).data
    # We also need the url, so we add this to the json
    for provenance_node in json_graph_data["provenance_nodes"]:
        try:
            datafile = models_controller.get_datafile(
                provenance_node["datafile_id"])
            provenance_node["url"] = datafile.dataset_version_id
        except NoResultFound:
            log.info(
                "The node {} with datafile_id {} has been ignored because no datafile was matching"
                .format(provenance_node["node_id"],
                        provenance_node["datafile_id"]))

    return flask.jsonify(json_graph_data)
コード例 #5
0
def backfill_compressed_file(self, datafile_id: str,
                             cache_entry_id: Optional[str]):
    """
    Get the S3 key from ConversionCache (if DataFile is not Raw) or s3_key field, then
    compress, upload to S3, and update DataFile compressed_s3_key and column types
    
    Args:
        datafile_id (str): ID for the (S3) DataFile to update
        cache_entry_id (Optional[str]): ID for the ConversionCache entry to use, or
            None if the DataFile is a Raw DataFile
    """
    if cache_entry_id is None:
        datafile = models_controller.get_datafile(datafile_id)
        s3_key = datafile.s3_key
        mime_type = "text/plain"
    else:
        s3_key = _get_s3_key_from_conversion_cache_url(cache_entry_id)
        mime_type = "text/csv"

    _download_and_compress_s3_backfill(datafile_id, s3_key, mime_type)
コード例 #6
0
def _backfill_compressed_file(datafile_id: str, delay=True):
    datafile = models_controller.get_datafile(datafile_id)
    if datafile.type != "s3" or datafile.compressed_s3_key is not None:
        return flask.make_response(flask.jsonify(None), 304)

    from taiga2.tasks import (
        convert_and_backfill_compressed_file,
        backfill_compressed_file,
    )

    if datafile.format == S3DataFile.DataFileFormat.Raw:
        task = backfill_compressed_file.delay(datafile_id, None)
    else:
        is_new, entry = models_controller.get_conversion_cache_entry(
            datafile.dataset_version.id, datafile.name, "csv")

        if is_new:
            task = convert_and_backfill_compressed_file.delay(
                datafile_id, entry.id)
        else:
            task = backfill_compressed_file.delay(datafile_id, entry.id)

    return flask.make_response(flask.jsonify(task.id), 202)
コード例 #7
0
def convert_and_backfill_compressed_file(self, datafile_id: str,
                                         cache_entry_id: str):
    """
    Convert a HDF5 or Columnar file to CSV, then compress, upload to S3, and update
    DataFile compressed_s3_key and column types
    
    Args:
        datafile_id (str): ID for the (S3) DataFile to update
        cache_entry_id (str): ID for the ConversionCache entry to update
    """
    datafile = models_controller.get_datafile(datafile_id)
    _start_conversion_task(
        self,
        Progress(self),
        datafile.s3_bucket,
        datafile.s3_key,
        str(datafile.format),
        "csv",
        cache_entry_id,
    )
    s3_key = _get_s3_key_from_conversion_cache_url(cache_entry_id)

    _download_and_compress_s3_backfill(datafile_id, s3_key, "text/csv")
コード例 #8
0
def update_figshare_article_with_dataset_version(figshareDatasetVersionLink):
    dataset_version_id = figshareDatasetVersionLink["dataset_version_id"]
    description = figshareDatasetVersionLink["description"]
    article_id = figshareDatasetVersionLink["article_id"]
    current_article_version = figshareDatasetVersionLink[
        "current_article_version"]
    files_to_update = figshareDatasetVersionLink["files_to_update"]

    token = _fetch_figshare_token()
    if token is None:
        flask.abort(401)

    dataset_version = models_controller.get_dataset_version(dataset_version_id)
    figshare_dataset_version_link = models_controller.add_figshare_dataset_version_link(
        dataset_version.id, article_id, current_article_version + 1)

    from taiga2.tasks import upload_datafile_to_figshare

    try:
        for file_to_update in files_to_update:
            action = file_to_update["action"]

            if action == "Delete":
                figshare.delete_file(article_id,
                                     file_to_update["figshare_file_id"], token)
            elif action == "Add":
                datafile_id = file_to_update["datafile_id"]
                if datafile_id is None:
                    file_to_update[
                        "failure_reason"] = "Cannot add or replace file without datafile ID"
                    continue

                datafile = models_controller.get_datafile(datafile_id)

                if datafile.type == "gcs":
                    file_to_update[
                        "failure_reason"] = "Cannot upload GCS pointer files"
                    continue
                elif datafile.type == "virtual":
                    datafile = datafile.underlying_data_file

                if datafile.compressed_s3_key is None:
                    file_to_update[
                        "failure_reason"] = "Cannot upload files without compressed S3 file"
                    continue

                task = upload_datafile_to_figshare.delay(
                    figshare_dataset_version_link.figshare_article_id,
                    figshare_dataset_version_link.id,
                    file_to_update["file_name"],
                    datafile_id,
                    datafile.compressed_s3_key,
                    datafile.original_file_md5,
                    token,
                )

                file_to_update["task_id"] = task.id
            else:
                raise ValueError(f"Unrecognized action: {action}")

        r = figshare.update_article(article_id, description, token)

        return flask.jsonify({
            "article_id": figshare_dataset_version_link.figshare_article_id,
            "files": files_to_update,
        })
    except HTTPError as error:
        models_controller.delete_figshare_dataset_version_and_datafiles(
            figshare_dataset_version_link.id)
        return flask.abort(error.code, error.reason)