def upload_dataset_version_to_figshare(figshareDatasetVersionLink): dataset_version_id = figshareDatasetVersionLink["dataset_version_id"] article_name = figshareDatasetVersionLink["article_name"] article_description = figshareDatasetVersionLink["article_description"] article_license = figshareDatasetVersionLink.get("license", 0) article_categories = figshareDatasetVersionLink.get("categories", None) article_keywords = figshareDatasetVersionLink.get("keywords", None) article_references = figshareDatasetVersionLink.get("references", None) files_to_upload = figshareDatasetVersionLink["files_to_upload"] token = _fetch_figshare_token() if token is None: flask.abort(401) dataset_version = models_controller.get_dataset_version(dataset_version_id) figshare_dataset_version_link = figshare.create_article( dataset_version_id, article_name, article_description, article_license, article_categories, article_keywords, article_references, token, ) from taiga2.tasks import upload_datafile_to_figshare for file_to_upload in files_to_upload: datafile = models_controller.get_datafile( file_to_upload["datafile_id"]) if datafile.type == "gcs": file_to_upload[ "failure_reason"] = "Cannot upload GCS pointer files" continue elif datafile.type == "virtual": datafile = datafile.underlying_data_file if datafile.compressed_s3_key is None: file_to_upload[ "failure_reason"] = "Cannot upload files without compressed S3 file" continue task = upload_datafile_to_figshare.delay( figshare_dataset_version_link.figshare_article_id, figshare_dataset_version_link.id, file_to_upload["file_name"], file_to_upload["datafile_id"], datafile.compressed_s3_key, datafile.original_file_md5, token, ) file_to_upload["task_id"] = task.id return flask.jsonify({ "article_id": figshare_dataset_version_link.figshare_article_id, "files": files_to_upload, })
def _download_and_compress_s3_backfill(datafile_id: str, s3_key: str, mime_type: str): # TODO: Add progress? s3 = aws.s3 datafile = models_controller.get_datafile(datafile_id) compressed_s3_key = models_controller.generate_compressed_key( ) + "-backfilled" bucket_name = flask.current_app.config["S3_BUCKET"] s3_object = s3.Object(bucket_name, s3_key) compressed_s3_object = s3.Object(bucket_name, compressed_s3_key) with tempfile.NamedTemporaryFile() as download_dest: with tempfile.NamedTemporaryFile() as compressed_dest: s3_object.download_fileobj(download_dest) _compress_and_upload_to_s3( s3_object, download_dest, compressed_dest, compressed_s3_object, mime_type, None, ) models_controller.update_datafile_compressed_key_and_column_types( datafile_id, compressed_s3_key, None)
def copy_datafile_to_google_bucket(self, datafile_id: str, dest_bucket: str, dest_gcs_path: str): s3 = aws.s3 datafile = models_controller.get_datafile(datafile_id) compressed_s3_object = s3.Object(datafile.s3_bucket, datafile.compressed_s3_key) with tempfile.NamedTemporaryFile() as download_dest: compressed_s3_object.download_fileobj(download_dest) download_dest.seek(0) upload_from_file( download_dest, dest_bucket, dest_gcs_path, compressed_s3_object.content_type, compressed_s3_object.content_encoding, )
def get_provenance_graph(gid): print("We received the graph Id: {}!".format(gid)) provenance_full_graph_schema = schemas.ProvenanceGraphFullSchema() graph = models_controller.get_provenance_graph_by_id(gid) json_graph_data = provenance_full_graph_schema.dump(graph).data # We also need the url, so we add this to the json for provenance_node in json_graph_data["provenance_nodes"]: try: datafile = models_controller.get_datafile( provenance_node["datafile_id"]) provenance_node["url"] = datafile.dataset_version_id except NoResultFound: log.info( "The node {} with datafile_id {} has been ignored because no datafile was matching" .format(provenance_node["node_id"], provenance_node["datafile_id"])) return flask.jsonify(json_graph_data)
def backfill_compressed_file(self, datafile_id: str, cache_entry_id: Optional[str]): """ Get the S3 key from ConversionCache (if DataFile is not Raw) or s3_key field, then compress, upload to S3, and update DataFile compressed_s3_key and column types Args: datafile_id (str): ID for the (S3) DataFile to update cache_entry_id (Optional[str]): ID for the ConversionCache entry to use, or None if the DataFile is a Raw DataFile """ if cache_entry_id is None: datafile = models_controller.get_datafile(datafile_id) s3_key = datafile.s3_key mime_type = "text/plain" else: s3_key = _get_s3_key_from_conversion_cache_url(cache_entry_id) mime_type = "text/csv" _download_and_compress_s3_backfill(datafile_id, s3_key, mime_type)
def _backfill_compressed_file(datafile_id: str, delay=True): datafile = models_controller.get_datafile(datafile_id) if datafile.type != "s3" or datafile.compressed_s3_key is not None: return flask.make_response(flask.jsonify(None), 304) from taiga2.tasks import ( convert_and_backfill_compressed_file, backfill_compressed_file, ) if datafile.format == S3DataFile.DataFileFormat.Raw: task = backfill_compressed_file.delay(datafile_id, None) else: is_new, entry = models_controller.get_conversion_cache_entry( datafile.dataset_version.id, datafile.name, "csv") if is_new: task = convert_and_backfill_compressed_file.delay( datafile_id, entry.id) else: task = backfill_compressed_file.delay(datafile_id, entry.id) return flask.make_response(flask.jsonify(task.id), 202)
def convert_and_backfill_compressed_file(self, datafile_id: str, cache_entry_id: str): """ Convert a HDF5 or Columnar file to CSV, then compress, upload to S3, and update DataFile compressed_s3_key and column types Args: datafile_id (str): ID for the (S3) DataFile to update cache_entry_id (str): ID for the ConversionCache entry to update """ datafile = models_controller.get_datafile(datafile_id) _start_conversion_task( self, Progress(self), datafile.s3_bucket, datafile.s3_key, str(datafile.format), "csv", cache_entry_id, ) s3_key = _get_s3_key_from_conversion_cache_url(cache_entry_id) _download_and_compress_s3_backfill(datafile_id, s3_key, "text/csv")
def update_figshare_article_with_dataset_version(figshareDatasetVersionLink): dataset_version_id = figshareDatasetVersionLink["dataset_version_id"] description = figshareDatasetVersionLink["description"] article_id = figshareDatasetVersionLink["article_id"] current_article_version = figshareDatasetVersionLink[ "current_article_version"] files_to_update = figshareDatasetVersionLink["files_to_update"] token = _fetch_figshare_token() if token is None: flask.abort(401) dataset_version = models_controller.get_dataset_version(dataset_version_id) figshare_dataset_version_link = models_controller.add_figshare_dataset_version_link( dataset_version.id, article_id, current_article_version + 1) from taiga2.tasks import upload_datafile_to_figshare try: for file_to_update in files_to_update: action = file_to_update["action"] if action == "Delete": figshare.delete_file(article_id, file_to_update["figshare_file_id"], token) elif action == "Add": datafile_id = file_to_update["datafile_id"] if datafile_id is None: file_to_update[ "failure_reason"] = "Cannot add or replace file without datafile ID" continue datafile = models_controller.get_datafile(datafile_id) if datafile.type == "gcs": file_to_update[ "failure_reason"] = "Cannot upload GCS pointer files" continue elif datafile.type == "virtual": datafile = datafile.underlying_data_file if datafile.compressed_s3_key is None: file_to_update[ "failure_reason"] = "Cannot upload files without compressed S3 file" continue task = upload_datafile_to_figshare.delay( figshare_dataset_version_link.figshare_article_id, figshare_dataset_version_link.id, file_to_update["file_name"], datafile_id, datafile.compressed_s3_key, datafile.original_file_md5, token, ) file_to_update["task_id"] = task.id else: raise ValueError(f"Unrecognized action: {action}") r = figshare.update_article(article_id, description, token) return flask.jsonify({ "article_id": figshare_dataset_version_link.figshare_article_id, "files": files_to_update, }) except HTTPError as error: models_controller.delete_figshare_dataset_version_and_datafiles( figshare_dataset_version_link.id) return flask.abort(error.code, error.reason)