def citrine_upload(citrine_data, api_key, mdf_dataset, previous_id=None, public=True): import os from citrination_client import CitrinationClient cit_client = CitrinationClient(api_key).data source_id = mdf_dataset.get("mdf", {}).get("source_id", "NO_ID") try: cit_title = mdf_dataset["dc"]["titles"][0]["title"] except (KeyError, IndexError, TypeError): cit_title = "Untitled" try: cit_desc = " ".join([ desc["description"] for desc in mdf_dataset["dc"]["descriptions"] ]) if not cit_desc: raise KeyError except (KeyError, IndexError, TypeError): cit_desc = None # Create new version if dataset previously created if previous_id: try: rev_res = cit_client.create_dataset_version(previous_id) assert rev_res.number > 1 except Exception: previous_id = "INVALID" else: cit_ds_id = previous_id cit_client.update_dataset(cit_ds_id, name=cit_title, description=cit_desc, public=False) # Create new dataset if not created if not previous_id or previous_id == "INVALID": try: cit_ds_id = cit_client.create_dataset(name=cit_title, description=cit_desc, public=False).id assert cit_ds_id > 0 except Exception as e: print("{}: Citrine dataset creation failed: {}".format( source_id, repr(e))) if previous_id == "INVALID": return { "success": False, "error": "Unable to create revision or new dataset in Citrine" } else: return { "success": False, "error": "Unable to create Citrine dataset, possibly due to duplicate entry" } success = 0 failed = 0 for path, _, files in os.walk(os.path.abspath(citrine_data)): for pif in files: up_res = cit_client.upload(cit_ds_id, os.path.join(path, pif)) if up_res.successful(): success += 1 else: print("{}: Citrine upload failure: {}".format( source_id, str(up_res))) failed += 1 cit_client.update_dataset(cit_ds_id, public=public) return { "success": bool(success), "cit_ds_id": cit_ds_id, "success_count": success, "failure_count": failed }
def begin_convert(mdf_dataset, status_id): """Pull, back up, and convert metadata.""" # Setup creds = { "app_name": "MDF Open Connect", "client_id": app.config["API_CLIENT_ID"], "client_secret": app.config["API_CLIENT_SECRET"], "services": ["transfer", "publish"] } clients = toolbox.confidential_login(creds) mdf_transfer_client = clients["transfer"] globus_publish_client = clients["publish"] # Download data locally, back up on MDF resources dl_res = download_and_backup(mdf_transfer_client, mdf_dataset.pop("data", {}), status_id) if dl_res["success"]: local_path = dl_res["local_path"] backup_path = dl_res["backup_path"] else: raise IOError("No data downloaded") # TODO: Update status - data downloaded print("DEBUG: Data downloaded") print("DEBUG: Conversions started") # Pop indexing args parse_params = mdf_dataset.pop("index", {}) add_services = mdf_dataset.pop("services", []) # TODO: Stream data into files instead of holding feedstock in memory feedstock = [mdf_dataset] # tags = [sub["subject"] for sub in mdf_dataset.get("dc", {}).get("subjects", [])] # key_info = get_key_matches(tags or None) # List of all files, for bag all_files = [] # Citrination setup cit_manager = IngesterManager() cit_client = CitrinationClient(app.config["CITRINATION_API_KEY"]) # Get title and description try: cit_title = mdf_dataset["dc"]["titles"][0]["title"] except (KeyError, IndexError): cit_title = "Untitled" try: cit_desc = " ".join([ desc["description"] for desc in mdf_dataset["dc"]["descriptions"] ]) if not cit_desc: raise KeyError except (KeyError, IndexError): cit_desc = None cit_ds = cit_client.create_data_set(name=cit_title, description=cit_desc, share=0).json() cit_ds_id = cit_ds["id"] print("DEBUG: Citrine dataset ID:", cit_ds_id) for path, dirs, files in os.walk(os.path.abspath(local_path)): # Separate files into groups, process group as unit for group in group_files(files): # Get all file metadata group_file_md = [ get_file_metadata(file_path=os.path.join(path, filename), backup_path=os.path.join( backup_path, path.replace(os.path.abspath(local_path), ""), filename)) for filename in group ] all_files.extend(group_file_md) group_paths = [os.path.join(path, filename) for filename in group] # MDF parsing mdf_res = omniparser.omniparse(group_paths, parse_params) # Citrine parsing cit_pifs = cit_manager.run_extensions( group_paths, include=None, exclude=[], args={"quality_report": False}) if not isinstance(cit_pifs, list): cit_pifs = [cit_pifs] cit_full = [] if len(cit_pifs) > 0: cit_res = [] # Add UIDs cit_pifs = cit_utils.set_uids(cit_pifs) for pif in cit_pifs: # Get PIF URL pif_land_page = { "mdf": { "landing_page": cit_utils.get_url(pif, cit_ds_id) } } if cit_ds_id else {} # Make PIF into feedstock and save cit_res.append( toolbox.dict_merge(pif_to_feedstock(pif), pif_land_page)) # Add DataCite metadata pif = add_dc(pif, mdf_dataset.get("dc", {})) cit_full.append(pif) else: # No PIFs parsed # TODO: Send failed datatype to Citrine for logging # Pad cit_res to the same length as mdf_res for "merging" cit_res = [{} for i in range(len(mdf_res))] # If MDF parser failed to parse group, pad mdf_res to match PIF count if len(mdf_res) == 0: mdf_res = [{} for i in range(len(cit_res))] # If only one mdf record was parsed, merge all PIFs into that record if len(mdf_res) == 1: merged_cit = {} [toolbox.dict_merge(merged_cit, cr) for cr in cit_res] mdf_records = [toolbox.dict_merge(mdf_res[0], merged_cit)] # If the same number of MDF records and Citrine PIFs were parsed, merge in order elif len(mdf_res) == len(cit_res): mdf_records = [ toolbox.dict_merge(r_mdf, r_cit) for r_mdf, r_cit in zip(mdf_res, cit_res) ] # Otherwise, keep the MDF records only else: print("DEBUG: Record mismatch:\nMDF parsed", len(mdf_res), "records", "\nCitrine parsed", len(cit_res), "records" "\nPIFs discarded") # TODO: Update status/log - Citrine records discarded mdf_records = mdf_res # Filter null records, save rest if not mdf_records: print("DEBUG: No MDF records in group:", group) [ feedstock.append( toolbox.dict_merge(record, {"files": group_file_md})) for record in mdf_records if record ] # Upload PIFs to Citrine for full_pif in cit_full: with tempfile.NamedTemporaryFile(mode="w+") as pif_file: pif_dump(full_pif, pif_file) pif_file.seek(0) up_res = json.loads( cit_client.upload(cit_ds_id, pif_file.name)) if up_res["success"]: print("DEBUG: Citrine upload success") else: print("DEBUG: Citrine upload failure, error", up_res.get("status")) # TODO: Update status - indexing success print("DEBUG: Indexing success") # Pass feedstock to /ingest with tempfile.TemporaryFile(mode="w+") as stock: for entry in feedstock: json.dump(entry, stock) stock.write("\n") stock.seek(0) ingest_res = requests.post(app.config["INGEST_URL"], data={"status_id": status_id}, files={'file': stock}) if not ingest_res.json().get("success"): # TODO: Update status? Ingest failed # TODO: Fail everything, delete Citrine dataset, etc. raise ValueError("In convert - Ingest failed" + str(ingest_res.json())) # Additional service integrations # Finalize Citrine dataset # TODO: Turn on public dataset ingest (share=1) if "citrine" in add_services: try: cit_client.update_data_set(cit_ds_id, share=0) except Exception as e: # TODO: Update status, notify Citrine - Citrine ds failure print("DEBUG: Citrination dataset not updated") # Globus Publish # TODO: Test after Publish API is fixed if "globus_publish" in add_services: try: fin_res = globus_publish_data(globus_publish_client, mdf_transfer_client, mdf_dataset, local_path) except Exception as e: # TODO: Update status - Publish failed print("Publish ERROR:", repr(e)) else: # TODO: Update status - Publish success print("DEBUG: Publish success:", fin_res) # Remove local data shutil.rmtree(local_path) # TODO: Update status - everything done return {"success": True, "status_id": status_id}