class CitrinationPlugin(PawsPlugin): """ Wrapper contains a Citrination client and implements the PawsPlugin abc interface. """ def __init__(self): input_names = ['address','api_key_file'] super(CitrinationPlugin,self).__init__(input_names) self.input_doc['address'] = 'web address of citrination instance' self.input_doc['api_key_file'] = 'path to a file in the local filesystem containing a valid citrination api key' self.inputs['address'] = 'http://citrination.com' self.ctn_client = None self.return_codes = {} def start(self): self.address = self.inputs['address'] f = open(self.inputs['api_key_file'],'r') self.api_key = str(f.readline()).strip() f.close() self.ctn_client = CitrinationClient(api_key = self.api_key, site = self.address) def stop(self): pass def content(self): return {'client':self.ctn_client,'inputs':self.inputs} def description(self): desc = str('Citrination API Client Plugin for Paws: ' + 'This is a container for the Citrination Client module. ' + 'The Citrination Client connects to a Citrination instance ' + 'and exposes some parts of the Citrination API. ' + 'Startup requires the web address of a Citrination instance ' + 'and an API key that provides access to that instance.') return desc def ship_dataset(self,pifs): # Create the data set response = self.ctn_client.create_data_set() dsid = response.json()['id'] # TODO: Note that the entire data set can be one json, # of an array of pif records, and this will lead to a faster upload. for p in pifs: try: json_file = pawstools.scratchdir+'/'+p.uid+'.json' pif.dump(p, open(json_file,'w')) #print 'add DATA SET {} to tags'.format(dsid) #p.tags.append('DATA SET {}'.format(dsid)) #print 'dump {} to data set {}'.format(json_file,dsid) cl.upload_file(json_file,data_set_id = dsid) #print 'NOT SHIPPING {} (this is a test)'.format(json_file) self.return_codes[p.uid]=1 # delete dataset json #print 'deleting file {}'.format(json_file) os.remove(json_file) except: # TODO: Pass along some return code from the server? self.return_codes[p.uid]=-1
def test_upload_pif(): client = CitrinationClient(environ['CITRINATION_API_KEY'], environ['CITRINATION_SITE']) dataset = loads(client.create_data_set(name="Tutorial dataset", description="Dataset for tutorial", share=0).content.decode('utf-8'))['id'] pif = System() pif.id = 0 with open("tmp.json", "w") as fp: dump(pif, fp) response = loads(client.upload_file("tmp.json", dataset)) assert response["message"] == "Upload is complete."
def test_upload_pif(): client = CitrinationClient(environ['CITRINATION_API_KEY'], 'https://stage.citrination.com') dataset = loads(client.create_data_set(name="Tutorial dataset", description="Dataset for tutorial", share=0).content.decode('utf-8'))['id'] pif = System() pif.id = 0 with TemporaryDirectory() as tmpdir: tempname = join(tmpdir, "pif.json") with open(tempname, "w") as fp: dump(pif, fp) response = loads(client.upload_file(tempname, dataset)) assert response["message"] == "Upload is complete."
def test_upload_pif(): client = CitrinationClient(environ['CITRINATION_API_KEY'], environ['CITRINATION_SITE']) dataset = loads( client.create_data_set(name="Tutorial dataset", description="Dataset for tutorial", share=0).content.decode('utf-8'))['id'] pif = System() pif.id = 0 with open("tmp.json", "w") as fp: dump(pif, fp) response = loads(client.upload_file("tmp.json", dataset)) assert response["message"] == "Upload is complete."
def begin_convert(mdf_dataset, status_id): """Pull, back up, and convert metadata.""" # Setup creds = { "app_name": "MDF Open Connect", "client_id": app.config["API_CLIENT_ID"], "client_secret": app.config["API_CLIENT_SECRET"], "services": ["transfer", "publish"] } clients = toolbox.confidential_login(creds) mdf_transfer_client = clients["transfer"] globus_publish_client = clients["publish"] # Download data locally, back up on MDF resources dl_res = download_and_backup(mdf_transfer_client, mdf_dataset.pop("data", {}), status_id) if dl_res["success"]: local_path = dl_res["local_path"] backup_path = dl_res["backup_path"] else: raise IOError("No data downloaded") # TODO: Update status - data downloaded print("DEBUG: Data downloaded") print("DEBUG: Conversions started") # Pop indexing args parse_params = mdf_dataset.pop("index", {}) add_services = mdf_dataset.pop("services", []) # TODO: Stream data into files instead of holding feedstock in memory feedstock = [mdf_dataset] # tags = [sub["subject"] for sub in mdf_dataset.get("dc", {}).get("subjects", [])] # key_info = get_key_matches(tags or None) # List of all files, for bag all_files = [] # Citrination setup cit_manager = IngesterManager() cit_client = CitrinationClient(app.config["CITRINATION_API_KEY"]) # Get title and description try: cit_title = mdf_dataset["dc"]["titles"][0]["title"] except (KeyError, IndexError): cit_title = "Untitled" try: cit_desc = " ".join([ desc["description"] for desc in mdf_dataset["dc"]["descriptions"] ]) if not cit_desc: raise KeyError except (KeyError, IndexError): cit_desc = None cit_ds = cit_client.create_data_set(name=cit_title, description=cit_desc, share=0).json() cit_ds_id = cit_ds["id"] print("DEBUG: Citrine dataset ID:", cit_ds_id) for path, dirs, files in os.walk(os.path.abspath(local_path)): # Separate files into groups, process group as unit for group in group_files(files): # Get all file metadata group_file_md = [ get_file_metadata(file_path=os.path.join(path, filename), backup_path=os.path.join( backup_path, path.replace(os.path.abspath(local_path), ""), filename)) for filename in group ] all_files.extend(group_file_md) group_paths = [os.path.join(path, filename) for filename in group] # MDF parsing mdf_res = omniparser.omniparse(group_paths, parse_params) # Citrine parsing cit_pifs = cit_manager.run_extensions( group_paths, include=None, exclude=[], args={"quality_report": False}) if not isinstance(cit_pifs, list): cit_pifs = [cit_pifs] cit_full = [] if len(cit_pifs) > 0: cit_res = [] # Add UIDs cit_pifs = cit_utils.set_uids(cit_pifs) for pif in cit_pifs: # Get PIF URL pif_land_page = { "mdf": { "landing_page": cit_utils.get_url(pif, cit_ds_id) } } if cit_ds_id else {} # Make PIF into feedstock and save cit_res.append( toolbox.dict_merge(pif_to_feedstock(pif), pif_land_page)) # Add DataCite metadata pif = add_dc(pif, mdf_dataset.get("dc", {})) cit_full.append(pif) else: # No PIFs parsed # TODO: Send failed datatype to Citrine for logging # Pad cit_res to the same length as mdf_res for "merging" cit_res = [{} for i in range(len(mdf_res))] # If MDF parser failed to parse group, pad mdf_res to match PIF count if len(mdf_res) == 0: mdf_res = [{} for i in range(len(cit_res))] # If only one mdf record was parsed, merge all PIFs into that record if len(mdf_res) == 1: merged_cit = {} [toolbox.dict_merge(merged_cit, cr) for cr in cit_res] mdf_records = [toolbox.dict_merge(mdf_res[0], merged_cit)] # If the same number of MDF records and Citrine PIFs were parsed, merge in order elif len(mdf_res) == len(cit_res): mdf_records = [ toolbox.dict_merge(r_mdf, r_cit) for r_mdf, r_cit in zip(mdf_res, cit_res) ] # Otherwise, keep the MDF records only else: print("DEBUG: Record mismatch:\nMDF parsed", len(mdf_res), "records", "\nCitrine parsed", len(cit_res), "records" "\nPIFs discarded") # TODO: Update status/log - Citrine records discarded mdf_records = mdf_res # Filter null records, save rest if not mdf_records: print("DEBUG: No MDF records in group:", group) [ feedstock.append( toolbox.dict_merge(record, {"files": group_file_md})) for record in mdf_records if record ] # Upload PIFs to Citrine for full_pif in cit_full: with tempfile.NamedTemporaryFile(mode="w+") as pif_file: pif_dump(full_pif, pif_file) pif_file.seek(0) up_res = json.loads( cit_client.upload(cit_ds_id, pif_file.name)) if up_res["success"]: print("DEBUG: Citrine upload success") else: print("DEBUG: Citrine upload failure, error", up_res.get("status")) # TODO: Update status - indexing success print("DEBUG: Indexing success") # Pass feedstock to /ingest with tempfile.TemporaryFile(mode="w+") as stock: for entry in feedstock: json.dump(entry, stock) stock.write("\n") stock.seek(0) ingest_res = requests.post(app.config["INGEST_URL"], data={"status_id": status_id}, files={'file': stock}) if not ingest_res.json().get("success"): # TODO: Update status? Ingest failed # TODO: Fail everything, delete Citrine dataset, etc. raise ValueError("In convert - Ingest failed" + str(ingest_res.json())) # Additional service integrations # Finalize Citrine dataset # TODO: Turn on public dataset ingest (share=1) if "citrine" in add_services: try: cit_client.update_data_set(cit_ds_id, share=0) except Exception as e: # TODO: Update status, notify Citrine - Citrine ds failure print("DEBUG: Citrination dataset not updated") # Globus Publish # TODO: Test after Publish API is fixed if "globus_publish" in add_services: try: fin_res = globus_publish_data(globus_publish_client, mdf_transfer_client, mdf_dataset, local_path) except Exception as e: # TODO: Update status - Publish failed print("Publish ERROR:", repr(e)) else: # TODO: Update status - Publish success print("DEBUG: Publish success:", fin_res) # Remove local data shutil.rmtree(local_path) # TODO: Update status - everything done return {"success": True, "status_id": status_id}