class CitrinationPlugin(PawsPlugin):
    """
    Wrapper contains a Citrination client and
    implements the PawsPlugin abc interface.
    """

    def __init__(self):
        input_names = ['address','api_key_file']
        super(CitrinationPlugin,self).__init__(input_names)
        self.input_doc['address'] = 'web address of citrination instance'
        self.input_doc['api_key_file'] = 'path to a file in the local filesystem containing a valid citrination api key'
        self.inputs['address'] = 'http://citrination.com' 
        self.ctn_client = None
        self.return_codes = {} 

    def start(self):
        self.address = self.inputs['address'] 
        f = open(self.inputs['api_key_file'],'r')
        self.api_key = str(f.readline()).strip()
        f.close()
        self.ctn_client = CitrinationClient(api_key = self.api_key, site = self.address)

    def stop(self):
        pass

    def content(self): 
        return {'client':self.ctn_client,'inputs':self.inputs}

    def description(self):
        desc = str('Citrination API Client Plugin for Paws: '
            + 'This is a container for the Citrination Client module. '
            + 'The Citrination Client connects to a Citrination instance '
            + 'and exposes some parts of the Citrination API. '
            + 'Startup requires the web address of a Citrination instance '
            + 'and an API key that provides access to that instance.')
        return desc

    def ship_dataset(self,pifs):
        # Create the data set
        response = self.ctn_client.create_data_set()
        dsid = response.json()['id']
        # TODO: Note that the entire data set can be one json,
        # of an array of pif records, and this will lead to a faster upload.
        for p in pifs:
            try:
                json_file = pawstools.scratchdir+'/'+p.uid+'.json'
                pif.dump(p, open(json_file,'w'))
                #print 'add DATA SET {} to tags'.format(dsid)
                #p.tags.append('DATA SET {}'.format(dsid))
                #print 'dump {} to data set {}'.format(json_file,dsid)
                cl.upload_file(json_file,data_set_id = dsid)
                #print 'NOT SHIPPING {} (this is a test)'.format(json_file)
                self.return_codes[p.uid]=1
                # delete dataset json
                #print 'deleting file {}'.format(json_file)
                os.remove(json_file) 
            except:
                # TODO: Pass along some return code from the server?
                self.return_codes[p.uid]=-1
def test_upload_pif():
    client = CitrinationClient(environ['CITRINATION_API_KEY'], environ['CITRINATION_SITE'])
    dataset = loads(client.create_data_set(name="Tutorial dataset", description="Dataset for tutorial", share=0).content.decode('utf-8'))['id']
    pif = System()
    pif.id = 0

    with open("tmp.json", "w") as fp:
        dump(pif, fp)
    response = loads(client.upload_file("tmp.json", dataset))
    assert response["message"] == "Upload is complete."
def test_upload_pif():
    client = CitrinationClient(environ['CITRINATION_API_KEY'], 'https://stage.citrination.com')
    dataset = loads(client.create_data_set(name="Tutorial dataset", description="Dataset for tutorial", share=0).content.decode('utf-8'))['id']
    pif = System()
    pif.id = 0

    with TemporaryDirectory() as tmpdir:
        tempname = join(tmpdir, "pif.json")
        with open(tempname, "w") as fp:
            dump(pif, fp)
        response = loads(client.upload_file(tempname, dataset))
    assert response["message"] == "Upload is complete."
Exemple #4
0
def test_upload_pif():
    client = CitrinationClient(environ['CITRINATION_API_KEY'],
                               environ['CITRINATION_SITE'])
    dataset = loads(
        client.create_data_set(name="Tutorial dataset",
                               description="Dataset for tutorial",
                               share=0).content.decode('utf-8'))['id']
    pif = System()
    pif.id = 0

    with open("tmp.json", "w") as fp:
        dump(pif, fp)
    response = loads(client.upload_file("tmp.json", dataset))
    assert response["message"] == "Upload is complete."
Exemple #5
0
def begin_convert(mdf_dataset, status_id):
    """Pull, back up, and convert metadata."""
    # Setup
    creds = {
        "app_name": "MDF Open Connect",
        "client_id": app.config["API_CLIENT_ID"],
        "client_secret": app.config["API_CLIENT_SECRET"],
        "services": ["transfer", "publish"]
    }
    clients = toolbox.confidential_login(creds)
    mdf_transfer_client = clients["transfer"]
    globus_publish_client = clients["publish"]

    # Download data locally, back up on MDF resources
    dl_res = download_and_backup(mdf_transfer_client,
                                 mdf_dataset.pop("data", {}), status_id)
    if dl_res["success"]:
        local_path = dl_res["local_path"]
        backup_path = dl_res["backup_path"]
    else:
        raise IOError("No data downloaded")
    # TODO: Update status - data downloaded
    print("DEBUG: Data downloaded")

    print("DEBUG: Conversions started")
    # Pop indexing args
    parse_params = mdf_dataset.pop("index", {})
    add_services = mdf_dataset.pop("services", [])

    # TODO: Stream data into files instead of holding feedstock in memory
    feedstock = [mdf_dataset]

    # tags = [sub["subject"] for sub in mdf_dataset.get("dc", {}).get("subjects", [])]
    # key_info = get_key_matches(tags or None)

    # List of all files, for bag
    all_files = []

    # Citrination setup
    cit_manager = IngesterManager()
    cit_client = CitrinationClient(app.config["CITRINATION_API_KEY"])
    # Get title and description
    try:
        cit_title = mdf_dataset["dc"]["titles"][0]["title"]
    except (KeyError, IndexError):
        cit_title = "Untitled"
    try:
        cit_desc = " ".join([
            desc["description"] for desc in mdf_dataset["dc"]["descriptions"]
        ])
        if not cit_desc:
            raise KeyError
    except (KeyError, IndexError):
        cit_desc = None
    cit_ds = cit_client.create_data_set(name=cit_title,
                                        description=cit_desc,
                                        share=0).json()
    cit_ds_id = cit_ds["id"]
    print("DEBUG: Citrine dataset ID:", cit_ds_id)

    for path, dirs, files in os.walk(os.path.abspath(local_path)):
        # Separate files into groups, process group as unit
        for group in group_files(files):
            # Get all file metadata
            group_file_md = [
                get_file_metadata(file_path=os.path.join(path, filename),
                                  backup_path=os.path.join(
                                      backup_path,
                                      path.replace(os.path.abspath(local_path),
                                                   ""), filename))
                for filename in group
            ]
            all_files.extend(group_file_md)

            group_paths = [os.path.join(path, filename) for filename in group]

            # MDF parsing
            mdf_res = omniparser.omniparse(group_paths, parse_params)

            # Citrine parsing
            cit_pifs = cit_manager.run_extensions(
                group_paths,
                include=None,
                exclude=[],
                args={"quality_report": False})
            if not isinstance(cit_pifs, list):
                cit_pifs = [cit_pifs]
            cit_full = []
            if len(cit_pifs) > 0:
                cit_res = []
                # Add UIDs
                cit_pifs = cit_utils.set_uids(cit_pifs)
                for pif in cit_pifs:
                    # Get PIF URL
                    pif_land_page = {
                        "mdf": {
                            "landing_page": cit_utils.get_url(pif, cit_ds_id)
                        }
                    } if cit_ds_id else {}
                    # Make PIF into feedstock and save
                    cit_res.append(
                        toolbox.dict_merge(pif_to_feedstock(pif),
                                           pif_land_page))
                    # Add DataCite metadata
                    pif = add_dc(pif, mdf_dataset.get("dc", {}))

                    cit_full.append(pif)
            else:  # No PIFs parsed
                # TODO: Send failed datatype to Citrine for logging
                # Pad cit_res to the same length as mdf_res for "merging"
                cit_res = [{} for i in range(len(mdf_res))]

            # If MDF parser failed to parse group, pad mdf_res to match PIF count
            if len(mdf_res) == 0:
                mdf_res = [{} for i in range(len(cit_res))]

            # If only one mdf record was parsed, merge all PIFs into that record
            if len(mdf_res) == 1:
                merged_cit = {}
                [toolbox.dict_merge(merged_cit, cr) for cr in cit_res]
                mdf_records = [toolbox.dict_merge(mdf_res[0], merged_cit)]
            # If the same number of MDF records and Citrine PIFs were parsed, merge in order
            elif len(mdf_res) == len(cit_res):
                mdf_records = [
                    toolbox.dict_merge(r_mdf, r_cit)
                    for r_mdf, r_cit in zip(mdf_res, cit_res)
                ]
            # Otherwise, keep the MDF records only
            else:
                print("DEBUG: Record mismatch:\nMDF parsed", len(mdf_res),
                      "records", "\nCitrine parsed", len(cit_res), "records"
                      "\nPIFs discarded")
                # TODO: Update status/log - Citrine records discarded
                mdf_records = mdf_res

            # Filter null records, save rest
            if not mdf_records:
                print("DEBUG: No MDF records in group:", group)
            [
                feedstock.append(
                    toolbox.dict_merge(record, {"files": group_file_md}))
                for record in mdf_records if record
            ]

            # Upload PIFs to Citrine
            for full_pif in cit_full:
                with tempfile.NamedTemporaryFile(mode="w+") as pif_file:
                    pif_dump(full_pif, pif_file)
                    pif_file.seek(0)
                    up_res = json.loads(
                        cit_client.upload(cit_ds_id, pif_file.name))
                    if up_res["success"]:
                        print("DEBUG: Citrine upload success")
                    else:
                        print("DEBUG: Citrine upload failure, error",
                              up_res.get("status"))

    # TODO: Update status - indexing success
    print("DEBUG: Indexing success")

    # Pass feedstock to /ingest
    with tempfile.TemporaryFile(mode="w+") as stock:
        for entry in feedstock:
            json.dump(entry, stock)
            stock.write("\n")
        stock.seek(0)
        ingest_res = requests.post(app.config["INGEST_URL"],
                                   data={"status_id": status_id},
                                   files={'file': stock})
    if not ingest_res.json().get("success"):
        # TODO: Update status? Ingest failed
        # TODO: Fail everything, delete Citrine dataset, etc.
        raise ValueError("In convert - Ingest failed" + str(ingest_res.json()))

    # Additional service integrations

    # Finalize Citrine dataset
    # TODO: Turn on public dataset ingest (share=1)
    if "citrine" in add_services:
        try:
            cit_client.update_data_set(cit_ds_id, share=0)
        except Exception as e:
            # TODO: Update status, notify Citrine - Citrine ds failure
            print("DEBUG: Citrination dataset not updated")

    # Globus Publish
    # TODO: Test after Publish API is fixed
    if "globus_publish" in add_services:
        try:
            fin_res = globus_publish_data(globus_publish_client,
                                          mdf_transfer_client, mdf_dataset,
                                          local_path)
        except Exception as e:
            # TODO: Update status - Publish failed
            print("Publish ERROR:", repr(e))
        else:
            # TODO: Update status - Publish success
            print("DEBUG: Publish success:", fin_res)

    # Remove local data
    shutil.rmtree(local_path)
    # TODO: Update status - everything done
    return {"success": True, "status_id": status_id}