def extract_pif(group, params=None):
    """Use Citrine's extractors."""
    if not params:
        return {}

    # Setup
    dc_md = params["dataset"]["dc"]
    cit_path = os.path.join(params["service_data"], "citrine")
    os.makedirs(cit_path, exist_ok=True)
    cit_manager = IngesterManager()
    mdf_records = []

    try:
        raw_pifs = cit_manager.run_extensions(group,
                                              include=params.get(
                                                  "include", None),
                                              exclude=[])
    except Exception as e:
        logger.debug("Citrine pif-ingestor raised exception: " + repr(e))
        return {}
    if not raw_pifs:
        return {}
    elif isinstance(raw_pifs, System):
        raw_pifs = [raw_pifs]
    elif not isinstance(raw_pifs, list):
        raw_pifs = list(raw_pifs)
    id_pifs = cit_utils.set_uids(raw_pifs)

    for pif in id_pifs:
        try:
            pif_feed = pif_to_feedstock(pif)
        except Exception as e:
            logger.warn("PIF to feedstock failed: " + repr(e))
            raise
        try:
            mdf_pif = _translate_pif(pif_feed)
        except Exception as e:
            logger.warn("_translate_pif failed: " + repr(e))
            raise
        if mdf_pif:
            mdf_records.append(mdf_pif)

        pif_name = (pif.uid or str(ObjectId())) + ".pif"
        pif_path = os.path.join(cit_path, pif_name)
        try:
            with open(pif_path, 'w') as pif_file:
                pif_dump(add_dc(pif, dc_md), pif_file)
        except Exception as e:
            logger.warn("Could not save PIF: {}".format(repr(e)))
            try:
                os.remove(pif_path)
            except FileNotFoundError:
                pass

    return mdf_records
Ejemplo n.º 2
0
def test_set_uids():
    pifs = [System(names=["foo"]), System(names=["bar"])]
    set_uids(pifs)
    for pif in pifs:
        assert pif.uid
    assert len({p.uid for p in pifs}) == 2
Ejemplo n.º 3
0
def test_explicit_uids():
    pifs = [System(names=["foo"]), System(names=["bar"])]
    uids = ["spam", "eggs"]
    set_uids(pifs, ["spam", "eggs"])
    assert [p.uid for p in pifs] == uids
Ejemplo n.º 4
0
def test_dup_pifs():
    pifs = [System(names=["foo"]), System(names=["foo"])]
    set_uids(pifs)
    assert len({p.uid for p in pifs}) == 1
Ejemplo n.º 5
0
def begin_convert(mdf_dataset, status_id):
    """Pull, back up, and convert metadata."""
    # Setup
    creds = {
        "app_name": "MDF Open Connect",
        "client_id": app.config["API_CLIENT_ID"],
        "client_secret": app.config["API_CLIENT_SECRET"],
        "services": ["transfer", "publish"]
    }
    clients = toolbox.confidential_login(creds)
    mdf_transfer_client = clients["transfer"]
    globus_publish_client = clients["publish"]

    # Download data locally, back up on MDF resources
    dl_res = download_and_backup(mdf_transfer_client,
                                 mdf_dataset.pop("data", {}), status_id)
    if dl_res["success"]:
        local_path = dl_res["local_path"]
        backup_path = dl_res["backup_path"]
    else:
        raise IOError("No data downloaded")
    # TODO: Update status - data downloaded
    print("DEBUG: Data downloaded")

    print("DEBUG: Conversions started")
    # Pop indexing args
    parse_params = mdf_dataset.pop("index", {})
    add_services = mdf_dataset.pop("services", [])

    # TODO: Stream data into files instead of holding feedstock in memory
    feedstock = [mdf_dataset]

    # tags = [sub["subject"] for sub in mdf_dataset.get("dc", {}).get("subjects", [])]
    # key_info = get_key_matches(tags or None)

    # List of all files, for bag
    all_files = []

    # Citrination setup
    cit_manager = IngesterManager()
    cit_client = CitrinationClient(app.config["CITRINATION_API_KEY"])
    # Get title and description
    try:
        cit_title = mdf_dataset["dc"]["titles"][0]["title"]
    except (KeyError, IndexError):
        cit_title = "Untitled"
    try:
        cit_desc = " ".join([
            desc["description"] for desc in mdf_dataset["dc"]["descriptions"]
        ])
        if not cit_desc:
            raise KeyError
    except (KeyError, IndexError):
        cit_desc = None
    cit_ds = cit_client.create_data_set(name=cit_title,
                                        description=cit_desc,
                                        share=0).json()
    cit_ds_id = cit_ds["id"]
    print("DEBUG: Citrine dataset ID:", cit_ds_id)

    for path, dirs, files in os.walk(os.path.abspath(local_path)):
        # Separate files into groups, process group as unit
        for group in group_files(files):
            # Get all file metadata
            group_file_md = [
                get_file_metadata(file_path=os.path.join(path, filename),
                                  backup_path=os.path.join(
                                      backup_path,
                                      path.replace(os.path.abspath(local_path),
                                                   ""), filename))
                for filename in group
            ]
            all_files.extend(group_file_md)

            group_paths = [os.path.join(path, filename) for filename in group]

            # MDF parsing
            mdf_res = omniparser.omniparse(group_paths, parse_params)

            # Citrine parsing
            cit_pifs = cit_manager.run_extensions(
                group_paths,
                include=None,
                exclude=[],
                args={"quality_report": False})
            if not isinstance(cit_pifs, list):
                cit_pifs = [cit_pifs]
            cit_full = []
            if len(cit_pifs) > 0:
                cit_res = []
                # Add UIDs
                cit_pifs = cit_utils.set_uids(cit_pifs)
                for pif in cit_pifs:
                    # Get PIF URL
                    pif_land_page = {
                        "mdf": {
                            "landing_page": cit_utils.get_url(pif, cit_ds_id)
                        }
                    } if cit_ds_id else {}
                    # Make PIF into feedstock and save
                    cit_res.append(
                        toolbox.dict_merge(pif_to_feedstock(pif),
                                           pif_land_page))
                    # Add DataCite metadata
                    pif = add_dc(pif, mdf_dataset.get("dc", {}))

                    cit_full.append(pif)
            else:  # No PIFs parsed
                # TODO: Send failed datatype to Citrine for logging
                # Pad cit_res to the same length as mdf_res for "merging"
                cit_res = [{} for i in range(len(mdf_res))]

            # If MDF parser failed to parse group, pad mdf_res to match PIF count
            if len(mdf_res) == 0:
                mdf_res = [{} for i in range(len(cit_res))]

            # If only one mdf record was parsed, merge all PIFs into that record
            if len(mdf_res) == 1:
                merged_cit = {}
                [toolbox.dict_merge(merged_cit, cr) for cr in cit_res]
                mdf_records = [toolbox.dict_merge(mdf_res[0], merged_cit)]
            # If the same number of MDF records and Citrine PIFs were parsed, merge in order
            elif len(mdf_res) == len(cit_res):
                mdf_records = [
                    toolbox.dict_merge(r_mdf, r_cit)
                    for r_mdf, r_cit in zip(mdf_res, cit_res)
                ]
            # Otherwise, keep the MDF records only
            else:
                print("DEBUG: Record mismatch:\nMDF parsed", len(mdf_res),
                      "records", "\nCitrine parsed", len(cit_res), "records"
                      "\nPIFs discarded")
                # TODO: Update status/log - Citrine records discarded
                mdf_records = mdf_res

            # Filter null records, save rest
            if not mdf_records:
                print("DEBUG: No MDF records in group:", group)
            [
                feedstock.append(
                    toolbox.dict_merge(record, {"files": group_file_md}))
                for record in mdf_records if record
            ]

            # Upload PIFs to Citrine
            for full_pif in cit_full:
                with tempfile.NamedTemporaryFile(mode="w+") as pif_file:
                    pif_dump(full_pif, pif_file)
                    pif_file.seek(0)
                    up_res = json.loads(
                        cit_client.upload(cit_ds_id, pif_file.name))
                    if up_res["success"]:
                        print("DEBUG: Citrine upload success")
                    else:
                        print("DEBUG: Citrine upload failure, error",
                              up_res.get("status"))

    # TODO: Update status - indexing success
    print("DEBUG: Indexing success")

    # Pass feedstock to /ingest
    with tempfile.TemporaryFile(mode="w+") as stock:
        for entry in feedstock:
            json.dump(entry, stock)
            stock.write("\n")
        stock.seek(0)
        ingest_res = requests.post(app.config["INGEST_URL"],
                                   data={"status_id": status_id},
                                   files={'file': stock})
    if not ingest_res.json().get("success"):
        # TODO: Update status? Ingest failed
        # TODO: Fail everything, delete Citrine dataset, etc.
        raise ValueError("In convert - Ingest failed" + str(ingest_res.json()))

    # Additional service integrations

    # Finalize Citrine dataset
    # TODO: Turn on public dataset ingest (share=1)
    if "citrine" in add_services:
        try:
            cit_client.update_data_set(cit_ds_id, share=0)
        except Exception as e:
            # TODO: Update status, notify Citrine - Citrine ds failure
            print("DEBUG: Citrination dataset not updated")

    # Globus Publish
    # TODO: Test after Publish API is fixed
    if "globus_publish" in add_services:
        try:
            fin_res = globus_publish_data(globus_publish_client,
                                          mdf_transfer_client, mdf_dataset,
                                          local_path)
        except Exception as e:
            # TODO: Update status - Publish failed
            print("Publish ERROR:", repr(e))
        else:
            # TODO: Update status - Publish success
            print("DEBUG: Publish success:", fin_res)

    # Remove local data
    shutil.rmtree(local_path)
    # TODO: Update status - everything done
    return {"success": True, "status_id": status_id}