예제 #1
0
def get_files_dats_datasets(cache, dats_samples_d, sample_manifest, file_guids,
                            no_circular_links):
    file_datasets_l = []

    wgs_datatype = DatsObj(
        "DataType",
        [("information", util.get_annotation("DNA sequencing", cache)),
         ("method", util.get_annotation("whole genome sequencing assay",
                                        cache)),
         ("platform", util.get_annotation("Illumina", cache))])

    def get_wgs_datatype():
        dkey = ".".join(["DataType", "WGS"])
        return cache.get_obj_or_ref(dkey, lambda: wgs_datatype)

    snp_datatype = DatsObj(
        "DataType", [("information", util.get_annotation("SNP", cache)),
                     ("method", util.get_annotation("SNP analysis", cache))])

    cnv_datatype = DatsObj(
        "DataType", [("information", util.get_annotation("CNV", cache)),
                     ("method", util.get_annotation("CNV analysis", cache))])

    def get_snp_datatype():
        dkey = ".".join(["DataType", "SNP"])
        return cache.get_obj_or_ref(dkey, lambda: snp_datatype)

    def get_cnv_datatype():
        dkey = ".".join(["DataType", "CNV"])
        return cache.get_obj_or_ref(dkey, lambda: cnv_datatype)

    nhlbi_key = ":".join(["Organization", "NHLBI"])
    nhlbi = cache.get_obj_or_ref(nhlbi_key, lambda: NIH_NHLBI)
    creators = [nhlbi]

    def make_data_standard(format):
        return DatsObj("DataStandard",
                       [("name", format),
                        ("type", util.get_value_annotation("format", cache)),
                        ("description", format + " file format")])

    cram_ds_key = ":".join(["DataStandard", "CRAM"])
    cram_dstan = cache.get_obj_or_ref(cram_ds_key,
                                      lambda: make_data_standard("CRAM"))

    vcf_ds_key = ":".join(["DataStandard", "VCF"])
    vcf_dstan = cache.get_obj_or_ref(vcf_ds_key,
                                     lambda: make_data_standard("VCF"))

    n_samples = len(dats_samples_d)
    n_samples_found = 0

    for sample_id in dats_samples_d:
        dats_sample = dats_samples_d[sample_id]

        # look up corresponding file paths in manifest file
        if sample_id not in sample_manifest:
            logging.debug("sample not found in manifest - " + sample_id)
            continue

        n_samples_found += 1
        ms = sample_manifest[sample_id]

        material_type = 'DNA'
        wgs_type = get_wgs_datatype()
        snp_type = get_snp_datatype()
        cnv_type = get_cnv_datatype()

        # ------------------------------------------------
        # WGS sequence - CRAM and CRAI files
        # ------------------------------------------------

        def get_filename(gs_uri):
            m = re.match(r'^.*\/([^\/]+)$', gs_uri)
            if m is None:
                logging.fatal("unable to parse filename from " + gs_uri)
                sys.exit(1)
            filename = m.group(1)
            return filename

        gs_cram = ms['gs_cram']['mapped_value']
        gs_crai = ms['gs_crai']['mapped_value']

        # GUID lookup
        cram_file = get_filename(gs_cram)
        crai_file = get_filename(gs_crai)

        cram_doi = file_guids[cram_file]['Sodium_GUID']['raw_value']
        cram_size = file_guids[cram_file]['File size']['raw_value']
        cram_md5 = file_guids[cram_file]['md5sum']['raw_value']

        crai_doi = file_guids[crai_file]['Sodium_GUID']['raw_value']
        crai_md5 = file_guids[crai_file]['md5sum']['raw_value']

        # handle file size values with "e" in them
        def filesize_to_int(size):
            if re.match(r'.*e.*', size):
                size = int(float(size))
            else:
                size = int(size)

        # Google Cloud Platform / Google Storage copy
        gs_cram_access = DatsObj("Access", [("accessURL", gs_cram)])
        gs_cram_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_cram_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", cram_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", crai_doi),
                             ("relationType", "cram_index")])
                ]),
                ("size", filesize_to_int(cram_size)),
                # TODO - add file size units
                ("conformsTo", [cram_dstan])
            ])

        # AWS / S3 copy
        s3_cram = ms['s3_cram']['mapped_value']
        s3_crai = ms['s3_crai']['mapped_value']
        s3_cram_access = DatsObj("Access", [("accessURL", s3_cram)])
        s3_cram_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_cram_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", cram_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", crai_doi),
                             ("relationType", "cram_index")])
                ]),
                ("size", filesize_to_int(cram_size)),
                # TODO - add file size units
                ("conformsTo", [cram_dstan])
            ])

        m = re.match(r'^.*\/([^\/]+)$', gs_cram)
        if m is None:
            logging.fatal("unable to parse filename from CRAM file URI " +
                          gs_cram)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [cram_md5])])

        cram_dataset = DatsObj("Dataset", [
            ("distributions", [gs_cram_distro, s3_cram_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [wgs_type]),
            ("creators", creators),
        ])

        cram_da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_sample.getIdRef()])
             #            ("uses", [])                          # software used
             ])

        cram_dataset.set("producedBy", cram_da)
        # circular link back to enclosing Dataset as the output
        if not no_circular_links:
            cram_da.set("output", [cram_dataset.getIdRef()])
        file_datasets_l.append(cram_dataset)

        # ------------------------------------------------
        # Variant calls - VCF and CSI files
        # ------------------------------------------------

        gs_vcf = ms['gs_vcf']['mapped_value']
        gs_csi = ms['gs_csi']['mapped_value']

        if gs_vcf is None:
            logging.warn("no VCF file found for " + sample_id)
            continue

        # GUID lookup
        vcf_file = get_filename(gs_vcf)
        csi_file = get_filename(gs_csi)

        vcf_doi = file_guids[vcf_file]['Sodium_GUID']['raw_value']
        vcf_size = file_guids[vcf_file]['File size']['raw_value']
        vcf_md5 = file_guids[vcf_file]['md5sum']['raw_value']

        csi_doi = file_guids[csi_file]['Sodium_GUID']['raw_value']
        csi_md5 = file_guids[csi_file]['md5sum']['raw_value']

        # Google Cloud Platform / Google Storage copy
        gs_vcf_access = DatsObj("Access", [("accessURL", gs_vcf)])
        gs_vcf_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_vcf_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", vcf_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", csi_doi),
                             ("relationType", "vcf_index")])
                ]),
                ("size", filesize_to_int(vcf_size)),
                # TODO - add file size units
                ("conformsTo", [vcf_dstan])
            ])

        # AWS / S3 copy
        s3_vcf = ms['s3_vcf']['mapped_value']
        s3_csi = ms['s3_csi']['mapped_value']
        s3_vcf_access = DatsObj("Access", [("accessURL", s3_vcf)])
        s3_vcf_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_vcf_access),
                ("identifier", DatsObj("Identifier",
                                       [("identifier", vcf_doi)])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier", csi_doi),
                             ("relationType", "vcf_index")])
                ]),
                ("size", filesize_to_int(vcf_size)),
                # TODO - add file size units
                ("conformsTo", [vcf_dstan])
            ])

        m = re.match(r'^.*\/([^\/]+)$', gs_vcf)
        if m is None:
            logging.fatal("unable to parse filename from VCF file URI " +
                          gs_vcf)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [vcf_md5])])

        vcf_dataset = DatsObj("Dataset", [
            ("distributions", [gs_vcf_distro, s3_vcf_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [snp_type, cnv_type]),
            ("creators", creators),
        ])

        vcf_da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_sample.getIdRef()])
             #            ("uses", [])                          # software used
             ])

        vcf_dataset.set("producedBy", vcf_da)
        # circular link back to enclosing Dataset as the output
        if not no_circular_links:
            vcf_da.set("output", [vcf_dataset.getIdRef()])
        file_datasets_l.append(vcf_dataset)

    logging.info("found " + str(n_samples_found) + " / " + str(n_samples) +
                 " sample(s) in TOPMed file manifest")
    return file_datasets_l
예제 #2
0
def make_consent_group(args, group_name, group_index, subject_l,
                       dats_subject_d):

    # find DATS subject that corresponds to each named subject
    dats_subjects_l = []
    # parallel array in which existing subjects are represented by idref
    dats_subjects_idrefs_l = []

    for s in subject_l:
        if s['SUBJID'] not in dats_subject_d:
            logging.warn(
                "GTEx subject " + s['SUBJID'] +
                " not found in public metadata, creating new subject Material")

            # create new placeholder Material and 1. add it to "all subjects" group 2.
            subject = DatsObj("Material",
                              [("name", s['SUBJID']), ("characteristics", []),
                               ("description", "GTEx subject " + s['SUBJID'])])
            dats_subject_d[s['SUBJID']] = subject
            dats_subjects_l.append(subject)
            dats_subjects_idrefs_l.append(subject)
        else:
            ds = dats_subject_d[s['SUBJID']]
            dats_subjects_l.append(ds)
            dats_subjects_idrefs_l.append(ds.getIdRef())

    # create StudyGroup and associated ConsentInfo

    # TODO - determine if/where to store group_index (0 or 1)

    # only 2 consent groups in GTEx study:
    #   0 - Subjects did not participate in the study, did not complete a consent document and
    #       are included only for the pedigree structure and/or genotype controls, such as HapMap subjects
    #   1 - General Research Use (GRU)
    consent_info = None
    if group_name == "General Research Use (GRU)":
        # Data Use Ontology for consent info - http://www.obofoundry.org/ontology/duo.html
        #  http://purl.obolibrary.org/obo/DUO_0000005 - "general research use and clinical care"
        #  "This primary category consent code indicates that use is allowed for health/medical/biomedical
        # purposes and other biological research, including the study of population origins or ancestry."
        consent_info = DatsObj(
            "ConsentInfo",
            [("name", group_name), ("abbreviation", "GRU"),
             ("description", group_name),
             ("relatedIdentifiers", [
                 DatsObj("RelatedIdentifier",
                         [("identifier",
                           "http://purl.obolibrary.org/obo/DUO_0000005")])
             ])])
    elif group_name == "Subjects did not participate in the study, did not complete a consent document and are included only for the pedigree structure and/or genotype controls, such as HapMap subjects":
        consent_info = DatsObj("ConsentInfo", [("name", group_name),
                                               ("description", group_name)])
    else:
        logging.fatal("unrecognized consent group " + group_name)
        sys.exit(1)

    group = DatsObj("StudyGroup", [("name", group_name),
                                   ("members", dats_subjects_idrefs_l),
                                   ("size", len(dats_subjects_idrefs_l)),
                                   ("consentInformation", [consent_info])])

    # create link back from each subject to the parent StudyGroup
    if args.no_circular_links:
        logging.warn(
            "not creating Subject level circular links because of --no_circular_links option"
        )
    else:
        for s in dats_subjects_l:
            cl = s.get("characteristics")
            cl.append(
                DatsObj("Dimension", [("name", "member of study group"),
                                      ("values", [group.getIdRef()])]))
    return group
예제 #3
0
def main():

    # input
    parser = argparse.ArgumentParser(
        description='Create DATS JSON for dbGaP GTEx public metadata.')
    parser.add_argument(
        '--output_file',
        required=True,
        help=
        'Output file path for the DATS JSON file containing the top-level DATS Dataset.'
    )
    parser.add_argument(
        '--dbgap_public_xml_path',
        required=True,
        help=
        'Path to directory that contains public dbGaP metadata files e.g., *.data_dict.xml and *.var_report.xml'
    )
    parser.add_argument(
        '--dbgap_protected_metadata_path',
        required=False,
        help=
        'Path to directory that contains access-controlled dbGaP tab-delimited metadata files.'
    )
    parser.add_argument(
        '--max_output_samples',
        required=False,
        type=int,
        help=
        'Impose a limit on the number of sample Materials in the output DATS. For testing purposes only.'
    )
    parser.add_argument('--subject_phenotypes_path',
                        default=V7_SUBJECT_PHENOTYPES_FILE,
                        required=False,
                        help='Path to ' + V7_SUBJECT_PHENOTYPES_FILE)
    parser.add_argument('--sample_attributes_path',
                        default=V7_SAMPLE_ATTRIBUTES_FILE,
                        required=False,
                        help='Path to ' + V7_SAMPLE_ATTRIBUTES_FILE)
    parser.add_argument(
        '--data_stewards_repo_path',
        default='data-stewards',
        required=False,
        help='Path to local copy of https://github.com/dcppc/data-stewards')
    parser.add_argument(
        '--no_circular_links',
        action='store_true',
        help=
        'Whether to disallow circular links/paths within the JSON-LD output.')
    parser.add_argument(
        '--use_all_dbgap_subject_vars',
        action='store_true',
        help=
        'Whether to store all available dbGaP variable values as characteristics of the DATS subject Materials.'
    )
    #    parser.add_argument('--use_all_dbgap_sample_vars', action='store_true', help ='Whether to store all available dbGaP variable values as characteristics of the DATS sample Materials.')
    args = parser.parse_args()

    # logging
    logging.basicConfig(level=logging.INFO)
    #    logging.basicConfig(level=logging.DEBUG)

    # read portal metadata for subjects and samples
    p_subjects = portal_files.read_subject_phenotypes_file(
        args.subject_phenotypes_path)
    p_samples = portal_files.read_sample_attributes_file(
        args.sample_attributes_path)
    portal_files.link_samples_to_subjects(p_samples, p_subjects)

    # read id dump and manifest files from GitHub data-stewards repo

    # id dumps
    subject_id_file = args.data_stewards_repo_path + "/gtex/v7/id_dumps/gtex_v7_subject_ids.txt"
    gh_subjects = github_files.read_subject_id_file(subject_id_file)
    sample_id_file = args.data_stewards_repo_path + "/gtex/v7/id_dumps/gtex_v7_sample_ids.txt"
    gh_samples = github_files.read_sample_id_file(sample_id_file)
    tissue_id_file = args.data_stewards_repo_path + "/gtex/v7/id_dumps/gtex_v7_tissue_ids.txt"
    gh_tissues = github_files.read_tissue_id_file(tissue_id_file)

    # manifest files
    protected_rnaseq_manifest = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + RNASEQ_MANIFEST_FILE
    protected_rnaseq_files = github_files.read_protected_rnaseq_manifest(
        protected_rnaseq_manifest)
    protected_wgs_manifest = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + WGS_MANIFEST_FILE
    protected_wgs_files = github_files.read_protected_wgs_manifest(
        protected_wgs_manifest)

    # DOIs
    rnaseq_dois_file = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + RNASEQ_DOIS_FILE
    rnaseq_dois = github_files.read_dois_manifest(rnaseq_dois_file)
    wgs_dois_file = args.data_stewards_repo_path + "/gtex/v7/manifests/protected_data/" + WGS_DOIS_FILE
    wgs_dois = github_files.read_dois_manifest(wgs_dois_file)

    # compare GitHub manifest files with GitHub id dumps
    cross_check_ids(gh_subjects, gh_samples, protected_rnaseq_files,
                    protected_rnaseq_manifest, "RNA-Seq", "GitHub id dumps")
    cross_check_ids(gh_subjects, gh_samples, protected_wgs_files,
                    protected_wgs_manifest, "WGS", "GitHub id dumps")

    # compare GitHub manifest files with GTEx Portal metdata files
    cross_check_ids(p_subjects, p_samples, protected_rnaseq_files,
                    protected_rnaseq_manifest, "RNA-Seq",
                    "GTEx Portal metadata")
    cross_check_ids(p_subjects, p_samples, protected_wgs_files,
                    protected_wgs_manifest, "WGS", "GTEx Portal metadata")

    # create top-level dataset
    gtex_dataset = ccmm.gtex.wgs_datasets.get_dataset_json()

    # index dbGaP study Datasets by id
    dbgap_study_datasets_by_id = {}
    for tds in gtex_dataset.get("hasPart"):
        dbgap_study_id = tds.get("identifier").get("identifier")
        if dbgap_study_id in dbgap_study_datasets_by_id:
            logging.fatal("encountered duplicate study_id " + dbgap_study_id)
            sys.exit(1)
        m = re.match(r'^(phs\d+\.v\d+)\.p\d+$', dbgap_study_id)
        if m is None:
            logging.fatal("unable to parse study_id " + dbgap_study_id)
            sys.exit(1)
        dbgap_study_datasets_by_id[m.group(1)] = tds

    # read public dbGaP metadata
    pub_xp = args.dbgap_public_xml_path
    restricted_mp = args.dbgap_protected_metadata_path
    # read public metadata
    dbgap_study_pub_md = ccmm.gtex.public_metadata.read_study_metadata(pub_xp)
    # there should be only one study
    study_ids = [k for k in dbgap_study_pub_md.keys()]
    n_study_ids = len(study_ids)
    study_id = study_ids[0]
    if n_study_ids != 1:
        logging.fatal("read " + str(n_study_ids) + " dbGaP studies from " +
                      pub_xp)
        sys.exit(1)

    dbgap_study_dataset = dbgap_study_datasets_by_id[study_id]
    dbgap_study_md = dbgap_study_pub_md[study_id]
    sv = ccmm.gtex.public_metadata.add_study_vars(dbgap_study_dataset,
                                                  dbgap_study_md)
    dbgap_study_md['id_to_var'] = sv['id_to_var']
    dbgap_study_md['type_name_cg_to_var'] = sv['type_name_cg_to_var']

    # set 2nd level types to be the same as the top-level types: WGS and RNA-Seq
    dbgap_study_dataset.set("types", gtex_dataset.get("types"))

    # cache used to minimize duplication of JSON objects in JSON-LD output
    cache = DatsObjCache()

    # --------------------------
    # subjects
    # --------------------------

    # create subjects based on GTEx Portal subject phenotype file and GitHub data-stewards id dump
    dats_subjects_d = ccmm.gtex.subjects.get_subjects_dats_materials(
        cache, p_subjects, gh_subjects,
        dbgap_study_md['type_name_cg_to_var']['Subject_Phenotypes'])
    # sorted list of subjects
    dats_subjects_l = sorted([dats_subjects_d[s] for s in dats_subjects_d],
                             key=lambda s: s.get("name"))

    # TODO - add consent groups, of which GTEx has 2: 0=didn't participate, 1=General Research Use (GRU)

    # create StudyGroup that lists all the subjects
    logging.info("creating 'all subjects' StudyGroup containing " +
                 str(len(dats_subjects_l)) +
                 " subject(s) from public metadata")
    all_subjects = DatsObj(
        "StudyGroup",
        [
            ("name", "all subjects"),
            # subjects appear in full here, but id references will be used elsewhere in the instance:
            ("members", dats_subjects_l),
            ("size", len(dats_subjects_l))
        ])

    # create link back from each subject to the parent StudyGroup
    if args.no_circular_links:
        logging.warn(
            "not creating Subject level circular links because of --no_circular_links option"
        )
    else:
        for s in dats_subjects_l:
            cl = s.get("characteristics")
            cl.append(
                DatsObj("Dimension", [("name", "member of study group"),
                                      ("values", [all_subjects.getIdRef()])]))

    dats_study = DatsObj("Study", [("name", "GTEx"),
                                   ("studyGroups", [all_subjects])])

    # link Study to Dataset
    dbgap_study_dataset.set("producedBy", dats_study)

    # --------------------------
    # sample Materials
    # --------------------------

    # create samples based on GTEx Portal sample attributes file and GitHub data-stewards id dump
    dats_samples_d = ccmm.gtex.samples.get_samples_dats_materials(
        cache, dats_subjects_d, p_samples, gh_samples,
        dbgap_study_md['type_name_cg_to_var']['Sample_Attributes'])
    # sorted list of samples
    dats_samples_l = sorted([dats_samples_d[s] for s in dats_samples_d],
                            key=lambda s: s.get("name"))
    if args.max_output_samples is not None:
        dats_samples_l = dats_samples_l[0:int(args.max_output_samples)]
        logging.warn("limiting output to " + str(len(dats_samples_l)) +
                     " sample(s) due to value of --max_output_samples")
    dbgap_study_dataset.set("isAbout", dats_samples_l)

    # --------------------------
    # file Datasets
    # --------------------------

    file_datasets_l = []

    # WGS CRAM
    wgs_dats_file_datasets_l = ccmm.gtex.samples.get_files_dats_datasets(
        cache, dats_samples_d, p_samples, gh_samples, protected_wgs_files,
        wgs_dois, args.no_circular_links)
    logging.info("adding Datasets for " + str(len(wgs_dats_file_datasets_l)) +
                 " WGS CRAM files")
    file_datasets_l.extend(wgs_dats_file_datasets_l)

    # RNA-Seq CRAM
    rnaseq_dats_file_datasets_l = ccmm.gtex.samples.get_files_dats_datasets(
        cache, dats_samples_d, p_samples, gh_samples, protected_rnaseq_files,
        rnaseq_dois, args.no_circular_links)
    logging.info("adding Datasets for " +
                 str(len(rnaseq_dats_file_datasets_l)) + " RNA-Seq CRAM files")
    file_datasets_l.extend(rnaseq_dats_file_datasets_l)

    dbgap_study_dataset.set("hasPart", file_datasets_l)

    # augment public (meta)data with restricted-access (meta)data
    if restricted_mp is not None:
        # create study groups and update subjects/samples with restricted phenotype data
        add_restricted_data(cache, args, dbgap_study_md, dats_subjects_l,
                            dats_samples_d, dats_study, study_id)

    # write Dataset to DATS JSON file
    with open(args.output_file, mode="w") as jf:
        jf.write(json.dumps(gtex_dataset, indent=2, cls=DATSEncoder))
예제 #4
0
def get_files_dats_datasets(cache, dats_samples_d, p_samples, gh_samples,
                            protected_cram_files, no_circular_links):
    file_datasets = []

    rnaseq_datatype = DatsObj("DataType", [
        ("information", util.get_annotation("transcription profiling", cache)),
        ("method", util.get_annotation("RNA-seq assay", cache)),
        ("platform", util.get_annotation("Illumina", cache))
    ])

    def get_rnaseq_datatype():
        dkey = ".".join(["DataType", "RNA-seq"])
        return cache.get_obj_or_ref(dkey, lambda: rnaseq_datatype)

    wgs_datatype = DatsObj(
        "DataType",
        [("information", util.get_annotation("DNA sequencing", cache)),
         ("method", util.get_annotation("whole genome sequencing assay",
                                        cache)),
         ("platform", util.get_annotation("Illumina", cache))])

    def get_wgs_datatype():
        dkey = ".".join(["DataType", "WGS"])
        return cache.get_obj_or_ref(dkey, lambda: wgs_datatype)

    broad_key = ":".join(["Organization", "Broad Institute"])
    broad = cache.get_obj_or_ref(
        broad_key,
        lambda: DatsObj("Organization", [("name", "Broad Institute")]))
    creators = [broad]

    def make_data_standard(format):
        return DatsObj("DataStandard",
                       [("name", format),
                        ("type", util.get_value_annotation("format", cache)),
                        ("description", format + " file format")])

    cram_ds_key = ":".join(["DataStandard", "CRAM"])
    cram_ds = cache.get_obj_or_ref(cram_ds_key,
                                   lambda: make_data_standard("CRAM"))

    crai_ds_key = ":".join(["DataStandard", "CRAI"])
    crai_ds = cache.get_obj_or_ref(crai_ds_key,
                                   lambda: make_data_standard("CRAI"))

    for sample_id in protected_cram_files:
        file = protected_cram_files[sample_id]
        material_type = None
        ds_types = None

        # determine file type
        if re.search(r'wgs\/', file['cram_file_aws']['raw_value']):
            material_type = 'DNA'
            ds_types = get_wgs_datatype()
            gcp_suffix = '_gcp'
        elif re.search(r'rnaseq\/', file['cram_file_aws']['raw_value']):
            material_type = 'RNA'
            ds_types = get_rnaseq_datatype()
            gcp_suffix = ''
        else:
            logging.fatal(
                "unable to determine material/sequence type from cram_file_aws="
                + file['cram_file_aws']['raw_value'])
            sys.exit(1)

        # RNA-Seq keys = sample_id	cram_file	cram_file_md5	cram_file_size	cram_index	cram_file_aws	cram_index_aws
        # WGS keys = same as above + firecloud_id
        cram_file = file['cram_file' + gcp_suffix]['raw_value']
        cram_file_md5 = file['cram_file_md5']['raw_value']

        # TODO - review the following encoding decisions:
        #  - storing .crai URI as relatedIdentifier of the DatasetDistribution for the .cram file
        #  - storing MD5 checksum of the .cram file as an extraProperty of the DatasetDistribution
        #  - storing firecloud_id as a relatedIdentifier of the Dataset (not the DatasetDistribution)

        # Google Cloud Platform / Google Storage copy
        gs_access = DatsObj(
            "Access",
            [("landingPage", file['cram_file' + gcp_suffix]['raw_value'])])
        gs_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", gs_access),
                ("identifier",
                 DatsObj("Identifier",
                         [("identifier",
                           file['cram_file' + gcp_suffix]['raw_value'])])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier",
                            [("identifier",
                              file['cram_index' + gcp_suffix]['raw_value']),
                             ("relationType", "cram_index")])
                ]),
                ("size", int(file['cram_file_size']['raw_value'])),
                # TODO - add unit for bytes, include IRI?
                #                ("unit", util.get_value_annotation("bytes", cache))
                ("conformsTo", [
                    cache.get_obj_or_ref(cram_ds_key,
                                         lambda: make_data_standard("CRAM"))
                ])
            ])

        # AWS / S3 copy
        s3_access = DatsObj(
            "Access", [("landingPage", file['cram_file_aws']['raw_value'])])
        s3_distro = DatsObj(
            "DatasetDistribution",
            [
                ("access", s3_access),
                ("identifier",
                 DatsObj("Identifier", [
                     ("identifier", file['cram_file_aws']['raw_value'])
                 ])),
                ("relatedIdentifiers", [
                    DatsObj("RelatedIdentifier", [
                        ("identifier", file['cram_index_aws']['raw_value']),
                        ("relationType", "cram_index")
                    ])
                ]),
                ("size", int(file['cram_file_size']['raw_value'])),
                # TODO - add unit for bytes, include IRI?
                #                ("unit", util.get_value_annotation("bytes", cache))
                ("conformsTo", [
                    cache.get_obj_or_ref(cram_ds_key,
                                         lambda: make_data_standard("CRAM"))
                ])
            ])

        m = re.match(r'^.*\/([^\/]+)$', cram_file)
        if m is None:
            logging.fatal("unable to parse filename from CRAM file URI " +
                          cram_file)
            sys.exit(1)
        filename = m.group(1)

        # TODO - replace this with DATS-specific MD5 checksum encoding (TBD)
        md5_dimension = DatsObj(
            "Dimension", [("name", util.get_value_annotation("MD5", cache)),
                          ("values", [file['cram_file_md5']['raw_value']])])

        ds = DatsObj("Dataset", [
            ("distributions", [gs_distro, s3_distro]),
            ("dimensions", [md5_dimension]),
            ("title", filename),
            ("types", [ds_types]),
            ("creators", creators),
        ])

        # add firecloud_id for WGS
        if 'firecloud_id' in file:
            f_id = DatsObj("RelatedIdentifier",
                           [("identifier", file['firecloud_id']['raw_value']),
                            ("identifierSource", "FireCloud")])
            ds.set("relatedIdentifiers", [f_id])

        # input RNA/DNA extract that was sequenced
        if sample_id not in dats_samples_d:
            logging.fatal("no sample exists for " + sample_id +
                          " found in file " +
                          file['cram_file_aws']['raw_value'])
            sys.exit(1)

        dats_sample = dats_samples_d[sample_id]
        dats_samp_key = ":".join(["Material", dats_sample.get("name")])
        dats_samp = cache.get_obj_or_ref(dats_samp_key, lambda: dats_sample)

        da = DatsObj(
            "DataAcquisition",
            [("name", filename), ("input", [dats_samp])
             #                ("uses", [])                          # software used
             ])

        if not no_circular_links:
            # circular link back to enclosing Dataset as the output
            da.set("output", [ds.getIdRef()])

        ds.set("producedBy", da)
        file_datasets.append(ds)

    return file_datasets