Esempio n. 1
0
    def __init__(self,
                 cohort,
                 expr_source,
                 var_source,
                 copy_source,
                 annot_file,
                 cv_prop=2.0 / 3,
                 cv_seed=None,
                 **coh_args):
        self.cohort = cohort

        expr = get_expr_data(cohort, expr_source, **coh_args)
        if 'annot_fields' in coh_args:
            annot_data = get_gencode(annot_file, coh_args['annot_fields'])
        else:
            annot_data = get_gencode(annot_file)

        self.gene_annot = {
            at['gene_name']: {
                **{
                    'Ens': ens
                },
                **at
            }
            for ens, at in get_gencode().items()
            if at['gene_name'] in expr.columns
        }

        expr, variants = add_mutations(cohort, var_source, copy_source, expr,
                                       self.gene_annot, **coh_args)

        super().__init__(expr, variants, cv_prop, cv_seed)
Esempio n. 2
0
def process_input_datasets(metabric_dir,
                           annot_dir,
                           use_types=None,
                           **data_args):

    samp_data = load_metabric_samps(metabric_dir)
    expr = drop_duplicate_genes(load_metabric_expression(metabric_dir))

    use_samps = set(samp_data.SAMPLE_ID[
        (samp_data.CANCER_TYPE == 'Breast Cancer')
        & (samp_data.CANCER_TYPE_DETAILED == 'Breast Invasive Ductal Carcinoma'
           )]) & set(expr.index)

    annot_file = os.path.join(annot_dir, "gencode.v19.annotation.gtf.gz")
    if 'annot_fields' in data_args:
        annot_data = get_gencode(annot_file, data_args['annot_fields'])
    else:
        annot_data = get_gencode(annot_file)

    annot_dict = {
        at['gene_name']: {
            **{
                'Ens': ens
            },
            **at
        }
        for ens, at in annot_data.items()
        if at['gene_name'] in set(expr.columns)
    }

    variants = load_metabric_variants(metabric_dir)
    copy_df = load_metabric_copies(metabric_dir)

    use_samps &= set(copy_df.index)
    with open(os.path.join(metabric_dir, "data_mutations_mskcc.txt"),
              'r') as f:
        use_samps &= set(f.readline().split("#Sequenced_Samples: ")[1].split(
            '\t')[0].split(' '))

    if use_types is not None:
        use_samps &= choose_subtypes(samp_data, use_types)

    expr_data = expr.loc[use_samps, expr.columns.isin(annot_dict)]
    variants = variants.loc[variants.Sample.isin(use_samps)
                            & variants.Gene.isin(annot_dict)]

    copy_df = copy_df.loc[use_samps, copy_df.columns.isin(annot_dict)]
    copy_df = pd.DataFrame(copy_df.stack()).reset_index()
    copy_df.columns = ['Sample', 'Gene', 'Copy']

    copy_df = copy_df.loc[(copy_df.Copy != 0)]
    copy_df.Copy = copy_df.Copy.map({
        -2: 'DeepDel',
        -1: 'ShalDel',
        1: 'ShalGain',
        2: 'DeepGain'
    })

    return expr_data, variants, copy_df, annot_dict
Esempio n. 3
0
def process_input_datasets(cohort, expr_source, var_source, copy_source,
                           annot_dir, type_file, **data_args):

    base_coh = cohort.split('_')[0]
    use_types = parse_subtypes(cohort)

    expr = drop_duplicate_genes(
        get_expr_data(base_coh, expr_source, **data_args))

    annot_file = os.path.join(annot_dir, "gencode.v19.annotation.gtf.gz")
    if 'annot_fields' in data_args:
        annot_data = get_gencode(annot_file, data_args['annot_fields'])
    else:
        annot_data = get_gencode(annot_file)

    # restructure annotation data around expression gene labels
    use_genes = set(expr.columns.get_level_values('Gene'))
    annot_dict = {
        at['gene_name']: {
            **{
                'Ens': ens
            },
            **at
        }
        for ens, at in annot_data.items() if at['gene_name'] in use_genes
    }

    expr, variants, copy_df = add_mutations(base_coh, var_source, copy_source,
                                            expr, annot_dict, **data_args)

    use_samps = set(expr.index)
    if use_types is not None:
        use_samps &= choose_subtypes(use_types, base_coh, type_file)

    expr_data = expr.loc[use_samps]
    variants = variants.loc[variants.Sample.isin(use_samps)]
    copy_df = copy_df.loc[copy_df.Sample.isin(use_samps)]

    return expr_data, variants, copy_df, annot_dict
Esempio n. 4
0
def process_input_datasets(ccle_dir, annot_dir, expr_source, **coh_args):
    samp_data = load_ccle_samps(ccle_dir)
    expr = load_ccle_expression(ccle_dir, expr_source, **coh_args)
    variants = load_ccle_variants(ccle_dir)
    copies = load_ccle_copies(ccle_dir)

    expr.index = [
        samp_data.index[samp_data.SAMPLE_ID == smp][0] for smp in expr.index
    ]
    copies.index = [
        samp_data.index[samp_data.SAMPLE_ID == smp][0] for smp in copies.index
    ]
    variants.Sample = [
        samp_data.index[samp_data.SAMPLE_ID == smp][0]
        for smp in variants.Sample
    ]

    use_samps = set(expr.index) & set(copies.index)
    expr = drop_duplicate_genes(expr.loc[use_samps])

    annot_file = os.path.join(annot_dir, "gencode.v19.annotation.gtf.gz")
    annot_data = get_gencode(annot_file, ['transcript', 'exon'])

    annot_dict = {
        at['gene_name']: {
            **{
                'Ens': ens
            },
            **at
        }
        for ens, at in annot_data.items()
        if at['gene_name'] in set(expr.columns)
    }

    expr = expr.loc[:, expr.columns.isin(annot_dict)]
    variants = variants.loc[variants.Sample.isin(use_samps)
                            & variants.Gene.isin(annot_dict)]

    copies = copies.loc[use_samps, copies.columns.isin(annot_dict)]
    copy_df = pd.DataFrame(copies.stack()).reset_index()
    copy_df.columns = ['Sample', 'Gene', 'Copy']

    copy_df = copy_df.loc[(copy_df.Copy != 0)]
    copy_df.Copy = copy_df.Copy.map({
        -2: 'DeepDel',
        -1: 'ShalDel',
        1: 'ShalGain',
        2: 'DeepGain'
    })

    return expr, variants, copy_df, annot_dict
Esempio n. 5
0
def process_input_datasets(baml_dir, annot_dir, syn, **data_args):

    samp_data = pd.read_csv(os.path.join(baml_dir, "VarCalls",
                                         "TableS12_WES_samples.tsv"),
                            sep='\t')
    expr = load_beat_expression(baml_dir)

    annot_file = os.path.join(annot_dir, "gencode.v19.annotation.gtf.gz")
    if 'annot_fields' in data_args:
        annot_data = get_gencode(annot_file, data_args['annot_fields'])
    else:
        annot_data = get_gencode(annot_file)

    annot_dict = {at['gene_name']: {**{'Ens': ens}, **at}
                  for ens, at in annot_data.items()
                  if ens in set(expr.columns)}

    # TODO: incorporate supplemental mutation data, eg. laboratory-based
    # data for FLT3 ITDs found here:
    # https://www.nature.com/articles/s41586-018-0623-z
    variants = load_beat_variants(syn, **data_args)
    variants['Sample'] = ['pid{}'.format(pid) for pid in variants.Sample.values]
    samp_data['Sample'] = ['pid{}'.format(pid) for pid in samp_data.patientId]
    use_samps = set(expr.index) & set(samp_data.Sample)

    expr = expr.loc[use_samps, expr.columns.isin(annot_data)]
    expr_data = drop_duplicate_genes(expr.rename(
        columns={gn: annot_data[gn]['gene_name'] for gn in expr.columns}))

    # duplicates need to be filtered out here as they arise from two different
    # callers (varscan and mutect) being used to produce the mutation dataset
    variants = variants.loc[variants.Sample.isin(use_samps)
                            & variants.Gene.isin(annot_dict)]
    variants = variants.loc[~variants.duplicated()]

    return expr_data, variants, annot_dict
Esempio n. 6
0
    def __init__(self,
                 cohorts, mut_genes, mut_levels,
                 expr_source, var_source, copy_source,
                 annot_file, domain_dir=None, type_file=None,
                 top_genes=100, samp_cutoff=None, cv_prop=2.0/3, cv_seed=None,
                 **coh_args):
        self.cohorts = cohorts

        expr_dict = {cohort: get_expr_data(cohort, expr_source, **coh_args)
                     for cohort in cohorts}

        if 'annot_fields' in coh_args:
            annot_data = get_gencode(annot_file, coh_args['annot_fields'])
        else:
            annot_data = get_gencode(annot_file)

        # restructure annotation data around expression gene labels
        gene_annot = {cohort: {
            at['gene_name']: {**{'Ens': ens}, **at}
            for ens, at in annot_data.items()
            if at['gene_name'] in set(expr.columns.get_level_values('Gene'))
            }
            for cohort, expr in expr_dict.items()}

        data_dict = {cohort: add_variant_data(cohort, var_source, copy_source,
                                              expr, gene_annot[cohort],
                                              **coh_args)
                     for cohort, expr in expr_dict.items()}

        use_genes = reduce(and_,
                           [expr.columns for expr in expr_dict.values()])
        expr = pd.concat([data_list[0] for data_list in data_dict.values()])

        self.gene_annot = {gn: ant
                           for gn, ant in gene_annot[cohorts[0]].items()
                           if gn in use_genes}

        if (type_file is not None and 'use_types' in coh_args
                and coh_args['use_types'] is not None):
            type_data = pd.read_csv(type_file,
                                    sep='\t', index_col=0, comment='#')

            use_samps = set()
            for cohort in cohorts:
                if coh_args['use_types'][cohort] is not None:
                    use_samps |= set(
                        type_data.index[(type_data.DISEASE == cohort)
                                        & (type_data.SUBTYPE.isin(
                                            coh_args['use_types'][cohort]))]
                        )

                else:
                    use_samps |= set(data_dict[cohort][0].index)

            use_samps &= set(expr.index)

        else:
            use_samps = expr.index

        expr = expr.loc[use_samps]
        self.cohort_samps = {coh: set(data_dict[coh][0].index) & use_samps
                             for coh in cohorts}

        variants = pd.concat([data_list[1]
                              for data_list in data_dict.values()])
        copy_df = pd.concat([data_list[2]
                             for data_list in data_dict.values()])

        if mut_genes:
            self.alleles = variants.loc[
                variants.Gene.isin(mut_genes),
                ['Sample', 'Protein', 'ref_count', 'alt_count']
                ]

        # add a mutation level indicating if a mutation is a CNA or not by
        # first figuring out where to situate it relative to the other
        # levels...
        if 'Gene' in mut_levels:
            scale_lvl = mut_levels.index('Gene') + 1
        else:
            scale_lvl = 0

        # ...and then inserting the new level, and adding its corresponding
        # values to the mutation and copy number alteration datasets
        mut_levels.insert(scale_lvl, 'Scale')
        mut_levels.insert(scale_lvl + 1, 'Copy')
        variants['Scale'] = 'Point'
        copy_df['Scale'] = 'Copy'

        super().__init__(expr, pd.concat([variants, copy_df], sort=True),
                         mut_genes, mut_levels, domain_dir,
                         top_genes, samp_cutoff, cv_prop, cv_seed)
Esempio n. 7
0
    def __init__(self,
                 mut_genes,
                 mut_levels,
                 toil_dir,
                 sample_data,
                 tx_map,
                 syn,
                 copy_dir,
                 annot_file,
                 top_genes=None,
                 samp_cutoff=25,
                 cv_prop=0.75,
                 cv_seed=None,
                 **coh_args):
        tcga_expr = pd.read_csv(os.path.join(toil_dir, 'TCGA',
                                             'TCGA_LAML_tpm.tsv.gz'),
                                sep='\t')

        tcga_expr.index = tcga_expr.iloc[:,
                                         0].str.split('|').apply(itemgetter(0))
        tcga_expr = tcga_expr.iloc[:, 1:]

        id_map = pd.read_csv(os.path.join(toil_dir, 'TCGA_ID_MAP.csv'),
                             sep=',',
                             index_col=0)
        id_map = id_map.loc[id_map['Disease'] == 'LAML']
        tcga_expr.columns = id_map.loc[tcga_expr.columns,
                                       'AliquotBarcode'].values

        tx_annot = pd.read_csv(tx_map, sep='\t', index_col=0)
        tcga_expr.index.name = 'Transcript'
        tcga_expr = tcga_expr.transpose(
        ).loc[:, [tx in tx_annot.index for tx in tcga_expr.index]]

        tcga_expr.columns = pd.MultiIndex.from_arrays(
            [tx_annot.loc[tcga_expr.columns, 'gene'], tcga_expr.columns],
            names=['Gene', 'Transcript'])

        tcga_expr.sort_index(axis=1, level=['Gene'], inplace=True)
        annot_data = get_gencode(annot_file)

        self.gene_annot = {
            at['gene_name']: {
                **{
                    'Ens': ens
                },
                **at
            }
            for ens, at in annot_data.items()
            if at['gene_name'] in tcga_expr.columns
        }

        expr, variants, copy_df = add_variant_data(
            cohort='LAML',
            var_source='mc3',
            copy_source='Firehose',
            syn=syn,
            expr=tcga_expr,
            copy_dir=copy_dir,
            gene_annot=self.gene_annot,
        )

        beataml_expr = pd.read_csv(os.path.join(sample_data, 'matrices',
                                                'CTD2_TPM_transcript.tsv'),
                                   sep='\t',
                                   index_col=0)

        use_genes = sorted(
            set(beataml_expr.index)
            & set(expr.columns.get_level_values('Transcript')))
        expr = log_norm(expr.loc[:, (slice(None), use_genes)])

        beataml_expr = log_norm(beataml_expr.transpose()).loc[:, use_genes]
        beataml_expr = beataml_expr.loc[:,
                                        expr.columns.
                                        get_level_values('Transcript')]

        beataml_expr.columns = pd.MultiIndex.from_arrays(
            [expr.columns.get_level_values('Gene'), beataml_expr.columns],
            names=['Gene', 'Transcript'])

        beataml_expr = beataml_expr.apply(lambda x: (x - x.min()) /
                                          (x.max() - x.min())).fillna(0.0)
        expr = expr.apply(lambda x: (x - x.min()) /
                          (x.max() - x.min())).fillna(0.0)

        if len(mut_levels) > 1 or mut_levels[0] != 'Gene':
            if 'Gene' in mut_levels:
                scale_lvl = mut_levels.index('Gene') + 1
            else:
                scale_lvl = 0

            mut_levels.insert(scale_lvl, 'Scale')
            mut_levels.insert(scale_lvl + 1, 'Copy')
            variants['Scale'] = 'Point'
            copy_df['Scale'] = 'Copy'

        super().__init__(pd.concat([expr, beataml_expr], sort=True),
                         pd.concat([variants, copy_df], sort=True), mut_genes,
                         mut_levels, top_genes, samp_cutoff, cv_prop, cv_seed)
Esempio n. 8
0
    def __init__(self,
                 cohorts,
                 mut_levels,
                 mut_genes,
                 expr_sources,
                 var_sources,
                 copy_sources,
                 annot_file,
                 domain_dir=None,
                 type_file=None,
                 cv_seed=None,
                 test_prop=0,
                 **coh_args):
        self.cohorts = cohorts

        if isinstance(expr_sources, str):
            expr_sources = [expr_sources]

        if var_sources is None:
            var_sources = expr_sources

        elif 'mc3' in var_sources:
            coh_args = {
                **coh_args,
                **{
                    'mc3': get_variants_mc3(coh_args['syn'])
                }
            }

        elif isinstance(var_sources, str):
            var_sources = [var_sources]

        if isinstance(copy_sources, str):
            copy_sources = [copy_sources]

        # load expression data for each cohort, get gene annotation
        expr_raw = {
            coh: drop_duplicate_genes(get_expr_data(coh, expr_src, **coh_args))
            for coh, expr_src in zip(cohorts, cycle(expr_sources))
        }

        if 'annot_fields' in coh_args:
            annot_data = get_gencode(annot_file, coh_args['annot_fields'])
        else:
            annot_data = get_gencode(annot_file)

        self.gene_annot = {
            at['gene_name']: {
                **{
                    'Ens': ens
                },
                **at
            }
            for ens, at in annot_data.items()
            if at['gene_name'] in reduce(and_, [
                expr.columns.get_level_values('Gene')
                for expr in expr_raw.values()
            ])
        }

        expr_dict = {cohort: None for cohort in cohorts}
        var_dict = {cohort: None for cohort in cohorts}
        for cohort, var_source, copy_source in zip(cohorts, cycle(var_sources),
                                                   cycle(copy_sources)):

            expr_dict[cohort], var_dict[cohort], copy_df = add_mutations(
                cohort, var_source, copy_source, expr_raw[cohort],
                self.gene_annot, **coh_args)

            var_dict[cohort].loc[:, 'Scale'] = 'Point'
            copy_df.loc[:, 'Scale'] = 'Copy'
            var_dict[cohort] = pd.concat([var_dict[cohort], copy_df],
                                         sort=True)

        if 'Gene' in mut_levels:
            scale_lvl = mut_levels.index('Gene') + 1
        else:
            scale_lvl = 0

        mut_levels.insert(scale_lvl, 'Scale')
        mut_levels.insert(scale_lvl + 1, 'Copy')

        super().__init__(expr_dict, var_dict, mut_genes, mut_levels, top_genes,
                         samp_cutoff, cv_prop, cv_seed)
Esempio n. 9
0
    def __init__(self,
                 mut_genes,
                 mut_levels,
                 expr_source,
                 var_source,
                 copy_source,
                 annot_file,
                 top_genes=100,
                 samp_cutoff=None,
                 cv_prop=2.0 / 3,
                 cv_seed=None,
                 **coh_args):
        expr_cohorts = list_cohorts(expr_source, **expr_args)

        expr_dict = dict()
        for cohort in expr_cohorts:
            try:
                expr_data = get_expr_data(cohort, expr_source, **coh_args)

                expr_dict[cohort] = pd.DataFrame(scale(expr_data),
                                                 index=expr_data.index,
                                                 columns=expr_data.columns)

            except:
                print('no expression found for {}'.format(cohort))

        # removes samples that appear in more than one cohort
        for coh1, coh2 in combinations(expr_dict, 2):
            if len(expr_dict[coh1].index & expr_dict[coh2].index):
                ovlp1 = expr_dict[coh1].index.isin(expr_dict[coh2].index)
                ovlp2 = expr_dict[coh2].index.isin(expr_dict[coh1].index)

                if np.all(ovlp1):
                    expr_dict[coh2] = expr_dict[coh2].loc[~ovlp2, :]
                elif np.all(ovlp2) or np.sum(~ovlp1) >= np.sum(~ovlp2):
                    expr_dict[coh1] = expr_dict[coh1].loc[~ovlp1, :]
                else:
                    expr_dict[coh2] = expr_dict[coh2].loc[~ovlp2, :]

        expr = pd.concat(list(expr_dict.values()))
        self.gene_annot = {
            at['gene_name']: {
                **{
                    'Ens': ens
                },
                **at
            }
            for ens, at in get_gencode().items()
            if at['gene_name'] in expr.columns
        }

        if var_source == 'mc3':
            variants, matched_samps = add_mutations(cohort=None,
                                                    var_source='mc3',
                                                    expr=expr,
                                                    gene_annot=self.gene_annot,
                                                    **coh_args)

        else:
            if var_source is None:
                var_source = expr_source
                var_cohorts = expr_cohorts

            else:
                var_args = dict()
                if 'var_dir' in coh_args:
                    var_args['data_dir'] = coh_args['var_dir']
                if 'syn' in coh_args:
                    var_args['syn'] = coh_args['syn']

                var_cohorts = list_cohorts(var_source, **var_args)
                var_cohorts &= expr_dict

            var_dict = dict()
            for cohort in var_cohorts:
                try:
                    var_dict[cohort] = add_mutations(cohort, expr_source,
                                                     **coh_args)

                    if copy_source is not None:
                        var_dict[cohort] = pd.concat([
                            var_dict[cohort],
                            get_copy_data(cohort, copy_source, **coh_args)
                        ])

                except:
                    print('no variants found for {}'.format(cohort))

            variants = pd.concat(list(var_dict.values()))

        expr = expr.loc[expr.index.isin(matched_samps[0]),
                        expr.columns.isin(list(self.gene_annot))]
        expr.index = [matched_samps[0][old_samp] for old_samp in expr.index]

        variants = variants.loc[variants['Sample'].isin(matched_samps[1]), :]
        variants['Sample'] = [
            matched_samps[1][old_samp] for old_samp in variants['Sample']
        ]

        # for each expression cohort, find the samples that were matched to
        # a sample in the mutation call data
        cohort_samps = {
            cohort: set(matched_samps[0][samp]
                        for samp in expr_df.index & matched_samps[0])
            for cohort, expr_df in expr_dict.items()
        }

        # save a list of matched samples for each cohort as an attribute
        self.cohort_samps = {
            cohort: samps
            for cohort, samps in cohort_samps.items() if samps
        }
        copy_data = None
        super().__init__(expr, variants, copy_data, mut_genes, mut_levels,
                         top_genes, samp_cutoff, cv_prop, cv_seed)
Esempio n. 10
0
    def __init__(self,
                 cohort,
                 copy_genes,
                 expr_source,
                 copy_source,
                 annot_file,
                 type_file,
                 annot_fields=None,
                 use_types=None,
                 cv_seed=None,
                 test_prop=0,
                 **coh_args):
        self.cohort = cohort

        # load expression and gene annotation datasets
        expr = drop_duplicate_genes(
            get_expr_data(cohort, expr_source, **coh_args))
        annot_data = get_gencode(annot_file, annot_fields)

        # restructure annotation data around expression gene labels
        self.gene_annot = {
            at['gene_name']: {
                **{
                    'Ens': ens
                },
                **at
            }
            for ens, at in annot_data.items()
            if at['gene_name'] in set(expr.columns.get_level_values('Gene'))
        }

        if copy_source == 'Firehose':
            if 'copy_dir' not in coh_args:
                copy_dir = coh_args['expr_dir']
            else:
                copy_dir = coh_args['copy_dir']

            copies = get_copies_firehose(cohort, copy_dir, discrete=False)

        else:
            raise ValueError("Unrecognized source of copy number data!")

        expr_match, copy_match = match_tcga_samples(expr.index, copies.index)

        expr_df = expr.loc[
            expr.index.isin(expr_match),
            expr.columns.get_level_values('Gene').isin(self.gene_annot)]
        expr_df.index = [expr_match[old_samp] for old_samp in expr_df.index]

        self.event_feats = copies.columns[copies.columns.str.match(
            "[0-9]+(p|q).?")]
        copy_df = copies.loc[copies.index.isin(copy_match),
                             (copies.columns.isin(self.gene_annot)
                              | copies.columns.isin(self.event_feats))]

        self.gene_annot.update(
            zip(self.event_feats, [{
                'Chr': "chr{}".format(lbl)
            } for lbl in self.event_feats.str.replace("(p|q).*", "")]))

        copy_df.index = [copy_match[samp] for samp in copy_df.index]
        use_samps = set(expr_df.index)

        if use_types is not None:
            type_data = pd.read_csv(type_file,
                                    sep='\t',
                                    index_col=0,
                                    comment='#')
            type_data = type_data[type_data.DISEASE == cohort]

            use_samps &= set(
                type_data.index[type_data.SUBTYPE.isin(use_types)])

        expr_df = expr_df.loc[use_samps]
        copy_df = copy_df.loc[copy_df.index.isin(use_samps),
                              (set(copy_genes) & set(copy_df.columns))
                              | set(self.event_feats)]
        copy_genes = [
            copy_gene for copy_gene in copy_genes
            if copy_gene in copy_df.columns
        ]

        super().__init__(expr_df, copy_df, copy_genes, cv_seed, test_prop)