コード例 #1
0
    def get_annotation(var_df,
                       ds_name,
                       input_fp=None,
                       output_prefix='',
                       output_dir=None,
                       output_fp=None,
                       filter_condition=None,
                       reference_genome=None):
        """
        Add or also run FatHMM for the variants in the given dataframe
        :param var_df: pandas dataframe with variants
        :param ds_name: name of dataset, possibly related to given filter;
                        used for default naming of input and output files
        :param input_fp: absolute path to input file
        :param output_prefix: should be None
        :param output_dir: should be None
        :param output_fp: absolute path to output file
        :param filter_condition: select only a subset of given dataframe, for example a cancer type or a subject
        :param reference_genome: no reference genome for FatHMM
        :return: extended pandas dataframe with FATHMM scores
        """
        if filter_condition is None:
            select_indices = np.where(var_df[FUNC_COL])[0]
        else:
            select_indices = np.where(filter_condition & var_df[FUNC_COL])[0]

        if len(select_indices) == 0:
            logger.warning(
                'No variants selected for FATHMM analysis of case {}!'.format(
                    ds_name))

        if output_dir is None:
            output_dir = os.path.join('.')
        if input_fp is None:
            input_fp = os.path.join(
                output_dir, '{}{}'.format(ds_name, FatHMM.INPUT_SUFFIX))
        if output_fp is None:
            output_fp = os.path.join(
                output_dir, '{}{}'.format(ds_name, FatHMM.OUTPUT_SUFFIX))

        if not os.path.isfile(input_fp):
            FatHMM.generate_input_file(input_fp, var_df.iloc[select_indices])

        if not os.path.isfile(output_fp):
            # run FatHMM
            FatHMM.run(os.path.abspath(input_fp), os.path.abspath(output_fp))

        fathmm_df = FatHMM.read_results(output_fp)

        # add FATHMM results to the dataframe
        add_column(var_df,
                   select_indices,
                   FATHMM_KEY_COL,
                   FatHMM.SCORE_COL,
                   fathmm_df,
                   as_value=True,
                   sub_key=FatHMM.SCORE_COL,
                   as_float=True)

        if (var_df.loc[select_indices][FatHMM.SCORE_COL].count() <
                0.2 * var_df.loc[select_indices][FATHMM_KEY_COL].count() +
                min(3.0,
                    var_df.loc[select_indices][FATHMM_KEY_COL].count() / 2.0)):
            logger.warning(
                'Only {} FATHMM predictions found for case {} but {} IDs.'.
                format(var_df.loc[select_indices][FatHMM.SCORE_COL].count(),
                       ds_name,
                       var_df.loc[select_indices][FATHMM_KEY_COL].count()))

        return var_df
コード例 #2
0
ファイル: candra.py プロジェクト: johannesreiter/LiFD
    def get_annotation(var_df,
                       ds_name,
                       input_fp=None,
                       output_prefix='',
                       output_dir=None,
                       output_fp=None,
                       filter_condition=None,
                       reference_genome='hg19'):
        """
        Add or also run CanDrA for the given dataframe
        :param var_df: pandas dataframe
        :param ds_name: name of dataset, possibly related to given filter;
                        used for default naming of input and output files
        :param input_fp: path to input file
        :param output_prefix: output naming prefix
        :param output_dir: path to output directory
        :param output_fp: if output_fp is given then CanDrA will not be run and it is assumed to exist already
        :param filter_condition: select only a subset of given dataframe, for example a cancer type or a subject
        :param reference_genome: human reference genome, e.g. hg19 or hg20
        :return: extended pandas dataframe with CanDrA scores
        """

        if filter_condition is None:
            select_indices = np.where(var_df[FUNC_COL])[0]
        else:
            select_indices = np.where(filter_condition & var_df[FUNC_COL])[0]

        if len(select_indices) == 0:
            logger.warning(
                'No variants selected for CanDrA analysis of case {}!'.format(
                    ds_name))

        if output_dir is None:
            output_dir = os.path.join('.')
        if input_fp is None:
            input_fp = os.path.join(
                output_dir, '{}{}'.format(ds_name, Candra.INPUT_SUFFIX))
        if output_fp is None:
            output_fp = os.path.join(
                output_dir, '{}{}'.format(ds_name, Candra.OUTPUT_SUFFIX))

        if not os.path.isfile(input_fp):
            Candra.generate_input_file(input_fp,
                                       var_df.iloc[select_indices],
                                       chromosome_col='Chromosome',
                                       position_col='StartPosition',
                                       reference_col='ReferenceAllele',
                                       alternate_col='AlternateAllele')

        if not os.path.isfile(output_fp):
            # run CanDrA
            if len(var_df.iloc[select_indices][CT_COL].unique()) != 1:
                # var_df[var_df.Subject == sub_name][CT_COL].unique()
                raise RuntimeError(
                    'No cancer type could be inferred for subject {}: {} {}'.
                    format(ds_name,
                           var_df.iloc[select_indices][CT_COL].unique(),
                           len(var_df.iloc[select_indices][CT_COL].unique())))

            ct = var_df.iloc[select_indices][CT_COL].unique()[0]
            Candra.run(os.path.abspath(input_fp), os.path.abspath(output_fp),
                       ct)

        candra_df = Candra.read_results(output_fp)

        # add CanDrA results to the dataframe
        Candra.add_amino_acid_change_column(var_df, candra_df)
        add_column(var_df,
                   select_indices,
                   NT_VAR_COL,
                   Candra.LIFD_SCORE_COL,
                   candra_df,
                   as_value=True,
                   sub_key=Candra.SCORE_COL,
                   as_float=True)
        add_column(var_df,
                   select_indices,
                   NT_VAR_COL,
                   Candra.LIFD_CATEGORY_COL,
                   candra_df,
                   as_value=True,
                   sub_key=Candra.CATEGORY_COL)
        var_df['CanDrA_clf'].replace('nan', NAN, inplace=True)
        add_column(var_df,
                   select_indices,
                   NT_VAR_COL,
                   Candra.LIFD_SIGNIFICANCE_COL,
                   candra_df,
                   as_value=True,
                   sub_key=Candra.SIGNIFICANCE_COL,
                   as_float=True)

        return var_df
コード例 #3
0
ファイル: cravat.py プロジェクト: johannesreiter/LiFD
    def get_annotation(var_df,
                       ds_name,
                       input_fp=None,
                       output_prefix='',
                       output_dir=None,
                       output_fp=None,
                       filter_condition=None,
                       reference_genome='hg19'):
        """
        Add or also run CRAVAT and CHASMplus for the given dataframe
        :param var_df: pandas dataframe
        :param ds_name: name of dataset, possibly related to given filter;
                        used for default naming of input and output files
        :param input_fp: path to input file
        :param output_prefix: output naming prefix
        :param output_dir: path to output directory
        :param output_fp: if cravat_output_fp is given then CRAVAT will not be run and it is assumed to exist already
        :param filter_condition: select only a subset of given dataframe, for example a cancer type or a subject
        :param reference_genome: human reference genome, e.g. hg19 or hg20
        :return: extended pandas dataframe with CHASMPlus pancancer and cancer type specific scores and p-values
        """

        # Cravat can not convert genomic position for mitochondrial DNA
        if filter_condition is None:
            select_indices = np.where(var_df[FUNC_COL]
                                      & (var_df.Chromosome != 'MT'))[0]
        else:
            select_indices = np.where(filter_condition & var_df[FUNC_COL]
                                      & (var_df.Chromosome != 'MT'))[0]

        if len(select_indices) == 0:
            logger.warning(
                'No variants selected for CRAVAT analysis of dataset {}!'.
                format(ds_name))

        if output_dir is None:
            output_dir = os.path.join('.')
        if input_fp is None:
            input_fp = os.path.join(
                output_dir, '{}{}'.format(ds_name, Cravat.INPUT_SUFFIX))

        if not os.path.isfile(os.path.abspath(input_fp)):
            Cravat.generate_input_file(input_fp,
                                       var_df.iloc[select_indices],
                                       chromosome_col=CHR_COL,
                                       position_col=POS_START_COL,
                                       reference_col=REF_COL,
                                       alternate_col=ALT_COL,
                                       subject_col='Subject')
        else:
            logger.debug(
                'CRAVAT input file for dataset {} already exists.'.format(
                    ds_name))

        # get cancer type
        if len(var_df.iloc[select_indices][CT_COL].unique()) != 1:
            # var_df[var_df.Subject == sub_name][CT_COL].unique()
            raise RuntimeError(
                'No cancer type could be inferred for dataset {}: {}'.format(
                    ds_name, var_df.iloc[select_indices][CT_COL].unique()))

        cts = var_df.iloc[select_indices][CT_COL].unique()

        if len(cts) == 1:
            ct = cts[0]
        else:
            logger.error('No unique cancer type could be identified: f{cts}')
            ct = None

        if output_fp is None:
            output_fp = os.path.join(output_dir,
                                     f'{ds_name}{Cravat.OUTPUT_SUFFIX}')

            if not os.path.isfile(output_fp):
                # run CRAVAT
                Cravat.run(os.path.abspath(input_fp),
                           output_dir,
                           ct,
                           prefix=output_prefix,
                           sub_name=ds_name,
                           reference_genome=reference_genome)
            else:
                logger.debug(
                    'CRAVAT output file already exists: '.format(output_fp))

        if os.path.exists(output_fp) and os.path.isfile(output_fp):
            if output_fp.endswith('.xlsx'):

                cravat_df = Cravat.read_results(
                    os.path.abspath(output_fp),
                    cancer_type=ct,
                    reference_genome=reference_genome)
                logger.debug('Read cravat results with {} entries: {}'.format(
                    len(cravat_df), output_fp))

                add_column(var_df,
                           select_indices,
                           NT_VAR_COL,
                           Cravat.CP_COL,
                           cravat_df,
                           sub_key=Cravat.CP_COL,
                           as_float=True)
                add_column(var_df,
                           select_indices,
                           NT_VAR_COL,
                           Cravat.CP_SCORE_COL,
                           cravat_df,
                           sub_key=Cravat.CP_SCORE_COL,
                           as_float=True)

                if ct == 'PANCAN':
                    add_column(var_df,
                               select_indices,
                               NT_VAR_COL,
                               Cravat.CP_CT_COL,
                               cravat_df,
                               sub_key=Cravat.CP_COL,
                               as_float=True)
                    add_column(var_df,
                               select_indices,
                               NT_VAR_COL,
                               Cravat.CP_SCORE_CT_COL,
                               cravat_df,
                               sub_key=Cravat.CP_SCORE_COL,
                               as_float=True)
                elif ct is not None and not (isinstance(ct, float)
                                             and np.isnan(ct)):
                    add_column(var_df,
                               select_indices,
                               NT_VAR_COL,
                               Cravat.CP_CT_COL,
                               cravat_df,
                               sub_key=Cravat.CP_CT_COL.replace('CT', ct),
                               as_float=True)
                    add_column(var_df,
                               select_indices,
                               NT_VAR_COL,
                               Cravat.CP_SCORE_CT_COL,
                               cravat_df,
                               sub_key=Cravat.CP_SCORE_CT_COL.replace(
                                   'CT', ct),
                               as_float=True)
                else:
                    logger.warning(
                        'No cancer type given for dataset {}: {}'.format(
                            ds_name, ct))

                if len(select_indices) < 10:
                    min_fraction = 0.0
                else:
                    min_fraction = 1.0

                if 'MutationEffect' in var_df.columns:
                    df = var_df.iloc[select_indices, :]
                    min_fraction *= 0.9 * len(
                        df[df.MutationEffect == 'Substitution']) / len(
                            select_indices)
                else:
                    min_fraction *= 0.3

                # check if at least a minimum number of predictions were found
                assert (var_df.iloc[select_indices, :][Cravat.CP_COL].count() >
                        min_fraction * len(select_indices)), \
                    'Only {} Cravat/CHASMplus predictions found for dataset {} but {} indices.'.format(
                        var_df.iloc[select_indices, :][Cravat.CP_COL].count(), ds_name, len(select_indices))

            elif output_fp.endswith('.tsv'):
                cravat_vars = Cravat.read_results(
                    os.path.abspath(output_fp),
                    reference_genome=reference_genome)

                # add CHASM results to the dataframe
                add_column(var_df,
                           select_indices,
                           NT_VAR_COL,
                           'Chasm',
                           cravat_vars,
                           sub_key=0,
                           as_float=True)

                assert var_df[filter_condition]['Chasm'].count() > 0.3 * len(select_indices), \
                    'Only {} Cravat/Chasm predictions found for dataset {} but {} indices.'.format(
                        var_df[filter_condition]['Chasm'].count(), ds_name, len(select_indices))
        else:
            logger.error(
                'Missing cravat/chasm output for dataset {}! No file {}'.
                format(ds_name, output_fp))
            raise RuntimeError(
                'Missing cravat/chasm output for dataset {}! No file {}'.
                format(ds_name, output_fp))

        return var_df
コード例 #4
0
ファイル: cgi.py プロジェクト: johannesreiter/LiFD
    def get_annotation(var_df,
                       ds_name,
                       input_fp=None,
                       output_prefix='',
                       output_dir=None,
                       output_fp=None,
                       filter_condition=None,
                       reference_genome='hg19'):
        """
        Add CGI analysis results for whole dataset or a given subject's data
        :param var_df: pandas dataframe
        :param ds_name: name of dataset, possibly related to given filter;
                        used for default naming of input and output files
        :param input_fp: path to input file
        :param output_prefix: output naming prefix
        :param output_dir: path to output directory
        :param output_fp: path to expected output file
        :param filter_condition: select only a subset of given dataframe
                for example CGI could be run separately for different cancer types or subjects
        :param reference_genome: human reference genome, e.g. hg19
        :return: extended pandas dataframe with columns 'cadd_phred', 'CGI_driver', 'CGI_known_driver',
                 'CGI_predicted_driver', 'CGI_driver_gene', 'CGI_driver_gene_source', 'CGI_gene_role',
                 'CGI_driver_mut_prediction'
        """

        if filter_condition is None:
            select_indices = np.where(var_df[FUNC_COL])[0]
        else:
            select_indices = np.where(filter_condition & var_df[FUNC_COL])[0]

        if output_dir is None:
            output_dir = os.path.join('.')
        if input_fp is None:
            input_fp = os.path.join(output_dir,
                                    '{}{}'.format(ds_name, Cgi.INPUT_SUFFIX))
        if output_fp is None:
            output_fp = os.path.join(output_dir,
                                     '{}{}'.format(ds_name, Cgi.OUTPUT_SUFFIX))

        if not os.path.isfile(os.path.abspath(input_fp)):
            Cgi.generate_input_file(input_fp, var_df.iloc[select_indices])

        job_id = None
        if not os.path.isfile(output_fp) and os.path.isfile(input_fp):
            ct = var_df.iloc[select_indices][CT_COL].unique()[0]
            if ct == 'PANCAN':
                ct = 'CANCER'
            job_id = Cgi.run(os.path.abspath(input_fp),
                             os.path.abspath(output_fp), ct, ds_name)

        if os.path.exists(output_fp) and zipfile.is_zipfile(output_fp):
            cgi_vars_df = Cgi.read_results(os.path.abspath(output_fp))
            # cgi_vars_df['driver']
            # add CGI results to the dataframe
            # interpreting CADD scores
            # >30 very high, >25 high, >20 medium, >10 low, <=10 very low
            add_column(var_df,
                       select_indices,
                       NT_VAR_COL,
                       'cadd_phred',
                       cgi_vars_df,
                       sub_key='cadd_phred',
                       as_float=True)
            # add three columns for whether variant is a known or predicted driver, or other (passenger)
            add_column(var_df,
                       select_indices,
                       NT_VAR_COL,
                       Cgi.DRIVER_COL,
                       cgi_vars_df,
                       sub_key='driver',
                       as_float=False)
            add_column(var_df,
                       select_indices,
                       NT_VAR_COL,
                       Cgi.KNOWN_DRIVER_COL,
                       cgi_vars_df[cgi_vars_df.driver == 'known'],
                       sub_key='driver',
                       as_value=False)
            add_column(var_df,
                       select_indices,
                       NT_VAR_COL,
                       Cgi.PREDICTED_DRIVER_COL,
                       cgi_vars_df[cgi_vars_df.driver != 'other'],
                       sub_key='driver',
                       as_value=False)

            add_column(var_df,
                       select_indices,
                       NT_VAR_COL,
                       Cgi.DRIVER_GENE_COL,
                       cgi_vars_df,
                       sub_key='driver_gene',
                       as_float=False)
            add_column(var_df,
                       select_indices,
                       NT_VAR_COL,
                       Cgi.SOURCE_COL,
                       cgi_vars_df,
                       sub_key='driver_gene_source',
                       as_float=False)
            add_column(var_df,
                       select_indices,
                       NT_VAR_COL,
                       Cgi.GENE_ROLE_COL,
                       cgi_vars_df,
                       sub_key='gene_role',
                       as_float=False)
            add_column(var_df,
                       select_indices,
                       NT_VAR_COL,
                       Cgi.DRIVER_MUT_PREDICTION_COL,
                       cgi_vars_df,
                       sub_key='driver_mut_prediction',
                       as_float=False)

            # assert var_df['CGI_driver'].count() > 0.3 * len(select_indices), \
            #     'Only {} CGI predictions found for case {} but {} variants.'.format(
            #         var_df['CGI_driver'].count(), ds_name, len(select_indices)) # UNDO THIS COMENT
            # delete CGI request
            if job_id is not None:
                r = requests.delete('{}/{}'.format(Cgi.URL, job_id),
                                    headers=Cgi.HEADERS)
                r.json()
        else:
            logger.warning(
                f'Missing CGI output for case {ds_name} with input file: {input_fp}'
            )
            existing_jobs = Cgi._get_existing_jobs()
            if ds_name not in existing_jobs.keys():
                most_recent_date = sorted(existing_jobs[ds_name].keys())[-1]
                job_id = existing_jobs[ds_name][most_recent_date]['id']
                job_info = Cgi.get_job_info(job_id)
                logger.warning(job_info)

        return var_df
コード例 #5
0
ファイル: vep.py プロジェクト: johannesreiter/LiFD
    def get_annotation(var_df,
                       ds_name,
                       input_fp=None,
                       output_prefix='',
                       output_dir=None,
                       output_fp=None,
                       filter_condition=None,
                       reference_genome='hg19'):
        """
        Run VEP and FATHMM for given subject's data and add the results to the given dataframe
        :param var_df: pandas dataframe with input data
        :param ds_name: name of dataset, possibly related to given filter;
                        used for default naming of input and output files
        :param input_fp: path to VEP input file
        :param output_prefix: output naming prefix
        :param output_dir: path to output directory
        :param output_fp: path to VEP output file
        :param filter_condition: select only a subset of given dataframe
        :param reference_genome: human reference genome, e.g. hg19 or hg20
        :return: extended pandas dataframe with columns 'PolyPhen', 'Sift', 'impact', 'consequence', 'FATHMM_ID',
        """
        if filter_condition is None:
            select_indices = np.where(var_df[FUNC_COL])[0]
        else:
            select_indices = np.where(filter_condition & (var_df[FUNC_COL]))[0]

        if output_dir is None:
            output_dir = os.path.join('.')
        if input_fp is None:
            input_fp = os.path.join(output_dir,
                                    '{}{}'.format(ds_name, Vep.INPUT_SUFFIX))
        if output_fp is None:
            output_fp = os.path.join(output_dir,
                                     '{}{}'.format(ds_name, Vep.OUTPUT_SUFFIX))

        if not os.path.isfile(input_fp):
            Vep.generate_input_file(input_fp,
                                    var_df.iloc[select_indices],
                                    chromosome_col='Chromosome',
                                    gene_col='GeneSymbol',
                                    start_col='StartPosition',
                                    end_col='EndPosition',
                                    reference_col='ReferenceAllele',
                                    alternate_col='AlternateAllele')

        if not os.path.isfile(output_fp) and os.path.isfile(input_fp):
            Vep.run(os.path.abspath(input_fp),
                    os.path.abspath(output_fp),
                    reference_genome=reference_genome)

        # Extract relevant information from VEP files and generate FATHMM input files if necessary
        if os.path.isfile(output_fp):
            vep_df = Vep.read_results(output_fp)

            if vep_df is not None:
                # polyphen
                add_column(var_df,
                           select_indices,
                           NT_VAR_COL,
                           PP_SCORE_COL,
                           vep_df,
                           sub_key=PP_SCORE_COL,
                           as_value=True,
                           as_float=True)
                add_column(var_df,
                           select_indices,
                           NT_VAR_COL,
                           SIFT_SCORE_COL,
                           vep_df,
                           sub_key=SIFT_SCORE_COL,
                           as_value=True,
                           as_float=True)
                add_column(var_df,
                           select_indices,
                           NT_VAR_COL,
                           Vep.IMPACT_COL,
                           vep_df,
                           sub_key=Vep.IMPACT_COL,
                           as_value=True)
                if FATHMM_KEY_COL not in var_df.columns:
                    add_column(var_df,
                               select_indices,
                               NT_VAR_COL,
                               FATHMM_KEY_COL,
                               vep_df,
                               sub_key=FATHMM_KEY_COL,
                               as_value=True)
                    var_df.replace({FATHMM_KEY_COL: 'nan'},
                                   np.nan,
                                   inplace=True)

                # assert var_df[filter_condition][Vep.IMPACT_COL].count() > 0.3 * len(select_indices) - 2, \
                #     'Only {} VEP impact predictions found for case {} but {} indices.'.format(
                #         var_df[filter_condition][Vep.IMPACT_COL].count(), ds_name, len(select_indices))
            else:
                logger.error('No VEP results for case {}.'.format(ds_name))
                raise RuntimeError(
                    'No VEP results for case {}. '.format(ds_name))

        return var_df