Ejemplo n.º 1
0
    def setUpClass(self):
        ''' __init__ method for class obj '''

        self.data_path = os.path.abspath(__file__ + '/../' + \
               'data/test_find_peptide_variants/')

        annotation_path = self.data_path + '/gencode_min.gtf'
        genomefa_path = self.data_path + '/GRCh38_limited_chr7.fa.gz'
        cosmicdb_path = self.data_path + \
            '/CosmicGenomeScreensMutantExport.min.tsv'

        self.input_path = self.data_path + '/vcf/'
        self.input_paths = [
            self.input_path + x for x in os.listdir(self.input_path)
        ]

        self.cosmic_df = pd.read_csv(cosmicdb_path, sep='\t')
        self.annotation_df = pd.read_csv(annotation_path, sep='\t', skiprows=5)
        self.genome_faidx = Fasta(genomefa_path)

        self.annotation_genome_tree = GenomeIntervalTree(
            lambda feat: feat.pos,
            (GFFFeature(row) for _, row in self.annotation_df.iterrows()))

        self.protein_variant_predictor = ProteinVariantPredictor(
            self.annotation_genome_tree, self.genome_faidx)

        self.aa_mutation_finder = AminoAcidMutationFinder(self.cosmic_df,
                                                          self.annotation_df,
                                                          self.genome_faidx,
                                                          cov_bool=1)
Ejemplo n.º 2
0
    def setUpClass(self):
        ''' __init__ method for FindAAMutationsTester class '''
        self.data_path = os.path.abspath(__file__ + \
              '/../' + 'data/test_find_peptide_variants/')

        self.cosmicdb_path = self.data_path + '/cosmic_min.tsv'

        self.annotation_path = self.data_path + '/hg38-plus.min.gtf'

        self.genomefa_path = self.data_path + '/GRCh38_limited_chr7.fa.gz'
        self.cov_bool = 1
        self.num_processes = 2  # this is wierd
        self.outpath = self.data_path + '/test_out.csv'

        self.input_path = self.data_path + '/vcf/'
        self.input_paths = [
            self.input_path + x for x in os.listdir(self.input_path)
        ]

        cosmic_df = None
        annotation_df = pd.read_csv(self.annotation_path, sep='\t', skiprows=5)

        genome_faidx = Fasta(self.genomefa_path)

        self.aa_mutation_finder = AminoAcidMutationFinder(
            cosmic_df, annotation_df, genome_faidx, self.cov_bool)
Ejemplo n.º 3
0
    def test_extract_coverage(self):
        ''' testing the extract_coverage() subroutine within
		  	find_peptide_variants.find_cell_gene_aa_mutations() '''
        def init_process(curr_aa_mutation_finder):
            global current_process_aa_mutation_finder
            current_process_aa_mutation_finder = curr_aa_mutation_finder

        def process_cell(path):
            my_obj = current_process_aa_mutation_finder. \
               find_cell_gene_aa_mutations(path=path)

            return (my_obj)

        # need to rebuild this with larger version of COSMIC
        cosmicdb_path = self.data_path + '/cosmic_min.tsv'
        cosmic_df = pd.read_csv(cosmicdb_path, sep='\t')
        curr_aa_mutation_finder = AminoAcidMutationFinder(cosmic_df,
                                                          self.annotation_df,
                                                          self.genome_faidx,
                                                          cov_bool=1)

        for vcf_path in self.input_paths:
            curr_vcf = vcf_path.strip(self.input_path)
            init_process(curr_aa_mutation_finder)
            gene_aa_mutations = process_cell(vcf_path)

            if 'A1' in curr_vcf:
                k = list(gene_aa_mutations)
                assert 'EGFR' in k

                # take a look at the variant coverage string
                #		this is horrible -- I know
                v = gene_aa_mutations.get('EGFR')
                vl = list(v)
                ve = vl[0]
                vc = ve.split(',')[1]
                assert vc == '[2:0]'

            elif 'A2' in curr_vcf:
                k = list(gene_aa_mutations)
                v = gene_aa_mutations.get('EGFR')
                vl = list(v)
                ve = vl[0]
                vc = ve.split(',')[1]
                assert vc == '[0:2]' or vc == '[3:2:1]'

            else:
                k = list(gene_aa_mutations)
                assert not k
def test_basic_cmp():
    ''' does find_all_mutations return w/o error, redux
		this one has an expected vs. actual file compare step 
		pytest keeps telling me filecmp.cmp() is outdated? '''
    from cerebra.find_peptide_variants import AminoAcidMutationFinder

    data_path = os.path.abspath(__file__ + '/../' +
                                'data/test_find_peptide_variants/')
    genomefa_path = data_path + '/GRCh38_limited_chr7.fa'

    annotation_path = data_path + '/gencode_min.gtf'
    coverage = 0
    num_processes = 1  # want to include that multiprocessing module
    outpath = data_path + '/test_out.csv'

    input_path = data_path + '/vcf/'
    input_paths = [input_path + x for x in os.listdir(input_path)]

    cosmic_df = None
    annotation_df = pd.read_csv(annotation_path, sep='\t', skiprows=5)
    genome_faidx = Fasta(genomefa_path)
    aa_mutation_finder = AminoAcidMutationFinder(cosmic_df, annotation_df,
                                                 genome_faidx, coverage)

    results_df = aa_mutation_finder.find_transcript_mutations(
        paths=input_paths, processes=num_processes)

    results_df = results_df.sort_index()  # row sort
    results_df = results_df.sort_index(axis=1)  # column sort
    results_df.to_csv(outpath)

    assert os.path.isfile(outpath)

    outdf = pd.read_csv(outpath, index_col=0)
    expect_index = ['A1', 'A2', 'A3', 'A4', 'A5']
    expect_cols = ['EGFR']

    assert list(outdf.index) == expect_index
    assert list(outdf.columns) == expect_cols

    indel_ensps = [
        'ENSP00000275493.2:p.(Leu858delinsArgTrp)',
        'ENSP00000395243.3:p.(Leu813delinsArgTrp)',
        'ENSP00000415559.1:p.(Leu813delinsArgTrp)'
    ]

    snp_ensps = [
        'ENSP00000415559.1:p.(Leu813Arg)',
        'ENSP00000395243.3:p.(Leu813Arg)',
        'ENSP00000275493.2:p.(Leu858Arg)',
    ]

    a1_matches = outdf.loc['A1']['EGFR']
    a2_matches = outdf.loc['A2']['EGFR']

    # this is a bit backwards but ok considering the wierd format of outfile
    for x in indel_ensps:
        assert x in a1_matches

    for y in snp_ensps:
        assert y in a1_matches

    for z in snp_ensps:
        assert z in a2_matches

    #teardown
    os.remove(data_path + "/test_out.csv")