def test_section_input_process(sections, lookup): np.random.seed(0) d = DarwinianShift(data=MUTATION_DATA_FILE, exon_file=EXON_FILE, reference_fasta=REFERENCE_FASTA_FILE, lookup=lookup, sections=sections, statistics=[CDFMonteCarloTest(), ChiSquareTest()], testing_random_seed=1, verbose=True ) d.run_all() if 'start' in sections.columns: test_name = 'sections1' else: test_name = 'sections2' # output new test file. Do not uncomment unless results have changed and confident new results are correct # Be careful to only overwrite for the particular test(s) the you want. # This function runs with many parameters. Uncommenting and running all will overwrite all cases. # pickle.dump(d.scored_data, open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'wb')) # pickle.dump(d.results, open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'wb')) expected = pickle.load(open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'rb')) assert_frame_equal(sort_dataframe(d.scored_data), sort_dataframe(expected), check_dtype=False) expected = pickle.load(open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'rb')) assert_frame_equal(sort_dataframe(d.results), sort_dataframe(expected))
def test_read_vcf(): tsv_df = pd.read_csv(MUTATION_DATA_FILE, sep="\t") bases = ['A', 'C', 'G', 'T'] tsv_df = tsv_df[(tsv_df['ref'].isin(bases)) & (tsv_df['mut'].isin(bases))] vcf_df = read_sbs_from_vcf(os.path.join(MUTATION_DATA_FILE[:-4] + '.vcf')) tsv_df['chr'] = tsv_df['chr'].astype(str) assert_frame_equal(sort_dataframe(tsv_df[['chr', 'pos', 'ref', 'mut']]), sort_dataframe(vcf_df))
def compare_column_lists(df1, df2): cols1 = set(df1.columns) cols2 = set(df2.columns) only1 = cols1.difference(cols2) only2 = cols2.difference(cols1) common = cols1.intersection(cols2) assert_frame_equal(sort_dataframe(df1[common]), sort_dataframe(df2[common])) return only1, only2
def run_full_process(spectra, lookup, gene_list, transcript_list, deduplicate, excluded_positions, use_longest_transcript_only, exclude_synonymous, exclude_nonsense): np.random.seed(0) # This will be a very slow test. # Checks that the process works from start to finish for all options. if gene_list is not None or transcript_list is not None: # Results should be same if any of these are defined. Always testing the same transcripts gene_list_name = '2genes' else: gene_list_name = 'None' if excluded_positions is not None: ep = '1' else: ep = '0' excluded_mutations = [] if exclude_synonymous: excluded_mutations.append('synonymous') if exclude_nonsense: excluded_mutations.append('nonsense') test_name = "_".join([lookup.__class__.__name__, gene_list_name, str(int(deduplicate)), ep, str(int(use_longest_transcript_only)), str(int(exclude_synonymous)), str(int(exclude_nonsense))]) statistics = [CDFMonteCarloTest(num_draws=1000), ChiSquareTest(), MonteCarloTest(stat_function=np.mean, num_draws=1000), KSTest()] d = DarwinianShift(data=MUTATION_DATA_FILE, exon_file=EXON_FILE, reference_fasta=REFERENCE_FASTA_FILE, lookup=lookup, statistics=statistics, spectra=spectra['spectra'], gene_list=gene_list, transcript_list=transcript_list, deduplicate=deduplicate, excluded_positions=excluded_positions, use_longest_transcript_only=use_longest_transcript_only, excluded_mutation_types=excluded_mutations, testing_random_seed=1, verbose=True ) d.run_all() # output new test file. Do not uncomment unless results have changed and confident new results are correct # Be careful to only overwrite for the particular test(s) the you want. # This function runs with many parameters. Uncommenting and running all will overwrite all cases. # pickle.dump(d.scored_data, open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'wb')) # pickle.dump(d.results, open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'wb')) expected = pickle.load(open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'rb')) assert_frame_equal(sort_dataframe(d.scored_data), sort_dataframe(expected), check_dtype=False) expected = pickle.load(open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'rb')) assert_frame_equal(sort_dataframe(d.results), sort_dataframe(expected))
def test_global_kmerE(project_spectrum): s = project_spectrum.get_spectrum('glob_k3_is') # output new test file. Do not uncomment unless results have changed and confident new results are correct # pickle.dump(s.spectrum, open(os.path.join(FILE_DIR, "glob_kmer_E.pickle"), 'wb')) expected = pickle.load( open(os.path.join(FILE_DIR, "glob_kmer_E.pickle"), 'rb')) assert_frame_equal(sort_dataframe(s.spectrum), sort_dataframe(expected))
def test_uniprot_annotation(seq_object): uniprot_lookup = UniprotLookup(uniprot_directory=TEST_DATA_DIR) annotated_data = uniprot_lookup.annotate_dataframe(seq_object.observed_mutations, seq_object.transcript_id) # output new test file. Do not uncomment unless results have changed and confident new results are correct # pickle.dump(annotated_data, open(os.path.join(FILE_DIR, "annotated_data.pickle"), 'wb')) expected = pickle.load(open(os.path.join(FILE_DIR, "annotated_data.pickle"), 'rb')) assert_frame_equal(sort_dataframe(annotated_data), sort_dataframe(expected))
def test_uniprot_exploration4(): res = uniprot_exploration(genes=['NOTCH3'], data=MUTATION_DATA_FILE, exon_file=EXON_FILE, fasta_file=REFERENCE_FASTA_FILE, plot=False, uniprot_directory=TEST_DATA_DIR, match_variant_change=True) # output new test file. Do not uncomment unless results have changed and confident new results are correct # pickle.dump(res, open(os.path.join(FILE_DIR, "uniprot_exploration4.pickle"), 'wb')) expected = pickle.load(open(os.path.join(FILE_DIR, "uniprot_exploration4.pickle"), 'rb')) assert_frame_equal(sort_dataframe(res), sort_dataframe(expected))
def test_uniprot_exploration3(): res = uniprot_exploration(sections=[{'transcript_id': 'ENST00000263388', 'start': 1378, 'end': 1640}], data=MUTATION_DATA_FILE, exon_file=EXON_FILE, fasta_file=REFERENCE_FASTA_FILE, plot=False, uniprot_directory=TEST_DATA_DIR, match_variant_change=False) # output new test file. Do not uncomment unless results have changed and confident new results are correct # pickle.dump(res, open(os.path.join(FILE_DIR, "uniprot_exploration3.pickle"), 'wb')) expected = pickle.load(open(os.path.join(FILE_DIR, "uniprot_exploration3.pickle"), 'rb')) assert_frame_equal(sort_dataframe(res), sort_dataframe(expected))
def test_pdbekb_exploration(project): p = PDBeKBLookup(pdbekb_dir=TEST_DATA_DIR, transcript_uniprot_mapping={'ENST00000263388': 'ABC123'} # Map to the fake test data ) d = project.change_lookup(p) res = pdbe_kb_exploration(d, transcript_id='ENST00000263388') # output new test file. Do not uncomment unless results have changed and confident new results are correct # pickle.dump(res, open(os.path.join(FILE_DIR, "pdbekb_exploration.pickle"), 'wb')) expected = pickle.load(open(os.path.join(FILE_DIR, "pdbekb_exploration.pickle"), 'rb')) assert_frame_equal(sort_dataframe(res), sort_dataframe(expected))
def test_get_overlapped_transcripts(project): exon_data = pd.read_csv(EXON_FILE, sep="\t") exon_data.loc[:, 'Chromosome/scaffold name'] = exon_data['Chromosome/scaffold name'].astype(str) exon_data = exon_data[exon_data['Chromosome/scaffold name'].isin(project.chromosomes)] exon_data = exon_data[~pd.isnull(exon_data['Genomic coding start'])] exon_data['Genomic coding start'] = exon_data['Genomic coding start'].astype(int) exon_data['Genomic coding end'] = exon_data['Genomic coding end'].astype(int) res = project.get_overlapped_transcripts(project.data, exon_data) # output new test file. Do not uncomment unless results have changed and confident new results are correct # pickle.dump(res, open(os.path.join(RESULTS_DIR, "overlapped_transcripts.pickle"), 'wb')) expected = pickle.load(open(os.path.join(RESULTS_DIR, "overlapped_transcripts.pickle"), 'rb')) assert_frame_equal(sort_dataframe(res), sort_dataframe(expected))
def test_transcript_kmerB(project_spectrum): s = TranscriptKmerSpectrum( deduplicate_spectrum=False, k=1, # Size of kmer nucleotide context. Use 3 for trinucleotides. ignore_strand=False, missing_value= 0, # To replace missing values. Useful to make non-zero in some cases. name=None) s.set_project(project_spectrum) spectrum = s.get_complete_spectrum() # output new test file. Do not uncomment unless results have changed and confident new results are correct # pickle.dump(s.spectrum, open(os.path.join(FILE_DIR, "transcript_kmer_B.pickle"), 'wb')) expected = pickle.load( open(os.path.join(FILE_DIR, "transcript_kmer_B.pickle"), 'rb')) assert_frame_equal(sort_dataframe(spectrum), sort_dataframe(expected))
def test_transcript_read(): s = read_spectrum(os.path.join(FILE_DIR, "transcript_kmer_A.spectrum")) expected = pickle.load( open(os.path.join(FILE_DIR, "transcript_kmer_A.pickle"), 'rb')) assert_frame_equal(sort_dataframe(s.spectrum), sort_dataframe(expected))