def test_section_input_process(sections, lookup):
    np.random.seed(0)

    d = DarwinianShift(data=MUTATION_DATA_FILE,
                       exon_file=EXON_FILE,
                       reference_fasta=REFERENCE_FASTA_FILE,
                       lookup=lookup,
                       sections=sections,
                       statistics=[CDFMonteCarloTest(), ChiSquareTest()],
                       testing_random_seed=1,
                       verbose=True
                       )
    d.run_all()

    if 'start' in sections.columns:
        test_name = 'sections1'
    else:
        test_name = 'sections2'

    # output new test file. Do not uncomment unless results have changed and confident new results are correct
    # Be careful to only overwrite for the particular test(s) the you want.
    # This function runs with many parameters. Uncommenting and running all will overwrite all cases.
    # pickle.dump(d.scored_data, open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'wb'))
    # pickle.dump(d.results, open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'wb'))

    expected = pickle.load(open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'rb'))
    assert_frame_equal(sort_dataframe(d.scored_data), sort_dataframe(expected), check_dtype=False)

    expected = pickle.load(open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'rb'))
    assert_frame_equal(sort_dataframe(d.results), sort_dataframe(expected))
Ejemplo n.º 2
0
def test_read_vcf():
    tsv_df = pd.read_csv(MUTATION_DATA_FILE, sep="\t")
    bases = ['A', 'C', 'G', 'T']
    tsv_df = tsv_df[(tsv_df['ref'].isin(bases)) & (tsv_df['mut'].isin(bases))]
    vcf_df = read_sbs_from_vcf(os.path.join(MUTATION_DATA_FILE[:-4] + '.vcf'))
    tsv_df['chr'] = tsv_df['chr'].astype(str)

    assert_frame_equal(sort_dataframe(tsv_df[['chr', 'pos', 'ref', 'mut']]), sort_dataframe(vcf_df))
def compare_column_lists(df1, df2):
    cols1 = set(df1.columns)
    cols2 = set(df2.columns)
    only1 = cols1.difference(cols2)
    only2 = cols2.difference(cols1)
    common = cols1.intersection(cols2)
    assert_frame_equal(sort_dataframe(df1[common]), sort_dataframe(df2[common]))
    return only1, only2
def run_full_process(spectra, lookup, gene_list, transcript_list, deduplicate, excluded_positions,
                      use_longest_transcript_only, exclude_synonymous, exclude_nonsense):
    np.random.seed(0)
    # This will be a very slow test.
    # Checks that the process works from start to finish for all options.
    if gene_list is not None or transcript_list is not None:
        # Results should be same if any of these are defined. Always testing the same transcripts
        gene_list_name = '2genes'
    else:
        gene_list_name = 'None'
    if excluded_positions is not None:
        ep = '1'
    else:
        ep = '0'

    excluded_mutations = []
    if exclude_synonymous:
        excluded_mutations.append('synonymous')
    if exclude_nonsense:
        excluded_mutations.append('nonsense')

    test_name = "_".join([lookup.__class__.__name__, gene_list_name, str(int(deduplicate)), ep,
                          str(int(use_longest_transcript_only)), str(int(exclude_synonymous)),
                          str(int(exclude_nonsense))])

    statistics = [CDFMonteCarloTest(num_draws=1000), ChiSquareTest(),
                  MonteCarloTest(stat_function=np.mean, num_draws=1000),
                  KSTest()]

    d = DarwinianShift(data=MUTATION_DATA_FILE,
                       exon_file=EXON_FILE,
                       reference_fasta=REFERENCE_FASTA_FILE,
                       lookup=lookup,
                       statistics=statistics,
                       spectra=spectra['spectra'],
                       gene_list=gene_list,
                       transcript_list=transcript_list,
                       deduplicate=deduplicate,
                       excluded_positions=excluded_positions,
                       use_longest_transcript_only=use_longest_transcript_only,
                       excluded_mutation_types=excluded_mutations,
                       testing_random_seed=1,
                       verbose=True
                       )
    d.run_all()

    # output new test file. Do not uncomment unless results have changed and confident new results are correct
    # Be careful to only overwrite for the particular test(s) the you want.
    # This function runs with many parameters. Uncommenting and running all will overwrite all cases.
    # pickle.dump(d.scored_data, open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'wb'))
    # pickle.dump(d.results, open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'wb'))

    expected = pickle.load(open(os.path.join(RESULTS_DIR, "scored_data_{}.pickle".format(test_name)), 'rb'))
    assert_frame_equal(sort_dataframe(d.scored_data), sort_dataframe(expected), check_dtype=False)

    expected = pickle.load(open(os.path.join(RESULTS_DIR, "results_{}.pickle".format(test_name)), 'rb'))
    assert_frame_equal(sort_dataframe(d.results), sort_dataframe(expected))
Ejemplo n.º 5
0
def test_global_kmerE(project_spectrum):
    s = project_spectrum.get_spectrum('glob_k3_is')

    # output new test file. Do not uncomment unless results have changed and confident new results are correct
    # pickle.dump(s.spectrum, open(os.path.join(FILE_DIR, "glob_kmer_E.pickle"), 'wb'))

    expected = pickle.load(
        open(os.path.join(FILE_DIR, "glob_kmer_E.pickle"), 'rb'))
    assert_frame_equal(sort_dataframe(s.spectrum), sort_dataframe(expected))
Ejemplo n.º 6
0
def test_uniprot_annotation(seq_object):
    uniprot_lookup = UniprotLookup(uniprot_directory=TEST_DATA_DIR)
    annotated_data = uniprot_lookup.annotate_dataframe(seq_object.observed_mutations, seq_object.transcript_id)

    # output new test file. Do not uncomment unless results have changed and confident new results are correct
    # pickle.dump(annotated_data, open(os.path.join(FILE_DIR, "annotated_data.pickle"), 'wb'))

    expected = pickle.load(open(os.path.join(FILE_DIR, "annotated_data.pickle"), 'rb'))
    assert_frame_equal(sort_dataframe(annotated_data), sort_dataframe(expected))
Ejemplo n.º 7
0
def test_uniprot_exploration4():
    res = uniprot_exploration(genes=['NOTCH3'], data=MUTATION_DATA_FILE, exon_file=EXON_FILE,
                              fasta_file=REFERENCE_FASTA_FILE, plot=False, uniprot_directory=TEST_DATA_DIR,
                              match_variant_change=True)

    # output new test file. Do not uncomment unless results have changed and confident new results are correct
    # pickle.dump(res, open(os.path.join(FILE_DIR, "uniprot_exploration4.pickle"), 'wb'))

    expected = pickle.load(open(os.path.join(FILE_DIR, "uniprot_exploration4.pickle"), 'rb'))
    assert_frame_equal(sort_dataframe(res), sort_dataframe(expected))
Ejemplo n.º 8
0
def test_uniprot_exploration3():
    res = uniprot_exploration(sections=[{'transcript_id': 'ENST00000263388', 'start': 1378, 'end': 1640}],
                              data=MUTATION_DATA_FILE, exon_file=EXON_FILE,
                              fasta_file=REFERENCE_FASTA_FILE, plot=False, uniprot_directory=TEST_DATA_DIR,
                              match_variant_change=False)

    # output new test file. Do not uncomment unless results have changed and confident new results are correct
    # pickle.dump(res, open(os.path.join(FILE_DIR, "uniprot_exploration3.pickle"), 'wb'))

    expected = pickle.load(open(os.path.join(FILE_DIR, "uniprot_exploration3.pickle"), 'rb'))
    assert_frame_equal(sort_dataframe(res), sort_dataframe(expected))
Ejemplo n.º 9
0
def test_pdbekb_exploration(project):
    p = PDBeKBLookup(pdbekb_dir=TEST_DATA_DIR,
                     transcript_uniprot_mapping={'ENST00000263388': 'ABC123'}  # Map to the fake test data
    )
    d = project.change_lookup(p)
    res = pdbe_kb_exploration(d, transcript_id='ENST00000263388')

    # output new test file. Do not uncomment unless results have changed and confident new results are correct
    # pickle.dump(res, open(os.path.join(FILE_DIR, "pdbekb_exploration.pickle"), 'wb'))

    expected = pickle.load(open(os.path.join(FILE_DIR, "pdbekb_exploration.pickle"), 'rb'))
    assert_frame_equal(sort_dataframe(res), sort_dataframe(expected))
def test_get_overlapped_transcripts(project):
    exon_data = pd.read_csv(EXON_FILE, sep="\t")
    exon_data.loc[:, 'Chromosome/scaffold name'] = exon_data['Chromosome/scaffold name'].astype(str)
    exon_data = exon_data[exon_data['Chromosome/scaffold name'].isin(project.chromosomes)]
    exon_data = exon_data[~pd.isnull(exon_data['Genomic coding start'])]
    exon_data['Genomic coding start'] = exon_data['Genomic coding start'].astype(int)
    exon_data['Genomic coding end'] = exon_data['Genomic coding end'].astype(int)
    res = project.get_overlapped_transcripts(project.data, exon_data)

    # output new test file. Do not uncomment unless results have changed and confident new results are correct
    # pickle.dump(res, open(os.path.join(RESULTS_DIR, "overlapped_transcripts.pickle"), 'wb'))

    expected = pickle.load(open(os.path.join(RESULTS_DIR, "overlapped_transcripts.pickle"), 'rb'))
    assert_frame_equal(sort_dataframe(res), sort_dataframe(expected))
Ejemplo n.º 11
0
def test_transcript_kmerB(project_spectrum):
    s = TranscriptKmerSpectrum(
        deduplicate_spectrum=False,
        k=1,  # Size of kmer nucleotide context. Use 3 for trinucleotides.
        ignore_strand=False,
        missing_value=
        0,  # To replace missing values. Useful to make non-zero in some cases.
        name=None)
    s.set_project(project_spectrum)
    spectrum = s.get_complete_spectrum()

    # output new test file. Do not uncomment unless results have changed and confident new results are correct
    # pickle.dump(s.spectrum, open(os.path.join(FILE_DIR, "transcript_kmer_B.pickle"), 'wb'))

    expected = pickle.load(
        open(os.path.join(FILE_DIR, "transcript_kmer_B.pickle"), 'rb'))
    assert_frame_equal(sort_dataframe(spectrum), sort_dataframe(expected))
Ejemplo n.º 12
0
def test_transcript_read():
    s = read_spectrum(os.path.join(FILE_DIR, "transcript_kmer_A.spectrum"))
    expected = pickle.load(
        open(os.path.join(FILE_DIR, "transcript_kmer_A.pickle"), 'rb'))
    assert_frame_equal(sort_dataframe(s.spectrum), sort_dataframe(expected))