Esempio n. 1
0
def calc_values(seq_list):

    densities = collections.defaultdict(lambda: [])

    for id in seq_list:
        for seq in seq_list[id]:
            density = seqo.calc_motif_density([seq], stops)
            densities[id].append(density)

    ids = list(seq_list)
    sim_outputs = gen.run_in_parallel(list(seq_list), ["foo", seq_list, 1000],
                                      randomise_densities)

    random_densities = collections.defaultdict(lambda: [])
    for output in sim_outputs:
        result = output.get()
        for id in result:
            random_densities[id].extend(result[id])

    random_densities = {i: random_densities[i] for i in random_densities}

    nds = collections.defaultdict(lambda: [])
    for id in densities:
        for i, exon_density in enumerate(densities[id]):
            nd = np.divide(exon_density - np.mean(random_densities[id][i]),
                           np.mean(random_densities[id][i]))
            nds[id].append(nd)

    return densities, nds
Esempio n. 2
0
def generate_motifs_sets(motifs,
                         simulations_to_run,
                         output_file,
                         seed_list=None,
                         onebyone=None):
    '''
	Generate n sets of motifs based on the set of motifs provided.
	seed_list: a list of seeds to use (must be of length greater or equal to the number of simulations)
	'''

    #check that there are enough seeds if the seed is set
    if seed_list and simulations_to_run > len(seed_list):
        print(
            'The number of seeds must be at least equal to the number of simulations!'
        )
        raise Exception

    #get dinucleotides
    dinucleotides = get_dinucleotides(motifs)
    #create a list of processes
    input_list = [i for i in range(simulations_to_run)]
    #build processes
    processes = gen.run_in_parallel(input_list,
                                    ["foo", motifs, dinucleotides, seed_list],
                                    generate_motifs, onebyone)
    #run processes and output to output_file
    output = open(output_file, "w")
    for process in processes:
        simulants = process.get()
        if simulants:
            for simulant in simulants:
                output.write('{0}\n'.format("|".join(simulant)))
    output.close()
Esempio n. 3
0
def run_exon_simulation(motif_file, exon_fasta, output_dir, required_simulations, output_file):
    '''
    Run simulation that picks hexamers from the exon sequences
    '''

    exon_names, exon_seqs = gen.read_fasta(exon_fasta)
    #exons needs to be >= 16 to get the two exon ends
    exon_seqs = [exon for exon in exon_seqs if len(exon) >= 16]

    # get motifs, avoid header if there is one
    motif_list = gen.read_many_fields(motif_file, ",")
    motifs = [i[0] for i in motif_list if i[0][0] != "#"]

    real_count = se.get_stop_codon_count(motifs)

    simulations = list(range(required_simulations))
    # simulated_counts = simulate_motifs(simulations, exon_seqs, motifs)
    processes = gen.run_in_parallel(simulations, ["foo", exon_seqs, motifs], simulate_motifs)

    outputs = []
    for process in processes:
        outputs.extend(process.get())

    with open(output_file, "w") as outfile:
        outfile.write('sim,count\n')
        outfile.write('real,{0}\n'.format(real_count))
        for i, count in enumerate(outputs):
            outfile.write('{0},{1}\n'.format(i+1,count))
Esempio n. 4
0
def ptc_snp_simulation(out_prefix,
                       simulation_output_folder,
                       ptc_file,
                       syn_nonsyn_file,
                       exon_junctions_file,
                       bam_files,
                       required_simulations,
                       exon_junctions_bam_output_folder,
                       use_old_sims=False):
    '''
    Set up the PTC simulations and then run.
    if use_old_sims is True, don't pick new simulant SNPs.
    '''

    #setup up simulation output folder
    if simulation_output_folder == "None":
        simulation_output_folder = "{0}_simulate_ptc_snps".format(out_prefix)
    if not use_old_sims:
        #if the simulation folder we are specifying already exists, delete and start again
        gen.create_strict_directory(simulation_output_folder)
    else:
        gen.create_directory(simulation_output_folder)

    #setup up simulation bam analysis output folder
    simulation_bam_analysis_output_folder = "{0}__analysis_simulation_ptc_snps_bam_analysis".format(
        out_prefix)
    if not use_old_sims:
        #if the simulation folder we are specifying already exists, delete and start again
        gen.create_strict_directory(simulation_bam_analysis_output_folder)
    else:
        gen.create_directory(simulation_bam_analysis_output_folder)

    #get all nonsynonymous snps and put them in the simulation output folder
    nonsynonymous_snps_file = "{0}/nonsynonymous_snps.txt".format(
        simulation_output_folder)
    so.filter_by_snp_type(syn_nonsyn_file, nonsynonymous_snps_file, "non")

    #create a list of simulations to iterate over
    simulations = list(range(1, required_simulations + 1))
    #if you're only doing one simulation, don't parallelize the simulations
    #parallelize the processing of bams like for true data
    if required_simulations > 1:
        processes = gen.run_in_parallel(simulations, [
            "foo", out_prefix, simulation_output_folder,
            simulation_bam_analysis_output_folder, ptc_file,
            nonsynonymous_snps_file, exon_junctions_file, bam_files,
            exon_junctions_bam_output_folder, True, use_old_sims
        ], run_ptc_simulation_instance)
        for process in processes:
            process.get()
    else:
        run_ptc_simulation_instance([1], out_prefix, simulation_output_folder,
                                    simulation_bam_analysis_output_folder,
                                    ptc_file, nonsynonymous_snps_file,
                                    exon_junctions_file, bam_files,
                                    exon_junctions_bam_output_folder, False,
                                    use_old_sims)
Esempio n. 5
0
def large_effect_locations_sim():
    '''
    Test where the large effect cases are
    '''
    output_prefix = "results/clean_run_2/clean_run"
    ptc_file = "{0}_ptc_file.txt".format(output_prefix)
    relative_positions_file = "{0}_PTC_relative_exon_positions.bed".format(
        output_prefix)
    final_output_file = "{0}__analysis_final_output.txt".format(output_prefix)

    filtered_list = get_filtered_skipped_exons(final_output_file)
    large_effects, non_large_effects = get_large_effect_overlaps(
        filtered_list, 5, 0.025)

    relative_positions = gen.read_many_fields(relative_positions_file, "\t")

    rel_pos_list = {}
    for ptc in relative_positions[1:]:
        start = int(ptc[1])
        stop = int(ptc[2])
        exon = ptc[3]
        rel_pos = int(ptc[11])
        rel_pos_list[exon] = [rel_pos, start, stop]

    real_regions, real_ends = get_ptc_regions(large_effects, rel_pos_list)

    simulations = 10000
    sims = list(range(simulations))
    # outputs = large_effects_locations_sim(sims, large_effects, non_large_effects, rel_pos_list)
    processes = gen.run_in_parallel(
        sims, ["foo", large_effects, non_large_effects, rel_pos_list],
        large_effects_locations_sim)

    regions = []
    ends = []
    for process in processes:
        output = process.get()
        region = output[0]
        end = output[1]
        for i in region:
            regions.append(i)
        for i in end:
            ends.append(i)

    ese_region_pval = np.divide(
        len([1 for region in regions if region[1] >= real_regions[1]]) + 1,
        len(regions) + 1)
    ends_pval = np.divide(
        len([1 for end in ends if end[0] >= real_ends[0]]) + 1,
        len(ends) + 1)
    # ese_region_pval = 1

    print("PTCs in regions (0-2,3-69,70+): {0}".format(real_regions))
    print("Ends (5', 3'): {0}".format(real_ends))
    print("ESE region compared with sims: {0}".format(ese_region_pval))
    print("ESE exon ends with sims: {0}".format(ends_pval))
Esempio n. 6
0
def main():

    #Get genomic flux rates and pTGA
    genomic_taa_tga, genomic_tga_taa, genomic_pTGA = get_nulls(full_source)

    #Simulate GC rich set
    high_sims = []
    workers = int(os.cpu_count()) - 1
    sims = list(range(1, 101))
    high_processes = run_in_parallel(sims, ['foo', high_source, genomic_taa_tga, genomic_tga_taa], get_sims, workers=workers)
    for process in high_processes:
        output = process.get()
        high_sims.extend(output)

    #Simulate GC poor set
    low_sims = []
    workers = int(os.cpu_count()) - 1
    sims = list(range(1, 101))
    low_processes = run_in_parallel(sims, ['foo', low_source, genomic_taa_tga, genomic_tga_taa], get_sims, workers=workers)
    for process in low_processes:
        output = process.get()
        low_sims.extend(output)

    print (len(high_sims))

    #Now select random pairs to generate p value
    counter = 0
    no_sims = 0

    for i in tqdm(range(1,10001)):

        random_high = np.random.choice(high_sims)
        random_low = np.random.choice(low_sims)
        diff = random_high - random_low
        observed = 0.08544058 #Edit this observed difference according to the trio tested
        if diff > observed:
            counter += 1
        no_sims += 1

    p = counter / no_sims

    print (p)
Esempio n. 7
0
def retrieve_bams(ftp_site,
                  local_directory,
                  remote_directory,
                  password_file,
                  subset=None):
    '''
    For each .bam file at the ftp site, downsload it, transfer it to
    a remote server and delete it.
    ftp_site: the remote site that contains the files
    local_directory: the local directory where you want to temporarily store the files
    remote_directory: path to directory on remote server where you want to transfer the files
    password_file: path to file that contains Watson password
    subset: only retrieve this many .bam files (useful for testing)
    '''
    #create local directory, if it doesn't exist
    gen.create_directory(local_directory)
    #split the ftp_site address into host and the path
    ftp_site = ftp_site.split("/")
    host = ftp_site[0]
    ftp_directory = "/".join(ftp_site[1:])
    user = "******"
    password = "******"
    #connect to FTP server
    ftp = gen.ftp_connect(host, user, password, directory=ftp_directory)
    #get list of all .bam files
    all_files = ftp.nlst()
    all_files = [i for i in all_files if i[-4:] == ".bam"]
    print(len(all_files))
    ftp = gen.ftp_check(ftp, host, user, password, ftp_directory)
    ftp.quit()
    #get password for Watson
    with open(password_file) as file:
        expect_password = "".join(file)
        expect_password = expect_password.rstrip("\n")
    #I will use expect to run scp from the script
    #the way this works is you write an expect script
    #and then use the expect programme to run it
    #this is the string that will be in the script
    #each time, you replace "foo" with the name of the file you want to transfer
    expect_string = "#!/usr/bin/expect\nset timeout -1\nspawn rsync {0}/foo {1}\nexpect \"rs949@bssv-watson's password:\"\nsend \"{2}\\n\";\nexpect eof\nexit".format(
        local_directory, remote_directory, expect_password)
    if subset:
        all_files = all_files[:subset]
    #retrieve and transfer .bams in parallel
    processes = gen.run_in_parallel(all_files, [
        "foo", local_directory, host, user, password, ftp_directory,
        expect_string
    ],
                                    retrieve_bams_core,
                                    workers=6)
    for process in processes:
        process.get()
Esempio n. 8
0
def simulate_exon_ese_hits(simulations, relative_positions, large_effects,
                           non_large_effects, exon_seqs, ese_list):

    # get the information on the ptc and add to the two lists
    large_effect_info = []
    non_large_effect_info = []
    for ptc in relative_positions:
        exon = ptc[3]
        if exon in large_effects:
            large_effect_info.append(ptc)
        elif exon in non_large_effects:
            non_large_effect_info.append(ptc)

    real_ese_overlap = get_ese_overlap_count(large_effect_info,
                                             exon_seqs,
                                             ese_list,
                                             real=True)

    sims = list(range(simulations))
    ese_overlaps = simulate_exon_ese_overlaps(sims, large_effect_info,
                                              exon_seqs, ese_list,
                                              non_large_effect_info)
    # print(ese_overlaps)
    processes = gen.run_in_parallel(
        sims,
        ["foo", large_effect_info, exon_seqs, ese_list, non_large_effect_info],
        simulate_exon_ese_overlaps)

    outputs = []
    for process in processes:
        output = process.get()
        outputs.extend(output)

    pval = np.divide(
        len([i for i in outputs if i >= real_ese_overlap]) + 1,
        simulations + 1)
    print("Number of PTCs that hit an ESE in the real cases: {0}/{1}".format(
        real_ese_overlap, len(large_effects)))
    print(
        "Is this a significant number when picking non-large effect cases: {0}"
        .format(pval))
Esempio n. 9
0
def ese_hit_simulation(rel_pos_file, coding_exons_fasta, simulations, output_file, ese_file, window_start, window_end, exclude_cpg, clinvar=None):
    '''
    Simulate ese hits strictly within a region
    '''

    # get a list of the relative positions of the ptcs
    relative_positions_list = get_relative_position_list(rel_pos_file)
    # get a list of eses
    ese_list = get_eses_from_file(ese_file)
    # get the coding exons
    coding_exons = get_coding_exons(coding_exons_fasta)

    long_exons = get_long_exons(relative_positions_list, coding_exons, window_end*2)

    # get the ptcs that are in the 3-69 bp region for each exon of exon
    # this requires exons at least 128 bp in length for comparison
    window_ptcs = get_ptcs_in_window(long_exons, window_start, window_end, coding_exons)
    real_ese_hits = get_ese_hits(window_ptcs, coding_exons, ese_list)

    # simulate the hit counts for nt matched mutations
    simulation_list = list(range(simulations))
    # simulate_ese_hits(simulation_list, simulations, window_ptcs, coding_exons_fasta, ese_list, window_start, window_end)
    processes = gen.run_in_parallel(simulation_list, ["foo", simulations, window_ptcs, coding_exons_fasta, ese_list, window_start, window_end], simulate_ese_hits)
    #
    simulation_outputs = {}
    for process in processes:
        simulation_hits = process.get()
        simulation_outputs = {**simulation_outputs, **simulation_hits}


    with open(output_file, "w") as outfile:
        outfile.write("simulation,ese_hit_count,cant_count\n")
        outfile.write("real,{0},0\n".format(real_ese_hits))
        for simulation in sorted(simulation_outputs):
            outlist = [simulation+1, simulation_outputs[simulation][0], simulation_outputs[simulation][1]]
            outfile.write("{0}\n".format(",".join(gen.stringify(outlist))))
Esempio n. 10
0
def calc_values(seq_list):

    densities = collections.defaultdict(lambda: [])
    gcs = collections.defaultdict(lambda: [])
    ese = collections.defaultdict(lambda: [])

    for id in seq_list:
        for i, exon in enumerate(seq_list[id]):
            density = seqo.calc_motif_density([exon], stops)
            densities['{0}.{1}'.format(id, i)].append(density)
            gcs['{0}.{1}'.format(id,
                                 i)].append(seqo.calc_gc_seqs_combined([exon]))
            ese['{0}.{1}'.format(id, i)].append(
                seqo.calc_motif_density([exon], motifs))

    ids = list(seq_list)
    its = 1000
    sim_outputs = gen.run_in_parallel(ids, ["foo", seq_list, its],
                                      randomise_densities)

    randomised_densities = process_densities(sim_outputs)
    nds = calc_nds(densities, randomised_densities)

    return densities, nds, gcs, ese
Esempio n. 11
0
def ptc_location_simulation(rel_pos_file, coding_exons_fasta, simulations, output_file, ese_overlap_output_file, ese_file=None, only_ese=None, exclude_cpg=None):
    '''
    Simulation mutation locations of PTCs.
    Take the exon in which each PTC is location and randomly pick a site with the
    same nt composition.
    Locations of these matched mutations are used for null.
    '''

    # get a list of the relative positions of the ptcs
    relative_positions_list = get_relative_position_list(rel_pos_file)
    # get a list of eses
    ese_list = get_eses_from_file(ese_file)
    # get the coding exons
    coding_exons = get_coding_exons(coding_exons_fasta)

    # get the positions of the ptcs
    real_positions = get_ptc_positions(relative_positions_list, coding_exons)
    # get the number of ptcs with ese overlaps
    real_ese_overlap = get_ptc_ese_overlap(relative_positions_list, coding_exons, ese_list)

    # now do the simulations
    simulant_list = list(range(1, simulations+1))
    processes = gen.run_in_parallel(simulant_list, ["foo", simulations, relative_positions_list, coding_exons_fasta, ese_list, exclude_cpg], simulate_mutation_locations)

    position_list = {}
    ese_overlap_list = {}
    for process in processes:
        result = process.get()
        position_list = {**position_list, **result[0]}
        ese_overlap_list = {**ese_overlap_list, **result[1]}

    # ignore writing this to file if we just want the ese overlap
    if not only_ese:
        with open(output_file, "w") as outfile:
            outfile.write('simulation,0.2,3.69,70+\n')
            outfile.write('real,{0}\n'.format(",".join(gen.stringify(real_positions))))
            for simulant in position_list:
                outfile.write("{0},{1}\n".format(simulant, ",".join(gen.stringify(position_list[simulant]))))

    with open(ese_overlap_output_file, "w") as outfile:
        outfile.write('simulation,0.2,3.69,70+\n')
        outfile.write('real,{0}\n'.format(",".join(gen.stringify(real_ese_overlap))))
        for simulant in ese_overlap_list:
            outfile.write("{0},{1}\n".format(simulant, ",".join(gen.stringify(ese_overlap_list[simulant]))))


# def ptc_location_simulation(snp_file, full_bed, cds_fasta, possible_positions_dir, output_directory, required_simulations, coding_exons_file):
    '''
    Simulate the snp location.
    For each snp, pick another site that has the same reference allele and that would generate a ptc with the mutated allele.
    Repeat n times.
    '''

    # return all the possible_locations
    possible_locations = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: [])))
    nts = ["A", "C", "G", "T"]
    for nt in nts:
        location_file = "{0}/possible_ptc_locations_{1}.fasta".format(possible_positions_dir, nt)
        entry_names, entry_locations = gen.read_fasta(location_file)
        for i, name in enumerate(entry_names):
            exon = name.split(':')[0]
            aa = name.split(':')[1][0]
            ma = name.split(':')[1][-1]
            possible_locations[exon][aa][ma].append(entry_locations[i])

    # get a list of exons and their lengths
    exons = gen.read_many_fields(coding_exons_file, "\t")
    exon_list = {}
    for exon in exons:
        exon_list[exon[3]] = int(exon[2]) - int(exon[1])

    # create a list of required simulations
    simulations = list(range(1, int(required_simulations) + 1))
    run_location_simulations(simulations, snp_file, possible_locations, exon_list, output_directory)
Esempio n. 12
0
def main():

    description = "Check whether PTCs are associated with greater rates of exon skipping."
    args = gen.parse_arguments(
        description, [
            "gtf", "genome_fasta", "bams_folder", "vcf_folder", "panel_file",
            "out_prefix", "bam_analysis_folder", "number_of_simulations",
            "simulation_output_folder", "motif_file", "filter_genome_data",
            "get_SNPs", "process_bams", "simulate_ptc_snps",
            "motif_complement", "overwrite_intersect", "use_old_sims",
            "out_of_frame", "simulate_ptcs_with_monomorphic",
            "generate_monomorphic_indices", "ignore_determine_snp_type",
            "ignore_psi_calculation", "ptc_location_analysis"
        ],
        flags=[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22],
        ints=[7])
    gtf, genome_fasta, bams_folder, vcf_folder, panel_file, out_prefix, bam_analysis_folder, number_of_simulations, simulation_output_folder, motif_file, filter_genome_data, get_SNPs, process_bams, simulate_ptc_snps, motif_complement, overwrite_intersect, use_old_sims, out_of_frame, simulate_ptcs_with_monomorphic, generate_monomorphic_indices, ignore_determine_snp_type, ignore_psi_calculation, ptc_location_analysis = args.gtf, args.genome_fasta, args.bams_folder, args.vcf_folder, args.panel_file, args.out_prefix, args.bam_analysis_folder, args.number_of_simulations, args.simulation_output_folder, args.motif_file, args.filter_genome_data, args.get_SNPs, args.process_bams, args.simulate_ptc_snps, args.motif_complement, args.overwrite_intersect, args.use_old_sims, args.out_of_frame, args.simulate_ptcs_with_monomorphic, args.generate_monomorphic_indices, args.ignore_determine_snp_type, args.ignore_psi_calculation, args.ptc_location_analysis

    start = time.time()

    # create any necessary output diretories
    directory_splits = out_prefix.split('/')
    directory_paths = "/".join(directory_splits[:-1])
    gen.create_output_directories(directory_paths)
    gen.create_directory('temp_data/')

    CDS_fasta = "{0}_CDS.fasta".format(out_prefix)
    CDS_bed = "{0}_CDS.bed".format(out_prefix)
    exon_bed = "{0}_exons.bed".format(out_prefix)
    filtered_exon_bed = "{0}_filtered_exons.bed".format(out_prefix)
    exon_junctions_file = "{0}_exon_junctions.bed".format(out_prefix)
    coding_exon_bed = "{0}_coding_exons.bed".format(out_prefix)

    if filter_genome_data:
        #extract and filter CDS coordinates and sequences
        print("Extracting and filtering CDSs...")
        bo.extract_cds(gtf,
                       CDS_bed,
                       CDS_fasta,
                       genome_fasta,
                       all_checks=True,
                       uniquify=True,
                       clean_chrom_only=True,
                       full_chr_name=True)
        gen.get_time(start)

        #group the CDS sequences into families based on sequence similarity
        print("Grouping sequences into families...")
        names = gen.read_fasta(CDS_fasta)[0]
        gen.find_families_ensembl(
            "../source_data/GRCh37_ensembl_protein_families.txt", names,
            "{0}_families.txt".format(out_prefix))
        gen.get_time(start)

        print("Extracting and filtering exons...")
        #extract exon coordinates
        bo.extract_exons(gtf, exon_bed)
        #only leave exons from transcripts that passed quality control in the extract_cds step above.
        #also only leave a single gene per family
        bo.filter_bed_from_fasta(
            exon_bed,
            CDS_fasta,
            filtered_exon_bed,
            families_file="{0}_families.txt".format(out_prefix))
        gen.get_time(start)

        #extract exon-exon junction coordinates
        print("Extracting exon-exon junctions...")
        bo.extract_exon_junctions(exon_bed,
                                  exon_junctions_file,
                                  window_of_interest=2)
        gen.get_time(start)

        #make another exons bed that only contains fully coding exons.
        #This is because in the final analysis, we should only consider fully protein-coding exons.
        #However, for getting the exon junctions we need the full exons file because fully protein-coding exons might
        #be flanked by exons that are not. This is why we couldn't do this filtering step earlier.
        print(
            "Filtering out overlapping, non-coding and partially coding, as well as terminal exons..."
        )
        bo.check_coding(filtered_exon_bed,
                        CDS_bed,
                        coding_exon_bed,
                        remove_overlapping=True)
        gen.get_time(start)

    SNP_file = "{0}_SNP_file.txt".format(out_prefix)
    if out_of_frame:
        out_prefix = out_prefix + "_out_of_frame"
    PTC_file = "{0}_ptc_file.txt".format(out_prefix)
    syn_nonsyn_file = "{0}_syn_nonsyn_file.txt".format(out_prefix)
    CDS_interval_file = "{0}_intervals{1}".format(
        os.path.splitext(CDS_fasta)[0],
        os.path.splitext(CDS_fasta)[1])
    #check which individuals were included in Geuvadis
    full_sample_names = os.listdir(bams_folder)
    full_sample_names = [
        i for i in full_sample_names if i[-4:] == ".bam" and "proc" not in i
    ]
    sample_names = [(i.split("."))[0] for i in full_sample_names]
    sample_names = [i for i in sample_names if len(i) > 0]
    print('{0} samples included in Geuvadis...'.format(len(sample_names)))
    #for some reason, 17 of the samples from Geuvadis don't appear in the 1000genomes vcf
    #I'm gonna have to get to the bottom of this at some point
    #but at the moment I'm just gonna filter them out

    with open("../source_data/samples_in_vcf.txt") as file:
        samples_in_vcf = file.readlines()
    samples_in_vcf = [i.rstrip("\n") for i in samples_in_vcf]
    sample_names = [i for i in sample_names if i in samples_in_vcf]
    print('{0} samples also in vcf...'.format(len(sample_names)))
    sample_file = "{0}_sample_file.txt".format(out_prefix)

    # create a fasta containing all sequences for exons with snp
    coding_exons_fasta = "{0}_coding_exons.fasta".format(out_prefix)
    bo.fasta_from_intervals(coding_exon_bed,
                            coding_exons_fasta,
                            genome_fasta,
                            names=True)

    if get_SNPs:
        #get SNPs for the sample
        intersect_file = "{0}_SNP_CDS_intersect.bed".format(out_prefix)
        print("Getting SNP data...")
        so.get_snps_in_cds(coding_exon_bed, CDS_bed, vcf_folder, panel_file,
                           sample_names, sample_file, intersect_file,
                           out_prefix)
        print("Calculating SNP positions...")
        so.get_snp_positions(sample_file, SNP_file, CDS_bed, intersect_file,
                             out_prefix)
        gen.get_time(start)

    if ignore_determine_snp_type:
        pass
    else:
        print("Determining SNP type...")
        so.get_snp_change_status(SNP_file,
                                 CDS_fasta,
                                 PTC_file,
                                 syn_nonsyn_file,
                                 out_of_frame=out_of_frame,
                                 ref_check=True,
                                 headers=True)
        gen.get_time(start)

    #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step.
    print(
        "Filtering exon-exon junctions to only leave those that flank exons with a PTC variant..."
    )
    PTC_exon_junctions_file = "{0}_filtered_exon_junctions.bed".format(
        out_prefix)
    bo.filter_exon_junctions(exon_junctions_file, PTC_file,
                             PTC_exon_junctions_file)

    #make a list of all the .bam files and modify them to have the full path rather than just the file name
    bam_files = [
        "{0}/{1}".format(bams_folder, i) for i in full_sample_names
        if (i.split("."))[0] in sample_names
    ]

    #in parallel, do the processing on individual .bam files
    exon_junctions_bam_output_folder = "{0}__analysis_exon_junction_bams".format(
        out_prefix)
    if bam_analysis_folder == "None":
        bam_analysis_folder = "{0}__analysis_bam_analysis".format(out_prefix)
    gen.create_directory(bam_analysis_folder)
    if process_bams:
        print("Processing RNA-seq data...")
        if out_of_frame:
            splits = exon_junctions_bam_output_folder.split('/')
            splits[-1] = splits[-1].replace('_out_of_frame', '')
            exon_junctions_bam_output_folder = "/".join(splits)
        gen.create_directory(exon_junctions_bam_output_folder)
        #we have to do it like this because you can't pass flags into run_in_parallel
        keyword_dict = {"overwrite_intersect": overwrite_intersect}
        processes = gen.run_in_parallel(bam_files, [
            "foo", exon_junctions_file, PTC_exon_junctions_file,
            bam_analysis_folder, PTC_file, syn_nonsyn_file, out_prefix,
            exon_junctions_bam_output_folder, keyword_dict
        ],
                                        nao.process_bam_per_individual,
                                        workers=36)
        for process in processes:
            process.get()
        gen.get_time(start)

    #if required, filter PTCs to only leave ones that overlap motifs from a specified set
    motif_filtering = False
    if motif_file != "None":
        print(
            "Filtering SNPs based on whether or not they overlap a motif from the specified set..."
        )
        motif_suffix = ((motif_file.split("/"))[-1]).split(".")[0]
        if motif_complement:
            out_prefix = "{0}_{1}_complement".format(out_prefix, motif_suffix)
        else:
            out_prefix = "{0}_{1}".format(out_prefix, motif_suffix)
        filtered_ptc = "{0}_ptc_file.txt".format(out_prefix)
        so.filter_motif_SNPs(CDS_fasta,
                             PTC_file,
                             motif_file,
                             filtered_ptc,
                             complement=motif_complement)
        PTC_file = filtered_ptc

    final_file = "{0}__analysis_final_output.txt".format(out_prefix)
    if ignore_psi_calculation:
        pass
    else:
        print("Calculating PSI...")
        bmo.compare_PSI(PTC_file, bam_analysis_folder, final_file)

    #run the simulation that swaps ptcs for nonsynonymous snps
    if simulate_ptc_snps:
        if simulate_ptc_snps and not number_of_simulations:
            print("Please specify the number of simulations")
            raise Exception
        nao.ptc_snp_simulation(out_prefix,
                               simulation_output_folder,
                               PTC_file,
                               syn_nonsyn_file,
                               exon_junctions_file,
                               bam_files,
                               number_of_simulations,
                               exon_junctions_bam_output_folder,
                               use_old_sims=use_old_sims)

    # run the simulation that picks monomorphic sites
    if simulate_ptcs_with_monomorphic:
        if simulate_ptcs_with_monomorphic and not number_of_simulations:
            print("Please specify the number of simulations")
            raise Exception

        coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix)
        if not os.path.exists(coding_exon_fasta):
            print('Coding exon fasta is required...')
            raise Exception
        nao.ptc_monomorphic_simulation(
            out_prefix,
            simulation_output_folder,
            sample_file,
            genome_fasta,
            PTC_file,
            syn_nonsyn_file,
            coding_exon_bed,
            coding_exon_fasta,
            exon_junctions_file,
            bam_files,
            number_of_simulations,
            generate_indices=generate_monomorphic_indices,
            use_old_sims=use_old_sims)

    # get the locations of the ptcs
    if ptc_location_analysis:
        print("PTC locations analysis...")
        snp_relative_exon_position_file = "{0}_SNP_relative_exon_position.bed".format(
            out_prefix)
        ptc_location_analysis_output_file = "{0}_ptc_location_analysis.csv".format(
            out_prefix)
        coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix)
        if not os.path.exists(coding_exon_fasta) or not os.path.exists(
                snp_relative_exon_position_file) or not os.path.exists(
                    PTC_file):
            print("Please run --filter_genome_data and --get_SNPs first...")
            raise Exception
        # need to work out where and what the analysis outputs need to do
        so.ptc_locations(PTC_file, snp_relative_exon_position_file,
                         ptc_location_analysis_output_file)
Esempio n. 13
0
def run_ptc_monomorphic_simulation_instance(
        simulations,
        out_prefix,
        simulation_output_folder,
        simulation_bam_analysis_output_folder,
        ptc_file,
        syn_nonsyn_file,
        exon_junctions_file,
        bam_files,
        nt_indices_files,
        coding_exon_fasta,
        parallel=False,
        use_old_sims=False):
    '''
    Run the ptc simulations for the required number.
    '''

    #iterate over simulations
    counter = 0
    for simulation_number in simulations:

        counter = gen.update_counter(counter, 10, "SIMULATION ")

        #setup a folder to contain the individual simulation inside the simulations output
        simulation_instance_folder = "{0}/ptc_monomorphic_simulation_run_{1}".format(
            simulation_output_folder, simulation_number)
        if not use_old_sims:
            gen.create_strict_directory(simulation_instance_folder)
        else:
            gen.create_directory(simulation_instance_folder)

        # copy ptc file to directory
        real_ptcs_for_sim_file = "{0}/{1}".format(simulation_output_folder,
                                                  ptc_file.split('/')[-1])
        gen.copy_file(ptc_file, real_ptcs_for_sim_file)
        ptc_file = real_ptcs_for_sim_file

        #get list of exons
        exon_list = bo.get_fasta_exon_intervals(coding_exon_fasta)

        #generate pseudo ptc snps
        pseudo_monomorphic_ptc_file = "{0}/pseudo_monomorphic_ptc_file_{1}.txt".format(
            simulation_instance_folder, simulation_number)
        if (not use_old_sims) or (
                not (os.path.isfile(pseudo_monomorphic_ptc_file))):
            so.generate_pseudo_monomorphic_ptcs(ptc_file, nt_indices_files,
                                                exon_list,
                                                pseudo_monomorphic_ptc_file)

        #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step when generating pseudo ptcs
        pseudo_monomorphic_ptc_exon_junctions_file = "{0}/filtered_exon_junctions_{1}.bed".format(
            simulation_instance_folder, simulation_number)
        if (not use_old_sims) or (not (
                os.path.isfile(pseudo_monomorphic_ptc_exon_junctions_file))):
            bo.filter_exon_junctions(
                exon_junctions_file, pseudo_monomorphic_ptc_file,
                pseudo_monomorphic_ptc_exon_junctions_file)

        exon_junctions_bam_output_folder = "{0}__analysis_exon_junction_bams".format(
            out_prefix)
        gen.create_directory(exon_junctions_bam_output_folder)
        #run the bam analysis for each
        #(don't parallelize if you're doing the simulations in parallel)
        kw_dict = {
            "ptc_snp_simulation": True,
            "simulation_instance_folder": simulation_instance_folder,
            "simulation_number": simulation_number
        }
        if parallel:
            process_bam_per_individual(
                bam_files, exon_junctions_file,
                pseudo_monomorphic_ptc_exon_junctions_file,
                simulation_bam_analysis_output_folder,
                pseudo_monomorphic_ptc_file, syn_nonsyn_file, out_prefix,
                exon_junctions_bam_output_folder, kw_dict)
        else:
            processes = gen.run_in_parallel(bam_files, [
                "foo", exon_junctions_file,
                pseudo_monomorphic_ptc_exon_junctions_file,
                simulation_bam_analysis_output_folder,
                pseudo_monomorphic_ptc_file, syn_nonsyn_file, out_prefix,
                exon_junctions_bam_output_folder, kw_dict
            ],
                                            process_bam_per_individual,
                                            workers=36)
            for process in processes:
                process.get()

        #process final psi for simulation
        final_file = "{0}/final_output_simulation_{1}.txt".format(
            simulation_bam_analysis_output_folder, simulation_number)
        bmo.compare_PSI(pseudo_monomorphic_ptc_file,
                        simulation_bam_analysis_output_folder,
                        final_file,
                        sim_number=simulation_number)
Esempio n. 14
0
def ptc_monomorphic_simulation(out_prefix,
                               simulation_output_folder,
                               sample_file,
                               genome_fasta,
                               ptc_file,
                               syn_nonsyn_file,
                               coding_exon_bed,
                               coding_exon_fasta,
                               exon_junctions_file,
                               bam_files,
                               required_simulations,
                               generate_indices=False,
                               use_old_sims=False):
    '''
    Set up the PTC simulations and then run.
    if use_old_sims is True, don't pick new simulant SNPs from monomorphic sites.
    '''

    print(
        "Running simulation picking monomorphic sites that have the same ancestral allele as a PTC snp..."
    )

    #setup up simulation output folder
    if simulation_output_folder == "None":
        simulation_output_folder = "{0}_simulate_ptc_monomorphic_sites".format(
            out_prefix)
    if not use_old_sims and generate_indices:
        #if the simulation folder we are specifying already exists, delete and start again
        gen.create_strict_directory(simulation_output_folder)
    else:
        gen.create_directory(simulation_output_folder)

    #setup up simulation bam analysis output folder
    simulation_bam_analysis_output_folder = "{0}_simulate_ptc_monomorphic_sites_bam_analysis".format(
        out_prefix)

    # create the filepaths to hold to positions of non mutated sites
    nt_indices_files = {
        "A":
        "{0}/nt_indices_no_mutations_A.fasta".format(simulation_output_folder),
        "C":
        "{0}/nt_indices_no_mutations_C.fasta".format(simulation_output_folder),
        "G":
        "{0}/nt_indices_no_mutations_G.fasta".format(simulation_output_folder),
        "T":
        "{0}/nt_indices_no_mutations_T.fasta".format(simulation_output_folder),
    }

    if generate_indices and not use_old_sims:
        get_non_mutation_indices(simulation_output_folder, sample_file,
                                 coding_exon_bed, out_prefix, genome_fasta,
                                 nt_indices_files)

    if not use_old_sims:
        #if the simulation folder we are specifying already exists, delete and start again
        gen.create_strict_directory(simulation_bam_analysis_output_folder)
    else:
        gen.create_directory(simulation_bam_analysis_output_folder)

    # #create a list of simulations to iterate over
    simulations = list(range(1, required_simulations + 1))
    # #if you're only doing one simulation, don't parallelize the simulations
    # #parallelize the processing of bams like for true data
    if required_simulations > 1:
        processes = gen.run_in_parallel(
            simulations, [
                "foo", out_prefix, simulation_output_folder,
                simulation_bam_analysis_output_folder, ptc_file,
                syn_nonsyn_file, exon_junctions_file, bam_files,
                nt_indices_files, coding_exon_fasta, True, use_old_sims
            ],
            run_ptc_monomorphic_simulation_instance,
            workers=36)
        for process in processes:
            process.get()
    else:
        run_ptc_monomorphic_simulation_instance(
            [1], out_prefix, simulation_output_folder,
            simulation_bam_analysis_output_folder, ptc_file, syn_nonsyn_file,
            exon_junctions_file, bam_files, nt_indices_files,
            coding_exon_fasta, False, use_old_sims)
Esempio n. 15
0
def large_effects_lengths_sim():
    '''
    Test the lengths of the large effects exons
    and whether they are biased of length 3
    '''

    output_prefix = "results/clean_run_2/clean_run"
    ptc_file = "{0}_ptc_file.txt".format(output_prefix)
    relative_positions_file = "{0}_PTC_relative_exon_positions.bed".format(
        output_prefix)
    final_output_file = "{0}__analysis_final_output.txt".format(output_prefix)

    filtered_list = get_filtered_skipped_exons(final_output_file)
    large_effects, non_large_effects = get_large_effect_overlaps(
        filtered_list, 5, 0.025)

    relative_positions = gen.read_many_fields(relative_positions_file, "\t")

    rel_pos_list = {}
    for ptc in relative_positions[1:]:
        start = int(ptc[1])
        stop = int(ptc[2])
        exon = ptc[3]
        rel_pos = int(ptc[11])
        rel_pos_list[exon] = [rel_pos, start, stop]

    real_lengths, real_periods = get_exon_length_info(large_effects,
                                                      rel_pos_list)

    simulations = 10000
    sims = list(range(simulations))
    # sim_exon_length_info(sims, large_effects, non_large_effects, rel_pos_list)
    processes = gen.run_in_parallel(
        sims, ["foo", large_effects, non_large_effects, rel_pos_list],
        sim_exon_length_info)

    lengths = []
    periods = []
    for process in processes:
        output = process.get()
        length = output[0]
        period = output[1]
        for i in length:
            lengths.append(i)
        for i in period:
            periods.append(i)

    length_pval = np.divide(
        len([
            1 for length in lengths if np.mean(length) <= np.mean(real_lengths)
        ]) + 1,
        len(lengths) + 1)
    # for those exons of length 3
    period_pval = np.divide(
        len([1 for period in periods if period[0] >= real_periods[0]]) + 1,
        len(periods) + 1)
    # ese_region_pval = 1

    print("Mean large effect exon length: {0}".format(np.mean(real_lengths)))
    print("Real periodicity (0,1,2): {0}".format(real_periods))
    print("Lengths compared with sims: {0}".format(length_pval))
    print("Periodicity with sims: {0}".format(period_pval))