Esempio n. 1
0
def ptc_snp_simulation(out_prefix,
                       simulation_output_folder,
                       ptc_file,
                       syn_nonsyn_file,
                       exon_junctions_file,
                       bam_files,
                       required_simulations,
                       exon_junctions_bam_output_folder,
                       use_old_sims=False):
    '''
    Set up the PTC simulations and then run.
    if use_old_sims is True, don't pick new simulant SNPs.
    '''

    #setup up simulation output folder
    if simulation_output_folder == "None":
        simulation_output_folder = "{0}_simulate_ptc_snps".format(out_prefix)
    if not use_old_sims:
        #if the simulation folder we are specifying already exists, delete and start again
        gen.create_strict_directory(simulation_output_folder)
    else:
        gen.create_directory(simulation_output_folder)

    #setup up simulation bam analysis output folder
    simulation_bam_analysis_output_folder = "{0}__analysis_simulation_ptc_snps_bam_analysis".format(
        out_prefix)
    if not use_old_sims:
        #if the simulation folder we are specifying already exists, delete and start again
        gen.create_strict_directory(simulation_bam_analysis_output_folder)
    else:
        gen.create_directory(simulation_bam_analysis_output_folder)

    #get all nonsynonymous snps and put them in the simulation output folder
    nonsynonymous_snps_file = "{0}/nonsynonymous_snps.txt".format(
        simulation_output_folder)
    so.filter_by_snp_type(syn_nonsyn_file, nonsynonymous_snps_file, "non")

    #create a list of simulations to iterate over
    simulations = list(range(1, required_simulations + 1))
    #if you're only doing one simulation, don't parallelize the simulations
    #parallelize the processing of bams like for true data
    if required_simulations > 1:
        processes = gen.run_in_parallel(simulations, [
            "foo", out_prefix, simulation_output_folder,
            simulation_bam_analysis_output_folder, ptc_file,
            nonsynonymous_snps_file, exon_junctions_file, bam_files,
            exon_junctions_bam_output_folder, True, use_old_sims
        ], run_ptc_simulation_instance)
        for process in processes:
            process.get()
    else:
        run_ptc_simulation_instance([1], out_prefix, simulation_output_folder,
                                    simulation_bam_analysis_output_folder,
                                    ptc_file, nonsynonymous_snps_file,
                                    exon_junctions_file, bam_files,
                                    exon_junctions_bam_output_folder, False,
                                    use_old_sims)
Esempio n. 2
0
def retrieve_bams(ftp_site,
                  local_directory,
                  remote_directory,
                  password_file,
                  subset=None):
    '''
    For each .bam file at the ftp site, downsload it, transfer it to
    a remote server and delete it.
    ftp_site: the remote site that contains the files
    local_directory: the local directory where you want to temporarily store the files
    remote_directory: path to directory on remote server where you want to transfer the files
    password_file: path to file that contains Watson password
    subset: only retrieve this many .bam files (useful for testing)
    '''
    #create local directory, if it doesn't exist
    gen.create_directory(local_directory)
    #split the ftp_site address into host and the path
    ftp_site = ftp_site.split("/")
    host = ftp_site[0]
    ftp_directory = "/".join(ftp_site[1:])
    user = "******"
    password = "******"
    #connect to FTP server
    ftp = gen.ftp_connect(host, user, password, directory=ftp_directory)
    #get list of all .bam files
    all_files = ftp.nlst()
    all_files = [i for i in all_files if i[-4:] == ".bam"]
    print(len(all_files))
    ftp = gen.ftp_check(ftp, host, user, password, ftp_directory)
    ftp.quit()
    #get password for Watson
    with open(password_file) as file:
        expect_password = "".join(file)
        expect_password = expect_password.rstrip("\n")
    #I will use expect to run scp from the script
    #the way this works is you write an expect script
    #and then use the expect programme to run it
    #this is the string that will be in the script
    #each time, you replace "foo" with the name of the file you want to transfer
    expect_string = "#!/usr/bin/expect\nset timeout -1\nspawn rsync {0}/foo {1}\nexpect \"rs949@bssv-watson's password:\"\nsend \"{2}\\n\";\nexpect eof\nexit".format(
        local_directory, remote_directory, expect_password)
    if subset:
        all_files = all_files[:subset]
    #retrieve and transfer .bams in parallel
    processes = gen.run_in_parallel(all_files, [
        "foo", local_directory, host, user, password, ftp_directory,
        expect_string
    ],
                                    retrieve_bams_core,
                                    workers=6)
    for process in processes:
        process.get()
Esempio n. 3
0
def run_ptc_simulation_instance(simulations,
                                out_prefix,
                                simulation_output_folder,
                                simulation_bam_analysis_output_folder,
                                ptc_file,
                                nonsynonymous_snps_file,
                                exon_junctions_file,
                                bam_files,
                                exon_junctions_bam_output_folder,
                                parallel=False,
                                use_old_sims=False):
    '''
    Run the ptc simulations for the required number.
    '''

    #iterate over simulations
    counter = 0
    for simulation_number in simulations:

        counter = gen.update_counter(counter, 10, "SIMULATION ")

        #setup a folder to contain the individual simulation inside the simulations output
        simulation_instance_folder = "{0}/ptc_simulation_run_{1}".format(
            simulation_output_folder, simulation_number)
        if not use_old_sims:
            gen.create_strict_directory(simulation_instance_folder)
        else:
            gen.create_directory(simulation_instance_folder)

        #generate pseudo ptc snps
        #also need to remove these snps from the file they started in so create a new remaining snps file
        #we can tweak these if we start running out of snps
        pseudo_ptc_file = "{0}/pseudo_ptc_file_{1}.txt".format(
            simulation_instance_folder, simulation_number)
        remaining_snps_file = "{0}/remaining_snps_file_{1}.txt".format(
            simulation_instance_folder, simulation_number)
        if (not use_old_sims) or (not (os.path.isfile(pseudo_ptc_file))):
            so.generate_pseudo_ptc_snps(ptc_file,
                                        nonsynonymous_snps_file,
                                        pseudo_ptc_file,
                                        remaining_snps_file,
                                        group_by_gene=False,
                                        without_replacement=True,
                                        match_allele_frequency=True,
                                        match_allele_frequency_window=0.05)

        #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step when generating pseudo ptcs
        pseudo_ptc_exon_junctions_file = "{0}/filtered_exon_junctions_{1}.bed".format(
            simulation_instance_folder, simulation_number)
        if (not use_old_sims) or (not (os.path.isfile(pseudo_ptc_file))):
            bo.filter_exon_junctions(exon_junctions_file, pseudo_ptc_file,
                                     pseudo_ptc_exon_junctions_file)
Esempio n. 4
0
def bam_quality_filter(input_bam,
                       output_bam,
                       quality_greater_than_equal_to=None,
                       quality_less_than_equal_to=None):
    '''
    Filters bam reads by quality.
    quality_less_than_equal_to: the lower threshold for quality control
    quality_greater_than_equal_to: the upper threshold for quality control
    '''

    samtools_args = ["samtools", "view", "-h"]
    #if neither thresholds are specified
    if not quality_greater_than_equal_to and not quality_less_than_equal_to:
        print("You must specify one threshold to filter reads by.")
        raise Exception
    #if both thresholds are specified
    if quality_greater_than_equal_to and quality_less_than_equal_to:
        #create temp file
        gen.create_directory("temp_data/")
        temp_file = "temp_data/{0}.{1}.bam".format(
            os.path.split(output_bam)[1][:-4], random.random())
        #first get everything below the upper threshold
        #need to account for the fact samtools removes everything below threshold
        #so when inversing need to add 1 to total
        args = samtools_args.copy()
        upper_limit = quality_less_than_equal_to + 1
        args.extend(["-q", upper_limit, input_bam, "-U", temp_file])
        gen.run_process(args)
        #second get everything above the lower threshold
        args = samtools_args.copy()
        args.extend(["-bq", quality_greater_than_equal_to, temp_file])
        gen.run_process(args, file_for_output=output_bam)
        # #cleanup files
        gen.remove_file(temp_file)
    #if only the lower threshold is specified
    elif quality_greater_than_equal_to and not quality_less_than_equal_to:
        samtools_args.extend(["-bq", quality_greater_than_equal_to, input_bam])
        gen.run_process(samtools_args, file_for_output=output_bam)
    #if only the upper threshold is specified
    elif quality_less_than_equal_to and not quality_greater_than_equal_to:
        #need to account for the fact samtools removes everything below threshold
        #so when inversing need to add 1 to total
        upper_limit = quality_less_than_equal_to + 1
        samtools_args.extend(["-q", upper_limit, input_bam, "-U", output_bam])
        gen.run_process(samtools_args)
Esempio n. 5
0
def fasta_from_intervals_temp_file(bed_file, output_fasta, genome_fasta, random_directory=None):
        '''
        Create a temporary file to hold the fasta extractions
        '''
        random_int = np.random.randint(9999999,size=2)
        if random_directory:
                temp_directory_path = './temp_files/temp_fasta_files_{0}'.format(random_int[0])
        else:
                temp_directory_path = './temp_files/temp_fasta_files'
        #create temp directory if doesnt already exist
        gen.create_directory('./temp_files/')
        #delete temp fasta directory and create new
        gen.create_strict_directory(temp_directory_path)
        #set the temporary fasta file path
        temp_fasta_file = '{0}/{1}_{2}{3}'.format(temp_directory_path, os.path.splitext(os.path.basename(output_fasta))[0], random_int[1], os.path.splitext(os.path.basename(output_fasta))[1])
        temp_fasta_file = output_fasta
        fasta_from_intervals(bed_file, temp_fasta_file, genome_fasta, force_strand = True, names = True)
        return(temp_fasta_file, temp_directory_path)
Esempio n. 6
0
def main():

    description = "Check whether stop codons are depleted in motif sets by simulating the motif set."
    args = gen.parse_arguments(description, [
        "required_simulations", "all_sets", "ESR", "Ke", "PESE", "RESCUE",
        "INT3", "RBP_motifs", "filter_RBPs", "split_RBPs"
    ],
                               flags=[1, 2, 3, 4, 5, 6, 7, 8, 9])
    required_simulations, all_sets, ESR, Ke, PESE, RESCUE, INT3, RBP_motifs, filter_RBPs, split_rbps = args.required_simulations, args.all_sets, args.ESR, args.Ke, args.PESE, args.RESCUE, args.INT3, args.RBP_motifs, args.filter_RBPs, args.split_RBPs

    if split_rbps and not filter_RBPs:
        print('You must specify the filtered RBPs if you want to split by ND.')
        raise Exception

    if not required_simulations:
        print('You must specify the number of simulations you require.')
        raise Exception

    #create the output_directory
    output_directory = "output_data"
    gen.create_directory(output_directory)

    #set up the simulations we want
    required_sets = []
    if all_sets:
        required_sets.extend([i for i in ese_sets])
    else:
        if ESR:
            required_sets.append("ESR")
        if Ke:
            required_sets.append("Ke400_ESEs")
        if PESE:
            required_sets.append("PESE")
        if RESCUE:
            required_sets.append("RESCUE")
        if INT3:
            required_sets.append("INT3")
        if RBP_motifs and not filter_RBPs:
            required_sets.append("RBP_motifs")
        if RBP_motifs and filter_RBPs:
            required_sets.append("RBP_motifs_filtered")

    #check whether any sets have been chosen
    if len(required_sets) == 0:
        print("\nPlease choose a motif set to analyse:\n")
        [print("--{0}".format(i)) for i in sorted(ese_sets)]
        print("\n")
        raise Exception

    #create the necessary files
    simulation_sets = []
    for ese_set in required_sets:
        if ese_set == "RBP_motifs_filtered":
            dir_name = "RBP_motifs"
        else:
            dir_name = ese_set
        #create the output directory for the particular motif set
        motif_output_directory = "{0}/{1}".format(output_directory, dir_name)
        gen.create_directory(motif_output_directory)
        if split_rbps:
            #if we want to split the rbp motifs based on nd, need to create 2 lots of outputs
            simulated_set_output_pos_nd = "{0}/{1}_simulants_pos_nd_{2}.txt".format(
                motif_output_directory, dir_name, required_simulations)
            output_file_pos_nd = "{0}/{1}_stop_counts_pos_nd_{2}.csv".format(
                motif_output_directory, dir_name, required_simulations)
            simulation_sets.append([
                ese_set, simulated_set_output_pos_nd, output_file_pos_nd, 1,
                "Positive ND"
            ])
            simulated_set_output_neg_nd = "{0}/{1}_simulants_neg_nd_{2}.txt".format(
                motif_output_directory, dir_name, required_simulations)
            output_file_neg_nd = "{0}/{1}_stop_counts_neg_nd_{2}.csv".format(
                motif_output_directory, dir_name, required_simulations)
            simulation_sets.append([
                ese_set, simulated_set_output_neg_nd, output_file_neg_nd, -1,
                "Negative ND"
            ])
        else:
            #create simulated set output, analysis output file
            simulated_set_output = "{0}/{1}_simulants_{2}.txt".format(
                motif_output_directory, dir_name, required_simulations)
            output_file = "{0}/{1}_stop_counts_{2}.csv".format(
                motif_output_directory, dir_name, required_simulations)
            simulation_sets.append(
                [ese_set, simulated_set_output, output_file])

    run_simulations(simulation_sets, int(required_simulations))
Esempio n. 7
0
def intersect_bed(bed_file1,
                  bed_file2,
                  overlap=False,
                  overlap_rec=False,
                  write_both=False,
                  sort=False,
                  output_file=None,
                  force_strand=False,
                  no_name_check=False,
                  no_dups=True,
                  intersect=False,
                  hit_count=False,
                  bed_path=None,
                  intersect_bam=None,
                  write_zero=False,
                  write_bed=False,
                  subtract=None,
                  return_non_overlaps=None,
                  write_none=False):
    """
    Use bedtools to intersect coordinates from two bed files.
    Return those lines in bed file 1 that overlap with intervals in bed file 2.
    Adapted from RS.

    Args:
        bed_file1 (str): path to first bed file (could be bam file if intersect_bam=True)
        bed_file2 (str): path to second bed file
        overlap (float): minimum overlap required as a fraction of the intervals in bed file 1 (EX: 0.8 means that the overlap has to be at least 80% of the intervals in bed file 1).
        overlap_rec (bool): if true, require that the overlap as a fraction of the intervals in file 2 be at least as high as the threshold indicated in -f.
        write_both (bool): if true, return not only the interval from bed file 1 but, tagged onto the end, also the interval from bed file 2 that it overlaps.
        sort (bool): if true, sort bed files before taking the intersection
        output_file (str): if exists, path to with which to write output file
        force_strand (bool): if true, check that the feature and the bed interval are on the same strand
        no_name_check (bool): if false, checks whether the chromosome names are the same in the too bed files
        no_dups (bool): if true, only returns each interval once. If false, intervals in bed file 1 that overlap several intervals in bed file 2 will be returned several times (as many times as there are overlaps with different elements in bed file 2)
        intersect (bool): if true, rather than returning the entire interval, only return the part of the interval that overlaps an interval in bed file 2.
        hit_count (bool): for each element in bed file 1, return the number of elements it overlaps in bed file 2
        intersect_bam (bool): if true, intersect a bam file with a bed file. Requires bam file to be called first
        write_zero (bool): like write_both but also write A intervals that don't overlap with any B intervals
        write_bed (bool): if true, when intersecting a bam file, write output as bed
        subtract (bool): if true, set argument to subtractBed
        return_non_overlaps (bool): if true, only return entries in bed file 1 that don't overlap bed file 2

    Returns:
        bedtools_output (list): list of bed lines from the output file
    """

    gen.create_directory("temp_data/")
    temp_file_name = "temp_data/temp_bed_file{0}.bed".format(random.random())
    #have it write the output to a temporary file
    bedtools_output = run_bedtools(bed_file1,
                                   bed_file2,
                                   force_strand,
                                   write_both,
                                   overlap,
                                   sort,
                                   no_name_check,
                                   no_dups,
                                   output_file=temp_file_name,
                                   intersect=intersect,
                                   hit_number=hit_count,
                                   bed_path=bed_path,
                                   intersect_bam=intersect_bam,
                                   write_zero=write_zero,
                                   overlap_rec=overlap_rec,
                                   write_bed=write_bed,
                                   subtract=subtract,
                                   return_non_overlaps=return_non_overlaps,
                                   write_none=write_none)
    #move it to a permanent location only if you want to keep it
    if output_file:
        gen.run_process(["mv", temp_file_name, output_file])
    else:
        bedtools_output = gen.read_many_fields(temp_file_name, "\t")
    gen.remove_file(temp_file_name)
    return (bedtools_output)
Esempio n. 8
0
def intersect_bed(bed_file1,
                  bed_file2,
                  use_bedops=False,
                  overlap=False,
                  overlap_rec=False,
                  write_both=False,
                  sort=False,
                  output_file=None,
                  force_strand=False,
                  no_name_check=False,
                  no_dups=True,
                  chrom=None,
                  intersect=False,
                  hit_count=False,
                  bed_path=None,
                  intersect_bam=None,
                  write_zero=False,
                  write_bed=False,
                  subtract=None):
    '''Use bedtools/bedops to intersect coordinates from two bed files.
    Return those lines in bed file 1 that overlap with intervals in bed file 2.
    OPTIONS
    output_file: write output to this file
    use_bedops: use bedops rather than bedtools. Certain options are only valid with one of the two, see below.
    overlap: minimum overlap required as a fraction of the intervals in bed file 1 (EX: 0.8 means that the
    overlap has to be at least 80% of the intervals in bed file 1).
    overlap_rec: require that the overlap as a fraction of the intervals in file 2 be at least as high as
    the threshold indicated in -f.
    write_both: if True, return not only the interval from bed file 1 but, tagged onto the end, also the
    interval from bed file 2 that it overlaps (only
    valid when using bedtools).
    sort: sort bed files before taking the intersection
    force_strand: check that the feature and the bed interval are on the same strand (only valid with bedtools)
    no_name_check: if set to False, checks whether the chromosome names are the same in the too bed files (only valid with bedtools)
    no_dups: if True, only returns each interval once. If set to false, intervals in bed file 1 that overlap several intervals in
    bed file 2 will be returned several times (as many times as there are overlaps with different elements in bed file 2)
    chrom: limit search to a specific chromosome (only valid with bedops, can help in terms of efficiency)
    intersect: rather than returning the entire interval, only return the part of the interval that overlaps an interval in bed file 2.
    hit_count: for each element in bed file 1, return the number of elements it overlaps in bed file 2 (only valid with bedtools)
    intersect_bam: intersect a bam file with a bed file. Requires bam file to be called first
    write_zero: like write_both but also write A intervals that don't overlap with any B intervals,
    write_bed: when intersecting a bam file, write output as bed.'''
    gen.create_directory("temp_data/")
    temp_file_name = "temp_data/temp_bed_file{0}.bed".format(random.random())
    #have it write the output to a temporary file
    if use_bedops:
        bedtools_output = run_bedops(bed_file1,
                                     bed_file2,
                                     force_strand,
                                     write_both,
                                     chrom,
                                     overlap,
                                     sort,
                                     output_file=temp_file_name,
                                     intersect=intersect,
                                     hit_number=hit_count,
                                     no_dups=no_dups,
                                     intersect_bam=intersect_bam,
                                     overlap_rec=overlap_rec)
    else:
        bedtools_output = run_bedtools(bed_file1,
                                       bed_file2,
                                       force_strand,
                                       write_both,
                                       chrom,
                                       overlap,
                                       sort,
                                       no_name_check,
                                       no_dups,
                                       output_file=temp_file_name,
                                       intersect=intersect,
                                       hit_number=hit_count,
                                       bed_path=bed_path,
                                       intersect_bam=intersect_bam,
                                       write_zero=write_zero,
                                       overlap_rec=overlap_rec,
                                       write_bed=write_bed,
                                       subtract=subtract)
    #move it to a permanent location only if you want to keep it
    if output_file:
        gen.run_process(["mv", temp_file_name, output_file])
    else:
        bedtools_output = gen.read_many_fields(temp_file_name, "\t")
    gen.remove_file(temp_file_name)
    return (bedtools_output)
Esempio n. 9
0
def main():

    description = "Check whether PTCs are associated with greater rates of exon skipping."
    args = gen.parse_arguments(
        description, [
            "gtf", "genome_fasta", "bams_folder", "vcf_folder", "panel_file",
            "out_prefix", "bam_analysis_folder", "number_of_simulations",
            "simulation_output_folder", "motif_file", "filter_genome_data",
            "get_SNPs", "process_bams", "simulate_ptc_snps",
            "motif_complement", "overwrite_intersect", "use_old_sims",
            "out_of_frame", "simulate_ptcs_with_monomorphic",
            "generate_monomorphic_indices", "ignore_determine_snp_type",
            "ignore_psi_calculation", "ptc_location_analysis"
        ],
        flags=[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22],
        ints=[7])
    gtf, genome_fasta, bams_folder, vcf_folder, panel_file, out_prefix, bam_analysis_folder, number_of_simulations, simulation_output_folder, motif_file, filter_genome_data, get_SNPs, process_bams, simulate_ptc_snps, motif_complement, overwrite_intersect, use_old_sims, out_of_frame, simulate_ptcs_with_monomorphic, generate_monomorphic_indices, ignore_determine_snp_type, ignore_psi_calculation, ptc_location_analysis = args.gtf, args.genome_fasta, args.bams_folder, args.vcf_folder, args.panel_file, args.out_prefix, args.bam_analysis_folder, args.number_of_simulations, args.simulation_output_folder, args.motif_file, args.filter_genome_data, args.get_SNPs, args.process_bams, args.simulate_ptc_snps, args.motif_complement, args.overwrite_intersect, args.use_old_sims, args.out_of_frame, args.simulate_ptcs_with_monomorphic, args.generate_monomorphic_indices, args.ignore_determine_snp_type, args.ignore_psi_calculation, args.ptc_location_analysis

    start = time.time()

    # create any necessary output diretories
    directory_splits = out_prefix.split('/')
    directory_paths = "/".join(directory_splits[:-1])
    gen.create_output_directories(directory_paths)
    gen.create_directory('temp_data/')

    CDS_fasta = "{0}_CDS.fasta".format(out_prefix)
    CDS_bed = "{0}_CDS.bed".format(out_prefix)
    exon_bed = "{0}_exons.bed".format(out_prefix)
    filtered_exon_bed = "{0}_filtered_exons.bed".format(out_prefix)
    exon_junctions_file = "{0}_exon_junctions.bed".format(out_prefix)
    coding_exon_bed = "{0}_coding_exons.bed".format(out_prefix)

    if filter_genome_data:
        #extract and filter CDS coordinates and sequences
        print("Extracting and filtering CDSs...")
        bo.extract_cds(gtf,
                       CDS_bed,
                       CDS_fasta,
                       genome_fasta,
                       all_checks=True,
                       uniquify=True,
                       clean_chrom_only=True,
                       full_chr_name=True)
        gen.get_time(start)

        #group the CDS sequences into families based on sequence similarity
        print("Grouping sequences into families...")
        names = gen.read_fasta(CDS_fasta)[0]
        gen.find_families_ensembl(
            "../source_data/GRCh37_ensembl_protein_families.txt", names,
            "{0}_families.txt".format(out_prefix))
        gen.get_time(start)

        print("Extracting and filtering exons...")
        #extract exon coordinates
        bo.extract_exons(gtf, exon_bed)
        #only leave exons from transcripts that passed quality control in the extract_cds step above.
        #also only leave a single gene per family
        bo.filter_bed_from_fasta(
            exon_bed,
            CDS_fasta,
            filtered_exon_bed,
            families_file="{0}_families.txt".format(out_prefix))
        gen.get_time(start)

        #extract exon-exon junction coordinates
        print("Extracting exon-exon junctions...")
        bo.extract_exon_junctions(exon_bed,
                                  exon_junctions_file,
                                  window_of_interest=2)
        gen.get_time(start)

        #make another exons bed that only contains fully coding exons.
        #This is because in the final analysis, we should only consider fully protein-coding exons.
        #However, for getting the exon junctions we need the full exons file because fully protein-coding exons might
        #be flanked by exons that are not. This is why we couldn't do this filtering step earlier.
        print(
            "Filtering out overlapping, non-coding and partially coding, as well as terminal exons..."
        )
        bo.check_coding(filtered_exon_bed,
                        CDS_bed,
                        coding_exon_bed,
                        remove_overlapping=True)
        gen.get_time(start)

    SNP_file = "{0}_SNP_file.txt".format(out_prefix)
    if out_of_frame:
        out_prefix = out_prefix + "_out_of_frame"
    PTC_file = "{0}_ptc_file.txt".format(out_prefix)
    syn_nonsyn_file = "{0}_syn_nonsyn_file.txt".format(out_prefix)
    CDS_interval_file = "{0}_intervals{1}".format(
        os.path.splitext(CDS_fasta)[0],
        os.path.splitext(CDS_fasta)[1])
    #check which individuals were included in Geuvadis
    full_sample_names = os.listdir(bams_folder)
    full_sample_names = [
        i for i in full_sample_names if i[-4:] == ".bam" and "proc" not in i
    ]
    sample_names = [(i.split("."))[0] for i in full_sample_names]
    sample_names = [i for i in sample_names if len(i) > 0]
    print('{0} samples included in Geuvadis...'.format(len(sample_names)))
    #for some reason, 17 of the samples from Geuvadis don't appear in the 1000genomes vcf
    #I'm gonna have to get to the bottom of this at some point
    #but at the moment I'm just gonna filter them out

    with open("../source_data/samples_in_vcf.txt") as file:
        samples_in_vcf = file.readlines()
    samples_in_vcf = [i.rstrip("\n") for i in samples_in_vcf]
    sample_names = [i for i in sample_names if i in samples_in_vcf]
    print('{0} samples also in vcf...'.format(len(sample_names)))
    sample_file = "{0}_sample_file.txt".format(out_prefix)

    # create a fasta containing all sequences for exons with snp
    coding_exons_fasta = "{0}_coding_exons.fasta".format(out_prefix)
    bo.fasta_from_intervals(coding_exon_bed,
                            coding_exons_fasta,
                            genome_fasta,
                            names=True)

    if get_SNPs:
        #get SNPs for the sample
        intersect_file = "{0}_SNP_CDS_intersect.bed".format(out_prefix)
        print("Getting SNP data...")
        so.get_snps_in_cds(coding_exon_bed, CDS_bed, vcf_folder, panel_file,
                           sample_names, sample_file, intersect_file,
                           out_prefix)
        print("Calculating SNP positions...")
        so.get_snp_positions(sample_file, SNP_file, CDS_bed, intersect_file,
                             out_prefix)
        gen.get_time(start)

    if ignore_determine_snp_type:
        pass
    else:
        print("Determining SNP type...")
        so.get_snp_change_status(SNP_file,
                                 CDS_fasta,
                                 PTC_file,
                                 syn_nonsyn_file,
                                 out_of_frame=out_of_frame,
                                 ref_check=True,
                                 headers=True)
        gen.get_time(start)

    #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step.
    print(
        "Filtering exon-exon junctions to only leave those that flank exons with a PTC variant..."
    )
    PTC_exon_junctions_file = "{0}_filtered_exon_junctions.bed".format(
        out_prefix)
    bo.filter_exon_junctions(exon_junctions_file, PTC_file,
                             PTC_exon_junctions_file)

    #make a list of all the .bam files and modify them to have the full path rather than just the file name
    bam_files = [
        "{0}/{1}".format(bams_folder, i) for i in full_sample_names
        if (i.split("."))[0] in sample_names
    ]

    #in parallel, do the processing on individual .bam files
    exon_junctions_bam_output_folder = "{0}__analysis_exon_junction_bams".format(
        out_prefix)
    if bam_analysis_folder == "None":
        bam_analysis_folder = "{0}__analysis_bam_analysis".format(out_prefix)
    gen.create_directory(bam_analysis_folder)
    if process_bams:
        print("Processing RNA-seq data...")
        if out_of_frame:
            splits = exon_junctions_bam_output_folder.split('/')
            splits[-1] = splits[-1].replace('_out_of_frame', '')
            exon_junctions_bam_output_folder = "/".join(splits)
        gen.create_directory(exon_junctions_bam_output_folder)
        #we have to do it like this because you can't pass flags into run_in_parallel
        keyword_dict = {"overwrite_intersect": overwrite_intersect}
        processes = gen.run_in_parallel(bam_files, [
            "foo", exon_junctions_file, PTC_exon_junctions_file,
            bam_analysis_folder, PTC_file, syn_nonsyn_file, out_prefix,
            exon_junctions_bam_output_folder, keyword_dict
        ],
                                        nao.process_bam_per_individual,
                                        workers=36)
        for process in processes:
            process.get()
        gen.get_time(start)

    #if required, filter PTCs to only leave ones that overlap motifs from a specified set
    motif_filtering = False
    if motif_file != "None":
        print(
            "Filtering SNPs based on whether or not they overlap a motif from the specified set..."
        )
        motif_suffix = ((motif_file.split("/"))[-1]).split(".")[0]
        if motif_complement:
            out_prefix = "{0}_{1}_complement".format(out_prefix, motif_suffix)
        else:
            out_prefix = "{0}_{1}".format(out_prefix, motif_suffix)
        filtered_ptc = "{0}_ptc_file.txt".format(out_prefix)
        so.filter_motif_SNPs(CDS_fasta,
                             PTC_file,
                             motif_file,
                             filtered_ptc,
                             complement=motif_complement)
        PTC_file = filtered_ptc

    final_file = "{0}__analysis_final_output.txt".format(out_prefix)
    if ignore_psi_calculation:
        pass
    else:
        print("Calculating PSI...")
        bmo.compare_PSI(PTC_file, bam_analysis_folder, final_file)

    #run the simulation that swaps ptcs for nonsynonymous snps
    if simulate_ptc_snps:
        if simulate_ptc_snps and not number_of_simulations:
            print("Please specify the number of simulations")
            raise Exception
        nao.ptc_snp_simulation(out_prefix,
                               simulation_output_folder,
                               PTC_file,
                               syn_nonsyn_file,
                               exon_junctions_file,
                               bam_files,
                               number_of_simulations,
                               exon_junctions_bam_output_folder,
                               use_old_sims=use_old_sims)

    # run the simulation that picks monomorphic sites
    if simulate_ptcs_with_monomorphic:
        if simulate_ptcs_with_monomorphic and not number_of_simulations:
            print("Please specify the number of simulations")
            raise Exception

        coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix)
        if not os.path.exists(coding_exon_fasta):
            print('Coding exon fasta is required...')
            raise Exception
        nao.ptc_monomorphic_simulation(
            out_prefix,
            simulation_output_folder,
            sample_file,
            genome_fasta,
            PTC_file,
            syn_nonsyn_file,
            coding_exon_bed,
            coding_exon_fasta,
            exon_junctions_file,
            bam_files,
            number_of_simulations,
            generate_indices=generate_monomorphic_indices,
            use_old_sims=use_old_sims)

    # get the locations of the ptcs
    if ptc_location_analysis:
        print("PTC locations analysis...")
        snp_relative_exon_position_file = "{0}_SNP_relative_exon_position.bed".format(
            out_prefix)
        ptc_location_analysis_output_file = "{0}_ptc_location_analysis.csv".format(
            out_prefix)
        coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix)
        if not os.path.exists(coding_exon_fasta) or not os.path.exists(
                snp_relative_exon_position_file) or not os.path.exists(
                    PTC_file):
            print("Please run --filter_genome_data and --get_SNPs first...")
            raise Exception
        # need to work out where and what the analysis outputs need to do
        so.ptc_locations(PTC_file, snp_relative_exon_position_file,
                         ptc_location_analysis_output_file)
Esempio n. 10
0
def run_ptc_monomorphic_simulation_instance(
        simulations,
        out_prefix,
        simulation_output_folder,
        simulation_bam_analysis_output_folder,
        ptc_file,
        syn_nonsyn_file,
        exon_junctions_file,
        bam_files,
        nt_indices_files,
        coding_exon_fasta,
        parallel=False,
        use_old_sims=False):
    '''
    Run the ptc simulations for the required number.
    '''

    #iterate over simulations
    counter = 0
    for simulation_number in simulations:

        counter = gen.update_counter(counter, 10, "SIMULATION ")

        #setup a folder to contain the individual simulation inside the simulations output
        simulation_instance_folder = "{0}/ptc_monomorphic_simulation_run_{1}".format(
            simulation_output_folder, simulation_number)
        if not use_old_sims:
            gen.create_strict_directory(simulation_instance_folder)
        else:
            gen.create_directory(simulation_instance_folder)

        # copy ptc file to directory
        real_ptcs_for_sim_file = "{0}/{1}".format(simulation_output_folder,
                                                  ptc_file.split('/')[-1])
        gen.copy_file(ptc_file, real_ptcs_for_sim_file)
        ptc_file = real_ptcs_for_sim_file

        #get list of exons
        exon_list = bo.get_fasta_exon_intervals(coding_exon_fasta)

        #generate pseudo ptc snps
        pseudo_monomorphic_ptc_file = "{0}/pseudo_monomorphic_ptc_file_{1}.txt".format(
            simulation_instance_folder, simulation_number)
        if (not use_old_sims) or (
                not (os.path.isfile(pseudo_monomorphic_ptc_file))):
            so.generate_pseudo_monomorphic_ptcs(ptc_file, nt_indices_files,
                                                exon_list,
                                                pseudo_monomorphic_ptc_file)

        #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step when generating pseudo ptcs
        pseudo_monomorphic_ptc_exon_junctions_file = "{0}/filtered_exon_junctions_{1}.bed".format(
            simulation_instance_folder, simulation_number)
        if (not use_old_sims) or (not (
                os.path.isfile(pseudo_monomorphic_ptc_exon_junctions_file))):
            bo.filter_exon_junctions(
                exon_junctions_file, pseudo_monomorphic_ptc_file,
                pseudo_monomorphic_ptc_exon_junctions_file)

        exon_junctions_bam_output_folder = "{0}__analysis_exon_junction_bams".format(
            out_prefix)
        gen.create_directory(exon_junctions_bam_output_folder)
        #run the bam analysis for each
        #(don't parallelize if you're doing the simulations in parallel)
        kw_dict = {
            "ptc_snp_simulation": True,
            "simulation_instance_folder": simulation_instance_folder,
            "simulation_number": simulation_number
        }
        if parallel:
            process_bam_per_individual(
                bam_files, exon_junctions_file,
                pseudo_monomorphic_ptc_exon_junctions_file,
                simulation_bam_analysis_output_folder,
                pseudo_monomorphic_ptc_file, syn_nonsyn_file, out_prefix,
                exon_junctions_bam_output_folder, kw_dict)
        else:
            processes = gen.run_in_parallel(bam_files, [
                "foo", exon_junctions_file,
                pseudo_monomorphic_ptc_exon_junctions_file,
                simulation_bam_analysis_output_folder,
                pseudo_monomorphic_ptc_file, syn_nonsyn_file, out_prefix,
                exon_junctions_bam_output_folder, kw_dict
            ],
                                            process_bam_per_individual,
                                            workers=36)
            for process in processes:
                process.get()

        #process final psi for simulation
        final_file = "{0}/final_output_simulation_{1}.txt".format(
            simulation_bam_analysis_output_folder, simulation_number)
        bmo.compare_PSI(pseudo_monomorphic_ptc_file,
                        simulation_bam_analysis_output_folder,
                        final_file,
                        sim_number=simulation_number)
Esempio n. 11
0
def ptc_monomorphic_simulation(out_prefix,
                               simulation_output_folder,
                               sample_file,
                               genome_fasta,
                               ptc_file,
                               syn_nonsyn_file,
                               coding_exon_bed,
                               coding_exon_fasta,
                               exon_junctions_file,
                               bam_files,
                               required_simulations,
                               generate_indices=False,
                               use_old_sims=False):
    '''
    Set up the PTC simulations and then run.
    if use_old_sims is True, don't pick new simulant SNPs from monomorphic sites.
    '''

    print(
        "Running simulation picking monomorphic sites that have the same ancestral allele as a PTC snp..."
    )

    #setup up simulation output folder
    if simulation_output_folder == "None":
        simulation_output_folder = "{0}_simulate_ptc_monomorphic_sites".format(
            out_prefix)
    if not use_old_sims and generate_indices:
        #if the simulation folder we are specifying already exists, delete and start again
        gen.create_strict_directory(simulation_output_folder)
    else:
        gen.create_directory(simulation_output_folder)

    #setup up simulation bam analysis output folder
    simulation_bam_analysis_output_folder = "{0}_simulate_ptc_monomorphic_sites_bam_analysis".format(
        out_prefix)

    # create the filepaths to hold to positions of non mutated sites
    nt_indices_files = {
        "A":
        "{0}/nt_indices_no_mutations_A.fasta".format(simulation_output_folder),
        "C":
        "{0}/nt_indices_no_mutations_C.fasta".format(simulation_output_folder),
        "G":
        "{0}/nt_indices_no_mutations_G.fasta".format(simulation_output_folder),
        "T":
        "{0}/nt_indices_no_mutations_T.fasta".format(simulation_output_folder),
    }

    if generate_indices and not use_old_sims:
        get_non_mutation_indices(simulation_output_folder, sample_file,
                                 coding_exon_bed, out_prefix, genome_fasta,
                                 nt_indices_files)

    if not use_old_sims:
        #if the simulation folder we are specifying already exists, delete and start again
        gen.create_strict_directory(simulation_bam_analysis_output_folder)
    else:
        gen.create_directory(simulation_bam_analysis_output_folder)

    # #create a list of simulations to iterate over
    simulations = list(range(1, required_simulations + 1))
    # #if you're only doing one simulation, don't parallelize the simulations
    # #parallelize the processing of bams like for true data
    if required_simulations > 1:
        processes = gen.run_in_parallel(
            simulations, [
                "foo", out_prefix, simulation_output_folder,
                simulation_bam_analysis_output_folder, ptc_file,
                syn_nonsyn_file, exon_junctions_file, bam_files,
                nt_indices_files, coding_exon_fasta, True, use_old_sims
            ],
            run_ptc_monomorphic_simulation_instance,
            workers=36)
        for process in processes:
            process.get()
    else:
        run_ptc_monomorphic_simulation_instance(
            [1], out_prefix, simulation_output_folder,
            simulation_bam_analysis_output_folder, ptc_file, syn_nonsyn_file,
            exon_junctions_file, bam_files, nt_indices_files,
            coding_exon_fasta, False, use_old_sims)