Esempio n. 1
0
def main():

    description = "Check whether stop codons are depleted in motif sets by simulating the motif set."
    args = gen.parse_arguments(description, ["motif_file", "output_dir", "results_dir", "required_simulations", "motif_simulation", "exon_simulation"], flags = [4,5], ints = [3])
    motif_file, output_dir, results_dir, required_simulations, motif_simulation, exon_simulation = args.motif_file,  args.output_dir, args.results_dir, args.required_simulations, args.motif_simulation, args.exon_simulation

    if not required_simulations:
        print('You must specify the number of simulations you require.')
        raise Exception

    gen.create_output_directories(output_dir)

    if motif_simulation:
        simulation_sets = []

        #create the output directory for the particular motif set
        motif_output_dir = "{0}/{1}".format(output_dir, ".".join(motif_file.split('.')[:-1]).split('/')[-1])
        gen.create_output_directories(motif_output_dir)


        simulated_motifs_output = "{0}/simulations_{1}.txt".format(motif_output_dir, required_simulations)
        output_file = "{0}/stop_counts_{1}.txt".format(motif_output_dir, required_simulations)

        # add the files to the required list
        simulation_sets.append([motif_file, simulated_motifs_output, output_file])

        # run the simulations
        run_simulations(simulation_sets, required_simulations)

    exon_hexamer_simulation = "{0}/region_hexamer_sim.csv".format(output_dir)
    if exon_simulation:
        exon_fasta = "{0}_CDS_intervals.fasta".format(results_dir)
        run_exon_simulation(motif_file, exon_fasta, output_dir, required_simulations, exon_hexamer_simulation)
Esempio n. 2
0
def main():

    arguments = ["output_directory", "motif_file", "simulations", "controls_directory", "exons_fasta", "motifs_stop_density", "motif_stop_codon_densities_sim", "motif_codon_densities", "motif_densities_exon_dinucleotides", "generate_motif_controls", "match_density", "match_subs"]

    description = ""
    args = gen.parse_arguments(description, arguments, opt_flags=[2,3,4], flags = [5,6,7,8,9,10,11])
    output_directory, motif_file, simulations, controls_directory, exons_fasta, motifs_stop_density, motif_stop_codon_densities_sim, motif_codon_densities, motif_densities_exon_dinucleotides, generate_motif_controls, match_density, match_subs = args.output_directory, args.motif_file, args.simulations, args.controls_directory, args.exons_fasta, args.motifs_stop_density, args.motif_stop_codon_densities_sim, args.motif_codon_densities, args.motif_densities_exon_dinucleotides, args.generate_motif_controls, args.match_density, args.match_subs

    # interger the simulations
    if simulations:
        simulations = int(simulations)

    # create the global output directory
    global_output_directory = "{0}/motif_tests".format(output_directory)
    gen.create_output_directories(global_output_directory)

    # if we want to generate the controls
    if generate_motif_controls:
        simopc.generate_motif_dinucleotide_controls(motif_file, simulations, output_directory, match_density = match_density, match_subs = match_subs)

    # get the stop density if motifs and non motifs of same length
    if motifs_stop_density:
        mtop.calc_stop_densities(motif_file)

    # calculate stop codon densities in the motif sets
    if motif_stop_codon_densities_sim:
        # create a local output directory
        local_output_directory = "{0}/motif_stop_density_simulations".format(global_output_directory)
        gen.create_output_directories(local_output_directory)
        # output filepath
        output_file = "{0}/{1}_stop_codon_densities.csv".format(local_output_directory, motif_file.split("/")[-1].split(".")[0])
        # run if we need some more controls
        if simulations > len(os.listdir(controls_directory)):
            print("Please create more simulants...")
            raise Exception
        # # calculate densities
        mtop.motif_stop_codon_densities(motif_file, controls_directory, simulations, output_file)

    # calculate other codon densities in motif sets
    if motif_codon_densities:
        local_output_directory = "{0}/codon_combination_densities".format(global_output_directory)
        gen.create_output_directories(local_output_directory)
        # get all the possible sets of 3 unique codon combinations
        codon_combinations_file = "{0}/codon_combinations.txt".format(local_output_directory)
        if not os.path.isfile(codon_combinations_file):
            seqo.generate_all_motif_combinations(stops, codon_combinations_file)

        output_file = "{0}/{1}_codon_combination_densities.csv".format(local_output_directory, motif_file.split("/")[-1].split(".")[0])
        if simulations > len(os.listdir(controls_directory)):
            gen.remove_directory(controls_directory)
            simopc.generate_motif_controls(motif_file, simulations, controls_directory, match_density = False)
        mtop.motif_codon_densities(motif_file, codon_combinations_file, controls_directory, simulations, output_file)
Esempio n. 3
0
def main():
    description = "Compare expression parameters for transcripts that contain true PTCs vs transcripts that contain pseudo-PTCs."
    args = gen.parse_arguments(description, ["PTCs_file", "pseudo_PTCs_directory", "expression_file"])
    PTCs_file, pseudo_PTCs_directory, expression_file = args.PTCs_file, args.pseudo_PTCs_directory, args.expression_file

    expression = gen.read_many_fields(expression_file, "\t")

    #get median expression parameters for true PTCs
    true_values = PTCs_to_expression(PTCs_file, expression)

    #do the same for each of the simulant PTC files
    sim_files = os.listdir(pseudo_PTCs_directory)
    #I'm doing the first one separately so I could easily stack the outputs
    sim_values = PTCs_to_expression("{0}/{1}".format(pseudo_PTCs_directory, sim_files[0]), expression)
    for sim_file in sim_files[1:]:
        curr_sim_values = PTCs_to_expression("{0}/{1}".format(pseudo_PTCs_directory, sim_file), expression)
        sim_values = np.vstack((sim_values, curr_sim_values))

    display_comparison("breadth", true_values, sim_values, 0)
    display_comparison("maximum TPM", true_values, sim_values, 1)
    display_comparison("median TPM", true_values, sim_values, 2)
    display_comparison("median TPM (if expressed)", true_values, sim_values, 3)
Esempio n. 4
0
def main():

    description = "Miscellaneous tests."
    args = gen.parse_arguments(description, [
        "results_prefix", "disease_output_dir", "ese_file", "get_filtered",
        "get_info", "disease_locations_chisquare",
        "large_effect_ese_hits_simulation", "large_effect_locations",
        "large_effect_lengths"
    ],
                               flags=[3, 4, 5, 6, 7, 8])
    results_prefix, disease_output_dir, ese_file, get_filtered, get_info, disease_locations_chisquare, large_effect_ese_hits_simulation, large_effect_locations, large_effect_lengths = args.results_prefix, args.disease_output_dir, args.ese_file, args.get_filtered, args.get_info, args.disease_locations_chisquare, args.large_effect_ese_hits_simulation, args.large_effect_locations, args.large_effect_lengths

    if get_filtered:
        get_filtered_exons()

    # tests on the large effect cases
    if large_effect_ese_hits_simulation:
        ese_hits_simulation(ese_file)
    if large_effect_locations:
        large_effect_locations_sim()
    if large_effect_lengths:
        large_effects_lengths_sim()
Esempio n. 5
0
def main():

    description = "Check whether stop codons are depleted in motif sets by simulating the motif set."
    args = gen.parse_arguments(description, [
        "required_simulations", "all_sets", "ESR", "Ke", "PESE", "RESCUE",
        "INT3", "RBP_motifs", "filter_RBPs", "split_RBPs"
    ],
                               flags=[1, 2, 3, 4, 5, 6, 7, 8, 9])
    required_simulations, all_sets, ESR, Ke, PESE, RESCUE, INT3, RBP_motifs, filter_RBPs, split_rbps = args.required_simulations, args.all_sets, args.ESR, args.Ke, args.PESE, args.RESCUE, args.INT3, args.RBP_motifs, args.filter_RBPs, args.split_RBPs

    if split_rbps and not filter_RBPs:
        print('You must specify the filtered RBPs if you want to split by ND.')
        raise Exception

    if not required_simulations:
        print('You must specify the number of simulations you require.')
        raise Exception

    #create the output_directory
    output_directory = "output_data"
    gen.create_directory(output_directory)

    #set up the simulations we want
    required_sets = []
    if all_sets:
        required_sets.extend([i for i in ese_sets])
    else:
        if ESR:
            required_sets.append("ESR")
        if Ke:
            required_sets.append("Ke400_ESEs")
        if PESE:
            required_sets.append("PESE")
        if RESCUE:
            required_sets.append("RESCUE")
        if INT3:
            required_sets.append("INT3")
        if RBP_motifs and not filter_RBPs:
            required_sets.append("RBP_motifs")
        if RBP_motifs and filter_RBPs:
            required_sets.append("RBP_motifs_filtered")

    #check whether any sets have been chosen
    if len(required_sets) == 0:
        print("\nPlease choose a motif set to analyse:\n")
        [print("--{0}".format(i)) for i in sorted(ese_sets)]
        print("\n")
        raise Exception

    #create the necessary files
    simulation_sets = []
    for ese_set in required_sets:
        if ese_set == "RBP_motifs_filtered":
            dir_name = "RBP_motifs"
        else:
            dir_name = ese_set
        #create the output directory for the particular motif set
        motif_output_directory = "{0}/{1}".format(output_directory, dir_name)
        gen.create_directory(motif_output_directory)
        if split_rbps:
            #if we want to split the rbp motifs based on nd, need to create 2 lots of outputs
            simulated_set_output_pos_nd = "{0}/{1}_simulants_pos_nd_{2}.txt".format(
                motif_output_directory, dir_name, required_simulations)
            output_file_pos_nd = "{0}/{1}_stop_counts_pos_nd_{2}.csv".format(
                motif_output_directory, dir_name, required_simulations)
            simulation_sets.append([
                ese_set, simulated_set_output_pos_nd, output_file_pos_nd, 1,
                "Positive ND"
            ])
            simulated_set_output_neg_nd = "{0}/{1}_simulants_neg_nd_{2}.txt".format(
                motif_output_directory, dir_name, required_simulations)
            output_file_neg_nd = "{0}/{1}_stop_counts_neg_nd_{2}.csv".format(
                motif_output_directory, dir_name, required_simulations)
            simulation_sets.append([
                ese_set, simulated_set_output_neg_nd, output_file_neg_nd, -1,
                "Negative ND"
            ])
        else:
            #create simulated set output, analysis output file
            simulated_set_output = "{0}/{1}_simulants_{2}.txt".format(
                motif_output_directory, dir_name, required_simulations)
            output_file = "{0}/{1}_stop_counts_{2}.csv".format(
                motif_output_directory, dir_name, required_simulations)
            simulation_sets.append(
                [ese_set, simulated_set_output, output_file])

    run_simulations(simulation_sets, int(required_simulations))
Esempio n. 6
0
def main():

    arguments = [
        "input_bed", "input_fasta", "output_directory", "input_fasta2",
        "input_file", "required_simulations", "motif_file", "families_file",
        "output_prefix", "controls_dir", "extract_sequences", "calc_gc",
        "density_sim", "get_exon_dint_controls", "get_intron_dint_controls",
        "exon_region_density", "compare_stop_density", "sim_orf_lengths",
        "sim_orf_lengths_masked", "sim_stop_density",
        "sim_stop_density_introns", "sim_stop_density_within_genes",
        "sim_stop_density_removed_motifs",
        "sim_stop_density_removed_motifs_sim_seqs", "sim_stop_density_diff",
        "exon_intron_density", "motif_nd", "excess_test", "single_exon",
        "motif_overlap", "motif_overlap_density", "clean_alignments",
        "seq_hits_linc", "upstream_atg", "excess_length_thresholds",
        "density_regions", "extract_second", "seq_no"
    ]
    description = "Container for analysis on lincRNAs"
    args = gen.parse_arguments(description,
                               arguments,
                               flags=[
                                   10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
                                   21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
                                   32, 33, 34, 35, 36
                               ],
                               opt_flags=[3, 4, 5, 6, 7, 8, 9, 37])

    input_bed, \
    input_fasta, \
    output_directory, \
    input_fasta2, \
    input_file, \
    required_simulations, \
    motif_file, \
    families_file, \
    output_prefix, \
    controls_dir, \
    extract_sequences, \
    calc_gc, \
    density_sim,  \
    get_exon_dint_controls, \
    get_intron_dint_controls, \
    exon_region_density, \
    compare_stop_density, \
    sim_orf_lengths, \
    sim_orf_lengths_masked, \
    sim_stop_density, \
    sim_stop_density_introns, \
    sim_stop_density_within_genes, \
    sim_stop_density_removed_motifs, \
    sim_stop_density_removed_motifs_sim_seqs, \
    sim_stop_density_diff, \
    exon_intron_density, \
    motif_nd, \
    excess_test, \
    single_exon,\
    motif_overlap, \
    motif_overlap_density, \
    clean_alignments, \
    seq_hits_linc, \
    upstream_atg, \
    excess_length_thresholds, \
    density_regions, \
    extract_second, \
    seq_no = \
    args.input_bed, \
    args.input_fasta, \
    args.output_directory, \
    args.input_fasta2, \
    args.input_file, \
    args.required_simulations, \
    args.motif_file, \
    args.families_file, \
    args.output_prefix, \
    args.controls_dir, \
    args.extract_sequences, \
    args.calc_gc, \
    args.density_sim, \
    args.get_exon_dint_controls, \
    args.get_intron_dint_controls, \
    args.exon_region_density, \
    args.compare_stop_density, \
    args.sim_orf_lengths, \
    args.sim_orf_lengths_masked, \
    args.sim_stop_density, \
    args.sim_stop_density_introns, \
    args.sim_stop_density_within_genes, \
    args.sim_stop_density_removed_motifs, \
    args.sim_stop_density_removed_motifs_sim_seqs, \
    args.sim_stop_density_diff, \
    args.exon_intron_density, \
    args.motif_nd, \
    args.excess_test, \
    args.single_exon, \
    args.motif_overlap, \
    args.motif_overlap_density, \
    args.clean_alignments, \
    args.seq_hits_linc, \
    args.upstream_atg, \
    args.excess_length_thresholds, \
    args.density_regions, \
    args.extract_second, \
    args.seq_no

    # make required simultions an int
    required_simulations = int(
        required_simulations) if required_simulations else None
    # prcoess output prefix
    output_prefix = output_prefix + "_" if output_prefix else ""
    seq_no = int(seq_no) if seq_no else None

    # create output directories
    global_output_directory = "{0}/tests/lincrna".format(output_directory)
    gen.create_output_directories(global_output_directory)

    # set a start time
    start = time.time()

    # create the output_directory if it doenst already exist
    gen.create_output_directories(global_output_directory)

    # get the sequences
    if extract_sequences:
        lincRNA_single_exon_bed = "{0}/lincrna/lincRNA.single_exon.bed".format(
            output_directory)
        lincRNA_single_exon_fasta = "{0}/lincrna/lincRNA.single_exon.fasta".format(
            output_directory)
        lincRNA_single_exon_families = "{0}/lincrna/lincRNA.single_exon_families.bed".format(
            output_directory)
        lincRNA_multi_exon_bed = "{0}/lincrna/lincRNA.multi_exon.bed".format(
            output_directory)
        lincRNA_multi_exon_intron_bed = "{0}/lincrna/lincRNA.multi_exon.introns.bed".format(
            output_directory)
        lincRNA_multi_exon_fasta = "{0}/lincrna/lincRNA.multi_exon.fasta".format(
            output_directory)
        lincRNA_multi_exon_exons_fasta = "{0}/lincrna/lincRNA.multi_exon.exons.fasta".format(
            output_directory)
        lincRNA_multi_exon_intron_fasta = "{0}/lincrna/lincRNA.multi_exon.introns.fasta".format(
            output_directory)
        lincRNA_multi_exon_families = "{0}/lincrna/lincRNA.multi_exon_families.bed".format(
            output_directory)
        cont.extract_lincRNA_sequences(input_bed,
                                       input_fasta,
                                       lincRNA_single_exon_bed,
                                       lincRNA_multi_exon_bed,
                                       lincRNA_single_exon_fasta,
                                       lincRNA_multi_exon_fasta,
                                       lincRNA_multi_exon_intron_bed,
                                       lincRNA_multi_exon_intron_fasta,
                                       lincRNA_single_exon_families,
                                       lincRNA_multi_exon_families,
                                       clean_run=None)

    # clean the alignments to get in usable form
    # might need this
    if clean_alignments:
        output_exon_file = "{0}/clean_exon_alignments.fasta"
        output_intron_file = "{0}/clean_intron_alignments.fasta"
        ltests.clean_alignments(input_bed, input_fasta, output_exon_file,
                                output_intron_file)

    if calc_gc:
        output_file = "{0}/{1}_gc.csv".format(global_output_directory,
                                              output_prefix)
        ltests.calc_gc(input_fasta, output_file, families_file=families_file)

    # orf length test
    if sim_orf_lengths:
        sim_orf_length_output_file = "{0}/{1}sim_orf_lengths.csv".format(
            global_output_directory, output_prefix)
        if families_file:
            sim_orf_length_z_file = "{0}/{1}sim_orf_lengths_zs_grouped.csv".format(
                global_output_directory, output_prefix)
        else:
            sim_orf_length_z_file = "{0}/{1}sim_orf_lengths_zs.csv".format(
                global_output_directory, output_prefix)
        # run the test
        simopc.sim_orf_length(input_fasta, required_simulations,
                              sim_orf_length_output_file)
        ltests.process_length_sim(sim_orf_length_output_file,
                                  sim_orf_length_z_file,
                                  families_file=families_file)

    if sim_orf_lengths_masked:
        masked_output_file = "{0}_{1}_masked.csv".format(
            input_file.split(".")[0],
            motif_file.split("/")[-1].split(".")[0])
        # run the test
        simopc.sim_orf_length_masked(input_fasta,
                                     required_simulations,
                                     motif_file,
                                     input_file,
                                     controls_dir,
                                     masked_output_file,
                                     families_file=families_file)

    # stop density test
    if sim_stop_density:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        if families_file:
            sim_stop_density_output_dir = "{0}/{1}_stop_density_simulation_all_genes_grouped_families".format(
                local_output_directory, output_prefix)
            sim_stop_density_output_file = "{0}/{1}_stop_density_simulation_all_genes_grouped_families.csv".format(
                local_output_directory, output_prefix)
            runs = 10
        else:
            sim_stop_density_output_dir = "{0}/{1}_stop_density_simulation_all_genes".format(
                local_output_directory, output_prefix)
            sim_stop_density_output_file = "{0}/{1}_stop_density_simulation_all_genes.csv".format(
                local_output_directory, output_prefix)
            runs = 1

        gen.create_output_directories(sim_stop_density_output_dir)

        for run in list(range(runs)):
            output_file = "{0}/stop_density_simulation_{1}.csv".format(
                sim_stop_density_output_dir, run + 1)
            ltests.sim_stop_density(input_fasta,
                                    output_file,
                                    simulations=int(required_simulations),
                                    families_file=families_file)
        # process the outputs
        ltests.process_sim_stop_density_outputs(sim_stop_density_output_dir,
                                                sim_stop_density_output_file)

    # within genes
    if sim_stop_density_within_genes:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        if families_file:
            sim_stop_density_within_gene_output_dir = "{0}/{1}_stop_density_simulation_within_genes_grouped_families".format(
                local_output_directory, output_prefix)
            sim_stop_density_within_gene_output_file = "{0}/{1}_stop_density_simulation_within_genes_grouped_families.csv".format(
                local_output_directory, output_prefix)
            runs = 10
        else:
            sim_stop_density_within_gene_output_dir = "{0}/{1}_stop_density_simulation_within_genes".format(
                local_output_directory, output_prefix)
            sim_stop_density_within_gene_output_file = "{0}/{1}_stop_density_simulation_within_genes.csv".format(
                local_output_directory, output_prefix)
            runs = 1
        gen.create_output_directories(sim_stop_density_within_gene_output_dir)
        for run in list(range(runs)):
            output_file = "{0}/stop_density_simulation_{1}.csv".format(
                sim_stop_density_within_gene_output_dir, run + 1)
            ltests.sim_stop_density_within_genes(
                input_fasta,
                output_file,
                simulations=int(required_simulations),
                families_file=families_file)

        # process the outputs
        ltests.process_sim_stop_density_within_gene_outputs(
            sim_stop_density_within_gene_output_dir,
            sim_stop_density_within_gene_output_file)

    # stop density test in the introns
    if sim_stop_density_introns:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        if families_file:
            sim_stop_density_output_dir = "{0}/{1}_stop_density_introns_simulation_all_genes_grouped_families".format(
                local_output_directory, output_prefix)
            sim_stop_density_output_file = "{0}/{1}_stop_density_introns_simulation_all_genes_grouped_families.csv".format(
                local_output_directory, output_prefix)
            runs = 1
        else:
            sim_stop_density_output_dir = "{0}/{1}_stop_density_introns_simulation_all_genes".format(
                local_output_directory, output_prefix)
            sim_stop_density_output_file = "{0}/{1}_stop_density_introns_simulation_all_genes.csv".format(
                local_output_directory, output_prefix)
            runs = 1
        gen.create_output_directories(sim_stop_density_output_dir)

        for run in list(range(runs)):
            output_file = "{0}/stop_density_simulation_{1}.csv".format(
                sim_stop_density_output_dir, run + 1)
            ltests.sim_stop_density(input_fasta,
                                    output_file,
                                    simulations=int(required_simulations),
                                    families_file=families_file,
                                    introns=True,
                                    input_fasta2=input_fasta2)
        # process the outputs
        ltests.process_sim_stop_density_outputs(sim_stop_density_output_dir,
                                                sim_stop_density_output_file)

    # remove motifs and test
    if sim_stop_density_removed_motifs:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        if families_file:
            sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_all_genes_grouped_families_removed_motifs".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            sim_output_file = "{0}/{1}_{2}_stop_density_simulation_all_genes_grouped_families_removed_motifs.csv".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            runs = 10
        else:
            sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_all_genes".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            sim_output_file = "{0}/{1}_{2}_stop_density_simulation_all_genes.csv".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            runs = 1
        # remove any previous runs
        gen.remove_directory(sim_output_dir)
        gen.create_output_directories(sim_output_dir)

        for run in list(range(runs)):
            run_output_file = "{0}/stop_density_simulation_{1}.csv".format(
                sim_output_dir, run + 1)
            ltests.sim_stop_density_removed_motifs(
                input_fasta,
                run_output_file,
                motif_file,
                simulations=int(required_simulations),
                families_file=families_file)
        # process the outputs
        ltests.process_sim_stop_density_outputs(sim_output_dir,
                                                sim_output_file,
                                                reverse=True)

    # remove motifs and test within seqs
    if sim_stop_density_removed_motifs_sim_seqs:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        if families_file:
            sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_grouped_families_removed_motifs_seq_sim".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            sim_output_file = "{0}/{1}_{2}_stop_density_simulation_grouped_families_removed_motifs_seq_sim.csv".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            runs = 10
        else:
            sim_output_dir = "{0}/{1}_{2}_stop_density_simulation_all_genes_seq_sim".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            sim_output_file = "{0}/{1}_{2}_stop_density_simulation_all_genes_seq_sim.csv".format(
                local_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            runs = 1
        # remove any previous runs
        gen.remove_directory(sim_output_dir)
        gen.create_output_directories(sim_output_dir)

        for run in list(range(runs)):
            run_output_file = "{0}/stop_density_simulation_{1}.csv".format(
                sim_output_dir, run + 1)
            ltests.sim_stop_density_removed_motifs_seq_sim(
                input_fasta,
                run_output_file,
                motif_file,
                controls_dir,
                simulations=int(required_simulations),
                families_file=families_file)
        # process the outputs
        ltests.process_sim_stop_density_outputs(sim_output_dir,
                                                sim_output_file,
                                                reverse=True)

    if sim_stop_density_diff:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        if families_file:
            sim_output_dir = "{0}/{1}_{2}_stop_density_diff_grouped_families".format(
                global_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            sim_output_file = "{0}/{1}_{2}_stop_density_stop_density_diff_grouped_families.csv".format(
                global_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            runs = 10
        else:
            sim_output_dir = "{0}/{1}_{2}_stop_density_stop_density_diff_all_genes".format(
                global_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            sim_output_file = "{0}/{1}_{2}_stop_density_stop_density_diff_all_genes.csv".format(
                global_output_directory, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            runs = 1
        # remove any previous runs
        gen.remove_directory(sim_output_dir)
        gen.create_output_directories(sim_output_dir)

        for run in list(range(runs)):
            run_output_file = "{0}/stop_density_simulation_{1}.csv".format(
                sim_output_dir, run + 1)
            ltests.sim_stop_density_diff(input_fasta,
                                         run_output_file,
                                         motif_file,
                                         controls_dir,
                                         simulations=int(required_simulations),
                                         families_file=families_file)
        # process the outputs
        ltests.process_sim_stop_density_diffs(sim_output_dir,
                                              sim_output_file,
                                              greater_than=False)

    # get density in exons and introns
    if exon_intron_density:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        output_file = "{0}/exon_intron_stop_density.csv".format(
            local_output_directory)
        ltests.exon_intron_stop_density(input_fasta,
                                        input_fasta2,
                                        output_file,
                                        families_file=families_file)

    # test whether there is an excess in flanks
    if excess_test:
        gen.check_files_exists([input_fasta, motif_file])
        # local output directory
        local_output_directory = "{0}/stop_excesses".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        # if the families file exists, group by family
        if families_file:
            excess_test_output_file = "{0}/{1}_stop_codon_excesses_grouped.csv".format(
                local_output_directory,
                motif_file.split("/")[-1].split(".")[0])
        else:
            excess_test_output_file = "{0}/{1}_stop_codon_excesses.csv".format(
                local_output_directory,
                motif_file.split("/")[-1].split(".")[0])
        # run the test
        ltests.excess_test(input_fasta,
                           motif_file,
                           excess_test_output_file,
                           simulations=required_simulations,
                           families_file=families_file)

    # upstream from the atg
    if upstream_atg:
        output_file = "{0}/stop_density/upstream_atg_stop_density.csv".format(
            global_output_directory)
        ltests.upstream_atg(input_fasta,
                            output_file,
                            simulations=int(required_simulations),
                            families_file=families_file)

    # calculate the density in the different regions
    if density_regions:
        local_output_directory = "{0}/stop_density".format(
            global_output_directory)
        gen.create_output_directories(local_output_directory)
        output_file = "{0}/stop_density_regions_chisq.csv".format(
            local_output_directory)
        output_file1 = "{0}/stop_density_regions1.csv".format(
            local_output_directory)
        output_file2 = "{0}/stop_density_regions_per_seq.csv".format(
            local_output_directory)
        ltests.density_regions(input_fasta,
                               motif_file,
                               output_file,
                               output_file1,
                               output_file2,
                               required_simulations=required_simulations,
                               families_file=families_file)

    # test hits to seqs
    if seq_hits_linc:
        local_output_dir = "{0}/ese_hits".format(global_output_directory)
        if output_prefix:
            tests_output_dir = "{0}/{1}_{2}".format(
                local_output_dir, output_prefix[:-1],
                motif_file.split("/")[-1].split(".")[0])
            final_output_file = "{0}/{1}_{2}_processed2.csv".format(
                local_output_dir, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
        else:
            tests_output_dir = "{0}/{1}_{2}".format(
                local_output_dir, output_prefix,
                motif_file.split("/")[-1].split(".")[0])
            final_output_file = "{0}/{1}_{2}_processed2.csv".format(
                local_output_dir, output_prefix[:-1],
                motif_file.split("/")[-1].split(".")[0])
        gen.create_output_directories(tests_output_dir)

        runs = 10
        for run in range(runs):
            if output_prefix:
                output_file = "{0}/{1}_{2}_hits_{3}.csv".format(
                    tests_output_dir, output_prefix,
                    motif_file.split("/")[-1].split(".")[0], run + 1)
            else:
                output_file = "{0}/{1}_hits_{2}.csv".format(
                    tests_output_dir,
                    motif_file.split("/")[-1].split(".")[0], run + 1)
            mto.calc_seq_hits_linc(input_fasta,
                                   output_file,
                                   motif_file,
                                   controls_dir,
                                   required_simulations=required_simulations,
                                   families_file=families_file)
        mto.process_seq_hits_linc(tests_output_dir, final_output_file)

    if excess_length_thresholds:
        local_output_dir = "{0}/orf_length_thresholds".format(
            global_output_directory)
        gen.create_output_directories(local_output_dir)
        ltests.orf_exceed_length_threshold(
            input_fasta,
            local_output_directory,
            required_simulations=required_simulations,
            families_file=families_file)

    # extract second set
    if extract_second:
        local_output_dir = "{0}/genome_sequences/lincrna/{1}".format(
            output_directory, output_prefix)
        lmisco.extract_second_seqs(input_bed, input_file, input_fasta,
                                   local_output_dir)
Esempio n. 7
0
def main():

    arguments = ["working_directory", "output_directory", "genome_path", "input_bed", "input_fasta", "clean_run", "extract_exon_intron_bed", "extract_exons", "extract_introns", "sort_by_exon_number", "build_transcripts", "extract_families", "orf_length_sim"]
    description = "Wrapper for miscellaneous operations on lincRNA"
    args = gen.parse_arguments(description, arguments, flags = [5,6,7,8,9,10,11,12,13], opt_flags = [2,3,4])
    working_directory, output_directory, genome_path, input_bed, input_fasta, clean_run, extract_exon_intron_bed, extract_exons, extract_introns, sort_by_exon_number, build_transcripts, extract_families, orf_length_sim = args.working_directory, args.output_directory, args.genome_path, args.input_bed, args.input_fasta, args.clean_run, args.extract_exon_intron_bed, args.extract_exons, args.extract_introns, args.sort_by_exon_number, args.build_transcripts, args.extract_families, args.orf_length_sim

    # create the directories
    gen.create_output_directories(working_directory)
    gen.create_output_directories(output_directory)

    # file paths
    exons_bed = "{0}/exons.bed".format(working_directory)
    single_exons_bed = "{0}/single_exons.bed".format(working_directory)
    multi_exons_bed = "{0}/multi_exons.bed".format(working_directory)
    exons_fasta = "{0}/exons.fasta".format(working_directory)
    single_exons_fasta = "{0}/single_exons.fasta".format(working_directory)
    multi_exons_fasta = "{0}/multi_exons.fasta".format(working_directory)
    introns_bed = "{0}/introns.bed".format(working_directory)
    introns_fasta = "{0}/introns.fasta".format(working_directory)
    transcript_sequences_fasta = "{0}/transcript_sequences.fasta".format(working_directory)
    multi_exon_transcript_sequences_fasta = "{0}/multi_exon_transcript_sequences.fasta".format(working_directory)
    multi_exon_blast_file = "{0}/multi_exons_blast_all_against_all.csv".format(working_directory)
    multi_exon_blast_database = "{0}/multi_exon_blast_all_against_all".format(working_directory)
    multi_exon_families_file = "{0}/multi_exon_families.txt".format(working_directory)

    # create the exons and introns files from bed
    if extract_exon_intron_bed:
        # copy the main file to the folder
        gen.copy_file(input_bed, "{0}/{1}".format(working_directory, input_bed.split("/")[-1]))
        # extract the features
        lmo.extract_bed_coordinates_block_format(input_bed, exons_bed, introns_bed)
    # get files for each
    if sort_by_exon_number:
        gen.check_files_exists([exons_bed])
        lmo.sort_by_exon_number(exons_bed, single_exons_bed, multi_exons_bed)

    # get exons
    if extract_exons:
        gen.check_files_exists([exons_bed])
        fo.fasta_from_intervals(exons_bed, exons_fasta, genome_path, names=True)
        # if the single exons bed file exists, get just the single exon sequences
        if os.path.isfile(single_exons_bed):
            lmo.sort_fasta_by_bed(single_exons_bed, exons_fasta, single_exons_fasta)
        # if the multi exons bed file exists, get just the multi exon sequences
        if os.path.isfile(multi_exons_bed):
            lmo.sort_fasta_by_bed(multi_exons_bed, exons_fasta, multi_exons_fasta)

    # get introns
    if extract_introns:
        gen.check_files_exists([introns_bed])
        fo.fasta_from_intervals(introns_bed, introns_fasta, genome_path, names=True)

    # build transcripts
    if build_transcripts:
        gen.check_files_exists([exons_fasta])
        lmo.build_transcripts(exons_fasta, transcript_sequences_fasta)
        # if the multi exons bed file exists, get just the multi exon sequences
        if os.path.isfile(multi_exons_bed):
            lmo.sort_fasta_by_bed(multi_exons_bed, transcript_sequences_fasta, multi_exon_transcript_sequences_fasta)

    # now group into paralagous families
    if extract_families:
        gen.check_files_exists([multi_exon_transcript_sequences_fasta])
        cons.filter_families(multi_exon_transcript_sequences_fasta, multi_exon_blast_file, multi_exon_families_file, database_path = multi_exon_blast_database, clean_run = clean_run)
Esempio n. 8
0
def main():

    description = "Look at disease snps."
    arguments = ["disease_snps_file", "output_directory", "results_prefix", "simulations", "ese_file", "intersect_snps", "get_relative_positions", "get_snp_status", "get_info", "simulate_ptc_location", "get_possible_ptc_locations", "required_simulations", "get_overlaps", "intersect_ptcs", "compare_ptcs" ,"get_introns", "compare_distances", "clinvar_ptc_locations", "location_simulation", "exclude_cpg", "ese_hit_simulation", "only_disease", "only_kgenomes", "only_ese", "get_unique_ptcs", "get_unique_rel_pos", "excess_test", "disease_locations_chisquare"]
    args = gen.parse_arguments(description, arguments, flags = [5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20, 21, 22, 23,24,25,26,27], ints=[3])
    disease_snps_file, output_directory, results_prefix, simulations, ese_file, intersect_snps, get_relative_positions, get_snp_status, get_info, simulate_ptc_location, get_possible_ptc_locations, required_simulations, get_overlaps, intersect_ptcs, compare_ptcs, get_introns, compare_distances, clinvar_ptc_locations, location_simulation, exclude_cpg, ese_hit_simulation, only_disease, only_kgenomes, only_ese, get_unique_ptcs, get_unique_rel_pos, excess_test, disease_locations_chisquare = args.disease_snps_file, args.output_directory, args.results_prefix, args.simulations, args.ese_file, args.intersect_snps, args.get_relative_positions, args.get_snp_status, args.get_info, args.simulate_ptc_location, args.get_possible_ptc_locations, args.required_simulations, args.get_overlaps, args.intersect_ptcs, args.compare_ptcs, args.get_introns, args.compare_distances, args.clinvar_ptc_locations, args.location_simulation, args.exclude_cpg, args.ese_hit_simulation, args.only_disease, args.only_kgenomes, args.only_ese, args.get_unique_ptcs, args.get_unique_rel_pos, args.excess_test, args.disease_locations_chisquare

    if simulations and not isinstance(simulations, int):
        print("\nERROR: Please provide the correct number for simulations.\n")
        raise Exception

    # create the output directory if it doesnt already exist
    gen.create_output_directories(output_directory)

    # disease_snps_file = "./source_data/clinvar_20180429.vcf.gz"
    disease_snps_index_file = "{0}.tbi".format(disease_snps_file)

    if not os.path.isfile(disease_snps_file) or not os.path.isfile(disease_snps_index_file):
        print("\nERROR: Please provide the required disease SNPs file(s).\n")
        raise Exception

    # intersect the coding exons with the disease snps
    exon_bed = "{0}_coding_exons.bed".format(results_prefix)
    disease_snp_intersect_file_vcf = "{0}/disease_snp_intersect.vcf".format(output_directory)
    disease_snp_intersect_file_bed = "{0}/disease_snp_intersect.bed".format(output_directory)
    if intersect_snps:
        print("Intersecting snps with exons")
        so.intersect_snps_parallel(exon_bed, disease_snps_file, disease_snp_intersect_file_vcf)
        so.intersect_vcf_to_bed(exon_bed, disease_snp_intersect_file_vcf, disease_snp_intersect_file_bed, change_names = True)

    # get relative positions of the snps in cds and exons
    full_bed = "{0}_CDS.bed".format(results_prefix)
    disease_snps_relative_exon_positions = "{0}/disease_snp_relative_exon_positions.bed".format(output_directory)
    disease_snps_relative_cds_positions = "{0}/disease_snp_relative_cds_positions.bed".format(output_directory)
    if get_relative_positions:
        print("Getting snp relative positions...")
        so.get_snp_relative_exon_position(disease_snp_intersect_file_bed, disease_snps_relative_exon_positions)
        # output to var because this is how the function was made
        relative_positions = gen.read_many_fields(disease_snps_relative_exon_positions, "\t")
        so.get_snp_relative_cds_position(relative_positions, disease_snps_relative_cds_positions, full_bed)

    # get the change status of the snps to check them
    cds_fasta = "{0}_CDS.fasta".format(results_prefix)
    disease_ptcs_file = "{0}/disease_ptcs.txt".format(output_directory)
    disease_other_file = "{0}/disease_other_snps.txt".format(output_directory)
    if get_snp_status:
        print("Getting snp status...")
        so.get_snp_change_status(disease_snps_relative_cds_positions, cds_fasta, disease_ptcs_file, disease_other_file)

    # get intersect between the clinvar ptcs and 1000 genomes ptcs
    ptc_file = "{0}_ptc_file.txt".format(results_prefix)
    ptc_intersect_file = "{0}/ptc_intersect.bed".format(output_directory)
    if intersect_ptcs:
        temp_disease_ptc_file = "temp_data/{0}".format(random.random())
        dso.refactor_ptc_file(disease_ptcs_file, temp_disease_ptc_file)
        temp_k_genomes_ptc_file = "temp_data/{0}".format(random.random())
        dso.refactor_ptc_file(ptc_file, temp_k_genomes_ptc_file, header=True)
        bao.intersect_bed(temp_k_genomes_ptc_file, temp_disease_ptc_file, write_both = True, no_dups=False, output_file = ptc_intersect_file)
        gen.remove_file(temp_disease_ptc_file)
        gen.remove_file(temp_k_genomes_ptc_file)

    # get a list of ptcs unique to each dataset
    unique_ptcs = "{0}/disease_ptcs_no_intersect.bed".format(output_directory)
    unique_ptcs_kgenomes = "{0}/kgenomes_ptcs_no_intersect.bed".format(output_directory)
    if get_unique_ptcs:
        dso.get_unique_ptcs(disease_ptcs_file, ptc_file, ptc_intersect_file, unique_ptcs, unique_ptcs_kgenomes)

    # get the relative positions of the ptcs unique to each dataset
    unique_ptcs_rel_pos_file = "{0}/disease_ptcs_no_intersect_rel_pos.bed".format(output_directory)
    kgenomes_relative_positions = "{0}_PTC_relative_exon_positions.bed".format(results_prefix)
    kgenomes_unique_ptcs_rel_pos_file = "{0}/kgenomes_ptcs_no_intersect_rel_pos.bed".format(output_directory)
    if get_unique_rel_pos:
        dso.get_unique_rel_pos(unique_ptcs, disease_snps_relative_exon_positions, unique_ptcs_kgenomes, kgenomes_relative_positions, unique_ptcs_rel_pos_file, kgenomes_unique_ptcs_rel_pos_file)


    # get the ese file name
    ese_file_name = ese_file.split('/')[-1].split('.')[0]
    # get the coding exons fasta file path
    coding_exons_fasta = "{0}_coding_exons.fasta".format(results_prefix)

    # snp_relative_positions_file = "{0}_SNP_relative_exon_position.bed".format(results_prefix)

    # simulation picking random reference allele matched simulants
    clinvar_location_simulation_file = "{0}/clinvar_ptc_location_simulation.csv".format(output_directory)
    clinvar_location_simulation_ese_overlap_file = "{0}/clinvar_ptc_location_simulation_{1}_ese_overlaps.csv".format(output_directory, ese_file_name)
    kgenomes_location_simulation_file = "{0}/1000_genomes_simulations.csv".format(output_directory)
    kgenomes_location_simulation_ese_overlap_file = "{0}/1000_genomes_simulations_ese_overlaps.csv".format(output_directory)

    if location_simulation:
        if not only_kgenomes:
            print('Running ptc location simulation on disease PTCs...')
            dso.ptc_location_simulation(unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, clinvar_location_simulation_file, clinvar_location_simulation_ese_overlap_file, ese_file, only_ese, exclude_cpg)
        if not only_disease:
            print('Running ptc location simulation on 1000 genomes PTCs...')
            dso.ptc_location_simulation(kgenomes_unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, kgenomes_location_simulation_file, kgenomes_location_simulation_ese_overlap_file, ese_file, only_ese, exclude_cpg)


    window_start = 3
    window_end = 69
    clinvar_ese_hit_simulation_file = "{0}/clinvar_ese_hit_simulation_{1}_{2}_{3}.csv".format(output_directory, window_start, window_end, ese_file_name)
    kgenomes_ese_hit_simulation_file = "{0}/1000_genomes_ese_hit_simulation_{1}_{2}_{3}.csv".format(output_directory, window_start, window_end, ese_file_name)

    # do a simulation picking only sites from within the region
    if ese_hit_simulation:
        if not only_kgenomes:
            print("Simulating ESE hits on the {0}-{1} region for disease PTCs...".format(window_start, window_end))
            dso.ese_hit_simulation(unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, clinvar_ese_hit_simulation_file, ese_file, window_start, window_end, exclude_cpg)
        if not only_disease:
            print("Simulating ESE hits on the {0}-{1} region for 1000 genomes PTCs...".format(window_start, window_end))
            dso.ese_hit_simulation(kgenomes_unique_ptcs_rel_pos_file, coding_exons_fasta, simulations, kgenomes_ese_hit_simulation_file, ese_file, window_start, window_end, exclude_cpg)


    excess_test_file = "{0}/clinvar_ptc_{1}_{2}_excesses.csv".format(output_directory, window_start, window_end)
    if excess_test:
        dso.excess_test(unique_ptcs_rel_pos_file, coding_exons_fasta, excess_test_file)

    location_test_file = "{0}/clinvar_locations_chisquare.csv".format(output_directory)
    if disease_locations_chisquare:
        dso.disease_ptc_location_test(unique_ptcs_rel_pos_file, coding_exons_fasta, location_test_file)
Esempio n. 9
0
def main():

    description = "Take an output file from prepare_FANTOM.py and make a file with the expression data for each gene."
    args = gen.parse_arguments(description, [
        "clean_fasta", "promoters_file_name", "cage_file_name", "out_prefix",
        "TPM_threshold"
    ],
                               ints=[4])
    [
        clean_fasta, promoters_file_name, cage_file_name, out_prefix,
        TPM_threshold
    ] = [
        args.clean_fasta, args.promoters_file_name, args.cage_file_name,
        args.out_prefix, args.TPM_threshold
    ]

    #extract transcript coordinates
    transcripts_file = "{0}_transcripts_clean.bed".format(out_prefix)
    bo.extract_features("../source_data/Homo_sapiens.GRCh37.87.gtf",
                        transcripts_file, ["transcript"])

    #get the names of the transcripts you're interested
    names = gen.read_fasta(clean_fasta)[0]

    #write the coordinates of the promoter regions of those transcripts to file
    with open(promoters_file_name,
              "w") as out_file, open(transcripts_file, "r") as in_file:
        for line in in_file:
            parsed = (line.rstrip("\n")).split("\t")
            #parse out the transcript name
            name = parsed[3].split(".")[0]
            #skip transcripts that aren't among your transcripts of interest
            if name in names:
                #determine the coordinates of a 1001 bp region centered on the TSS (the supposed promoter region)
                if parsed[5] == "+":
                    current_line = [
                        "chr" + parsed[0],
                        int(parsed[1]) - 500,
                        int(parsed[1]) + 500 + 1, name, ".", parsed[5]
                    ]
                elif parsed[5] == "-":
                    current_line = [
                        "chr" + parsed[0],
                        int(parsed[2]) - 500 - 1,
                        int(parsed[2]) + 500, name, ".", parsed[5]
                    ]
                else:
                    RuntimeError("Invalid strand information!")
                out_file.write("\t".join([str(i) for i in current_line]))
                out_file.write("\n")

    #check which CAGE peaks overlap which promoters
    overlapping_peaks_file = "{0}_FANTOM_overlap_peaks.bed".format(out_prefix)
    bmo.intersect_bed(cage_file_name,
                      promoters_file_name,
                      output_file=overlapping_peaks_file,
                      force_strand=True,
                      write_both=True,
                      no_dups=False)

    #for each transcript, get all overlapping peaks
    #(store only the expression information)
    peaks_dict = {name: [] for name in names}
    with open(overlapping_peaks_file, "r") as peaks:
        for peak in peaks:
            peak = peak.split("\t")
            name = peak[9]
            peaks_dict[name].append(peak[3])

    #for each transcript,
    #store the mean TPM within each tissue (averaged over the different peaks
    #associated to that transcript)
    mean_dict = {}
    np.set_printoptions(suppress=True)
    for name in peaks_dict:
        if len(peaks_dict[name]) > 0:
            current_mat = np.array([[float(j) for j in i.split("|")]
                                    for i in peaks_dict[name]])
            means = np.mean(current_mat, axis=0)
            mean_dict[name] = means

    #calculate expression parameters
    final_dict = {}
    for gene in mean_dict:
        expressed = len([i for i in mean_dict[gene] if i > TPM_threshold])
        fraction = expressed / len(mean_dict[gene])
        maximum = np.max(mean_dict[gene])
        median_expr = np.median(mean_dict[gene])
        median_if_expressed = np.median(
            [i for i in mean_dict[gene] if i > TPM_threshold])
        final_dict[gene] = [
            fraction, maximum, median_expr, median_if_expressed
        ]

    output_file_name = "{0}_FANTOM_expression_per_transcript.txt".format(
        out_prefix)
    with open(output_file_name, "w") as file:
        file.write("gene\tbreadth\tmax\tmedian\tmedian_expr\n")
        for i in sorted(list(final_dict.keys())):
            if final_dict[i] != None:
                file.write("\t".join([i] + [str(j) for j in final_dict[i]]))
                file.write("\n")
Esempio n. 10
0
def main():

    description = "Check whether PTCs are associated with greater rates of exon skipping."
    args = gen.parse_arguments(
        description, [
            "gtf", "genome_fasta", "bams_folder", "vcf_folder", "panel_file",
            "out_prefix", "bam_analysis_folder", "number_of_simulations",
            "simulation_output_folder", "motif_file", "filter_genome_data",
            "get_SNPs", "process_bams", "simulate_ptc_snps",
            "motif_complement", "overwrite_intersect", "use_old_sims",
            "out_of_frame", "simulate_ptcs_with_monomorphic",
            "generate_monomorphic_indices", "ignore_determine_snp_type",
            "ignore_psi_calculation", "ptc_location_analysis"
        ],
        flags=[10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22],
        ints=[7])
    gtf, genome_fasta, bams_folder, vcf_folder, panel_file, out_prefix, bam_analysis_folder, number_of_simulations, simulation_output_folder, motif_file, filter_genome_data, get_SNPs, process_bams, simulate_ptc_snps, motif_complement, overwrite_intersect, use_old_sims, out_of_frame, simulate_ptcs_with_monomorphic, generate_monomorphic_indices, ignore_determine_snp_type, ignore_psi_calculation, ptc_location_analysis = args.gtf, args.genome_fasta, args.bams_folder, args.vcf_folder, args.panel_file, args.out_prefix, args.bam_analysis_folder, args.number_of_simulations, args.simulation_output_folder, args.motif_file, args.filter_genome_data, args.get_SNPs, args.process_bams, args.simulate_ptc_snps, args.motif_complement, args.overwrite_intersect, args.use_old_sims, args.out_of_frame, args.simulate_ptcs_with_monomorphic, args.generate_monomorphic_indices, args.ignore_determine_snp_type, args.ignore_psi_calculation, args.ptc_location_analysis

    start = time.time()

    # create any necessary output diretories
    directory_splits = out_prefix.split('/')
    directory_paths = "/".join(directory_splits[:-1])
    gen.create_output_directories(directory_paths)
    gen.create_directory('temp_data/')

    CDS_fasta = "{0}_CDS.fasta".format(out_prefix)
    CDS_bed = "{0}_CDS.bed".format(out_prefix)
    exon_bed = "{0}_exons.bed".format(out_prefix)
    filtered_exon_bed = "{0}_filtered_exons.bed".format(out_prefix)
    exon_junctions_file = "{0}_exon_junctions.bed".format(out_prefix)
    coding_exon_bed = "{0}_coding_exons.bed".format(out_prefix)

    if filter_genome_data:
        #extract and filter CDS coordinates and sequences
        print("Extracting and filtering CDSs...")
        bo.extract_cds(gtf,
                       CDS_bed,
                       CDS_fasta,
                       genome_fasta,
                       all_checks=True,
                       uniquify=True,
                       clean_chrom_only=True,
                       full_chr_name=True)
        gen.get_time(start)

        #group the CDS sequences into families based on sequence similarity
        print("Grouping sequences into families...")
        names = gen.read_fasta(CDS_fasta)[0]
        gen.find_families_ensembl(
            "../source_data/GRCh37_ensembl_protein_families.txt", names,
            "{0}_families.txt".format(out_prefix))
        gen.get_time(start)

        print("Extracting and filtering exons...")
        #extract exon coordinates
        bo.extract_exons(gtf, exon_bed)
        #only leave exons from transcripts that passed quality control in the extract_cds step above.
        #also only leave a single gene per family
        bo.filter_bed_from_fasta(
            exon_bed,
            CDS_fasta,
            filtered_exon_bed,
            families_file="{0}_families.txt".format(out_prefix))
        gen.get_time(start)

        #extract exon-exon junction coordinates
        print("Extracting exon-exon junctions...")
        bo.extract_exon_junctions(exon_bed,
                                  exon_junctions_file,
                                  window_of_interest=2)
        gen.get_time(start)

        #make another exons bed that only contains fully coding exons.
        #This is because in the final analysis, we should only consider fully protein-coding exons.
        #However, for getting the exon junctions we need the full exons file because fully protein-coding exons might
        #be flanked by exons that are not. This is why we couldn't do this filtering step earlier.
        print(
            "Filtering out overlapping, non-coding and partially coding, as well as terminal exons..."
        )
        bo.check_coding(filtered_exon_bed,
                        CDS_bed,
                        coding_exon_bed,
                        remove_overlapping=True)
        gen.get_time(start)

    SNP_file = "{0}_SNP_file.txt".format(out_prefix)
    if out_of_frame:
        out_prefix = out_prefix + "_out_of_frame"
    PTC_file = "{0}_ptc_file.txt".format(out_prefix)
    syn_nonsyn_file = "{0}_syn_nonsyn_file.txt".format(out_prefix)
    CDS_interval_file = "{0}_intervals{1}".format(
        os.path.splitext(CDS_fasta)[0],
        os.path.splitext(CDS_fasta)[1])
    #check which individuals were included in Geuvadis
    full_sample_names = os.listdir(bams_folder)
    full_sample_names = [
        i for i in full_sample_names if i[-4:] == ".bam" and "proc" not in i
    ]
    sample_names = [(i.split("."))[0] for i in full_sample_names]
    sample_names = [i for i in sample_names if len(i) > 0]
    print('{0} samples included in Geuvadis...'.format(len(sample_names)))
    #for some reason, 17 of the samples from Geuvadis don't appear in the 1000genomes vcf
    #I'm gonna have to get to the bottom of this at some point
    #but at the moment I'm just gonna filter them out

    with open("../source_data/samples_in_vcf.txt") as file:
        samples_in_vcf = file.readlines()
    samples_in_vcf = [i.rstrip("\n") for i in samples_in_vcf]
    sample_names = [i for i in sample_names if i in samples_in_vcf]
    print('{0} samples also in vcf...'.format(len(sample_names)))
    sample_file = "{0}_sample_file.txt".format(out_prefix)

    # create a fasta containing all sequences for exons with snp
    coding_exons_fasta = "{0}_coding_exons.fasta".format(out_prefix)
    bo.fasta_from_intervals(coding_exon_bed,
                            coding_exons_fasta,
                            genome_fasta,
                            names=True)

    if get_SNPs:
        #get SNPs for the sample
        intersect_file = "{0}_SNP_CDS_intersect.bed".format(out_prefix)
        print("Getting SNP data...")
        so.get_snps_in_cds(coding_exon_bed, CDS_bed, vcf_folder, panel_file,
                           sample_names, sample_file, intersect_file,
                           out_prefix)
        print("Calculating SNP positions...")
        so.get_snp_positions(sample_file, SNP_file, CDS_bed, intersect_file,
                             out_prefix)
        gen.get_time(start)

    if ignore_determine_snp_type:
        pass
    else:
        print("Determining SNP type...")
        so.get_snp_change_status(SNP_file,
                                 CDS_fasta,
                                 PTC_file,
                                 syn_nonsyn_file,
                                 out_of_frame=out_of_frame,
                                 ref_check=True,
                                 headers=True)
        gen.get_time(start)

    #filter the exon junctions file to only leave those junctions that flank exons retained in the previous step.
    print(
        "Filtering exon-exon junctions to only leave those that flank exons with a PTC variant..."
    )
    PTC_exon_junctions_file = "{0}_filtered_exon_junctions.bed".format(
        out_prefix)
    bo.filter_exon_junctions(exon_junctions_file, PTC_file,
                             PTC_exon_junctions_file)

    #make a list of all the .bam files and modify them to have the full path rather than just the file name
    bam_files = [
        "{0}/{1}".format(bams_folder, i) for i in full_sample_names
        if (i.split("."))[0] in sample_names
    ]

    #in parallel, do the processing on individual .bam files
    exon_junctions_bam_output_folder = "{0}__analysis_exon_junction_bams".format(
        out_prefix)
    if bam_analysis_folder == "None":
        bam_analysis_folder = "{0}__analysis_bam_analysis".format(out_prefix)
    gen.create_directory(bam_analysis_folder)
    if process_bams:
        print("Processing RNA-seq data...")
        if out_of_frame:
            splits = exon_junctions_bam_output_folder.split('/')
            splits[-1] = splits[-1].replace('_out_of_frame', '')
            exon_junctions_bam_output_folder = "/".join(splits)
        gen.create_directory(exon_junctions_bam_output_folder)
        #we have to do it like this because you can't pass flags into run_in_parallel
        keyword_dict = {"overwrite_intersect": overwrite_intersect}
        processes = gen.run_in_parallel(bam_files, [
            "foo", exon_junctions_file, PTC_exon_junctions_file,
            bam_analysis_folder, PTC_file, syn_nonsyn_file, out_prefix,
            exon_junctions_bam_output_folder, keyword_dict
        ],
                                        nao.process_bam_per_individual,
                                        workers=36)
        for process in processes:
            process.get()
        gen.get_time(start)

    #if required, filter PTCs to only leave ones that overlap motifs from a specified set
    motif_filtering = False
    if motif_file != "None":
        print(
            "Filtering SNPs based on whether or not they overlap a motif from the specified set..."
        )
        motif_suffix = ((motif_file.split("/"))[-1]).split(".")[0]
        if motif_complement:
            out_prefix = "{0}_{1}_complement".format(out_prefix, motif_suffix)
        else:
            out_prefix = "{0}_{1}".format(out_prefix, motif_suffix)
        filtered_ptc = "{0}_ptc_file.txt".format(out_prefix)
        so.filter_motif_SNPs(CDS_fasta,
                             PTC_file,
                             motif_file,
                             filtered_ptc,
                             complement=motif_complement)
        PTC_file = filtered_ptc

    final_file = "{0}__analysis_final_output.txt".format(out_prefix)
    if ignore_psi_calculation:
        pass
    else:
        print("Calculating PSI...")
        bmo.compare_PSI(PTC_file, bam_analysis_folder, final_file)

    #run the simulation that swaps ptcs for nonsynonymous snps
    if simulate_ptc_snps:
        if simulate_ptc_snps and not number_of_simulations:
            print("Please specify the number of simulations")
            raise Exception
        nao.ptc_snp_simulation(out_prefix,
                               simulation_output_folder,
                               PTC_file,
                               syn_nonsyn_file,
                               exon_junctions_file,
                               bam_files,
                               number_of_simulations,
                               exon_junctions_bam_output_folder,
                               use_old_sims=use_old_sims)

    # run the simulation that picks monomorphic sites
    if simulate_ptcs_with_monomorphic:
        if simulate_ptcs_with_monomorphic and not number_of_simulations:
            print("Please specify the number of simulations")
            raise Exception

        coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix)
        if not os.path.exists(coding_exon_fasta):
            print('Coding exon fasta is required...')
            raise Exception
        nao.ptc_monomorphic_simulation(
            out_prefix,
            simulation_output_folder,
            sample_file,
            genome_fasta,
            PTC_file,
            syn_nonsyn_file,
            coding_exon_bed,
            coding_exon_fasta,
            exon_junctions_file,
            bam_files,
            number_of_simulations,
            generate_indices=generate_monomorphic_indices,
            use_old_sims=use_old_sims)

    # get the locations of the ptcs
    if ptc_location_analysis:
        print("PTC locations analysis...")
        snp_relative_exon_position_file = "{0}_SNP_relative_exon_position.bed".format(
            out_prefix)
        ptc_location_analysis_output_file = "{0}_ptc_location_analysis.csv".format(
            out_prefix)
        coding_exon_fasta = "{0}_coding_exons.fasta".format(out_prefix)
        if not os.path.exists(coding_exon_fasta) or not os.path.exists(
                snp_relative_exon_position_file) or not os.path.exists(
                    PTC_file):
            print("Please run --filter_genome_data and --get_SNPs first...")
            raise Exception
        # need to work out where and what the analysis outputs need to do
        so.ptc_locations(PTC_file, snp_relative_exon_position_file,
                         ptc_location_analysis_output_file)
Esempio n. 11
0
def main():

    description = "Filter an osc file to only contain the samples that you want and format it as a bed file so that you could lift over the coordinates."
    arguments = ["input_file_name", "output_file_name", "filter_samples"]
    args = parse_arguments(description, arguments, flags=[2])
    input_file_name, output_file_name, filter_samples = [
        args.input_file_name, args.output_file_name, args.filter_samples
    ]

    #this is all the pooled ones except for all the brain subregion ones which I removed because otherwise like
    #11/40 would have been brain tissues. I left in the retina though.
    ones_I_want = [
        'of adipose tissue, adult, pool1', 'of adrenal gland, adult, pool1',
        'of aorta, adult, pool1', 'of bladder, adult, pool1',
        'of blood, adult, pool1', 'of brain, adult, pool1',
        'of cervix, adult, pool1', 'of colon, adult, pool1',
        'of esophagus, adult, pool1', 'of heart, adult, pool1',
        'of kidney, adult, pool1', 'of liver, adult, pool1',
        'of lung, adult, pool1', 'of ovary, adult, pool1',
        'of placenta, adult, pool1', 'of prostate, adult, pool1',
        'of retina, adult, pool1', 'of salivary gland, adult, pool1',
        'of skeletal muscle, adult, pool1', 'of small intestine, adult, pool1',
        'of smooth muscle, adult, pool1', 'of spleen, adult, pool1',
        'of testis, adult, pool1', 'of thymus, adult, pool1',
        'of thyroid, adult, pool1', 'of tonsil, adult, pool1',
        'of trachea, adult, pool1', 'of uterus, adult, pool1'
    ]

    IDs = []
    indices = []
    full_IDs = []

    counter = 0

    with open(input_file_name) as file, open(output_file_name,
                                             "w") as output_file:
        for line in file:
            counter = counter + 1
            if counter % 1000 == 0:
                print(counter)
            if line[0] == "#":
                if filter_samples:
                    if "adult, pool1" in line:
                        for search in ones_I_want:
                            if search in line:
                                ID = re.findall("CNhs[\d\.\-\w]*", line)[0]
                                IDs.append(ID)
            elif line[:6] == "00Anno":
                if filter_samples:
                    line = line.split("\t")
                    for pos, elem in enumerate(line):
                        for ID in IDs:
                            if ID in elem:
                                indices.append(pos)
                                full_IDs.append(elem)
            elif line[:3] == "chr":
                #I'm going to pretend that the actual data bit is just
                #the name of the bed record so it would survive the CrossMapping
                line = line.split("\t")
                coords = line[0]
                line[-1] = line[-1].rstrip("\n")
                if filter_samples:
                    line = [line[i] for i in indices]
                else:
                    line = line[1:]
                coords = coords.split("..")
                chrom = coords[0].split(":")[0]
                start = coords[0].split(":")[1]
                end = coords[1].split(",")[0]
                strand = coords[1].split(",")[1]
                name = "|".join(line)
                output_line = [chrom, start, end, name, ".", strand]
                output_file.write("\t".join(output_line))
                output_file.write("\n")
def main():

    arguments = [
        "output_directory", "genome_gtf", "genome_fasta", "ortholog_gtf",
        "ortholog_fasta", "input_file", "genome_fasta", "mapping_file",
        "codes_file", "ensembl_links", "extract_protein_coding",
        "extract_exons", "extract_introns", "extract_coding_exons",
        "extract_non_coding_exons", "extract_non_transcribed_regions",
        "extract_lincrna_seqs", "clean_run"
    ]

    description = ""
    args = gen.parse_arguments(description,
                               arguments,
                               opt_flags=[1, 2, 3, 4, 5, 6, 7, 8, 9],
                               flags=[10, 11, 12, 13, 14, 15, 16, 17])
    output_directory, genome_gtf, genome_fasta, ortholog_gtf, ortholog_fasta, input_file, genome_fasta, mapping_file, codes_file, ensembl_links, extract_protein_coding, extract_exons, extract_introns, extract_coding_exons, extract_non_coding_exons, extract_non_transcribed_regions, extract_lincrna_seqs, clean_run = args.output_directory, args.genome_gtf, args.genome_fasta, args.ortholog_gtf, args.ortholog_fasta, args.input_file, args.genome_fasta, args.mapping_file, args.codes_file, args.ensembl_links, args.extract_protein_coding, args.extract_exons, args.extract_introns, args.extract_coding_exons, args.extract_non_coding_exons, args.extract_non_transcribed_regions, args.extract_lincrna_seqs, args.clean_run

    # set a start time
    start = time.time()

    # create the output_directory if it doenst already exist
    gen.create_output_directories(output_directory)

    # get the sequences
    if extract_protein_coding:
        # input_file1 = gtf genome 1, genome_fasta = genome fasta 1, ortholog_gtf = gtf genome 2, ortholog_fasta = genome fasta 2, ensembl_links = orthlogs file
        cont.extract_clean_sequences(genome_gtf,
                                     genome_fasta,
                                     ortholog_gtf,
                                     ortholog_fasta,
                                     ensembl_links,
                                     output_directory,
                                     clean_run=clean_run)

    full_exon_file = "{0}/genome_sequences/human/human.exons.bed".format(
        output_directory)
    if extract_exons:
        cont.extract_exons(genome_gtf,
                           genome_fasta,
                           output_directory,
                           full_exon_file,
                           clean_run=clean_run)
        sequo.clean_feature_file(full_exon_file)

    exons_bed = "{0}/genome_sequences/{1}/{1}.cds.clean_filtered_exons.bed".format(
        output_directory, "human")
    coding_exons_bed = "{0}/genome_sequences/{1}/{1}.cds.clean_coding_exons.bed".format(
        output_directory, "human")
    coding_exons_fasta = "{0}/genome_sequences/{1}/{1}.cds.clean_coding_exons.fasta".format(
        output_directory, "human")
    if extract_coding_exons:
        sequo.get_coding_exon_coordinates(full_exon_file, exons_bed,
                                          coding_exons_bed)
        fo.fasta_from_intervals(coding_exons_bed,
                                coding_exons_fasta,
                                genome_fasta,
                                names=True)

    if extract_non_coding_exons:
        non_coding_exons_bed = "{0}/genome_sequences/{1}/{1}.cds.clean_non_coding_exons.bed".format(
            output_directory, "human")
        non_coding_exons_fasta = "{0}/genome_sequences/{1}/{1}.cds.clean_non_coding_exons.fasta".format(
            output_directory, "human")
        sequo.get_non_coding_exon_coordinates(full_exon_file, exons_bed,
                                              non_coding_exons_bed)
        fo.fasta_from_intervals(non_coding_exons_bed,
                                non_coding_exons_fasta,
                                genome_fasta,
                                names=True)

    if extract_introns:
        intron_bed = "{0}/genome_sequences/human/human.clean_introns.bed".format(
            output_directory)
        intron_fasta = "{0}/genome_sequences/human/human.clean_introns.fasta".format(
            output_directory)
        sequo.get_intron_coordinates(coding_exons_bed, intron_bed)
        fo.fasta_from_intervals(intron_bed,
                                intron_fasta,
                                genome_fasta,
                                names=True)

    if extract_non_transcribed_regions:
        all_features_bed = "{0}/genome_sequences/human/human.all_features.bed".format(
            output_directory)
        non_transcribed_bed = "{0}/genome_sequences/human/human.non_transcribed.bed".format(
            output_directory)
        non_transcribed_fasta = "{0}/genome_sequences/human/human.non_transcribed.fasta".format(
            output_directory)
        seqo.get_non_transcribed_regions(genome_gtf, genome_fasta,
                                         all_features_bed, non_transcribed_bed,
                                         non_transcribed_fasta,
                                         output_directory)

    # extract sequences from source file
    if extract_lincrna_seqs:
        # set up the output fasta to contain the exon seqs
        lincrna_exons_bed = "{0}/lincRNA_exons.bed".format(output_directory)
        lincrna_exons_fasta = "{0}/lincRNA_exons.fasta".format(
            output_directory)
        lincrna_seqs_fasta = "{0}/lincRNA_seqs.fasta".format(output_directory)
        print("Extracting lincRNA seqs...")
        fo.extract_seqs(input_file,
                        genome_fasta,
                        lincrna_exons_bed,
                        lincrna_exons_fasta,
                        lincrna_seqs_fasta,
                        mapping_file,
                        codes_file,
                        exclude_XY=True,
                        hg38=hg38,
                        NONCODE=NONCODE)
        print("Use lincRNA_misc.py to do further filtering...")