コード例 #1
0
def intron_hexamer_test(input_fasta, motif_file, output_directory, output_file, required_simulations = None, families_file = None):
    """
    Generate random hexamers from introns and calculate purine content

    Args:
        input_fasta (str): path to intron fasta
        motif_file (str): path to file containing real motifs
        output_directory (str): path to output directory
        output_file (str): path to output file
        required_simulations (int): if set, the number of simulations to run
        families_file (str): if set, path to families file
    """

    hexamers_dir = "{0}/random_hexamers".format(output_directory)
    gen.create_output_directories(hexamers_dir)
    # get the motifs
    motifs = sequo.read_motifs(motif_file)
    # if there are not enough simulations, generate them
    if len(os.listdir(hexamers_dir)) < required_simulations:
        gen.create_output_directories(hexamers_dir)
        required = list(range(required_simulations - len(os.listdir(hexamers_dir))))
        names, seqs = gen.read_fasta(sequences_file)
        seqs_list = collections.defaultdict(lambda: [])
        for i, name in enumerate(names):
            seqs_list[name.split(".")[0]].append(seqs[i])

        if families_file:
            seqs_list = sequo.pick_random_family_member(families_file, seqs_list)

        all_seqs = []
        [all_seqs.extend(seqs_list[i]) for i in seqs_list]
        full_seq = "X".join(all_seqs)
        simopc.run_simulation_function(required, [full_seq, motifs, hexamers_dir], sequo.locate_random_motifs, sim_run = False)

    # calculate the purine contents
    real_purine_content = sequo.calc_purine_content(motifs)
    real_nt_content = sequo.calc_nucleotide_content(motifs)

    test_purine_content = []
    test_nt_content = []
    for file in os.listdir(hexamers_dir):
        filepath = "{0}/{1}".format(hexamers_dir, file)
        test_motifs = sequo.read_motifs(filepath)
        test_purine_content.append(sequo.calc_purine_content(test_motifs))
        test_nt_content.append(sequo.calc_nucleotide_content(test_motifs))

    with open(output_file, "w") as outfile:
        outfile.write("id,purine_content,a_content,c_content,g_content,t_content\n")
        outfile.write("real,{0},{1}\n".format(real_purine_content, ",".join(gen.stringify([real_nt_content[i] for i in sorted(real_nt_content)]))))
        for i in range(len(test_purine_content)):
            outfile.write("{0},{1},{2}\n".format(i+1, test_purine_content[i], ",".join(gen.stringify([test_nt_content[i][j] for j in sorted(test_nt_content[i])]))))

    # remove the output directory
    gen.remove_directory(hexamers_dir)
コード例 #2
0
def motif_codon_density(motif_file, output_directory):

    stops = ["TAA", "TAG", "TGA"]
    gc_matchd_motifs_file = "{0}/gc_matched_combinations.bed".format(
        output_directory)
    if not os.path.isfile(gc_matchd_motifs_file):
        seqo.get_gc_matched_motifs(stops, gc_matchd_motifs_file)

    temp_dir = "temp_motif_density"
    gen.create_output_directories(temp_dir)

    motif_sets = gen.read_many_fields(gc_matchd_motifs_file, "\t")
    motif_sets.append(["TAA", "TAG", "TGA"])

    args = [motif_file, temp_dir]
    outputs = simoc.run_simulation_function(motif_sets,
                                            args,
                                            ops.calc_codon_density_in_motifs,
                                            sim_run=False)

    new_output_dir = "{0}/motif_densities".format(output_directory)
    gen.create_output_directories(new_output_dir)

    output_file = "{0}/{1}.csv".format(new_output_dir,
                                       motif_file.split("/")[-1].split(".")[0])
    with open(output_file, "w") as outfile:
        outfile.write("id,motifs,density\n")
        for i, file in enumerate(sorted(outputs)):
            data = gen.read_many_fields(file, ",")[0]
            outfile.write("{0},{1},{2}\n".format(i + 1, data[0], data[1]))

    gen.remove_directory(temp_dir)
コード例 #3
0
def motif_stop_codon_densities(motif_file, motif_controls_directory, required_simulations, output_file):

    filelist = {"real": motif_file}
    for i, file in enumerate(os.listdir(motif_controls_directory)[:required_simulations]):
        filelist[i] = "{0}/{1}".format(motif_controls_directory, file)
    file_ids = [i for i in filelist]

    temp_dir = "temp_motif_dir"
    gen.create_output_directories(temp_dir)

    args = [filelist, temp_dir]
    outputs = simopc.run_simulation_function(file_ids, args, calculate_stop_codon_densities, sim_run = False)

    with open(output_file, "w") as outfile:
        outfile.write("sim_id,stop_density\n")
        for file in outputs:
            outfile.write("{0}\n".format(",".join(gen.read_many_fields(file, "\t")[0])))
コード例 #4
0
def cds_motif_test(cds_fasta, output_file):

    nts = ["A", "C", "G", "T"]
    stops = ["TAA", "TAG", "TGA"]

    codon_list = sorted(
        ["".join(codon) for codon in it.product(nts, nts, nts)])
    combinations = [sorted(i) for i in it.combinations(codon_list, 3)]
    # combination_not_all_stops = [i for i in combinations if len(list(set(i) & set(stops))) < 3]

    # combinations = combinations[:30]

    temp_dir = "temp_motif_densities"
    gen.create_output_directories(temp_dir)

    args = [cds_fasta, temp_dir]
    outputs = simoc.run_simulation_function(combinations,
                                            args,
                                            ops.calc_motif_densities,
                                            sim_run=False)

    temp_filelist = []
    for output in outputs:
        temp_filelist.append(output)

    densities = collections.defaultdict(lambda: collections.defaultdict())

    for file in temp_filelist:
        motif = file.split("/")[-1].split(".")[0]
        data = gen.read_many_fields(file, ",")[0]
        gc = data[0]
        density = data[1]
        densities[gc][motif] = density

    iterator = 0
    with open(output_file, "w") as outfile:
        outfile.write("id,motif,gc,density\n")
        for gc in sorted(densities):
            for motif in sorted(densities[gc]):
                iterator += 1
                outfile.write("{0},{1},{2},{3}\n".format(
                    iterator, motif, gc, densities[gc][motif]))
コード例 #5
0
def motif_codon_densities(motif_file, codon_combinations_file, motif_controls_directory, required_simulations, output_file):

    filelist = {"real": motif_file}
    for i, file in enumerate(os.listdir(motif_controls_directory)[:required_simulations]):
        filelist[i] = "{0}/{1}".format(motif_controls_directory, file)
    file_ids = [i for i in filelist]


    codon_sets = gen.read_many_fields(codon_combinations_file, "\t")


    temp_dir = "temp_motif_dir"
    gen.create_output_directories(temp_dir)
    args = [filelist, codon_sets, temp_dir]
    outputs = simopc.run_simulation_function(file_ids, args, calculate_motif_densities, sim_run = False)

    real_density_list = {}
    sim_density_list = collections.defaultdict(lambda: [])

    for file in outputs:
        results = gen.read_many_fields(file, "\t")
        if "real" in file:
            for i in results:
                real_density_list[i[0]] = float(i[1])
        else:
            for i in results:
                sim_density_list[i[0]].append(float(i[1]))

    with open(output_file, "w") as outfile:
        outfile.write("codons,gc_content,purine_content,density,nd\n")
        for codon_set in sorted(real_density_list):
            nd = np.divide(real_density_list[codon_set] - np.mean(sim_density_list[codon_set]), np.mean(sim_density_list[codon_set]))
            outputs = [codon_set, seqo.calc_gc_seqs_combined(codon_set.split("_")), sequo.calc_purine_content(codon_set.split("_")), real_density_list[codon_set], nd]
            outfile.write("{0}\n".format(",".join(gen.stringify(outputs))))

    gen.remove_directory(temp_dir)
コード例 #6
0
        for i, iteration in enumerate(iterations):
            print("{0}/{1}".format(i + 1, len(iterations)))

            if iteration != "real":
                new_seqs = []
                for id in seq_list:
                    new_seqs.append(randomise_seq(seq_list[id]))
            else:
                new_seqs = [seq_list[i] for i in seq_list]

            iteration_densities = {}
            for codon_set in codon_sets:
                iteration_densities["_".join(
                    codon_set)] = seqo.calc_motif_density(new_seqs, codon_set)

            output_file = "{0}/{1}.csv".format(output_directory, iteration)
            with open(output_file, "w") as outfile:
                for codon_set in codon_sets:
                    outfile.write("{0},{1}\n".format(
                        "_".join(codon_set),
                        iteration_densities["_".join(codon_set)]))
    return outputs


iterations = ["real"] + list(range(1000))
soc.run_simulation_function(iterations,
                            [seq_list, codon_sets, output_directory],
                            run_simulations,
                            sim_run=False,
                            workers=50)