Example #1
0
def get_unique_rel_pos(unique_ptcs, disease_snps_relative_exon_positions, kgenomes_ptcs_file, kgenomes_ptcs_exon_positions, unique_ptcs_rel_pos_file, kgenomes_ptcs_rel_pos_file):
    '''
    Get the relative positions of the unique ptcs
    '''
    snps = gen.read_many_fields(disease_snps_relative_exon_positions, "\t")
    snp_list = collections.defaultdict(lambda: collections.defaultdict())
    for snp in snps:
        snp_pos = int(snp[7])
        rel_pos = int(snp[11])
        snp_list[snp_pos] = rel_pos

    ptcs = gen.read_many_fields(unique_ptcs, "\t")
    with open(unique_ptcs_rel_pos_file, "w") as outfile:
        for ptc in ptcs:
            ptc_pos = int(ptc[7])
            ptc[11] = snp_list[ptc_pos]
            outfile.write("{0}\n".format("\t".join(gen.stringify(ptc))))

    kgenomes_ptc_positions = gen.read_many_fields(kgenomes_ptcs_exon_positions, "\t")
    kgenomes_ptc_list = collections.defaultdict(lambda: collections.defaultdict())
    for ptc in kgenomes_ptc_positions[1:]:
        snp_pos = int(ptc[7])
        rel_pos = int(ptc[11])
        kgenomes_ptc_list[snp_pos] = rel_pos

    kgenomes_ptcs = gen.read_many_fields(kgenomes_ptcs_file, "\t")
    with open(kgenomes_ptcs_rel_pos_file, "w") as outfile:
        for ptc in kgenomes_ptcs:
            ptc_pos = int(ptc[7])
            ptc[11] = kgenomes_ptc_list[ptc_pos]
            outfile.write("{0}\n".format("\t".join(gen.stringify(ptc))))
Example #2
0
def get_introns(exon_bed, output_file):
    '''
    Get the introns between exons in a file
    '''

    exons = gen.read_many_fields(exon_bed, "\t")

    # get a dictionary of exons split by transcript and number
    exon_list = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict()))
    for item in exons:
        exon = Define_Exon(item)
        if exon.type == "stop_codon":
            exon.exon_no = 999999
        exon_list[exon.transcript_id][exon.exon_no] = item

    # now get the introns and write to file
    with open(output_file, "w") as outfile:
        for transcript in exon_list:
            for exon_no in sorted(exon_list[transcript]):
                exon = Define_Exon(exon_list[transcript][exon_no])
                # check that the next exon exists, assuming its id will not be higher than 999999
                if exon.exon_no + 1 in exon_list[transcript]:
                    next_exon = Define_Exon(exon_list[exon.transcript_id][exon.exon_no + 1])

                    if exon.strand == "-":
                        intron_start = next_exon.stop
                        intron_stop = exon.start
                    else:
                        intron_start = exon.stop
                        intron_stop = next_exon.start

                    outlist = gen.stringify([exon.chr, intron_start, intron_stop, "{0}.{1}-{2}".format(exon.transcript_id, exon.exon_no, next_exon.exon_no), ".", exon.strand])
                    outfile.write("{0}\n".format("\t".join(outlist)))
Example #3
0
def get_genome_bed_from_fasta_index(features_bed, fasta_index, output_file):
    """
    Given a list of features, get the genome coordinates as a bed file.

    Args:
        features_bed (str): path to bed file containing features
        fasta_index (str): path to fasta index file
        output_file (str): path to output file
    """

    # get all the chromosomes required
    first_column = [
        i.strip("chr") for i in list(
            set(
                gen.run_process(["awk", "{print $1}"],
                                file_for_input=features_bed).split('\n')))
        if len(i)
    ]
    # get index lines
    index = gen.read_many_fields(fasta_index, "\t")
    with open(output_file, "w") as outfile:
        for i in index:
            if i[0] in first_column:
                start = 0
                length = int(i[1])
                out_info = [i[0], start, start + length, ".", "."]
                outfile.write("{0}\t+\n{0}\t-\n".format("\t".join(
                    gen.stringify(out_info))))
Example #4
0
def entries_to_bed(source_path, output_file, exclude_XY=None, hg38=None, NONCODE=None):
    """
    Generate a file containing the exon info from a bed file

    Args:
        source_path (str): the source path for the origin .bed file
        output_file (str): output .bed file to contain the exon info
        exclude_XY (bool): if true, exclude cases on X and Y chr
        hg38 (bool): if true, using hg38
    """
    # read the file in
    lines = gen.read_many_fields(source_path, "\t")

    with open(output_file, "w") as outfile:
        for line in lines:
            features = gen.Bed_Feature(line)
            starts = [int(start) for start in features.featureStarts.split(',')[:-1]]
            sizes = [int(size) for size in features.featureSizes.split(',')[:-1]]
            expected = features.featureCount
            if len(sizes) == expected and len(starts) == expected:
                for i, start_pos in enumerate(starts):
                    # get the end position
                    start_pos = features.start + start_pos
                    end_pos = start_pos + sizes[i]
                    # create a list of the new bed line
                    write=True
                    if hg38:
                        features.chr = features.chr.strip("chr")
                    if NONCODE:
                        features.name = features.name.split(".")[0]
                        if "NONHSAT" not in features.name:
                            write=False
                    output_list = [features.chr, start_pos, end_pos, "{0}.{1}".format(features.name, i+1), ".", features.strand, features.thickStart, features.thickEnd, ".", features.featureCount, ".", "."]
                    # only add if a transcript, used for NONCODE sequences
                    if write:
                        # add the info if exists
                        if hasattr(features, "info"):
                            output_list.extend(features.info)
                        if exclude_XY:
                            if features.chr not in ["chrX", "chrY"]:
                                outfile.write("{0}\n".format("\t".join(gen.stringify(output_list))))
                        else:
                            outfile.write("{0}\n".format("\t".join(gen.stringify(output_list))))
            else:
                print('Error in the number of features')
def intron_hexamer_test(input_fasta, motif_file, output_directory, output_file, required_simulations = None, families_file = None):
    """
    Generate random hexamers from introns and calculate purine content

    Args:
        input_fasta (str): path to intron fasta
        motif_file (str): path to file containing real motifs
        output_directory (str): path to output directory
        output_file (str): path to output file
        required_simulations (int): if set, the number of simulations to run
        families_file (str): if set, path to families file
    """

    hexamers_dir = "{0}/random_hexamers".format(output_directory)
    gen.create_output_directories(hexamers_dir)
    # get the motifs
    motifs = sequo.read_motifs(motif_file)
    # if there are not enough simulations, generate them
    if len(os.listdir(hexamers_dir)) < required_simulations:
        gen.create_output_directories(hexamers_dir)
        required = list(range(required_simulations - len(os.listdir(hexamers_dir))))
        names, seqs = gen.read_fasta(sequences_file)
        seqs_list = collections.defaultdict(lambda: [])
        for i, name in enumerate(names):
            seqs_list[name.split(".")[0]].append(seqs[i])

        if families_file:
            seqs_list = sequo.pick_random_family_member(families_file, seqs_list)

        all_seqs = []
        [all_seqs.extend(seqs_list[i]) for i in seqs_list]
        full_seq = "X".join(all_seqs)
        simopc.run_simulation_function(required, [full_seq, motifs, hexamers_dir], sequo.locate_random_motifs, sim_run = False)

    # calculate the purine contents
    real_purine_content = sequo.calc_purine_content(motifs)
    real_nt_content = sequo.calc_nucleotide_content(motifs)

    test_purine_content = []
    test_nt_content = []
    for file in os.listdir(hexamers_dir):
        filepath = "{0}/{1}".format(hexamers_dir, file)
        test_motifs = sequo.read_motifs(filepath)
        test_purine_content.append(sequo.calc_purine_content(test_motifs))
        test_nt_content.append(sequo.calc_nucleotide_content(test_motifs))

    with open(output_file, "w") as outfile:
        outfile.write("id,purine_content,a_content,c_content,g_content,t_content\n")
        outfile.write("real,{0},{1}\n".format(real_purine_content, ",".join(gen.stringify([real_nt_content[i] for i in sorted(real_nt_content)]))))
        for i in range(len(test_purine_content)):
            outfile.write("{0},{1},{2}\n".format(i+1, test_purine_content[i], ",".join(gen.stringify([test_nt_content[i][j] for j in sorted(test_nt_content[i])]))))

    # remove the output directory
    gen.remove_directory(hexamers_dir)
Example #6
0
def write_features_to_bed(feature_list, output_file):
    """
    Write a set of features to bed file

    Args:
        feature_list (dict): dictionary containing features with transcripts as keys
        output_file (str): path to output file
    """

    with open(output_file, "w") as outfile:
        for feature in feature_list:
            for item in feature_list[feature]:
                item[3] = "{0}.{1}".format(item[3], item[4])
                item[4] = "."
                outfile.write("{0}\n".format("\t".join(gen.stringify(item))))
Example #7
0
def get_filtered_exons():
    '''
    Create a ptc file containing only the filtered final ptcs
    and large effect cases that overlap disease
    '''
    output_prefix = "results/clean_run_2/clean_run"
    disease_prefix = "results/clinvar"
    ptc_file = "{0}_ptc_file.txt".format(output_prefix)
    relative_positions_file = "{0}_PTC_relative_exon_positions.bed".format(
        output_prefix)
    final_output_file = "{0}__analysis_final_output.txt".format(output_prefix)

    intersect_file = "{0}/ptc_intersect.bed".format(disease_prefix)
    intersect_ptcs = gen.read_many_fields(intersect_file, "\t")
    intersect_list = []
    for ptc in intersect_ptcs:
        exon = ptc[-4]
        intersect_list.append(exon)

    filtered_list = get_filtered_skipped_exons(final_output_file)
    large_effects, non_large_effects = get_large_effect_overlaps(
        filtered_list, 5, 0.025)
    filtered_list = [
        exon[0] for exon in get_filtered_skipped_exons(final_output_file)
    ]

    ptcs = gen.read_many_fields(ptc_file, "\t")
    ptc_list = []
    le_overlap = []
    for ptc in ptcs:
        exon = ptc[3]
        if exon in filtered_list and exon not in intersect_list:
            ptc_list.append(ptc)
        if exon in large_effects and exon in intersect_list:
            le_overlap.append(ptc)

    large_effect_disease_overlap = "{0}_large_effect_disease_overlap.txt".format(
        output_prefix)
    with open(large_effect_disease_overlap, "w") as outfile:
        for ptc in le_overlap:
            exon = ptc[3]
            outfile.write('{0}\n'.format(exon))

    filtered_no_overlaps = "{0}_ptc_file_filtered_no_disease.txt".format(
        output_prefix)
    with open(filtered_no_overlaps, "w") as outfile:
        for ptc in ptc_list:
            outfile.write("{0}\n".format("\t".join(gen.stringify(ptc))))
Example #8
0
def get_relative_exon_positions(relative_positions_file, large_effect_exons,
                                ptc_file):

    snp_pos = {}
    relative_positions = gen.read_many_fields(relative_positions_file, "\t")
    for snp in relative_positions:
        id = snp[8]
        pos = int(snp[11])
        snp_pos[id] = pos

    with open("results/clean_run_2/clean_run_PTC_relative_exon_positions.bed",
              "w") as outfile:
        ptcs = gen.read_many_fields(ptc_file, "\t")
        outfile.write('{0}'.format("\t".join(ptcs[0])))

        for ptc in ptcs[1:]:
            id = ptc[8]
            ptc[11] = snp_pos[id]
            outfile.write('{0}\n'.format("\t".join(gen.stringify(ptc))))
Example #9
0
def get_introns_from_bed(input_bed, output_file):

    exons = gen.read_many_fields(input_bed, "\t")

    exon_list = collections.defaultdict(
        lambda: collections.defaultdict(lambda: []))
    strands = {}
    chrs = {}

    for exon in exons:
        transcript = exon[3].split(".")[0]
        exon_id = exon[3].split(".")[1]
        if "(" in exon_id:
            exon_id = exon_id.split("(")[0]
        exon_id = int(exon_id)
        start = int(exon[1])
        end = int(exon[2])
        chr = exon[0]
        strand = exon[5]
        strands[transcript] = strand
        chrs[transcript] = chr
        exon_list[transcript][exon_id] = [start, end]

    with open(output_file, "w") as outfile:
        for transcript in exon_list:
            for exon_id in sorted(exon_list[transcript]):
                # check whether the next exon exists
                if exon_id + 1 in exon_list[transcript]:
                    if strands[transcript] == "+":
                        intron_start = exon_list[transcript][exon_id][1]
                        intron_end = exon_list[transcript][exon_id + 1][0]
                    else:
                        intron_start = exon_list[transcript][exon_id + 1][1]
                        intron_end = exon_list[transcript][exon_id][0]
                    outdata = [
                        chrs[transcript], intron_start, intron_end,
                        "{0}.{1}-{2}".format(transcript, exon_id, exon_id + 1),
                        ".", strands[transcript]
                    ]
                    outfile.write("{0}\n".format("\t".join(
                        gen.stringify(outdata))))
Example #10
0
def ese_hit_simulation(rel_pos_file, coding_exons_fasta, simulations, output_file, ese_file, window_start, window_end, exclude_cpg, clinvar=None):
    '''
    Simulate ese hits strictly within a region
    '''

    # get a list of the relative positions of the ptcs
    relative_positions_list = get_relative_position_list(rel_pos_file)
    # get a list of eses
    ese_list = get_eses_from_file(ese_file)
    # get the coding exons
    coding_exons = get_coding_exons(coding_exons_fasta)

    long_exons = get_long_exons(relative_positions_list, coding_exons, window_end*2)

    # get the ptcs that are in the 3-69 bp region for each exon of exon
    # this requires exons at least 128 bp in length for comparison
    window_ptcs = get_ptcs_in_window(long_exons, window_start, window_end, coding_exons)
    real_ese_hits = get_ese_hits(window_ptcs, coding_exons, ese_list)

    # simulate the hit counts for nt matched mutations
    simulation_list = list(range(simulations))
    # simulate_ese_hits(simulation_list, simulations, window_ptcs, coding_exons_fasta, ese_list, window_start, window_end)
    processes = gen.run_in_parallel(simulation_list, ["foo", simulations, window_ptcs, coding_exons_fasta, ese_list, window_start, window_end], simulate_ese_hits)
    #
    simulation_outputs = {}
    for process in processes:
        simulation_hits = process.get()
        simulation_outputs = {**simulation_outputs, **simulation_hits}


    with open(output_file, "w") as outfile:
        outfile.write("simulation,ese_hit_count,cant_count\n")
        outfile.write("real,{0},0\n".format(real_ese_hits))
        for simulation in sorted(simulation_outputs):
            outlist = [simulation+1, simulation_outputs[simulation][0], simulation_outputs[simulation][1]]
            outfile.write("{0}\n".format(",".join(gen.stringify(outlist))))
def motif_codon_densities(motif_file, codon_combinations_file, motif_controls_directory, required_simulations, output_file):

    filelist = {"real": motif_file}
    for i, file in enumerate(os.listdir(motif_controls_directory)[:required_simulations]):
        filelist[i] = "{0}/{1}".format(motif_controls_directory, file)
    file_ids = [i for i in filelist]


    codon_sets = gen.read_many_fields(codon_combinations_file, "\t")


    temp_dir = "temp_motif_dir"
    gen.create_output_directories(temp_dir)
    args = [filelist, codon_sets, temp_dir]
    outputs = simopc.run_simulation_function(file_ids, args, calculate_motif_densities, sim_run = False)

    real_density_list = {}
    sim_density_list = collections.defaultdict(lambda: [])

    for file in outputs:
        results = gen.read_many_fields(file, "\t")
        if "real" in file:
            for i in results:
                real_density_list[i[0]] = float(i[1])
        else:
            for i in results:
                sim_density_list[i[0]].append(float(i[1]))

    with open(output_file, "w") as outfile:
        outfile.write("codons,gc_content,purine_content,density,nd\n")
        for codon_set in sorted(real_density_list):
            nd = np.divide(real_density_list[codon_set] - np.mean(sim_density_list[codon_set]), np.mean(sim_density_list[codon_set]))
            outputs = [codon_set, seqo.calc_gc_seqs_combined(codon_set.split("_")), sequo.calc_purine_content(codon_set.split("_")), real_density_list[codon_set], nd]
            outfile.write("{0}\n".format(",".join(gen.stringify(outputs))))

    gen.remove_directory(temp_dir)
Example #12
0
def ptc_location_simulation(rel_pos_file, coding_exons_fasta, simulations, output_file, ese_overlap_output_file, ese_file=None, only_ese=None, exclude_cpg=None):
    '''
    Simulation mutation locations of PTCs.
    Take the exon in which each PTC is location and randomly pick a site with the
    same nt composition.
    Locations of these matched mutations are used for null.
    '''

    # get a list of the relative positions of the ptcs
    relative_positions_list = get_relative_position_list(rel_pos_file)
    # get a list of eses
    ese_list = get_eses_from_file(ese_file)
    # get the coding exons
    coding_exons = get_coding_exons(coding_exons_fasta)

    # get the positions of the ptcs
    real_positions = get_ptc_positions(relative_positions_list, coding_exons)
    # get the number of ptcs with ese overlaps
    real_ese_overlap = get_ptc_ese_overlap(relative_positions_list, coding_exons, ese_list)

    # now do the simulations
    simulant_list = list(range(1, simulations+1))
    processes = gen.run_in_parallel(simulant_list, ["foo", simulations, relative_positions_list, coding_exons_fasta, ese_list, exclude_cpg], simulate_mutation_locations)

    position_list = {}
    ese_overlap_list = {}
    for process in processes:
        result = process.get()
        position_list = {**position_list, **result[0]}
        ese_overlap_list = {**ese_overlap_list, **result[1]}

    # ignore writing this to file if we just want the ese overlap
    if not only_ese:
        with open(output_file, "w") as outfile:
            outfile.write('simulation,0.2,3.69,70+\n')
            outfile.write('real,{0}\n'.format(",".join(gen.stringify(real_positions))))
            for simulant in position_list:
                outfile.write("{0},{1}\n".format(simulant, ",".join(gen.stringify(position_list[simulant]))))

    with open(ese_overlap_output_file, "w") as outfile:
        outfile.write('simulation,0.2,3.69,70+\n')
        outfile.write('real,{0}\n'.format(",".join(gen.stringify(real_ese_overlap))))
        for simulant in ese_overlap_list:
            outfile.write("{0},{1}\n".format(simulant, ",".join(gen.stringify(ese_overlap_list[simulant]))))


# def ptc_location_simulation(snp_file, full_bed, cds_fasta, possible_positions_dir, output_directory, required_simulations, coding_exons_file):
    '''
    Simulate the snp location.
    For each snp, pick another site that has the same reference allele and that would generate a ptc with the mutated allele.
    Repeat n times.
    '''

    # return all the possible_locations
    possible_locations = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: [])))
    nts = ["A", "C", "G", "T"]
    for nt in nts:
        location_file = "{0}/possible_ptc_locations_{1}.fasta".format(possible_positions_dir, nt)
        entry_names, entry_locations = gen.read_fasta(location_file)
        for i, name in enumerate(entry_names):
            exon = name.split(':')[0]
            aa = name.split(':')[1][0]
            ma = name.split(':')[1][-1]
            possible_locations[exon][aa][ma].append(entry_locations[i])

    # get a list of exons and their lengths
    exons = gen.read_many_fields(coding_exons_file, "\t")
    exon_list = {}
    for exon in exons:
        exon_list[exon[3]] = int(exon[2]) - int(exon[1])

    # create a list of required simulations
    simulations = list(range(1, int(required_simulations) + 1))
    run_location_simulations(simulations, snp_file, possible_locations, exon_list, output_directory)
Example #13
0
def get_output(entry):
    output = []
    for id in entry:
        output.extend(entry[id])
    return gen.stringify(output)
Example #14
0
cds_names, cds_seqs = gen.read_fasta(cds_fasta)
cds_list = {name: cds_seqs[i] for i, name in enumerate(cds_names)}

multi_cds_names, multi_cds_seqs = gen.read_fasta(multi_utr)
multi_cds_list = {
    name: [multi_cds_seqs[i]]
    for i, name in enumerate(multi_cds_names)
    if len(multi_cds_seqs[i]) > 50 and name in cds_list
}
multi_cds_list = sequo.pick_random_family_member(families_file, multi_cds_list)
#
# 6086
multi_densities, multi_nds = calc_values(multi_cds_list)

motifs = sequo.read_motifs(motif_file)
multi_seqs = [multi_cds_list[i][0] for i in multi_cds_list]
single_seqs = [single_cds_list[i][0] for i in single_cds_list]
single_ese_desities = [
    seqo.calc_motif_density([i], motifs) for i in single_seqs
]
multi_ese_densities = [
    seqo.calc_motif_density([i], motifs) for i in multi_seqs
]

output_file = "clean_run/utr_ese_densities.csv"
with open(output_file, "w") as outfile:
    outfile.write("single,{0}\n".format(",".join(
        gen.stringify(single_ese_desities))))
    outfile.write("multi,{0}\n".format(",".join(
        gen.stringify(multi_ese_densities))))