def get_MSA_concat_list(input_file, output_file, min_species):
    '''
    Given the sequences and coordinates from a bunch of raw MSA objects (as returned by get_MSA_gene_list),
    filter them to only keep ones where you have a contiguous CDS of the same length as in human.
    Write an output file where each row corresponds to one CDS region and has the orthologous sequence from each of the species.
    '''
    with open(input_file) as file, open(output_file, "w") as outfile:
        current_string = []
        lines = line_count(input_file)
        strand_mapping = {
            "human_sense": ("1", "-1"),
            "human_antisense": ("-1", "1")
        }
        print("Total: {0}.".format(lines))
        for pos, line in enumerate(file):
            if pos % 10000 == 0:
                print(pos)
            seqs = {}
            #that means we've gotten to a new human CDS record
            if line[0] == "%":
                line = line.rstrip("\n")
                global_name = line.split("|")
                strand = global_name[6]
                #it'll always fetch the gab from the sense strand (with regards to the query species) and
                #the other species from whatever aligns to the reference strand in the query species
                #so if the gene is antisense, everything has to be flipped
                if global_name[6] == "-":
                    mapping_tuple = strand_mapping["human_antisense"]
                else:
                    mapping_tuple = strand_mapping["human_sense"]
            #if it begins with neither a percentage sign nor an asterisk,
            #that means it must be a line of sequence
            #put all those in a list for each human CDS
            elif line[0] != "*":
                current_string.append(line)
            #you've reached the end of a CDS block
            else:
                #parse the stuff you've read in into a dictionary with the species as keys
                current_dict = get_species_dict(current_string)
                #loop over the different species
                for species in list(current_dict.keys()):
                    #check that the aligning region forms a contiguous block in this species
                    current_species_dict = filter_species(
                        current_dict[species], mapping_tuple)
                    if current_species_dict:
                        current_dict[species] = current_species_dict
                    else:
                        del current_dict[species]
                outstring = concat_sequences(current_dict, min_species,
                                             global_name)
                if outstring:
                    outfile.write(outstring)
                current_string = []
def get_reads_per_pos(reads_file, transcript_bed):
    """
    Given a BED file of reads and a BED file of transcript coordinates,
    make a dictionary with transcript IDs as keys and number of reads per position,
    as well as the absolute coordinates of the nucleotides, as values.
    :param reads_file: BED file with read coordinates
    :param transcript_bed: BED file with transcript coordinates
    :return: dictionary with numbers of reads per position
    """
    # intersect the transcripts and the reads, so you'd have an output file where
    # the transcript coordinates are followed by the overlapping read
    intermediate_file = "{0}_{1}read_per_pos_intermediate.bed".format(
        reads_file[:-4],
        transcript_bed.split("/")[-1][:-4])
    co.intersect_bed(transcript_bed,
                     reads_file,
                     force_strand=True,
                     write_both=True,
                     no_dups=False,
                     write_zero=False,
                     output_file=intermediate_file)
    reads_per_pos = {}
    total = hk.line_count(intermediate_file)
    print("Calculating the number of reads per position in each transcript...")
    with open(intermediate_file, newline="") as file:
        file_reader = csv.reader(file, delimiter="\t")
        for pos, line in enumerate(file_reader):
            if pos % 100000 == 0:
                print("{0}/{1}".format(pos, total))
            # prefix the chromosome and the strand to the transcript name cause you'll
            # need it later
            trans_name = line[3]
            trans_name = "{0}.{1}.{2}".format(line[0], line[5], trans_name)
            reads_per_pos = hk.add_key(trans_name, {"reads": {}},
                                       reads_per_pos)
            strand = line[5]
            if strand == "+":
                position = int(line[8]) - 1
            else:
                position = int(line[7])
            reads_per_pos[trans_name]["reads"] = hk.add_key(
                position, 0, reads_per_pos[trans_name]["reads"])
            reads_per_pos[trans_name]["reads"][
                position] = reads_per_pos[trans_name]["reads"][position] + 1
            reads_per_pos[trans_name] = hk.add_key(
                "coords", (int(line[1]), int(line[2])),
                reads_per_pos[trans_name])
    hk.remove_file(intermediate_file)
    return reads_per_pos
Exemple #3
0
def get_MSA_concat_list(input_file, output_file, min_species):
    '''
    Given the sequences and coordinates from a bunch of raw MSA objects (as returned by get_MSA_gene_list),
    filter them to only keep ones where you have a contiguous CDS of the same length as in human.
    Write an output file where each row corresponds to one CDS region and has the orthologous sequence from each of the species.
    '''
    with open(input_file) as file, open(output_file, "w") as outfile:
        current_string = []
        lines = line_count(input_file)
        strand_mapping = {"human_sense": ("1", "-1"), "human_antisense": ("-1", "1")}
        print("Total: {0}.".format(lines))
        for pos, line in enumerate(file):
            if pos % 10000 == 0:
                print(pos)
            seqs = {}
            #that means we've gotten to a new human CDS record
            if line[0] == "%":
                line = line.rstrip("\n")
                global_name = line.split("|")
                strand = global_name[6]
                #it'll always fetch the gab from the sense strand (with regards to the query species) and
                #the other species from whatever aligns to the reference strand in the query species
                #so if the gene is antisense, everything has to be flipped
                if global_name[6] == "-":
                    mapping_tuple = strand_mapping["human_antisense"]
                else:
                    mapping_tuple = strand_mapping["human_sense"]
            #if it begins with neither a percentage sign nor an asterisk,
            #that means it must be a line of sequence
            #put all those in a list for each human CDS
            elif line[0] != "*":
                current_string.append(line)
            #you've reached the end of a CDS block
            else:
                #parse the stuff you've read in into a dictionary with the species as keys
                current_dict = get_species_dict(current_string)
                #loop over the different species
                for species in list(current_dict.keys()):
                    #check that the aligning region forms a contiguous block in this species
                    current_species_dict = filter_species(current_dict[species], mapping_tuple)
                    if current_species_dict:
                        current_dict[species] = current_species_dict
                    else:
                        del current_dict[species]
                outstring = concat_sequences(current_dict, min_species, global_name)
                if outstring:
                    outfile.write(outstring)
                current_string = []
def extract_true_and_control_string(fasta_name, true_indices, control_indices):
    """
    Given a FASTA file and two tuples of indices, make two NUMPY arrays with the sequence
    between the indices in each of the FASTA entries.
    :param fasta_name: input FASTA file
    :param true_indices: tuple indicating the start and end (0-based) of the first segment
    :param control_indices: tuple giving the start and end of the second segment
    :return: two numpy arrays with rows corresponding to FASTA entries and columns to positions
    in the segment.
    """
    expected_length_true = true_indices[1] - true_indices[0]
    expected_length_control = control_indices[1] - control_indices[0]
    # Get the number of lines in FASTA and divide by 2 to get the number of sequences
    fasta_length = hk.line_count(fasta_name) / 2
    # Pre-allocate two arrays, one for the true sequence and one for the control
    occ_mat_true = np.empty(
        (int(fasta_length), int(true_indices[1] - true_indices[0])),
        dtype="str")
    occ_mat_control = np.empty(
        (int(fasta_length), int(control_indices[1] - control_indices[0])),
        dtype="str")
    pos_in_fasta = 0
    error_counter = 0
    with open(fasta_name) as fasta:
        for line in fasta:
            if line[0] != ">":
                true_string = line[true_indices[0]:true_indices[1]]
                control_string = line[control_indices[0]:control_indices[1]]
                if (len(true_string) != expected_length_true) or (
                        len(control_string) != expected_length_control):
                    error_counter = error_counter + 1
                for pos in range(len(true_string)):
                    occ_mat_true[pos_in_fasta, pos] = true_string[pos]
                    occ_mat_control[pos_in_fasta, pos] = control_string[pos]
                pos_in_fasta = pos_in_fasta + 1
    print("Errors: {0}.".format(error_counter))
    return (occ_mat_true, occ_mat_control)
def main():

    description = "Call peaks in a BED file of NET-seq reads."
    help_info = [
        "BED file (at least a BED6) with NET-seq reads. Should be single-nucleotide resolution (each BED region is the 3' end of a read.).",
        "Ensembl GTF file for the relevant species. Ensure that chromosome names are formatted the same way in both the GTF and the BED file with reads!",
        "BED file with the coordinates of the transcripts to analyze. Only the name field is read, hence the others can hold placeholders. The name field must contain transcript IDs from the GTF file.",
        "Name of the output file (BED file with peak coordinates).",
        "Alpha value for calling a position as having a significantly higher local read denisty than expected by chance. Default: 0.01.",
        "Merge distance: adjacent peaks will be merged if they are closer than this many nucleotides. Default: 21.",
        "Minimum reads per peak. Default: 10.",
        "The number of times the read position randomization should be performed for each transcript. Higher values make the significance calculation (marginally) more robust, however, they also make the programme very slow. Default: 5.",
        "Minimum length of a peak in nucleotides. Default: 5.",
        "Size of the sliding window to use when calculating the local read density. It may be sensible to set this to the same value as the merge distance. Should be an odd integer. Default: 21",
        "The analysis will be performed this many times, with the output files numbered. Useful for running many negative control simulations at once. Default: 1.",
        "Read positions will be shuffled within each transcript before analysis. This should disrupt any signal and should give a flat peak density profile.",
        "Instead of a sliding window, adjacent non-overlapping windows will be used when calculating the local read density.",
        "When calling peaks in a given exon/intron, do not include that exon/intron in the read position randomization.",
        "When --exclude_focal is set, count an exon and its upstream intron as a single unit (except for the first exon).",
        "Don't filter out likely PCR duplicates (peaks where more than 90%% of the reads come from a single nucleotide position).)"
    ]
    defaults = {4: 0.01, 5: 21, 6: 10, 7: 5, 8: 5, 9: 21, 10: 1}
    args = hk.parse_arguments(description, [
        "reads_file", "gtf", "trans_active_file", "output_file",
        "significance_threshold", "merge", "min_reads_per_peak", "iterations",
        "min_peak_length", "window_size", "runs", "neg_control", "no_slide",
        "exclude_focal", "with_ups_intron", "no_PCR_filter"
    ],
                              floats=[4],
                              ints=[5, 6, 7, 8, 9, 10],
                              flags=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                              detailed_help=help_info,
                              defaults=defaults)
    reads_file, gtf, trans_active_file, output_file, significance_threshold, merge, min_reads_per_peak, iterations, min_peak_length, window_size, runs, neg_control, no_slide, exclude_focal, with_ups_intron, no_PCR_filter = args.reads_file, args.gtf, args.trans_active_file, args.output_file, args.significance_threshold, args.merge, args.min_reads_per_peak, args.iterations, args.min_peak_length, args.window_size, args.runs, args.neg_control, args.no_slide, args.exclude_focal, args.with_ups_intron, args.no_PCR_filter

    print("Merge distance: {0}".format(merge))
    print("Minimum number of reads per peak: {0}".format(min_reads_per_peak))
    print("Minimum peak length: {0}".format(min_peak_length))
    print("Window size: {0}".format(window_size))
    print("Significance level: {0}".format(significance_threshold))
    print("Randomization iterations to perform: {0}".format(iterations))
    print("Runs: {0}".format(runs))

    neg_str = ""
    if neg_control:
        neg_str = "_neg_control"

    slide_str = ""
    if no_slide:
        slide_str = "_no_slide"
    intron_str = ""
    if with_ups_intron:
        intron_str = "w_ups_intr"

    # 0. make a BED file with the coordinates of transcripts

    transcripts_file = "{0}_transcripts.bed".format(gtf[:-4])
    co.get_transcripts(gtf, transcripts_file, add_chr=True)
    exons = rw.read_gtf(gtf, "exon")

    # 1. intersect the two files, loop over the result and make a
    # dictionary of reads per pos for each transcript, which has reads

    reads_per_pos = get_reads_per_pos(reads_file, transcripts_file)
    # only leave transcriptionally active genes (one isoform per gene)
    trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:]
    # pull out the column with transcript IDs
    trans_active_genes = [i[3] for i in trans_active_genes]
    reads_per_pos = {
        i: reads_per_pos[i]
        for i in reads_per_pos if i.split(".")[-1] in trans_active_genes
    }

    for sim in range(runs):

        print("**********{0}**********".format(sim))

        # 2. for each transcript, randomly reshuffle the reads and calculate the
        # nth percentile depending on what the significance threshold is
        # keep positions that are higher than that threshold and write to BED file

        raw_peak_bed = "{0}_{1}_raw_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format(
            reads_file[:-4],
            gtf.split("/")[-1][:-4], iterations, min_reads_per_peak,
            window_size, neg_str, intron_str, slide_str, sim)
        read_count_file = "{0}_{1}_read_counts{2}_{3}{4}{5}_{6}_sim.txt".format(
            reads_file[:-4],
            gtf.split("/")[-1][:-4], iterations, window_size, neg_str,
            intron_str, sim)
        new_reads_file = write_raw_peaks(reads_per_pos,
                                         raw_peak_bed,
                                         read_count_file,
                                         exons,
                                         iterations=iterations,
                                         min_read_count=min_reads_per_peak,
                                         window_size=window_size,
                                         neg_control=neg_control,
                                         no_slide=no_slide,
                                         exclude_focal=exclude_focal,
                                         with_ups_intron=with_ups_intron)
        if neg_control:
            reads_file = new_reads_file

        # 3. merge peaks

        merged_peak_bed = "{0}_{1}_merged_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format(
            reads_file[:-4],
            gtf.split("/")[-1][:-4], iterations, window_size, merge, neg_str,
            slide_str, intron_str, sim)
        co.merge_bed(raw_peak_bed, merged_peak_bed, merge)
        print("Before filtering, there are {0} peaks.".format(
            hk.line_count(merged_peak_bed)))

        # 4. filter out peaks that don't have enough reads or are too short.
        # Write final results to file and also write a stats file with the size,
        # read count and overlapping transcript of the peaks

        stats_file = "{0}_stats_{1}_sim.txt".format(output_file[:-4], sim)
        filter_peaks(merged_peak_bed,
                     reads_file,
                     read_count_file,
                     "{0}_{1}_sim.bed".format(output_file[:-4], sim),
                     min_reads_per_peak,
                     min_peak_length,
                     stats_file,
                     no_PCR_filter=no_PCR_filter)
Exemple #6
0
def main():
    description = "Record splicing distance."
    args = hk.parse_arguments(description, ["input_file", "gtf", "output_folder", "trans_active_file", "window_size", "intron_window_size", "outsuffix", "leave_terminal"], ints = [4, 5], flags = [7])
    input_file, gtf, output_folder, trans_active_file, window_size, intron_window_size, outsuffix, leave_terminal = args.input_file, args.gtf, args.output_folder, args.trans_active_file, args.window_size, args.intron_window_size, args.outsuffix, args.leave_terminal

    if outsuffix == "None":
        outsuffix = ""

    bare_input_path = input_file.split("/")[-1]
    bed = "{0}.bed".format(input_file[:-4])
    # hk.convert2bed(input_file, bed)

    # get descriptive stats of the reads
    length_file = "{0}/{1}_read_lengths.txt".format(output_folder, bare_input_path[:-4])
    write_read_lengths(bed, length_file)

    # read in CDS coordinates
    exons = rw.read_gtf(gtf, "CDS", gene=False)
    # only leave transcriptionally active genes (one isoform per gene)
    trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:]
    # pull out the column with transcript IDs
    trans_active_genes = [i[3] for i in trans_active_genes]
    exons = {i: exons[i] for i in exons if i in trans_active_genes}
    terminal_suff = "_with_terminal"
    if not leave_terminal:
        # remove last exons
        exons = {i: exons[i][:-1] for i in exons}
        terminal_suff = ""
    # prepare exon-exon junctions
    exon_junctions_file = "{0}_exon_junctions{1}{2}.bed".format(gtf[:-4], outsuffix, terminal_suff)
    all_junctions = co.extract_3ss(exons, exon_junctions_file)

    out_bed = "{0}/{1}_first_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True)
    out_bed_end = "{0}/{1}_last_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True)
    intron_bed = "{0}/{1}_first_{2}_intronic_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], intron_window_size, outsuffix, terminal_suff)
    write_intron_starts(all_junctions, intron_bed, exons, intron_window_size, add_chr=True)
    out_bed = "{0}/{1}_first_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True, centre=True)
    out_bed_end = "{0}/{1}_last_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff)
    write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True, centre=True)
    out_bed_si = "{0}/{1}_si_pos{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    write_si_pos(all_junctions, out_bed_si, exons, add_chr=True)
    out_bed_si_current = "{0}/{1}_si_pos_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    write_si_pos(all_junctions, out_bed_si_current, exons, add_chr=True, curr_exon=True)
    # check which junctions are associated with a splicing intermediate read
    snr_bed = "{0}_snr.bed".format(bed[:-4])
    co.snr_bed(bed, snr_bed)
    si_counts_bed = "{0}/{1}_si_counts{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    co.intersect_bed(out_bed_si, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_bed)
    si_counts_current_bed = "{0}/{1}_si_counts_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)
    co.intersect_bed(out_bed_si_current, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_current_bed)

    # filter out reads that don't overlap exon-exon junctions
    exon_junction_bed = "{0}_exon_junctions{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    co.intersect_bed(bed, exon_junctions_file, write_both=True,
                     output_file=exon_junction_bed,
                  force_strand=True, no_dups=False)

    spliced_bed = "{0}_spliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    unspliced_bed = "{0}_unspliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff)
    sr_distances = {}
    ur_distances = {}
    found_count = 0
    file_size = hk.line_count(exon_junction_bed)

    # will store all the intron names for which there are
    # either spliced or unspliced reads
    valid_junctions = []
    with open(exon_junction_bed) as file, open(spliced_bed, "w") as sfile, open(unspliced_bed, "w") as ufile:
        for pos, line in enumerate(file):

            if pos % 100000 == 0:
                print("{0}/{1}".format(pos, file_size))
                print("Found {0} spliced reads.".format(found_count))
                print("\n")

            line = line.split("\t")

            # reads that end at the last nucleotide of an exon
            intermediate_read = NGS.check_intermediate_read(line, exons)
            intron_name = line[20]

            if not intermediate_read:

                # check that it ends within the exon just downstream of
                # the 3' ss that is being analyzed

                in_dwns_exon = NGS.check_position_in_exon(line, exons)

                if in_dwns_exon:

                    # 'spliced', 'unspliced' or 'None' (=can't analyze)
                    read_type = NGS.analyze_cigar(line, overhang = 5)

                    if read_type:
                        if intron_name not in valid_junctions:
                            valid_junctions.append(intron_name)
                        splice_dist = NGS.get_splice_dist(line)
                        if read_type == "S":
                            sfile.write("\t".join([str(i) for i in line]))
                            found_count = found_count + 1
                            sr_distances = update_dist_dict(intron_name, sr_distances, splice_dist)
                        else:
                            ufile.write("\t".join([str(i) for i in line]))
                            ur_distances = update_dist_dict(intron_name, ur_distances, splice_dist)

    print("Proportion of spliced reads: {0}.".format(found_count/(pos + 1)))

    # for each valid junction, calculate the length of the exonic sequence
    # afterwards, so that you wouldn't consider intronic sequence in the distance
    # matrix
    lengths_dict = co.get_lengths(exons, valid_junctions)

    write_dist_mat(sr_distances, window_size,
                   "{0}/{1}_spliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   lengths_dict,
                   "{0}/{1}_spliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   "{0}/{1}_spliced_read_first_spliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))

    write_dist_mat(ur_distances, window_size,
                   "{0}/{1}_unspliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   lengths_dict,
                   "{0}/{1}_unspliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff),
                   "{0}/{1}_unspliced_read_first_unspliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))