def get_MSA_concat_list(input_file, output_file, min_species): ''' Given the sequences and coordinates from a bunch of raw MSA objects (as returned by get_MSA_gene_list), filter them to only keep ones where you have a contiguous CDS of the same length as in human. Write an output file where each row corresponds to one CDS region and has the orthologous sequence from each of the species. ''' with open(input_file) as file, open(output_file, "w") as outfile: current_string = [] lines = line_count(input_file) strand_mapping = { "human_sense": ("1", "-1"), "human_antisense": ("-1", "1") } print("Total: {0}.".format(lines)) for pos, line in enumerate(file): if pos % 10000 == 0: print(pos) seqs = {} #that means we've gotten to a new human CDS record if line[0] == "%": line = line.rstrip("\n") global_name = line.split("|") strand = global_name[6] #it'll always fetch the gab from the sense strand (with regards to the query species) and #the other species from whatever aligns to the reference strand in the query species #so if the gene is antisense, everything has to be flipped if global_name[6] == "-": mapping_tuple = strand_mapping["human_antisense"] else: mapping_tuple = strand_mapping["human_sense"] #if it begins with neither a percentage sign nor an asterisk, #that means it must be a line of sequence #put all those in a list for each human CDS elif line[0] != "*": current_string.append(line) #you've reached the end of a CDS block else: #parse the stuff you've read in into a dictionary with the species as keys current_dict = get_species_dict(current_string) #loop over the different species for species in list(current_dict.keys()): #check that the aligning region forms a contiguous block in this species current_species_dict = filter_species( current_dict[species], mapping_tuple) if current_species_dict: current_dict[species] = current_species_dict else: del current_dict[species] outstring = concat_sequences(current_dict, min_species, global_name) if outstring: outfile.write(outstring) current_string = []
def get_reads_per_pos(reads_file, transcript_bed): """ Given a BED file of reads and a BED file of transcript coordinates, make a dictionary with transcript IDs as keys and number of reads per position, as well as the absolute coordinates of the nucleotides, as values. :param reads_file: BED file with read coordinates :param transcript_bed: BED file with transcript coordinates :return: dictionary with numbers of reads per position """ # intersect the transcripts and the reads, so you'd have an output file where # the transcript coordinates are followed by the overlapping read intermediate_file = "{0}_{1}read_per_pos_intermediate.bed".format( reads_file[:-4], transcript_bed.split("/")[-1][:-4]) co.intersect_bed(transcript_bed, reads_file, force_strand=True, write_both=True, no_dups=False, write_zero=False, output_file=intermediate_file) reads_per_pos = {} total = hk.line_count(intermediate_file) print("Calculating the number of reads per position in each transcript...") with open(intermediate_file, newline="") as file: file_reader = csv.reader(file, delimiter="\t") for pos, line in enumerate(file_reader): if pos % 100000 == 0: print("{0}/{1}".format(pos, total)) # prefix the chromosome and the strand to the transcript name cause you'll # need it later trans_name = line[3] trans_name = "{0}.{1}.{2}".format(line[0], line[5], trans_name) reads_per_pos = hk.add_key(trans_name, {"reads": {}}, reads_per_pos) strand = line[5] if strand == "+": position = int(line[8]) - 1 else: position = int(line[7]) reads_per_pos[trans_name]["reads"] = hk.add_key( position, 0, reads_per_pos[trans_name]["reads"]) reads_per_pos[trans_name]["reads"][ position] = reads_per_pos[trans_name]["reads"][position] + 1 reads_per_pos[trans_name] = hk.add_key( "coords", (int(line[1]), int(line[2])), reads_per_pos[trans_name]) hk.remove_file(intermediate_file) return reads_per_pos
def get_MSA_concat_list(input_file, output_file, min_species): ''' Given the sequences and coordinates from a bunch of raw MSA objects (as returned by get_MSA_gene_list), filter them to only keep ones where you have a contiguous CDS of the same length as in human. Write an output file where each row corresponds to one CDS region and has the orthologous sequence from each of the species. ''' with open(input_file) as file, open(output_file, "w") as outfile: current_string = [] lines = line_count(input_file) strand_mapping = {"human_sense": ("1", "-1"), "human_antisense": ("-1", "1")} print("Total: {0}.".format(lines)) for pos, line in enumerate(file): if pos % 10000 == 0: print(pos) seqs = {} #that means we've gotten to a new human CDS record if line[0] == "%": line = line.rstrip("\n") global_name = line.split("|") strand = global_name[6] #it'll always fetch the gab from the sense strand (with regards to the query species) and #the other species from whatever aligns to the reference strand in the query species #so if the gene is antisense, everything has to be flipped if global_name[6] == "-": mapping_tuple = strand_mapping["human_antisense"] else: mapping_tuple = strand_mapping["human_sense"] #if it begins with neither a percentage sign nor an asterisk, #that means it must be a line of sequence #put all those in a list for each human CDS elif line[0] != "*": current_string.append(line) #you've reached the end of a CDS block else: #parse the stuff you've read in into a dictionary with the species as keys current_dict = get_species_dict(current_string) #loop over the different species for species in list(current_dict.keys()): #check that the aligning region forms a contiguous block in this species current_species_dict = filter_species(current_dict[species], mapping_tuple) if current_species_dict: current_dict[species] = current_species_dict else: del current_dict[species] outstring = concat_sequences(current_dict, min_species, global_name) if outstring: outfile.write(outstring) current_string = []
def extract_true_and_control_string(fasta_name, true_indices, control_indices): """ Given a FASTA file and two tuples of indices, make two NUMPY arrays with the sequence between the indices in each of the FASTA entries. :param fasta_name: input FASTA file :param true_indices: tuple indicating the start and end (0-based) of the first segment :param control_indices: tuple giving the start and end of the second segment :return: two numpy arrays with rows corresponding to FASTA entries and columns to positions in the segment. """ expected_length_true = true_indices[1] - true_indices[0] expected_length_control = control_indices[1] - control_indices[0] # Get the number of lines in FASTA and divide by 2 to get the number of sequences fasta_length = hk.line_count(fasta_name) / 2 # Pre-allocate two arrays, one for the true sequence and one for the control occ_mat_true = np.empty( (int(fasta_length), int(true_indices[1] - true_indices[0])), dtype="str") occ_mat_control = np.empty( (int(fasta_length), int(control_indices[1] - control_indices[0])), dtype="str") pos_in_fasta = 0 error_counter = 0 with open(fasta_name) as fasta: for line in fasta: if line[0] != ">": true_string = line[true_indices[0]:true_indices[1]] control_string = line[control_indices[0]:control_indices[1]] if (len(true_string) != expected_length_true) or ( len(control_string) != expected_length_control): error_counter = error_counter + 1 for pos in range(len(true_string)): occ_mat_true[pos_in_fasta, pos] = true_string[pos] occ_mat_control[pos_in_fasta, pos] = control_string[pos] pos_in_fasta = pos_in_fasta + 1 print("Errors: {0}.".format(error_counter)) return (occ_mat_true, occ_mat_control)
def main(): description = "Call peaks in a BED file of NET-seq reads." help_info = [ "BED file (at least a BED6) with NET-seq reads. Should be single-nucleotide resolution (each BED region is the 3' end of a read.).", "Ensembl GTF file for the relevant species. Ensure that chromosome names are formatted the same way in both the GTF and the BED file with reads!", "BED file with the coordinates of the transcripts to analyze. Only the name field is read, hence the others can hold placeholders. The name field must contain transcript IDs from the GTF file.", "Name of the output file (BED file with peak coordinates).", "Alpha value for calling a position as having a significantly higher local read denisty than expected by chance. Default: 0.01.", "Merge distance: adjacent peaks will be merged if they are closer than this many nucleotides. Default: 21.", "Minimum reads per peak. Default: 10.", "The number of times the read position randomization should be performed for each transcript. Higher values make the significance calculation (marginally) more robust, however, they also make the programme very slow. Default: 5.", "Minimum length of a peak in nucleotides. Default: 5.", "Size of the sliding window to use when calculating the local read density. It may be sensible to set this to the same value as the merge distance. Should be an odd integer. Default: 21", "The analysis will be performed this many times, with the output files numbered. Useful for running many negative control simulations at once. Default: 1.", "Read positions will be shuffled within each transcript before analysis. This should disrupt any signal and should give a flat peak density profile.", "Instead of a sliding window, adjacent non-overlapping windows will be used when calculating the local read density.", "When calling peaks in a given exon/intron, do not include that exon/intron in the read position randomization.", "When --exclude_focal is set, count an exon and its upstream intron as a single unit (except for the first exon).", "Don't filter out likely PCR duplicates (peaks where more than 90%% of the reads come from a single nucleotide position).)" ] defaults = {4: 0.01, 5: 21, 6: 10, 7: 5, 8: 5, 9: 21, 10: 1} args = hk.parse_arguments(description, [ "reads_file", "gtf", "trans_active_file", "output_file", "significance_threshold", "merge", "min_reads_per_peak", "iterations", "min_peak_length", "window_size", "runs", "neg_control", "no_slide", "exclude_focal", "with_ups_intron", "no_PCR_filter" ], floats=[4], ints=[5, 6, 7, 8, 9, 10], flags=[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], detailed_help=help_info, defaults=defaults) reads_file, gtf, trans_active_file, output_file, significance_threshold, merge, min_reads_per_peak, iterations, min_peak_length, window_size, runs, neg_control, no_slide, exclude_focal, with_ups_intron, no_PCR_filter = args.reads_file, args.gtf, args.trans_active_file, args.output_file, args.significance_threshold, args.merge, args.min_reads_per_peak, args.iterations, args.min_peak_length, args.window_size, args.runs, args.neg_control, args.no_slide, args.exclude_focal, args.with_ups_intron, args.no_PCR_filter print("Merge distance: {0}".format(merge)) print("Minimum number of reads per peak: {0}".format(min_reads_per_peak)) print("Minimum peak length: {0}".format(min_peak_length)) print("Window size: {0}".format(window_size)) print("Significance level: {0}".format(significance_threshold)) print("Randomization iterations to perform: {0}".format(iterations)) print("Runs: {0}".format(runs)) neg_str = "" if neg_control: neg_str = "_neg_control" slide_str = "" if no_slide: slide_str = "_no_slide" intron_str = "" if with_ups_intron: intron_str = "w_ups_intr" # 0. make a BED file with the coordinates of transcripts transcripts_file = "{0}_transcripts.bed".format(gtf[:-4]) co.get_transcripts(gtf, transcripts_file, add_chr=True) exons = rw.read_gtf(gtf, "exon") # 1. intersect the two files, loop over the result and make a # dictionary of reads per pos for each transcript, which has reads reads_per_pos = get_reads_per_pos(reads_file, transcripts_file) # only leave transcriptionally active genes (one isoform per gene) trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:] # pull out the column with transcript IDs trans_active_genes = [i[3] for i in trans_active_genes] reads_per_pos = { i: reads_per_pos[i] for i in reads_per_pos if i.split(".")[-1] in trans_active_genes } for sim in range(runs): print("**********{0}**********".format(sim)) # 2. for each transcript, randomly reshuffle the reads and calculate the # nth percentile depending on what the significance threshold is # keep positions that are higher than that threshold and write to BED file raw_peak_bed = "{0}_{1}_raw_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format( reads_file[:-4], gtf.split("/")[-1][:-4], iterations, min_reads_per_peak, window_size, neg_str, intron_str, slide_str, sim) read_count_file = "{0}_{1}_read_counts{2}_{3}{4}{5}_{6}_sim.txt".format( reads_file[:-4], gtf.split("/")[-1][:-4], iterations, window_size, neg_str, intron_str, sim) new_reads_file = write_raw_peaks(reads_per_pos, raw_peak_bed, read_count_file, exons, iterations=iterations, min_read_count=min_reads_per_peak, window_size=window_size, neg_control=neg_control, no_slide=no_slide, exclude_focal=exclude_focal, with_ups_intron=with_ups_intron) if neg_control: reads_file = new_reads_file # 3. merge peaks merged_peak_bed = "{0}_{1}_merged_peaks{2}_{3}_{4}{5}{6}{7}_{8}_sim.bed".format( reads_file[:-4], gtf.split("/")[-1][:-4], iterations, window_size, merge, neg_str, slide_str, intron_str, sim) co.merge_bed(raw_peak_bed, merged_peak_bed, merge) print("Before filtering, there are {0} peaks.".format( hk.line_count(merged_peak_bed))) # 4. filter out peaks that don't have enough reads or are too short. # Write final results to file and also write a stats file with the size, # read count and overlapping transcript of the peaks stats_file = "{0}_stats_{1}_sim.txt".format(output_file[:-4], sim) filter_peaks(merged_peak_bed, reads_file, read_count_file, "{0}_{1}_sim.bed".format(output_file[:-4], sim), min_reads_per_peak, min_peak_length, stats_file, no_PCR_filter=no_PCR_filter)
def main(): description = "Record splicing distance." args = hk.parse_arguments(description, ["input_file", "gtf", "output_folder", "trans_active_file", "window_size", "intron_window_size", "outsuffix", "leave_terminal"], ints = [4, 5], flags = [7]) input_file, gtf, output_folder, trans_active_file, window_size, intron_window_size, outsuffix, leave_terminal = args.input_file, args.gtf, args.output_folder, args.trans_active_file, args.window_size, args.intron_window_size, args.outsuffix, args.leave_terminal if outsuffix == "None": outsuffix = "" bare_input_path = input_file.split("/")[-1] bed = "{0}.bed".format(input_file[:-4]) # hk.convert2bed(input_file, bed) # get descriptive stats of the reads length_file = "{0}/{1}_read_lengths.txt".format(output_folder, bare_input_path[:-4]) write_read_lengths(bed, length_file) # read in CDS coordinates exons = rw.read_gtf(gtf, "CDS", gene=False) # only leave transcriptionally active genes (one isoform per gene) trans_active_genes = rw.read_many_fields(trans_active_file, "\t")[1:] # pull out the column with transcript IDs trans_active_genes = [i[3] for i in trans_active_genes] exons = {i: exons[i] for i in exons if i in trans_active_genes} terminal_suff = "_with_terminal" if not leave_terminal: # remove last exons exons = {i: exons[i][:-1] for i in exons} terminal_suff = "" # prepare exon-exon junctions exon_junctions_file = "{0}_exon_junctions{1}{2}.bed".format(gtf[:-4], outsuffix, terminal_suff) all_junctions = co.extract_3ss(exons, exon_junctions_file) out_bed = "{0}/{1}_first_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True) out_bed_end = "{0}/{1}_last_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True) intron_bed = "{0}/{1}_first_{2}_intronic_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], intron_window_size, outsuffix, terminal_suff) write_intron_starts(all_junctions, intron_bed, exons, intron_window_size, add_chr=True) out_bed = "{0}/{1}_first_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed, exons, window_size, add_chr=True, centre=True) out_bed_end = "{0}/{1}_last_centred_{2}_bp{3}{4}.bed".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff) write_exon_starts(all_junctions, out_bed_end, exons, window_size, add_chr=True, from_end=True, centre=True) out_bed_si = "{0}/{1}_si_pos{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) write_si_pos(all_junctions, out_bed_si, exons, add_chr=True) out_bed_si_current = "{0}/{1}_si_pos_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) write_si_pos(all_junctions, out_bed_si_current, exons, add_chr=True, curr_exon=True) # check which junctions are associated with a splicing intermediate read snr_bed = "{0}_snr.bed".format(bed[:-4]) co.snr_bed(bed, snr_bed) si_counts_bed = "{0}/{1}_si_counts{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) co.intersect_bed(out_bed_si, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_bed) si_counts_current_bed = "{0}/{1}_si_counts_current{2}{3}.bed".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff) co.intersect_bed(out_bed_si_current, snr_bed, force_strand=True, hit_count=True, no_dups=False, output_file=si_counts_current_bed) # filter out reads that don't overlap exon-exon junctions exon_junction_bed = "{0}_exon_junctions{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff) co.intersect_bed(bed, exon_junctions_file, write_both=True, output_file=exon_junction_bed, force_strand=True, no_dups=False) spliced_bed = "{0}_spliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff) unspliced_bed = "{0}_unspliced{1}{2}.bed".format(input_file[:-4], outsuffix, terminal_suff) sr_distances = {} ur_distances = {} found_count = 0 file_size = hk.line_count(exon_junction_bed) # will store all the intron names for which there are # either spliced or unspliced reads valid_junctions = [] with open(exon_junction_bed) as file, open(spliced_bed, "w") as sfile, open(unspliced_bed, "w") as ufile: for pos, line in enumerate(file): if pos % 100000 == 0: print("{0}/{1}".format(pos, file_size)) print("Found {0} spliced reads.".format(found_count)) print("\n") line = line.split("\t") # reads that end at the last nucleotide of an exon intermediate_read = NGS.check_intermediate_read(line, exons) intron_name = line[20] if not intermediate_read: # check that it ends within the exon just downstream of # the 3' ss that is being analyzed in_dwns_exon = NGS.check_position_in_exon(line, exons) if in_dwns_exon: # 'spliced', 'unspliced' or 'None' (=can't analyze) read_type = NGS.analyze_cigar(line, overhang = 5) if read_type: if intron_name not in valid_junctions: valid_junctions.append(intron_name) splice_dist = NGS.get_splice_dist(line) if read_type == "S": sfile.write("\t".join([str(i) for i in line])) found_count = found_count + 1 sr_distances = update_dist_dict(intron_name, sr_distances, splice_dist) else: ufile.write("\t".join([str(i) for i in line])) ur_distances = update_dist_dict(intron_name, ur_distances, splice_dist) print("Proportion of spliced reads: {0}.".format(found_count/(pos + 1))) # for each valid junction, calculate the length of the exonic sequence # afterwards, so that you wouldn't consider intronic sequence in the distance # matrix lengths_dict = co.get_lengths(exons, valid_junctions) write_dist_mat(sr_distances, window_size, "{0}/{1}_spliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), lengths_dict, "{0}/{1}_spliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), "{0}/{1}_spliced_read_first_spliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff)) write_dist_mat(ur_distances, window_size, "{0}/{1}_unspliced_read_distances_{2}{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), lengths_dict, "{0}/{1}_unspliced_read_{2}_intron_names{3}{4}.txt".format(output_folder, bare_input_path[:-4], window_size, outsuffix, terminal_suff), "{0}/{1}_unspliced_read_first_unspliced{2}{3}.txt".format(output_folder, bare_input_path[:-4], outsuffix, terminal_suff))