def getReadStartEnd(read_str): """ Returns the start and end position """ # elems = read_str.split("_") elems = convertCoordStr(read_str) return elems[1], elems[2]
def getRegionCoord(region_coord_str): region_coord = convertCoordStr(region_coord_str) # region_coord_str_elems = region_coord_str.split("_") # region_coord = (region_coord_str_elems[0], # int(region_coord_str_elems[1]), # int(region_coord_str_elems[2])) return region_coord
def getSSOrder(alt_start_or_end, inclusion_start, inclusion_end, exclusion_str_list): unordered_pos = [] for_ordered_inclusion_pos = None if alt_start_or_end == "alt_start": unordered_pos.append(inclusion_start) for_ordered_inclusion_pos = inclusion_start for exclusion_str in exclusion_str_list: chr, start, end = convertCoordStr(exclusion_str) unordered_pos.append(start) else: unordered_pos.append(inclusion_end) for_ordered_inclusion_pos = inclusion_end for exclusion_str in exclusion_str_list: chr, start, end = convertCoordStr(exclusion_str) unordered_pos.append(end) ordered_pos = list(unordered_pos) ordered_pos.sort() return ordered_pos, ordered_pos.index(for_ordered_inclusion_pos)
def getAA_ADInclIsoformLen(event_str, jcn_seq_len): event_str_list = event_str.split("\t") excl_jcns = event_str_list[5].split(";") chr, incl_start, incl_end = convertCoordStr(event_str_list[6]) alt_start_or_end = determineAltStartOrEnd(incl_start, incl_end, excl_jcns) ordered_pos, inclusion_pos_idx = getSSOrder(alt_start_or_end, incl_start, incl_end, excl_jcns) isoform_lengths = getAD_AA_isoform_lengths(alt_start_or_end, ordered_pos, jcn_seq_len) return isoform_lengths[inclusion_pos_idx]
def parse_jcn_str(jcn_str): """ (chr, chrStart, chrEnd, strand, blockLens, secondBlockStart) = """ (chr, intron_start_str, intron_end_str) = convertCoordStr(jcn_str) intron_start = int(intron_start_str) intron_end = int(intron_end_str) chrStart = intron_start - DEF_JCN_OVERHANG - 1 chrEnd = intron_end + DEF_JCN_OVERHANG blockLens = DEF_JCN_OVERHANG length = chrEnd - chrStart secondBlockStart = length - blockLens return chr, chrStart, chrEnd, blockLens, secondBlockStart
def getInclIsoformLen(event_str, jcn_seq_len): """ Length of the inclusion isoform equals sum of junction lengths + exon lengths """ event_str_list = event_str.split("\t") jcns = elems_split(event_str_list[6]) jcns.extend(elems_split(event_str_list[9])) exon_len = 0 exons = elems_split(event_str_list[8]) for exon in exons: if exon == "" or exon == "None": continue chr, exon_start, exon_end = convertCoordStr(exon) this_len = exon_end - exon_start + 1 exon_len += this_len return exon_len + (jcn_seq_len * (len(jcns)))
def findLargestRegion(coords_string): first_list = coords_string.split(";") full_list = [] for elem1 in first_list: for elem2 in elem1.split(","): full_list.append(elem2) leftmost = INFINITY rightmost = -1 for coord in full_list: chr, start, end = convertCoordStr(coord) if start < leftmost: leftmost = start if end > rightmost: rightmost = end return leftmost, rightmost
def disambiguateJcnStr(chr_seq, line_list, majority_rules): """ Will use splice site sequence to infer strand """ try: chr, start, end = convertCoordStr(line_list[3]) except: print "Junction BED file must have intron position in 4th column." sys.exit(1) intron_seq = chr_seq[start-1:end] if intron_seq.startswith("GT") and intron_seq.endswith("AG"): line_list[5] = "+" elif intron_seq.startswith("CT") and intron_seq.endswith("AC"): line_list[5] = "-" # Other common splice site sequence elif intron_seq.startswith("GC") and intron_seq.endswith("AG"): line_list[5] = "+" elif intron_seq.startswith("CT") and intron_seq.endswith("GC"): line_list[5] = "-" # minor spliceosome elif intron_seq.startswith("AT") and intron_seq.endswith("AC"): line_list[5] = "+" elif intron_seq.startswith("GT") and intron_seq.endswith("AT"): line_list[5] = "-" # Priority to 5' splice site since there is more information # there elif intron_seq.startswith("GT"): line_list[5] = "+" elif intron_seq.endswith("AC"): line_list[5] = "-" elif intron_seq.endswith("AG"): line_list[5] = "+" elif intron_seq.startswith("CT"): line_list[5] = "-" else: if not majority_rules: # Strand will resolved later if majority_rules print "Cannot find strand for %s" % line_list[3]
def disambiguateJcnStr(chr_seq, line_list, majority_rules): """ Will use splice site sequence to infer strand """ try: chr, start, end = convertCoordStr(line_list[3]) except: print("Junction BED file must have intron position in 4th column.") sys.exit(1) intron_seq = chr_seq[start - 1:end] if intron_seq.startswith("GT") and intron_seq.endswith("AG"): line_list[5] = "+" elif intron_seq.startswith("CT") and intron_seq.endswith("AC"): line_list[5] = "-" # Other common splice site sequence elif intron_seq.startswith("GC") and intron_seq.endswith("AG"): line_list[5] = "+" elif intron_seq.startswith("CT") and intron_seq.endswith("GC"): line_list[5] = "-" # minor spliceosome elif intron_seq.startswith("AT") and intron_seq.endswith("AC"): line_list[5] = "+" elif intron_seq.startswith("GT") and intron_seq.endswith("AT"): line_list[5] = "-" # Priority to 5' splice site since there is more information # there elif intron_seq.startswith("GT"): line_list[5] = "+" elif intron_seq.endswith("AC"): line_list[5] = "-" elif intron_seq.endswith("AG"): line_list[5] = "+" elif intron_seq.startswith("CT"): line_list[5] = "-" else: if not majority_rules: # Strand will resolved later if majority_rules print(("Cannot find strand for %s" % line_list[3]))
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("-i", dest="intron_coords", type="string", help="""File of intron coordinates. Format: type, chr, strand, start, end""", default=None) opt_parser.add_option("-b", dest="bed_intron_coords", type="string", help="BED file of intron coordinates.", default=None) opt_parser.add_option("-a", dest="read_alignments", type="string", help="""File of alignments to genome. Format: chr, start, strand""", default=None) opt_parser.add_option("-f", dest="flanking_dist", type="int", help="""Distance away from exon intron junction to check for reads in.""", default=None) opt_parser.add_option("-o", dest="offsets", type="int", help="""Minimum number of offsets required at each exon/intron junction. Default=1""", default=1) opt_parser.add_option("-l", dest="read_length", type="int", help="Length of the reads.", default=1) opt_parser.add_option("--out_dir", dest="out_dir", type="string", help="Output files are put here.", default=None) opt_parser.add_option("--out_prefix", dest="prefix", type="string", help="Prefix attached to all output files.", default=None) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-a") opt_parser.check_required("-f") opt_parser.check_required("-l") opt_parser.check_required("--out_dir") opt_parser.check_required("--out_prefix") # Check that the COUNTING_SCRIPT path is valid if not os.path.exists(COUNTING_SCRIPT): print("Please change COUNTING_SCRIPT path.") opt_parser.print_help() sys.exit(1) if options.intron_coords and options.bed_intron_coords: print("Only one type of intron coord can be used as input.") opt_parser.print_help() sys.exit(1) if (not options.intron_coords) and (not options.bed_intron_coords): print(" Need to specify intron coordinates. See options -i or -b") opt_parser.print_help() sys.exit(1) intron_coords = None isBedFormat = False if options.intron_coords: intron_coords = open(options.intron_coords) if options.bed_intron_coords: intron_coords = open(options.bed_intron_coords) isBedFormat = True read_alignments = options.read_alignments read_length = options.read_length flanking_dist = options.flanking_dist offsets = options.offsets prefix = options.prefix out_dir = options.out_dir if not out_dir.endswith("/"): out_dir += "/" if not os.path.exists(out_dir): print("Output directory does not exist") sys.exit(1) # Intermediate Output Files out_coords_file = out_dir + prefix + "_intron_exon_junction_coords.out" out_coords = open(out_coords_file, "w") out_read_assoc_file = out_dir + prefix + "_intron_exon_junction_coords_w_read.out" # Final output out_file_name = out_dir + prefix + "_intron_exon_junction_counts.txt" out_file = open(out_file_name, "w") confident_ie_name = out_dir + prefix + "_confident_ie.txt" confident_ie_file = open(confident_ie_name, "w") # {intron_coord: {"left": (chr, start, end), # "right": (chr, start, end)} # "left" and "right" being the region at the left or right side of the # junction, around the exon/intron junction # The dict is the above but reverse mapping left_region_coord2intron = {} right_region_coord2intron = {} # {intron_coord_str:{"left":{pos:count}, # "right":{pos:count}} intron_dict = {} regions_set = set([]) for line in intron_coords: line = formatLine(line) if isBedFormat: if line.startswith("track"): continue chr, start_str, end_str = parseBEDLine(line) else: type, chr, strand, start_str, end_str = line.split("\t") if chr.startswith("chr"): chr = chr.replace("chr", "") intron_coord_str = "%s:%s-%s" % (chr, start_str, end_str) if intron_coord_str not in intron_dict: intron_dict[intron_coord_str] = {"left": {}, "right": {}} start = int(start_str) end = int(end_str) left_coord = (chr, start - flanking_dist, start + flanking_dist - 1) right_coord = (chr, end - flanking_dist + 1, end + flanking_dist) updateDictOfLists(left_region_coord2intron, left_coord, intron_coord_str) updateDictOfLists(right_region_coord2intron, right_coord, intron_coord_str) regions_set.add(left_coord) regions_set.add(right_coord) # Print out regions out_coords for region_coord in regions_set: out_line = "%s\t%d\t%d\n" % (region_coord[0], region_coord[1], region_coord[2]) out_coords.write(out_line) out_coords.close() # Used to make unique name for tmp file in case a shared directory is being # used for runs. rand_num = random.randrange(1, 100000) # Get Read Counts print("Getting Counts in Region") cmd = "python %s --reads %s -l %d --coords %s -o %stmp%d.txt --read_assoc %s" % ( COUNTING_SCRIPT, read_alignments, read_length, out_coords_file, out_dir, rand_num, out_read_assoc_file) print(cmd) # runCmd(cmd, SHELL) os.system(cmd) # Remove the tmp file # runCmd("rm %stmp%d.txt" % (out_dir, rand_num), SHELL) os.system("rm %stmp%d.txt" % (out_dir, rand_num)) print("Getting Left and Right Counts") # Parse read_assoc_file to get information read_assoc_file = open(out_read_assoc_file) for line in read_assoc_file: line = formatLine(line) line_list = line.split("\t") read_start, read_end = getReadStartEnd(line_list[1]) region_coord = getRegionCoord(line_list[2]) intron_coord_list = getIntronStartEnds(left_region_coord2intron, right_region_coord2intron, region_coord) if region_coord in left_region_coord2intron: for intron_str in left_region_coord2intron[region_coord]: # Put in left dictionaries if read_end not in intron_dict[intron_str]["left"]: intron_dict[intron_str]["left"][read_end] = 1 else: intron_dict[intron_str]["left"][read_end] += 1 if region_coord in right_region_coord2intron: for intron_str in right_region_coord2intron[region_coord]: # Check right dictionary if read_end not in intron_dict[intron_str]["right"]: intron_dict[intron_str]["right"][read_end] = 1 else: intron_dict[intron_str]["right"][read_end] += 1 # Print output confident_ie_set = set([]) for intron_str in intron_dict: # chr, intron_start_str, intron_end_str = intron_str.split("_") # intron_start = int(intron_start_str) # intron_end = int(intron_end_str) chr, intron_start, intron_end = convertCoordStr(intron_str) # Get left_counts if len(intron_dict[intron_str]["left"]) >= offsets: left_count = getTotalCounts(intron_dict[intron_str]["left"]) confident_ie = "%s:%d-%d" % (chr, intron_start - 1, intron_start) confident_ie_set.add(confident_ie) else: left_count = 0 # Get right counts if len(intron_dict[intron_str]["right"]) >= offsets: right_count = getTotalCounts(intron_dict[intron_str]["right"]) confident_ie = "%s:%d-%d" % (chr, intron_end, intron_end + 1) confident_ie_set.add(confident_ie) else: right_count = 0 if left_count == 0 and right_count == 0: continue print_line = "%s\t%d\t%d\n" % (intron_str, left_count, right_count) out_file.write(print_line) # Now print out confident set of ie for ie in confident_ie_set: confident_ie_file.write("%s\n" % ie) confident_ie_file.close() sys.exit(0)
def main(): opt_parser = OptionParser() # Add Options. Required options should have default=None opt_parser.add_option("-i", dest="intron_coords", type="string", help="""File of intron coordinates. Format: type, chr, strand, start, end""", default=None) opt_parser.add_option("-b", dest="bed_intron_coords", type="string", help="BED file of intron coordinates.", default=None) opt_parser.add_option("-a", dest="read_alignments", type="string", help="""File of alignments to genome. Format: chr, start, strand""", default=None) opt_parser.add_option("-f", dest="flanking_dist", type="int", help="""Distance away from exon intron junction to check for reads in.""", default=None) opt_parser.add_option("-o", dest="offsets", type="int", help="""Minimum number of offsets required at each exon/intron junction. Default=1""", default=1) opt_parser.add_option("-l", dest="read_length", type="int", help="Length of the reads.", default=1) opt_parser.add_option("--out_dir", dest="out_dir", type="string", help="Output files are put here.", default=None) opt_parser.add_option("--out_prefix", dest="prefix", type="string", help="Prefix attached to all output files.", default=None) (options, args) = opt_parser.parse_args() # validate the command line arguments opt_parser.check_required("-a") opt_parser.check_required("-f") opt_parser.check_required("-l") opt_parser.check_required("--out_dir") opt_parser.check_required("--out_prefix") # Check that the COUNTING_SCRIPT path is valid if not os.path.exists(COUNTING_SCRIPT): print "Please change COUNTING_SCRIPT path." opt_parser.print_help() sys.exit(1) if options.intron_coords and options.bed_intron_coords: print "Only one type of intron coord can be used as input." opt_parser.print_help() sys.exit(1) if (not options.intron_coords) and (not options.bed_intron_coords): print " Need to specify intron coordinates. See options -i or -b" opt_parser.print_help() sys.exit(1) intron_coords = None isBedFormat = False if options.intron_coords: intron_coords = open(options.intron_coords) if options.bed_intron_coords: intron_coords = open(options.bed_intron_coords) isBedFormat = True read_alignments = options.read_alignments read_length = options.read_length flanking_dist = options.flanking_dist offsets = options.offsets prefix = options.prefix out_dir = options.out_dir if not out_dir.endswith("/"): out_dir += "/" if not os.path.exists(out_dir): print "Output directory does not exist" sys.exit(1) # Intermediate Output Files out_coords_file = out_dir + prefix + "_intron_exon_junction_coords.out" out_coords = open(out_coords_file, "w") out_read_assoc_file = out_dir + prefix + "_intron_exon_junction_coords_w_read.out" # Final output out_file_name = out_dir + prefix + "_intron_exon_junction_counts.txt" out_file = open(out_file_name, "w") confident_ie_name = out_dir + prefix + "_confident_ie.txt" confident_ie_file = open(confident_ie_name, "w") # {intron_coord: {"left": (chr, start, end), # "right": (chr, start, end)} # "left" and "right" being the region at the left or right side of the # junction, around the exon/intron junction # The dict is the above but reverse mapping left_region_coord2intron = {} right_region_coord2intron = {} # {intron_coord_str:{"left":{pos:count}, # "right":{pos:count}} intron_dict = {} regions_set = set([]) for line in intron_coords: line = formatLine(line) if isBedFormat: if line.startswith("track"): continue chr, start_str, end_str = parseBEDLine(line) else: type, chr, strand, start_str, end_str = line.split("\t") if chr.startswith("chr"): chr = chr.replace("chr", "") intron_coord_str = "%s:%s-%s" % (chr, start_str, end_str) if intron_coord_str not in intron_dict: intron_dict[intron_coord_str] = {"left": {}, "right": {}} start = int(start_str) end = int(end_str) left_coord = (chr, start - flanking_dist, start + flanking_dist - 1) right_coord = (chr, end - flanking_dist + 1, end + flanking_dist) updateDictOfLists(left_region_coord2intron, left_coord, intron_coord_str) updateDictOfLists(right_region_coord2intron, right_coord, intron_coord_str) regions_set.add(left_coord) regions_set.add(right_coord) # Print out regions out_coords for region_coord in regions_set: out_line = "%s\t%d\t%d\n" % (region_coord[0], region_coord[1], region_coord[2]) out_coords.write(out_line) out_coords.close() # Used to make unique name for tmp file in case a shared directory is being # used for runs. rand_num = random.randrange(1,100000) # Get Read Counts print "Getting Counts in Region" cmd = "python %s --reads %s -l %d --coords %s -o %stmp%d.txt --read_assoc %s" % (COUNTING_SCRIPT, read_alignments, read_length, out_coords_file, out_dir, rand_num, out_read_assoc_file) print cmd # runCmd(cmd, SHELL) os.system(cmd) # Remove the tmp file # runCmd("rm %stmp%d.txt" % (out_dir, rand_num), SHELL) os.system("rm %stmp%d.txt" % (out_dir, rand_num)) print "Getting Left and Right Counts" # Parse read_assoc_file to get information read_assoc_file = open(out_read_assoc_file) for line in read_assoc_file: line = formatLine(line) line_list = line.split("\t") read_start, read_end = getReadStartEnd(line_list[1]) region_coord = getRegionCoord(line_list[2]) intron_coord_list = getIntronStartEnds(left_region_coord2intron, right_region_coord2intron, region_coord) if region_coord in left_region_coord2intron: for intron_str in left_region_coord2intron[region_coord]: # Put in left dictionaries if read_end not in intron_dict[intron_str]["left"]: intron_dict[intron_str]["left"][read_end] = 1 else: intron_dict[intron_str]["left"][read_end] += 1 if region_coord in right_region_coord2intron: for intron_str in right_region_coord2intron[region_coord]: # Check right dictionary if read_end not in intron_dict[intron_str]["right"]: intron_dict[intron_str]["right"][read_end] = 1 else: intron_dict[intron_str]["right"][read_end] += 1 # Print output confident_ie_set = set([]) for intron_str in intron_dict: # chr, intron_start_str, intron_end_str = intron_str.split("_") # intron_start = int(intron_start_str) # intron_end = int(intron_end_str) chr, intron_start, intron_end = convertCoordStr(intron_str) # Get left_counts if len(intron_dict[intron_str]["left"]) >= offsets: left_count = getTotalCounts(intron_dict[intron_str]["left"]) confident_ie = "%s:%d-%d" % (chr, intron_start - 1, intron_start) confident_ie_set.add(confident_ie) else: left_count = 0 # Get right counts if len(intron_dict[intron_str]["right"]) >= offsets: right_count = getTotalCounts(intron_dict[intron_str]["right"]) confident_ie = "%s:%d-%d" % (chr, intron_end, intron_end + 1) confident_ie_set.add(confident_ie) else: right_count = 0 if left_count == 0 and right_count == 0: continue print_line = "%s\t%d\t%d\n" % (intron_str, left_count, right_count) out_file.write(print_line) # Now print out confident set of ie for ie in confident_ie_set: confident_ie_file.write("%s\n" % ie) confident_ie_file.close() sys.exit(0)