def upstream_and_downstream_seq(args): chromosome = split_coords(args.coords)[0] start = str(split_coords(args.coords)[1]) downstream = str(int(start)-1000) end = str(split_coords(args.coords.replace('"', ""))[2]) upstream = str(int(end)+1000) #using the samtools faidx function to take the appropriate sequence from a reference genome downstream_fa = Seq(pysam.faidx(args.genome, chromosome+":"+downstream+"-"+start), generic_dna) upstream_fa = Seq(pysam.faidx(args.genome, chromosome+":"+end+"-"+upstream), generic_dna) # Selecting only the sequence and converting to uppercase downstream_seq = downstream_fa[(len(downstream_fa.split('\n')[0])):-1].upper() # Selecting only the sequence, converting to uppercase, reversing and then getting the complementary sequence reverse_compliment_upstream_seq = upstream_fa[(len(upstream_fa.split('\n')[0])):-1].upper().reverse_complement() # Making sequence records with ID header and sequence downstream_seq = SeqRecord(downstream_seq, id="downstream_sequence") reverse_compliment_upstream_seq = SeqRecord(reverse_compliment_upstream_seq, id="upstream_sequence") if os.path.isdir(args.directory+"tmp/") == False: os.mkdir(args.directory+"tmp/") # Writing sequences to fasta file downstream_outfile = open(os.path.join(args.directory+"tmp/", "downstream.fa"), "w") downstream_outfile.write(">"+str(downstream_seq.id) + "\n" + str(downstream_seq.seq)) upstream_outfile = open(os.path.join(args.directory+"tmp/", "upstream.fa"), "w") upstream_outfile.write(">"+str(reverse_compliment_upstream_seq.id) + "\n" + str(reverse_compliment_upstream_seq.seq))
def location_of_binding(args, outlist, exon_list): finaloutlist = [] for line in outlist: col = line.split("\t") if int(col[9]) < split_coords(args.coords)[1]: line = line + "\t" + "Downstream" elif int(col[9]) > split_coords(args.coords)[2]: line = line + "\t" + "Upstream" elif int(col[9]) > split_coords(args.coords)[1] and int( col[10]) < split_coords(args.coords)[2]: line = line + "\t" + "circRNA" elif int(col[9]) == split_coords(args.coords)[1] or int( col[10]) == split_coords(args.coords)[2]: line = line + "\t" + "BSJ" elif int(col[10]) > split_coords(args.coords)[1] or int( col[9]) < split_coords(args.coords)[2]: line = line + "\t" + "BSJ" finaloutlist.append(line) for exon in exon_list: if int(col[9]) >= split_coords(exon)[1] and int( col[10]) <= split_coords(exon)[2]: line = line + "\t" + "Exon Binding" finaloutlist.append(line) return finaloutlist
def exon_coords(args): line_number1 = [] line_number2 = [] exon_file = open(args.exon).read().splitlines() for line in exon_file: col = line.split("\t") if col[0] == split_coords( args.coords)[0] and col[2] == "exon" and col[3] == str( split_coords(args.coords)[1]): line_number1.append(exon_file.index(line)) if col[0] == split_coords( args.coords)[0] and col[2] == "exon" and col[4] == str( split_coords(args.coords)[2]): line_number2.append(exon_file.index(line)) # Making list of exons composing circRNA exon_list = [] for line in exon_file[line_number1[0]:line_number2[0] + 1]: col = line.split("\t") if col[2] == "exon": exon_list.append(col[0] + ":" + col[3] + "-" + col[4]) exon_list = remove_duplicates(exon_list) return exon_list
def intron_binding(args, finaloutlist): yes_count = 0 for element in finaloutlist: col = element.split("\t") if int(col[9]) < split_coords(args.coords)[1] \ or int(col[9]) > split_coords(args.coords)[2]: yes_count += 1 return yes_count
def check_circBase(args, circBase): match = [] for line in circBase: col = line.split("\t") if "#" not in line: if split_coords(args.coords)[0] == col[0] and split_coords( args.coords)[1] == int(col[1]) and split_coords( args.coords)[2] == int(col[2]): match.append(line) return match
def index_gtf(args): line_number1 = [] line_number2 = [] gtf_file = open(args.A).read().splitlines() for line in gtf_file: col = line.split("\t") if col[0] == split_coords( args.ID)[0] and col[2] == "exon" and col[3] == str( split_coords(args.ID)[1]): line_number1.append(gtf_file.index(line)) elif col[0] == split_coords( args.ID)[0] and col[2] == "exon" and col[4] == str( split_coords(args.ID)[2]): line_number2.append(gtf_file.index(line)) return line_number1, line_number2, gtf_file
def up_intron(args): annotation = open(args.annotation, "r").read().splitlines() for line in annotation: col = line.split("\t") if int(col[1]) == split_coords(args.coords)[2]: upstream_intron = (int(col[2])) return upstream_intron
def ENCORI(args, downstream_intron, upstream_intron): outlist = [] infile = open(args.infile, "r").read().splitlines() for line in infile: if "#" not in line: col = line.split("\t") if col[8] == split_coords(args.coords)[0] \ and int(col[9]) >= downstream_intron + 1 \ and int(col[10]) <= upstream_intron: outlist.append(line) return outlist