def upstream_and_downstream_seq(args): chromosome = split_coords(args.coords)[0] start = str(split_coords(args.coords)[1]) downstream = str(int(start)-1000) end = str(split_coords(args.coords.replace('"', ""))[2]) upstream = str(int(end)+1000) #using the samtools faidx function to take the appropriate sequence from a reference genome downstream_fa = Seq(pysam.faidx(args.genome, chromosome+":"+downstream+"-"+start), generic_dna) upstream_fa = Seq(pysam.faidx(args.genome, chromosome+":"+end+"-"+upstream), generic_dna) # Selecting only the sequence and converting to uppercase downstream_seq = downstream_fa[(len(downstream_fa.split('\n')[0])):-1].upper() # Selecting only the sequence, converting to uppercase, reversing and then getting the complementary sequence reverse_compliment_upstream_seq = upstream_fa[(len(upstream_fa.split('\n')[0])):-1].upper().reverse_complement() # Making sequence records with ID header and sequence downstream_seq = SeqRecord(downstream_seq, id="downstream_sequence") reverse_compliment_upstream_seq = SeqRecord(reverse_compliment_upstream_seq, id="upstream_sequence") if os.path.isdir(args.directory+"tmp/") == False: os.mkdir(args.directory+"tmp/") # Writing sequences to fasta file downstream_outfile = open(os.path.join(args.directory+"tmp/", "downstream.fa"), "w") downstream_outfile.write(">"+str(downstream_seq.id) + "\n" + str(downstream_seq.seq)) upstream_outfile = open(os.path.join(args.directory+"tmp/", "upstream.fa"), "w") upstream_outfile.write(">"+str(reverse_compliment_upstream_seq.id) + "\n" + str(reverse_compliment_upstream_seq.seq))
def exonic_circRNA(exon_list, args): circRNA_seq = "" for exon in exon_list: exon_seq = Seq(pysam.faidx(args.G, exon), generic_dna) if args.s == "-": exon_seq = str(exon_seq[( len(exon_seq.split('\n')[0])):-1].upper().reverse_complement()) circRNA_seq += exon_seq elif args.s == "+": exon_seq = str(exon_seq[(len(exon_seq.split('\n')[0])):-1].upper()) circRNA_seq += exon_seq # Making a one line sequence circRNA_seq = circRNA_seq.replace("\n", "") return circRNA_seq
def find_frame(s): # Finds longest ORF s = s.replace("-", "") seq1 = Seq(s, generic_dna).translate() # translate in every frame seq2 = Seq(s[1:], generic_dna).translate() seq3 = Seq(s[2:], generic_dna).translate() L_seq1 = max([len(x) for x in seq1.split("*")]) # find longest ORF in each frame L_seq2 = max([len(x) for x in seq2.split("*")]) L_seq3 = max([len(x) for x in seq3.split("*")]) Ls = [L_seq1, L_seq2, L_seq3] L_max = max(Ls) # get longest ORF among all frames frames_max = [i for i, x in enumerate(Ls) if x == L_max] # get frame of longest ORF if len(frames_max) > 1: print "Warning: more than one reading frame had max length ORF" return frames_max[0]
def upstream_and_downstream_seq(coords, out_path): chromosome = coords.split(":")[0] start = str(int(coords.split(":")[1].split("-")[0])) downstream = str(int(coords.split(":")[1].split("-")[0]) - 500) end = str(int(coords.split(":")[1].split("-")[1])) upstream = str(int(coords.split(":")[1].split("-")[1]) + 500) #using the samtools faidx function to take the appropriate sequence from a reference genome downstream_fa = Seq( pysam.faidx( "/Users/lachlan/Documents/Masters_Research_2019/Data/ICM/genome.fa", chromosome + ":" + downstream + "-" + start), generic_dna) upstream_fa = Seq( pysam.faidx( "/Users/lachlan/Documents/Masters_Research_2019/Data/ICM/genome.fa", chromosome + ":" + end + "-" + upstream), generic_dna) # Selecting only the sequence and converting to uppercase downstream_seq = downstream_fa[( len(downstream_fa.split('\n')[0])):-1].upper() # Selecting only the sequence, converting to uppercase, reversing and then getting the complementary sequence reverse_compliment_upstream_seq = upstream_fa[( len(upstream_fa.split('\n')[0])):-1].upper().reverse_complement() # Making sequence records with ID header and sequence downstream_seq = SeqRecord(downstream_seq, id="downstream_sequence") reverse_compliment_upstream_seq = SeqRecord( reverse_compliment_upstream_seq, id="upstream_sequence") # Writing sequences to fasta file downstream_outfile = open(os.path.join(out_path, "downstream.fa"), "w") downstream_outfile.write(">" + str(downstream_seq.id) + "\n" + str(downstream_seq.seq)) upstream_outfile = open(os.path.join(out_path, "upstream.fa"), "w") upstream_outfile.write(">" + str(reverse_compliment_upstream_seq.id) + "\n" + str(reverse_compliment_upstream_seq.seq))
def randomprotein(percentage): inp = percentage length = 10000 gcpercent = float(inp) / 100 atpercent = 1 - gcpercent gcthou = int(length * gcpercent) atthou = int(length * atpercent) seq = "" goodlen = True while goodlen: rand = random.uniform(0, 1) if (rand >= 0 and rand < gcpercent / 2): seq += "G" elif (rand >= gcpercent / 2 and rand < gcpercent): seq += "C" elif (rand >= gcpercent and rand < 1 - (atpercent / 2)): seq += "A" elif (rand >= 1 - (atpercent / 2) and rand <= 1): seq += "T" if (len(seq) == 10000): goodlen = False seq = Seq(seq) seq = seq.translate() split = seq.split("*") for i in range(0, len(split)): largest = "" if (len(split[i]) > len(largest)): largest = split[i] return (len(largest))
dnaSeq = open("data\\dna_chromosome_1.seq", "r") my_seq = dnaSeq.read() print(my_seq) RNA = Seq(my_seq).transcribe() rev = RNA[::-1] print(rev) reverseRNASeq = open("reverse_RNAsequence_1.seq", "w") reverseRNASeq.write(rev.__str__()) reverseRNASeq.close() # Zad 6 mySeq = open("sampleData.txt", "r") my_seq = mySeq.read() #print(my_seq) s, t = my_seq.split("\n") print("s: " + s) print("t: " + t) for n in range(len(s) - len(t)): if t == s[n:(len(t) + n)]: print(n + 1) # Zad 7 mySeq = open("seqSampleData.txt", "r") my_seq = mySeq.read() # RNA transcribe RNA = Seq(my_seq).transcribe().__str__() startCodon = "AUG"
rawseq=retr_db(database, elt[0]) clen=len(rawseq) scaf.write('>'+elt[0]+'_'+rec[2]+'\n'+rawseq+'\n') if sens=='+': start=sstart-3*(qstart-1) if sstart-3*(qstart-1)>1 else sstart end=send+3*(qlen-qend) if send+3*(qlen-qend)<clen else send elif sens=='-': start=sstart-3*(qlen-qend) if sstart-3*(qlen-qend)>1 else sstart end=send+3*(qstart-1) if send+3*(qstart-1)<clen else send print elt[0],elt[2],sens,sstart,send,qstart,qend,start,end,clen hdseq=Seq(rawseq[start-1:end],IUPAC.unambiguous_dna) if sens=='-': hdseq=hdseq.reverse_complement() hdseq=hdseq.split('N')[0] print hdseq.translate() #head='>'+prefix+'_'+elt[2].split('_')[1]+'-'+elt[0].split('|')[1]+'@'+'-'.join(map(str,elt[1]))+'\n' head='>'+prefix+'_'+elt[2].split('_')[1]+'|ND'+elt[0].split('_')[1]+'@'+'-'.join(map(str,elt[1]))+'\n' prot.write(head+hdseq.translate().tostring()+'\n') nucl.write(head+hdseq.tostring()+'\n') #+'-'.join(map(str,elt[1])
except: d = None orf_sequence = defaultdict(list) for rec in orfs: orf_sequence[str(rec.seq)].append(rec.id) export = [] mapping = {} count = 1 for seq in orf_sequence: id = 'orf_sequence_{}'.format(str(count)) ids = orf_sequence[seq] for i in ids: mapping[i] = id description = ';'.join(ids) seq = Seq(''.join(seq.split('*'))) record = SeqRecord(id=id, description=description, seq=seq) export.append(record) count += 1 SeqIO.write(export, path + '/nr_translated_pg_orfs.fasta', 'fasta') outpath = path + '/id_mapping.json' mapping = json.dumps(mapping) with open(outpath, 'w') as f: f.write(mapping)