def dig_finder(): dirn = "./150126_BaG_Seq/" vector_seq = Fasta("./pZX-vector.fa").Seqs[0] stat_name = "chunk_dig_stat.csv" stat_dir = "/Users/xuz02/Google_Drive/workspace/Python/150126_BaG_Seq/" vector_seq = vector_seq.upper() vector_size = len(vector_seq) min_frag = 100 max_dig = 5 min_dig = 2 enzyme_dict = { "EcoRI": "GAATTC", "NotI": "GCGGCCGC", "BamHI": "GGATCC", "HindIII": "AAGCTT", "KpnI": "GGTACC", "SacI": "GAGCTC", "SalI": "GTCGAC", "SpeI": "ACTAGT", "NheI": "GCTAGC", "AgeI": "ACCGGT", "BsaI": "GGTCTC", "EcoRV": "GATATC", "NcoI": "CCATGG", "AgeI": "ACCGGT", "PstI": "CTGCAG", "XbaI": "TCTAGA", } enzyme_dict_II = {"BsaI": "GGTCTC", "BsmBI": "CGTCTC"} stat_fp = open(stat_dir + stat_name, "w") stat_fp.close()
def chr04_pcrtag_stat(): """Stat the pcrtags over the minichunks.""" import re dirn = "/workspace/Python/161212_megachunk_csPCR/" primers = "PCRtags_syn.csv" mega = "synIV_mega.fa" output = "chr04_pcrtag_stat.csv" mega_f = Fasta(dirn + mega) primers_fp = open(dirn + primers, "r") op = open(dirn + output, "w") primer_ls = [] csv = primers_fp.read() primer_ls = re.split("\r|,", csv) size = len(primer_ls) / 2 stats = [] for i in range(size): if "synF" in primer_ls[2 * i]: p_seq = primer_ls[2 * i + 1].upper() else: if "\n" not in primer_ls[2 * i + 1]: p_seq = Seq_Analyzer(primer_ls[2 * i + 1]).rcSeq().upper() for j in range(len(mega_f.Names)): sq = mega_f.Seqs[j].upper() if p_seq in sq: _pos = sq.find(p_seq) stats.append([ mega_f.Names[j], primer_ls[2 * i], primer_ls[2 * i], _pos ]) print "%s done!" % primer_ls[2 * i] for item in stats: op.write("%s, %s, %s, %d\n" % (item[0], item[1], item[2], item[3])) op.close() mega_f.close()
def info(args): """ >>> info(['tests/data/three_chrs.fasta']) <BLANKLINE> tests/data/three_chrs.fasta =========================== >chr3 length:3600 >chr2 length:80 >chr1 length:80 <BLANKLINE> 3760 basepairs in 3 sequences """ parser = optparse.OptionParser("""\ print headers and lengths of the given fasta file in order of length. e.g.: pyfasta info --gc some.fasta""") parser.add_option("-n", "--n", type="int", dest="nseqs", help="max number of records to print. use -1 for all", default=20) parser.add_option("--gc", dest="gc", help="show gc content", action="store_true", default=False) options, fastas = parser.parse_args(args) if not (fastas): sys.exit(parser.print_help()) import operator for fasta in fastas: f = Fasta(fasta) info = [(k, len(seq)) for k, seq in f.iteritems()] total_len = sum(l for k, l in info) nseqs = len(f) if options.nseqs > -1: info = sorted(info, key=operator.itemgetter(1), reverse=True) info = info[:options.nseqs] else: info.sort() print("\n" + fasta) print("=" * len(fasta)) for k, l in info: gc = "" if options.gc: seq = str(f[k]).upper() g = seq.count('G') c = seq.count('C') gc = 100.0 * (g + c) / float(l) gc = "gc:%.2f%%" % gc print((">%s length:%i " % (k, l)) + gc) if total_len > 1000000: total_len = "%.3fM" % (total_len / 1000000.) print() print("%s basepairs in %i sequences" % (total_len, nseqs))
def cds_fasta(self,genomefasta_path,cds_outpath): genome = Fasta(genomefasta_path) genome.readFasta() genomeDict = genome._fasta forward = {} reverse = {} for record in self.all_gff().values(): start,end = map(int,[record.start(),record.end()]) if record.feature() == "mRNA": seq2 = '' seq3 = '' elif record.feature() == "CDS": key = self.cds_pattern.search(record.attribute()).group("id") out = genomeDict[record.seqid()].get_seq()[start-1:end] seq2 += out seq3 += out if record.strand() == "+": forward[key] = seq2 else: reverse[key] = seq3 for key in reverse.keys(): seq = reverse[key] seq = seq.replace('A','{A}').replace('T','{T}').replace('C','{C}').replace('G','{G}') seq = seq.format(A='T', T='A', C='G', G='C')[::-1] forward[key] = seq CDS = open(cds_outpath,'w') for key in sorted(forward.keys()): fa = fasta_record(key,forward[key]) CDS.writelines(fa.fasta_parse()) CDS.close()
def linkerPCR_primers(): """ Make Linker PCR Primers.""" chunks = "yeast_chr01_chunks.FA" dirn = "/Users/xuz02/Google_Drive/workspace/Python/150218_Leslie/" bacbone = "pZX4_lin.fa" output = "liner_primers.csv" v_fasta = Fasta(dirn + bacbone) c_fasta = Fasta(dirn + chunks) o_fp = open(dirn + output, "w") left = 58 right = 57 reverse = "ggccggccccagcttttgttc" forward = "cggccggccctatagtgagtcg" o_fp.write("Name, Forward Primer, Reverse Primer\n") for n in range(len(c_fasta.Seqs)): f_primer = c_fasta.Seqs[n][-right:] + forward r_primer = Seq_Analyzer(c_fasta.Seqs[n][:left]).rcSeq() + reverse name = c_fasta.Names[n] fn = "pZX4_" + name[:19] + ".fasta" fp = open(dirn + fn, "w") fp.write(">%s\n" % name) fp.write(c_fasta.Seqs[n] + v_fasta.Seqs[0]) fp.close() o_fp.write("%s, %s, %s\n" % (name[:19], f_primer, r_primer)) o_fp.close()
def main(): ''' ''' opts = options() dScafs1 = {} fasta1 = Fasta(opts.fasta1) for i in xrange(len(fasta1.headers)): header = fasta1.headers[i].split()[0] dScafs1[header] = fasta1.seqs[i].upper() cnt = 0 ids = set() fasta2 = Fasta(opts.fasta2) for i in xrange(len(fasta2.headers)): header = fasta2.headers[i].split()[0] fasta2.seqs[i] = fasta2.seqs[i].upper() tmpCnt = 0 for j in xrange(min(len(fasta2.seqs[i]), len(dScafs1[header]))): if fasta2.seqs[i][j] != dScafs1[header][j]: cnt += 1 tmpCnt += 1 ids.add(header) lenDiff = abs(len(fasta2.seqs[i]) - len(dScafs1[header])) cnt += lenDiff print cnt print >> sys.stderr, '\t'.join(sorted(ids))
def extract(args): """ >>> extract(['--fasta', 'tests/data/three_chrs.fasta', 'chr2']) TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT """ parser = optparse.OptionParser("""extract some sequences from a fasta file. e.g.: pyfasta extract --fasta some.fasta --header at2g26540 at3g45640""") parser.add_option("--fasta", dest="fasta", help="path to the fasta file") parser.add_option("--header", dest="header", help="include headers", action="store_true", default=False) parser.add_option("--exclude", dest="exclude", help="extract all sequences EXCEPT those listed", action="store_true", default=False) parser.add_option("--file", dest="file", help=\ "if this flag is used, the sequences to extract" \ " are read from the file specified in args" , action="store_true", default=False) parser.add_option("--space", dest="space", action="store_true", help=\ "use the fasta identifier only up to the space as the key", default=False) options, seqs = parser.parse_args(args) if not (options.fasta and len(seqs)): sys.exit(parser.print_help()) key_fn = (lambda k: k.split()[0]) if options.space else None f = Fasta(options.fasta, key_fn=key_fn) if options.file: seqs = (x.strip() for x in open(seqs[0])) if options.exclude: seqs = sorted(frozenset(f.iterkeys()).difference(seqs)) for seqname in seqs: seq = f[seqname] if options.header: print ">%s" % seqname print seq
def compare_CDS(): import Codon import re codon_table = Codon.c_table() dirn = "/Workplace/Python/PCRtagChange" wt = Fasta(dirn + "yeast_chr04_0_00_genes.fa") syn = Fasta(dirn + "yeast_chr04_3_66_genes.fa")
def chr01_pcrtag_stat(): """Stat the pcrtags over the minichunks.""" import re dirn = "./160521_JL/" primers = "csPCR_primers.csv" miniC = "synI_mini_chunks.fasta" output = "chr01_pcrtag_stat.csv" miniC_f = Fasta(dirn + miniC) primers_fp = open(dirn + primers, "r") op = open(dirn + output, "w") primer_ls = [] csv = primers_fp.read() primer_ls = re.split("\r|,", csv) size = len(primer_ls) / 3 stats = [] for i in range(size): p_seq = primer_ls[3 * i + 1].lower() for j in range(len(miniC_f.Names)): if p_seq in miniC_f.Seqs[j]: _pos = miniC_f.Seqs[j].find(p_seq) stats.append([ miniC_f.Names[j], primer_ls[3 * i], primer_ls[3 * i + 2], _pos ]) for item in stats: op.write("%s, %s, %s, %d\n" % (item[0], item[1], item[2], item[3])) op.close() miniC_f.close()
def main(): ''' ''' opts = options() fasta = Fasta(opts.fasta) for i in xrange(len(fasta.headers)): print ">%s" % fasta.header(i) print fasta.seq(i)
def execute(): fasta = Fasta( "rosalind_tran.txt" ) s1 = fasta.get_segments()[0].get_sequence().upper() s2 = fasta.get_segments()[1].get_sequence().upper() print( "s1",s1 ) print( "s2",s2) ratio = compute_ratio( s1, s2 ) print( "ratio=" + str(ratio) )
def readinput( path ): """ Read input from FASTA file. """ fasta = Fasta( path ) segs = fasta.get_segments() s = segs[0].get_sequence() t = segs[1].get_sequence() return s,t
def execute(): fasta = Fasta( "rosalind_revp.txt" ) output = open( "output_revp.txt", "w" ) dna = fasta.get_segments()[0].get_sequence() #print( dna ) restrictions = find_restrictions( dna ) for i in range(len(restrictions)): print( str(restrictions[i][0])+" "+str(restrictions[i][1])) output.write(str(restrictions[i][0])+" "+str(restrictions[i][1])) output.write("\n")
def readinput( path ): """ Read all the segments from FASTA file. Return Fasta object. """ fasta = Fasta( path ) for i in range( len(fasta.get_segments()) ): seg = fasta.get_segments()[i] seq = seg.get_sequence() print( seg.get_header() + "\n" + seq ) return fasta
def info(args): """ >>> info(['tests/data/three_chrs.fasta']) <BLANKLINE> tests/data/three_chrs.fasta =========================== >chr3 length:3600 >chr2 length:80 >chr1 length:80 <BLANKLINE> 3760 basepairs in 3 sequences """ parser = optparse.OptionParser("""\ print headers and lengths of the given fasta file in order of length. e.g.: pyfasta info --gc some.fasta""") parser.add_option("-n", "--n", type="int", dest="nseqs", help="max number of records to print. use -1 for all", default=20) parser.add_option("--gc", dest="gc", help="show gc content", action="store_true", default=False) options, fastas = parser.parse_args(args) if not (fastas): sys.exit(parser.print_help()) import operator for fasta in fastas: f = Fasta(fasta) info = [(k, len(seq)) for k, seq in f.iteritems()] total_len = sum(l for k, l in info) nseqs = len(f) if options.nseqs > -1: info = sorted(info, key=operator.itemgetter(1), reverse=True) info = info[:options.nseqs] else: info.sort() print "\n" + fasta print "=" * len(fasta) for k, l in info: gc = "" if options.gc: seq = str(f[k]).upper() g = seq.count('G') c = seq.count('C') gc = 100.0 * (g + c) / float(l) gc = "gc:%.2f%%" % gc print (">%s length:%i " % (k, l)) + gc if total_len > 1000000: total_len = "%.3fM" % (total_len / 1000000.) print print "%s basepairs in %i sequences" % (total_len, nseqs)
def outputFasta(fishPath, Bait, Except): """ output fasta from fish pool """ fasta = Fasta(fishPath).readFasta() for ID, record in fasta.items(): line_out = record.fasta_parse() if not Except: if Bait.get(ID): print(line_out.strip()) else: if not Bait.get(ID): print(line_out.strip())
def cons(input_string): '''http://rosalind.info/problems/cons/''' fasta = Fasta(input_string) matrix = [] read_matrix = [read_string for label, read_string in fasta.all()] profile = consensus_profile(read_matrix) sequence = consensus_sequence(profile) print_lines = [''.join(sequence)] print_lines += ['{}: {}'.format(PROFILE_MATRIX_KEY_INDEXES[i], ' '.join(map(str,read))) \ for i, read in enumerate(profile)] print('\n'.join(print_lines))
def fill_bins(fasta_filename, index, out, prefix): fasta = Fasta(fasta_filename) # Read sequence by sequence for sequence in fasta.read(): if sequence["id"] in index: # get the sequence groups groups = index[sequence["id"]] # Write the sequence into the right bins for group in groups: with open("{}/{}{}.fa".format(out, prefix, group), "a") as fw: fw.write(">{}\n{}\n".format(sequence["id"], sequence["value"]))
def test_select_enzyme(): dirn = "/Users/xuz02/Google_Drive/workspace/Python/test_dig/" seq_name_ls = [] seq_ls = [] for fn in os.listdir(dirn): if fn[-5:].upper() == "FASTA": fn = dirn + fn fa = Fasta(fn) seq_ls.append(fa.Seqs[0]) seq_name_ls.append(fa.Names[0]) enzymes, dig_band_ls, re_name_ls, n_verified_ls =\ select_restriction_enyzme(seq_ls) # Write output csv file # Headers print enzymes for i in range(len(enzymes)): ouput = dirn + "re_" + re_name_ls[enzymes[i]] + ".csv" fop = open(ouput, "w") for n in range(len(seq_name_ls)): fop.write("%s, " % seq_name_ls[n]) bands = dig_band_ls[n][enzymes[i]] for band in bands: fop.write("%d, " % band) fop.write("\n") fop.close() output = dirn + "non_verified.csv" fop = open(output, "w") for i in n_verified_ls: fop.write(seq_name_ls[i] + "\n") fop.close()
def main(): ''' ''' opts = options() dScafs = {} fasta = Fasta(opts.fasta) for i in xrange(len(fasta.headers)): header = fasta.headers[i].split()[0] dScafs[header] = [nt.upper() for nt in fasta.seqs[i]] cnt = 0 with gzip.open(opts.snps) as handle: for line in handle: items = line.strip().split('\t') scafId, pos = items[0], int(items[2]) scafId = '_'.join(scafId.split("_")[:2]) indel = False if items[3] == "." or items[4] == ".": indel = True if indel == False: fromNt, toNt = items[3].upper(), items[4].upper() if dScafs[scafId][pos - 1].upper() == fromNt: dScafs[scafId][pos - 1] = toNt cnt += 1 else: print >> sys.stderr, "Does not match!!! %s vs %s" % ( fromNt, dScafs[scafId][pos - 1]) dScafs[scafId][pos - 1] = toNt cnt += 1 for key in dScafs: print ">%s\n%s" % (key, ''.join(dScafs[key])) print >> sys.stderr, "SNP count: %d" % cnt
def batch_PCR_Primers_at_End(): folder = "/workspace/Python/170116_SynIV/minichunks/" savefile = "/workspace/Python/170116_SynIV/primers.csv" L_res = 2 R_res = 2 Tm_ls = range(52, 59) minLen = 20 maxLen = 50 primer_ls = [] output = open(savefile, "w+") _len = len(Tm_ls) for fn in os.listdir(folder): if "fasta" in fn: fa = Fasta(folder + fn) length = len(fa.Seqs) for n in range(length): seq = fa.Seqs[n] name = fa.Names[n] F_primer, R_primer, Tm_bin = Seq_Analyzer(seq).\ Find_Primer_at_Ends(L_res, R_res, Tm_ls, minLen, maxLen) primer_ls.append([name, F_primer, R_primer, Tm_bin]) for primer_sub_ls in primer_ls: for n in range(_len): if primer_sub_ls[3][n]: output.write(str(primer_sub_ls[0][:-1]) + ",") for i in [1, 2]: for j in range(4): output.write(str(primer_sub_ls[i][n][j]) + ",") output.write(str(primer_sub_ls[2][n][4]) + ",") output.write("\n")
def execute(): """ text_file = open( "rosalind_hamm.txt", "r") s = text_file.readline().rstrip() t = text_file.readline().rstrip() text_file.close() """ from fasta import Fasta fasta = Fasta( "rosalind_tran.txt" ) s = fasta.get_segments()[0].get_sequence().upper() t = fasta.get_segments()[1].get_sequence().upper() if len(s) != len(t): raise Exception( "lengths do not match" ) print( hamm_dist(s,t) )
def extract(args): """ >>> extract(['--fasta', 'tests/data/three_chrs.fasta', 'chr2']) TAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAT """ parser = optparse.OptionParser( """extract some sequences from a fasta file. e.g.: pyfasta extract --fasta some.fasta --header at2g26540 at3g45640""" ) parser.add_option("--fasta", dest="fasta", help="path to the fasta file") parser.add_option("--header", dest="header", help="include headers", action="store_true", default=False) parser.add_option("--exclude", dest="exclude", help="extract all sequences EXCEPT those listed", action="store_true", default=False) parser.add_option("--file", dest="file", help=\ "if this flag is used, the sequences to extract" \ " are read from the file specified in args" , action="store_true", default=False) parser.add_option("--space", dest="space", action="store_true", help=\ "use the fasta identifier only up to the space as the key", default=False) options, seqs = parser.parse_args(args) if not (options.fasta and len(seqs)): sys.exit(parser.print_help()) key_fn = (lambda k: k.split()[0]) if options.space else None f = Fasta(options.fasta, key_fn=key_fn) if options.file: seqs = (x.strip() for x in open(seqs[0])) if options.exclude: seqs = sorted(frozenset(f.iterkeys()).difference(seqs)) for seqname in seqs: seq = f[seqname] if options.header: print(">%s" % seqname) print(seq)
def openfasta(self, fn): if fn.__class__.__name__ != "str": fn = self.dirn + "/" + str(fn.text()) fasta = Fasta(fn) data = fasta.Seqs[0] self.lName.setText(fasta.Names[0]) self.currentSeq = data self.SeqEdit.setPlainText(data) self.dirn = os.path.dirname(fn) self.tabWidget.setCurrentWidget(self.tab_seq_analyzer)
def flatten(args): """ >>> flatten(['tests/data/three_chrs.fasta']) """ parser = optparse.OptionParser( """flatten a fasta file *inplace* so all later access by pyfasta will use that flattend (but still viable) fasta file""" ) _, fasta = parser.parse_args(args) for fa in fasta: f = Fasta(fa, flatten_inplace=True)
def length_reporter(): dirn = "/Users/xuz02/Google_Drive/Project Data/ORDERS/" stat = "length_stat_0819.csv" stat_fp = open(dirn + stat, "w") for fn in os.listdir(dirn): if fn != "20150819_TWIST_LabOrder.txt": continue fasta = Fasta(dirn + fn) for n in range(len(fasta.Seqs)): _len = len(fasta.Seqs[n]) stat_fp.write(fasta.Names[n] + "," + str(_len) + "," + "\n") stat_fp.close()
def n50(self, assembly): ''' ''' fasta = Fasta("%s" % assembly) n50, nValCnt = None, 0 fasta.seqs.sort(key=len) for seq in fasta.seqs: nValCnt += len(seq) if nValCnt >= fasta.totalLen / 2: n50 = len(seq) break return n50
def main(): ''' ''' opts = options() refSize, refSeqCnt = 0, 0 try: ref = Fasta(opts.refGen) refSize, refSeqCnt = ref.totalLen, len(ref.headers) except IOError: pass geneIds = set() if opts.contamination != "": handle = open(opts.contamination, 'r') for line in handle: myId = line.strip().split('\t')[0] if myId[0] != '#': geneIds.add(myId) handle.close() geneIds = list(geneIds) myGeneIds = None genomeSize = None if opts.scafs != '': fastaS = Fasta(opts.scafs) genomeSize = detailsFasta("Genome Scaffolds", fastaS, refSize, refSeqCnt) if opts.genes != '': fastaG = Fasta(opts.genes) fastaG.rmGenes(geneIds) myGeneIds = set([header.split()[0] for header in fastaG.headers]) detailsFasta("Genome genes", fastaG)
def renameFasta(self, iName, oName, mName, prefix="scaf"): ''' ''' fasta = Fasta(iName) with open(mName, 'w') as handleM: # Mapping with open(oName, 'w') as handleW: # Output cnt = 1 for i in xrange(len(fasta.headers)): newHeader = "%s%ds" % (prefix, cnt) handleW.write(">%s\n" % newHeader) handleW.write("%s\n" % fasta.seqs[i]) handleM.write("%s\t%s\n" % (newHeader, fasta.headers[i])) cnt += 1
def mRNA_fasta(self,genomefasta_path,mRNAfasta_path): genome = Fasta(genomefasta_path) genome.readFasta() genomeDict = genome._fasta forward = {} for record in self.all_gff().values(): start,end = int(record.start()),int(record.end()) if record.feature() == "mRNA": key = self.mRNA_pattern.search(record.attribute()).group("id") seq = genomeDict[record.seqid()].get_seq()[start-1:end] if record.strand() == "+": forward[key] = seq else: seq = seq.replace('A','{A}').replace('T','{T}').replace('C','{C}').replace('G','{G}') seq = seq.format(A='T', T='A', C='G', G='C')[::-1] forward[key] = seq mRNAfasta = open(mRNAfasta_path,'w') for key in forward: fa = fasta_record(key,forward[key]) mRNAfasta.writelines(fa.fasta_parse()) mRNAfasta.close()
def DL_search_primers(): dirn = "./150314_DL_PIG/" fn1 = "pign_3kbFLK.fa" fn2 = "piga_3kbFLK.fa" fn3 = "pigl_3kbFLK.fa" fn4 = "pigk_3kbFLK.fa" ref = Fasta(dirn+fn1).Seqs[0] + Fasta(dirn+fn2).Seqs[0]\ + Fasta(dirn+fn3).Seqs[0] + Fasta(dirn+fn4).Seqs[0] name = "PigL" output = "pigL_primers_v3.csv" fp = open(dirn + output, "w") fa = Fasta(dirn + fn3) f_primer, r_primer = tilling_PCR_primers(seq=fa.Seqs[0], name=name, ref=ref) for p in f_primer: print p fp.write("%s,%s,%s,%s,%s\n" % (p[0], p[1], p[2], p[3], p[4])) for p in r_primer: fp.write("%s,%s,%s,%s,%s\n" % (p[0], p[1], p[2], p[3], p[4])) fp.close()
def _inferScoreMatrix(self, refName, trimmedReads, identity): ''' ''' memDir = os.getcwd() os.chdir(self.wd) fraction = 0.1 fasta = Fasta(trimmedReads) iterCnt, seqCnt = 300, 1000 self.shell("rm -f *.q runs.txt subset*") random.seed(0) with open("runs.txt", "w") as handle: for i in xrange(iterCnt): subFasta = Fasta() subIdxs = random.sample(xrange(len(fasta.headers)), int(seqCnt)) subFasta.headers, subFasta.seqs = [ fasta.headers[j] for j in subIdxs ], [fasta.seqs[k] for k in subIdxs] subFasta.totalLen = sum([len(fasta.seqs[j]) for j in subIdxs]) fName1, fName2 = self._splitSeqsIn2Files( subFasta, "subset%d" % (i + 1), fraction) handle.write( "lastz_D_Wrapper.pl --target=%s --query=%s --identity=%d\n" % (fName1, fName2, identity)) self.shell("cat runs.txt | parallel -j %d" % self.pCnt, ignoreFailure=True) self._median() self.shell("rm -f subset*") #self.shell("tail -n 5 *.q > scoreMatrix.q") if os.path.exists("scoreMatrix.q") == False: print >> sys.stderr, "# FATAL ERROR: could not create score matrix for HaploMerger. Exiting ..." sys.exit(0) first = refName.split(".")[0] scoreMatrix = "%s.%s.q" % (first, first) print "cp scoreMatrix.q %s" % scoreMatrix self.shell("cp scoreMatrix.q %s" % scoreMatrix) os.chdir(memDir) return scoreMatrix
def GC_tmp(): dirn = "/Users/xuz02/Downloads/" fn = "dra_mt.fa" # for fn in os.listdir(dirn): # if fn[-3:].upper() != "TXT" : # continue fasta = Fasta(dirn + fn) n = len(fasta.Seqs) for i in range(n): seq = fasta.Seqs[i] name = fasta.Names[i] GC_content, outlets = [], [] GC_content, outlets = Seq_Analyzer(seq).GC_window_analyzer( 20, 0.70, 0.40) if len(outlets) > 0: Seq_Analyzer(seq).GC_window_analyzer_visual(20, 0.70, 0.30, name)
def count_loxP(): import re dirn = "/Workplace/Python/161122_loxp/" synIV = Fasta(dirn + "synIV.fa") loxP_seq = "ATAACTTCGTATAATGTACATTATACGAAGTTAT" pattern1 = r"[G]ATAACTTCGTATAATGTACATTATACGAAGTTAT[^G]" pattern2 = r"[^G]ATAACTTCGTATAATGTACATTATACGAAGTTAT[G]" pattern3 = r"GATAACTTCGTATAATGTACATTATACGAAGTTATG" ref_seq = synIV.Seqs[0].upper() loci_1 = [m.start() for m in re.finditer(pattern1, ref_seq)] loci_2 = [m.start() for m in re.finditer(pattern2, ref_seq)] loci_3 = [m.start() for m in re.finditer(pattern3, ref_seq)] loci = loci_1 + loci_2 + loci_3 output = open(dirn + "loci_g.csv", "w") for l in loci: output.write(str(l) + "\n")
def minichunk_for_Twist(): dirn = "/Users/Zhuwei/Google_Drive/Project Data/ORDERS/Twist/" stat_name = "minichunk_stat.csv" GC_stat = [] rep_stat = [] stat_fn = open(dirn + stat_name, "w") fasta = Fasta("/Users/Zhuwei/Documents/Order_Twist_140607.fasta") for _n in range(fasta.size): _GC_ls = Seq_Analyzer(fasta.Seqs[_n]).GC_window_analyzer(100) print "pass GC\n" _mean = numpy.mean(_GC_ls) _min = min(_GC_ls) _max = max(_GC_ls) _std = numpy.std(_GC_ls) GC_stat.extend([fasta.Names[_n], _min, _max, _mean, _std]) _rep_ls = Seq_Analyzer(fasta.Seqs[_n]).Repetitive_Seq_analyzer(3, 4, 2) print "pass rep\n" rep_stat.append(fasta.Names[_n]) rep_stat.extend(_rep_ls) rep_stat.append(";;") # for fn in os.listdir(dirn): # if fn[-5:].upper() != "FASTA" and fn[-2:].upper() != "FA": # continue # fasta=Fasta(dirn+fn) # for _n in range(fasta.size): # _GC_ls=Seq_Analyzer(fasta.Seqs[_n]).GC_window_analyzer(100) # print"pass GC\n" # _mean=numpy.mean(_GC_ls) # _min=min(_GC_ls) # _max=max(_GC_ls) # _std=numpy.std(_GC_ls) # GC_stat.extend([fasta.Names[_n],_min,_max,_mean,_std]) # _rep_ls=Seq_Analyzer(fasta.Seqs[_n]).Repetitive_Seq_analyzer(3,5,2) # print "pass rep\n" # rep_stat.append(fasta.Names[_n]) # rep_stat.extend(_rep_ls) # rep_stat.append(";;") for _x in range(len(GC_stat)): stat_fn.write(str(GC_stat[_x]) + ";") if _x % 5 == 4: stat_fn.write("\n") for _x in rep_stat: if _x != ";;": stat_fn.write(str(_x) + ";") else: stat_fn.write(";\n")
def extract_terminial_seq(length=10000, prefix=""): dirn = "/home/zhuwei/cglabarata/comp/" fn = "52.fasta" outdirn = dirn + "52_ends/" fa = Fasta(dirn + fn) for n in range(len(fa.Names)): _name = fa.Names[n] if _name[:3].upper() != "CHR": continue _seq = fa.Seqs[n] _len = len(_seq) output_prefix = prefix + _name.split()[0] with open(outdirn + output_prefix + "_Left.fa", "w") as op: op.write(">%s_left\n" % output_prefix) op.write(_seq[:length]) op.write("\n") with open(outdirn + output_prefix + "_Right.fa", "w") as op: op.write(">%s_right\n" % output_prefix) op.write(_seq[-length:]) op.write("\n")
class Model(): """Encapsulate a secondary structure model informations.""" sequences = Fasta(options.FASTA_FILE) def __init__(self, name): self.name = name try: self.rna = str( recfromtxt(options.MODEL_PATH + self.name + "_seq.txt")) except: self.rna = self.sequences.sequences.keys()[ -1] #by default sequence is the first of the fasta file try: self.position = recfromtxt(options.MODEL_PATH + self.name + "_pos.csv", delimiter=",", dtype=float) self.position[:, 0] -= min(self.position[:, 0]) self.position[:, 1] -= min(self.position[:, 1]) self.position = self.position / amax(self.position) if max(self.position[:, 0]) == 1: self.position[:, 1] += (1 - max(self.position[:, 1])) / 2 else: self.position[:, 0] += (1 - max(self.position[:, 0])) / 2 except: print("Position file of " + name + " model absent or corrupted") self.position = zeros((300, 2)) try: self.app = recfromtxt(options.MODEL_PATH + self.name + "_app.csv") except: print("Appariment file of " + name + " model absent or corrupted") self.app = zeros(300) def __str__(self): '''String representaion of objects''' return (self.name + " model")
def readinput(path): fasta = Fasta(path) seq = fasta.get_segments()[0].get_sequence() return seq
#!/usr/bin/env python from fasta import Fasta import sys import os if len(sys.argv) == 1 or sys.argv[1] in ('-h', '--help'): print "Usage: %s fasta_file..." % sys.argv[0] sys.exit(0) for filename in sys.argv[1:]: dirname, basename = os.path.split(filename) root, ext = os.path.splitext(basename) output = root + '-prefixed' + ext f = Fasta() try: f.read_from(filename) except IOError, ioe: print "Error:", str(ioe) continue for seq in f: seq.header = root + '_' + seq.header f.save_to(output)
if seq2 == None: return seq1 if len(seq1[1]) > len(seq2[1]): return seq1 else: return seq2 ################## # BEGIN PROGRAM :) ################## verify_inputs() input_file = sys.argv[1] fasta = Fasta() file = open(input_file, "r") sys.stderr.write("Reading fasta ...") fasta.read(file) sys.stderr.write("Done.\n") current_seq = None for seq in fasta.entries: if not same_comp_gene(seq, current_seq): if current_seq != None: write_seq(current_seq) current_seq = seq else: current_seq = longer_of_the_two(seq, current_seq)