def extract_dbotu_otu_seqs(membershipfn, fasta_derep, otu_seqs_fn): """ Parameters ---------- membershipfn membership filename, made by dbotu.call_otus has OTU representative seq IDs in first column, and OTU member seq IDs in rest of row fatas_derep fasta dereplicated; sequence IDs should match those in membershipfn out_seqs_fn output fasta file to write representative seqs to """ ## Parse membership file with open(membershipfn, 'r') as f: lines = f.readlines() otu_reps = [l.split('\t')[0] for l in lines] ## Grab OTU representative seqs in membership file from dereplicated fasta ## and write to output with open(otu_seqs_fn, 'w') as out: for record in util.iter_fst(fasta_derep): sid = record[0][1:] seq = record[1] if sid in otu_reps: out.write('>dbotu' + sid + '\n' + seq + '\n') return None
def load_db(self): # Load existing SeqDB (if exists) if os.path.exists(self.fn): for tag, seq in util.iter_fst(self.fn): otu, size = re.search('>(.*);size=(\d+)', tag).groups() self.db[int(otu)] = seq self.size[int(otu)] = int(size) return self
def parse_files(f, q): """ Parse fasta and quality files, f and q """ sids = [] seqs = [] quals = [] for sid, seq in util.iter_fst(f): sids.append(sid[1:]) seqs.append(seq) for _, qual in util.iter_fst(q): quals.append(qual.split(' ')) if len(sids) != len(seqs) != len(quals): raise ValueError('fasta and quality files are not the same length!') return sids, seqs, quals
def dist(fasta): data = [] for i,record in enumerate(util.iter_fst(fasta)): sid, seq = record[:2] logfreq = np.log10(float(sid[1:])) data.append(logfreq) if i > 100000: break data = np.array(data) np.savetxt('test100000.out', data, delimiter = '\t')
def dist(fasta): data = [] for i, record in enumerate(util.iter_fst(fasta)): sid, seq = record[:2] logfreq = np.log10(float(sid[1:])) data.append(logfreq) if i > 100000: break data = np.array(data) np.savetxt('test100000.out', data, delimiter='\t')
def parse_index_file(index_fn, format='fasta'): # Map FASTQ sequences to their barcodes s2b = {} # maps sequences to barcodes # Case 1: index file is FASTA format if format=='fasta': for [s,b] in util.iter_fst(index_fn): s2b[s] = b # Case 2: index file is tab-delimited elif format=='tab': for line in open(index_fn): [s,b] = line.rstrip().split() s2b[s] = b return s2b
def parse_index_file(index_fn, format='fasta'): # Map FASTQ sequences to their barcodes s2b = {} # maps sequences to barcodes # Case 1: index file is FASTA format if format == 'fasta': for [s, b] in util.iter_fst(index_fn): s2b[s] = b # Case 2: index file is tab-delimited elif format == 'tab': for line in open(index_fn): [s, b] = line.rstrip().split() s2b[s] = b return s2b
def Freq_calculate(fasta, k, l, n, outfile): # Open the fasta file fn = fasta f = open(outfile, 'w') for record in util.iter_fst(fn): sid, seq = record[:2] C = float(sid[1:]) l = float(l) n = float(n) k = float(k) freq = C / ((l - k + 1) * n) #q = '< k:'+str(k)+' l:'+str(l)+' n:'+str(n)+' C:'+str(C)+' Freq='+str(freq)+'\n'+seq+'\n' q = '<' + str(freq) + '\n' + seq + '\n' f.write(q)
def Freq_calculate(fasta, k, l, n, outfile): # Open the fasta file fn = fasta f = open(outfile,'w') for record in util.iter_fst(fn): sid, seq = record[:2] C = float(sid[1:]) l = float(l) n = float(n) k = float(k) freq = C/((l-k+1)*n) #q = '< k:'+str(k)+' l:'+str(l)+' n:'+str(n)+' C:'+str(C)+' Freq='+str(freq)+'\n'+seq+'\n' q = '<'+str(freq)+'\n'+seq+'\n' f.write(q)
def Freq_calculate(fasta, k, l, n, outfile): # Open the fasta file fn = fasta f = open(outfile, 'w') for record in util.iter_fst(fn): sid, seq = record[:2] C = float(sid[1:]) l = float(l) n = float(n) k = float(k) freq = C / ((l - k + 1) * n) if np.log10(freq) > -8.5: q = '>' + str(freq) + '\n' + seq + '\n' f.write(q)
def load_db(fn, trim_len): # Load OTU database (otu id -> sequence) if not fn: return {} db = {} for [sid, seq] in util.iter_fst(fn): sid = int(sid) if trim_len: if len(seq) >= trim_len: seq = seq[:trim_len] else: continue db[sid] = seq return db
def Freq_calculate(fasta, k, l, n, outfile): # Open the fasta file fn = fasta f = open(outfile,'w') for record in util.iter_fst(fn): sid, seq = record[:2] C = float(sid[1:]) l = float(l) n = float(n) k = float(k) freq = C/((l-k+1)*n) if np.log10(freq) > -8.5: q = '>'+str(freq)+'\n'+seq+'\n' f.write(q)
def parse_barcodes_file(map_fn, format='fasta', rc=False): # Map barcodes to samples b2s = {} # maps barcodes to samples # Case 1: barcodes file is FASTA format if format == 'fasta': for [s, b] in util.iter_fst(map_fn): if rc == True: seq = reverse_complement(s) b2s[b] = s # Case 2: barcodes file is tab-delimited elif format == 'tab': for line in open(map_fn): [s, b] = line.rstrip().split() if rc == True: b = reverse_complement(b) b2s[b] = s # Return map of barcodes to samples return b2s
def remove_size_from_headers(raw_derep_in, raw_derep_out): """ Rename sequences in raw_dereplicated.fasta from 'seq;size=204' to 'seq'. Parameters ---------- raw_derep_in fasta file with dereplicated, trimmed reads This is the output from dereplicate_and_sort() which calls 3.dereplicate.py. raw_derep_out file name for output file with renamed headers """ with open(raw_derep_out, 'w') as out: for record in util.iter_fst(raw_derep_in): sid = record[0].split(';')[0] seq = record[1] out.write(sid + '\n' + seq + '\n') return None
def parse_barcodes_file(map_fn, format='fasta', rc=False): # Map barcodes to samples b2s = {} # maps barcodes to samples # Case 1: barcodes file is FASTA format if format == 'fasta': for [s,b] in util.iter_fst(map_fn): if rc == True: seq = reverse_complement(s) b2s[b] = s # Case 2: barcodes file is tab-delimited elif format == 'tab': for line in open(bcode_fn): [s,b] = line.rstrip().split() if rc == True: b = reverse_complement(b) b2s[b] = s # Return map of barcodes to samples return b2s
def Create_table(inlist): # initiate dictionary dict = {} # read in each file: for i, fasta in enumerate(open(inlist)): fastafile = fasta[:-1] print i for item in dict: dict[item] += [-20] for record in util.iter_fst(fastafile): sid, seq = record[:2] sfreq = float(sid[1:]) if dict.has_key(seq) == False: dict[seq] = [-20]*i + [sfreq] else: dict[seq][-1]= sfreq # print dict return dict
def Create_table(inlist): # initiate dictionary dict = {} # read in each file: for i, fasta in enumerate(open(inlist)): fastafile = fasta[:-1] print i for item in dict: dict[item] += [-20] for record in util.iter_fst(fastafile): sid, seq = record[:2] sfreq = float(sid[1:]) if dict.has_key(seq) == False: dict[seq] = [-20] * i + [sfreq] else: dict[seq][-1] = sfreq # print dict return dict
def fasta2table(fastaIn, tableOut): # Converts a set of fasta sequences into a table format with the first column # corresponding to label lines beginning with > and the second column to the sequence. keep = {} seqs = {} for [otu_number, seq] in util.iter_fst(fastaIn): otu_number = otu_number[1:] keep[otu_number] = 1 seqs[otu_number] = seq # Sort and organize into a new tab-delimited file with OTU_ID and Sequence as columns fid = open(tableOut,'w') headerline = "OTU_ID" + '\t' + 'Sequence' fid.write(headerline+'\n') for otu_number in keep: line = str(otu_number) + '\t' + seqs[otu_number] fid.write(line+'\n') fid.close() return None
def parse_index_file(index_fn, format='fasta'): # Map FASTQ sequences to their barcodes s2b = {} # maps sequences to barcodes # Case 1: index file is FASTA format if format=='fasta': for [s,b] in util.iter_fst(index_fn): # note: I'm pretty sure this won't work for downstream, because you need # to remove the first character from sequence ID s2b[s] = b # Case 2: index file is tab-delimited elif format=='tab': for line in open(index_fn): [s,b] = line.rstrip().split() s2b[s] = b # Case 3: index file is FASTQ format elif format=='fastq': for [s,b,_,_] in util.iter_fsq(index_fn): # If sequence ID has :Y:0: thing at the end (standard Illumina format), remove it # For this kind of fastq line: @SL-MAJ:AY3TB170104:AY3TB:1:1101:10000:7854 :N:0: s = s.rsplit(' ', 1)[0] s2b[s[1:]] = b return s2b
def parse_index_file(index_fn, format='fasta'): # Map FASTQ sequences to their barcodes s2b = {} # maps sequences to barcodes # Case 1: index file is FASTA format if format == 'fasta': for [s, b] in util.iter_fst(index_fn): # note: I'm pretty sure this won't work for downstream, because you need # to remove the first character from sequence ID s2b[s] = b # Case 2: index file is tab-delimited elif format == 'tab': for line in open(index_fn): [s, b] = line.rstrip().split() s2b[s] = b # Case 3: index file is FASTQ format elif format == 'fastq': for [s, b, _, _] in util.iter_fsq(index_fn): # If sequence ID has :Y:0: thing at the end (standard Illumina format), remove it # For this kind of fastq line: @SL-MAJ:AY3TB170104:AY3TB:1:1101:10000:7854 :N:0: s = s.rsplit(' ', 1)[0] s2b[s[1:]] = b return s2b
import argparse, util parser = argparse.ArgumentParser() parser.add_argument('--fst', default='input fasta file') parser.add_argument('--otus', default='list of otus') args = parser.parse_args() otus = [line.rstrip() for line in open(args.otus)] for [sid, seq] in util.iter_fst(args.fst): if sid in otus: print '>%s\n%s' %(sid, seq)
"GGA": "G", "TGG": "W", "CGG": "R", "AGG": "R", "GGG": "G" } def get_codons(nt): if len(nt) % 3 != 0: quit('error: len(nt) not divisible by 3') for i in range(int(len(nt) / 3)): beg = i * 3 end = beg + 3 yield nt[beg:end] def translate(nt): nt = nt.upper() aa = '' for codon in get_codons(nt): aa += codon_table[codon] return aa if __name__ == '__main__': import sys for record in util.iter_fst(sys.argv[1]): record[1] = translate(record[1]) print('\n'.join(record))
help='print command', default=False, action='store_true') args = parser.parse_args() # auto settings if args.lax == True: args.b1 = .5 args.b2 = .5 args.b3 = 10 args.b4 = 5 args.b5 = 'all' args.cut = 25 # fix arguments n = len([record for record in util.iter_fst(args.aln)]) args.b1 = int(args.b1 * n) + 1 args.b2 = int(args.b2 * n) + 1 args.b5 = {'none': 'N', 'half': 'H', 'all': 'A'}[args.b5] # encode gblocks input gmap = {} temp = open('%s.temp.aln' % (args.aln), 'w') print('writing alignmnet to %s.temp.aln' % (args.aln)) for i, record in enumerate(util.iter_fst(args.aln)): old = record[0] new = 'seq%d' % (i) gmap[new] = old record[0] = '>%s' % (new) temp.write('\n'.join(record) + '\n') temp.close()
cds = sorted(translate.codon_table.keys()) aas = sorted('G A L M F W K Q E S P V I C Y H R N D T'.split()) alphabet = nts + cds + aas header = ['gene'] + ['count_nt_%s' %(li) for li in nts] + ['count_cd_%s' %(li) for li in cds] + ['count_aa_%s' %(li) for li in aas] + \ ['freq_nt_%s' %(li) for li in nts] + ['freq_cd_%s' %(li) for li in cds] + ['freq_aa_%s' %(li) for li in aas] print('\t'.join(header)) # read sequence id map smap = {} if args.map: for line in open(args.map): line = line.rstrip().split() smap[line[0]] = line[1] # iterate over sequences for record in util.iter_fst(args.fst): sid, seq = record if len(seq) % 3 != 0: continue # map sequence ids sid = sid.split()[0][1:] if args.map: if sid in smap: sid = smap[sid] else: continue # count fna features # ------------------
args = parser.parse_args() filemap = args.fmap with open(filemap, 'r') as f: lines = f.readlines() lines = [line.strip().split('\t') for line in lines] sids = [line[1] for line in lines] fastas = [line[0] for line in lines] ## Relabel sequences in individual fastas. Save each one as *.sb ## Also concatenate all of these fastas into one large fasta, named myDataset.raw_concat.fasta concat_fasta = args.dataset + '.raw_concat.fasta' with open(concat_fasta, 'w') as concat: for fasta, newsid in zip(fastas, sids): new_fasta = fasta + '.sb' counter = 0 with open(new_fasta, 'w') as f: print(new_fasta) for oldsid, seq in util.iter_fst(fasta): counter += 1 f.write('>' + newsid + '_' + str(counter) + '\n') f.write(seq + '\n') # @Thomas: I think concatenating at the same time as relabeling will be fastest, but I'm not sure! concat.write('>' + newsid + '_' + str(counter) + '\n') concat.write(seq + '\n')
""" Relabel sequences in *.raw_trimmed.fasta to have datasetID--sampleID_N """ import os import argparse import util parser = argparse.ArgumentParser() parser.add_argument('trimmed_dir', help='directory with *.raw_trimmed.fasta files') args = parser.parse_args() files = [ os.path.join(args.trimmed_dir, i) for i in os.listdir(args.trimmed_dir) if i.endswith('.raw_trimmed.fasta') ] for fname in files: dataset = fname.split('/')[-1].split('.')[0] with open(fname + '.relabeled', 'w') as fnew: for sid, seq in util.iter_fst(fname): sid = '>' + dataset + '--' + sid[1:] fnew.write('\n'.join([sid, seq]) + '\n')
help='Input fasta sequences (optional)', default='') parser.add_argument('--map', help='Input mapping file', required=True) parser.add_argument('--min_count', help='Minimum read count', type=int) parser.add_argument('--min_samples', help='Minimum number of samples', type=int) parser.add_argument('--out', help='Output counts matrix') # Parse command line arguments args = parser.parse_args() # Load valid fst seqs keep = {} if args.fst: for [otu, seq] in util.iter_fst(args.fst): otu = otu[1:] keep[otu] = 1 # Keep track of samples and otus samples = {} otus = {} # For every line in the mapping file for line in open(args.map): # Load otu name and table of sample counts otu, table = line.rstrip().split('\t') if len(keep) > 0 and otu not in keep: continue entries = table.split(' ') count = sum(
import argparse import util # parse args parser = argparse.ArgumentParser() parser.add_argument('-f', help='FASTA file') parser.add_argument('-q', help='FASTQ file') parser.add_argument('-s', help='Subset ids') args = parser.parse_args() # load subset subset = [line.rstrip() for line in open(args.s)] # get iterator iter_seq = '' if args.f: iter_seq = util.iter_fst(args.f) if args.q: iter_seq = util.iter_fsq(args.q) # subset file for record in iter_seq: sid = record[0][1:].split(';')[0] if sid in subset: print '\n'.join(record)
parser.add_argument('--minlen', help='Minimum length', type=int, default=0) parser.add_argument('--num', help='Number to keep', type=int, default=0) parser.add_argument('--prefix', help='Prefix to add', type=str, default='') parser.add_argument('--prefix_sep', help='Prefix separator', type=str, default='.') parser.add_argument('--debug', help='Debug mode', action='store_true', default=False) args = parser.parse_args() # get iterator if args.fst: iter_seq = util.iter_fst(args.fst) elif args.fsq: iter_seq = util.iter_fsq(args.fsq) elif args.FST: iter_seq = util.iter_fst(sys.stdin) elif args.FSQ: iter_seq = util.iter_fsq(sys.stdin) else: quit('error: must specify fst, fsq, FST, or FSQ') # initialize variables keep = {} remove = {} # load IDs/coordinates to keep if args.keep:
import util, sys fn = sys.argv[1] k = int(sys.argv[2]) for [sid, seq] in util.iter_fst(fn): if len(seq) >= k: print '>%s\n%s' %(sid, seq[:k])
import util parser = argparse.ArgumentParser() parser.add_argument('--fst', help='Input fasta sequences (optional)', default='') parser.add_argument('--map', help='Input mapping file', required=True) parser.add_argument('--min_count', help='Minimum read count', type=int) parser.add_argument('--min_samples', help='Minimum number of samples', type=int) parser.add_argument('--out', help='Output counts matrix') # Parse command line arguments args = parser.parse_args() # Load valid fst seqs keep = {} if args.fst: for [otu, seq] in util.iter_fst(args.fst): otu = otu[1:] keep[otu] = 1 # Keep track of samples and otus samples = {} otus = {} # For every line in the mapping file for line in open(args.map): # Load otu name and table of sample counts otu, table = line.rstrip().split('\t') if len(keep) > 0 and otu not in keep: continue entries = table.split(' ') count = sum([int(entry.split(':')[1]) >= args.min_count for entry in entries])
print('Parsing dereplication map: {}'.format(args.map_file)) seq_sizes = {} with open(args.map_file, 'r') as f: lines = f.readlines() for line in lines: line = line.strip().split('\t') seqID = line[0] total_size = sum([int(i.split('size=')[1].split(':1')[0]) for i in line[1].split(' ')]) seq_sizes[seqID] = total_size ## Read in the entire fasta file into a dict {seqID: sequence} print('Reading fasta file: {}'.format(args.fasta_in)) fasta = {} for sid, seq in util.iter_fst(args.fasta_in): # sid in the fasta is something like >444;size=8 # sid.split(';')[0][1:] returns 444, which is a key in seq_sizes sid = sid.split(';')[0][1:] newsid = '>' + sid + ';size=' + str(seq_sizes[sid]) fasta[sid] = {} fasta[sid]['new_sid'] = newsid fasta[sid]['seq'] = seq ## Get list of sequence IDs in descending size (i.e. largest first) ordered_seqs = sorted(seq_sizes, key=lambda k: seq_sizes[k], reverse=True) ## Write new fasta file in descending size print('Writing sorted and relabled fasta: {}'.format(args.fasta_out)) with open(args.fasta_out, 'w') as f: f.write('\n'.join([fasta[s]['new_sid'] + '\n' + fasta[s]['seq'] for s in ordered_seqs]))