def test03(self): fa = path.join(datadir, 'F1_3xR1_36', 'trimmed.fasta') uc = path.join(datadir, 'F1_3xR1_36', 'trimmed.uc') groups = path.join(datadir, 'F1_3xR1_36', 'groups.csv.bz2') outdir = self.mkoutdir() limit = 500 min_size = 2 denoised = path.join(outdir, 'denoised.fasta') self.main([fa, '--clusters', uc, '--outfile', denoised, '--limit', limit, '--min-clust-size', min_size, '--weights', path.join(outdir, 'weights.csv')]) denoised_grouped = path.join(outdir, 'denoised.grouped.fasta') self.main([fa, '--clusters', uc, '--outfile', denoised_grouped, '--limit', limit, '--min-clust-size', min_size, '--groups', groups, '--weights', path.join(outdir, 'weights.grouped.csv')]) with open(denoised) as d, open(denoised_grouped) as g: ds = list(fastalite(d)) gs = list(fastalite(g)) self.assertEqual(set(s.seq for s in ds), set(s.seq for s in gs))
def test03(self): fa = path.join(datadir, 'F1_3xR1_36', 'trimmed.fasta') uc = path.join(datadir, 'F1_3xR1_36', 'trimmed.uc') groups = path.join(datadir, 'F1_3xR1_36', 'groups.csv.bz2') outdir = self.mkoutdir() limit = 500 min_size = 2 denoised = path.join(outdir, 'denoised.fasta') self.main([ fa, '--clusters', uc, '--outfile', denoised, '--limit', limit, '--min-clust-size', min_size, '--weights', path.join(outdir, 'weights.csv') ]) denoised_grouped = path.join(outdir, 'denoised.grouped.fasta') self.main([ fa, '--clusters', uc, '--outfile', denoised_grouped, '--limit', limit, '--min-clust-size', min_size, '--groups', groups, '--weights', path.join(outdir, 'weights.grouped.csv') ]) with open(denoised) as d, open(denoised_grouped) as g: ds = list(fastalite(d)) gs = list(fastalite(g)) self.assertEqual(set(s.seq for s in ds), set(s.seq for s in gs))
def test04(self): """ Include an rle file. """ out = self.mkoutdir() trimmed = path.join(out, 'trimmed.fasta') trimmed_rle = path.join(out, 'trimmed_rle.csv') args = [ self.data('rle_100.fasta'), '--left-aligns', self.data('rle_100_left_ssearch.csv.bz2'), '--right-zscore', '80', '--right-aligns', self.data('rle_100_right_ssearch.csv.bz2'), '--fasta-out', trimmed, '--rle', self.data('rle_100.csv.bz2'), '--rle-out', trimmed_rle ] self.main(args) with open(trimmed) as f: self.assertEqual(len(list(fastalite(f))), 66) with open(trimmed_rle) as f: self.assertEqual(len(f.readlines()), 67)
def action(args): if args.is_file: seqs = fastalite(opener(args.seqs)) for s in seqs: seq = reversed(s.seq) seq = [rev_comp[se] for se in seq] seq = ''.join(seq) args.out_fasta.write('>{}\n{}\n'.format(s.description, seq)) else: seq = [rev_comp[s] for s in args.seqs] seq = ''.join(seq) args.out.write(seq) args.out.write('\n') if args.rlefile and args.out_rle: reader = csv.reader(args.rlefile) writer = csv.writer(args.out_rle) # try to determine if first row is a header; we'll assume that # the first row, second column is a run-length encoding if # it's at least half digits. name, rle = reader.next() if sum(c.isdigit() for c in rle) / float(len(rle)) > 0.5: writer.writerow([name, ''.join(reversed(rle))]) else: assert [name, rle] == rle_fieldnames writer.writerow([name, rle]) for name, rle in reader: writer.writerow([name, ''.join(reversed(rle))])
def action(args): seqs = fastalite(args.fasta) pairs = list(all_pairwise(seqs)) if args.distance: pairs = [(q, t, 1 - i) for q, t, i in pairs] if args.split_info and args.matrix_out: primary, secondary = args.primary_group, args.secondary_group split_info = list(csv.DictReader(args.split_info)) info = {r['seqname']: r for r in split_info if r['seqname']} tax = {r['tax_id']:r for r in split_info} pairs += map(itemgetter(1,0,2), pairs) def group(seqname): i = info[seqname] return i[primary] or i[secondary] if secondary else i[primary] pairs = ((group(left), group(right), score) for left,right,score in pairs) # sort and group rows pairs = list(groupbyl(pairs, key = itemgetter(0))) matrix_out = csv.writer(args.matrix_out) # this is the tax_id order we will be using for columns tax_ids = map(itemgetter(0), pairs) # get the species names to output as first row matrix_out.writerow([''] + [tax[t]['tax_name'] for t in tax_ids]) # iterator through the sorted rows (pairs) for row_id, columns in pairs: # sort and group columns columns = dict(groupbyl(columns, key = itemgetter(1))) # get the species name row = [tax[row_id]['tax_name']] for t in tax_ids: # if t not in columns that means there is only # sequence representing the group # therefore the median destance is 0 if t not in columns: med = 0 else: col = columns[t] med = median(map(itemgetter(2), col)) # percent and round med = math.ceil(med * 100) / 100 row.append(med) matrix_out.writerow(row) else: writer = csv.writer(args.out) writer.writerow(['query', 'target', 'identity']) writer.writerows(pairs)
def build_parser(parser): parser.add_argument( 'clusters', type=Opener(), help='Clusters file (output of "usearch -uc")') parser.add_argument( '--fasta-in', type=lambda f: fastalite(Opener()(f)), help='input fasta file containing original clustered reads') parser.add_argument( '--fasta-out', type=Opener('w'), help='Output fasta containing centroids') parser.add_argument( '-g', '--groups', metavar='FILE', type=Opener(), help="""An optional file defining groups for partitioning input reads. If provided, cluster weights will be normalized to proportionally represent each group. File is a headerless csv with columns "seqname","group" (.csv.bz2)""") parser.add_argument( '--min-clust-size', type=int, default=1, help='[%(default)s]') parser.add_argument( '-o', '--out', type=Opener('w'), default=sys.stdout, help='Output file with columns (readname,centroidname)') parser.add_argument( '--specimenmap', type=Opener('w'), help='Output file with columns (clustername,samplename)') parser.add_argument( '--specimen', metavar='SAMPLENAME', help='provides samplename for mapfile') parser.add_argument( '-w', '--weights', type=Opener('w'), help='Output file with columns (clustername,weight)')
def action(args): # for debugging: # pd.set_option('display.max_columns', None) # pd.set_option('display.max_rows', None) if args.intype == 'fasta': fa = fastalite(args.infile, limit=args.limit) df = pd.Series(data={f.id: f.seq for f in fa}, name='seq') df = df.reset_index() df = df.set_index('index') df.index.name = 'id' df['length'] = df['seq'].apply(len) column = 'length' else: # elif args.intpe == 'csv': df = pd.read_csv(args.infile) column = args.column xticks = args.xaxis.split(',') if args.xaxis else None # do not display plots plt.use('Agg') # format blast data and add additional available information pl = df[column].plot(kind='kde', title=args.title, xticks=xticks) log.info('printing to {}'.format(args.out)) pl.get_figure().savefig(args.out)
def build_parser(parser): parser.add_argument('raw_reads', type=lambda f: fastalite(Opener()(f)), help="""input fasta file containing original clustered reads (default stdin).""") parser.add_argument('readmap', type=Opener('r'), help="""output of `bioy denoise --readmap` (csv file with columns readname,clustername)""") parser.add_argument('-r', '--rlefile', type=Csv2Dict('name', 'rle', fieldnames=['name', 'rle']), help="""An optional file containing run length encoding for infile (.csv.bz2)""") parser.add_argument('-d', '--outdir', help='output directory', default='.') parser.add_argument('--pattern', help="""A regular expression matching cluster names""") parser.add_argument('-N', '--sample', type=int, default=100, metavar='N', help='inculde no more than N reads [%(default)s]') parser.add_argument('--name-suffix', help='string to insert into name before .fasta', default='aln') parser.add_argument('--no-align', action='store_false', dest='align', default=True)
def test02(self): with open(self.data('two.fasta')) as f: seqs = list(sequtils.fastalite(f)) pairs = list(sequtils.all_pairwise(seqs)) self.assertEqual(len(pairs), (len(seqs) * (len(seqs) - 1)) / 2) self.assertEqual( [s.id for s in seqs], list(sequtils.names_from_pairs(pairs)))
def test04(self): """ test no clusters passing min_size 1) test file is actully created 2) test there are no seqs in file """ fa = path.join(datadir, 'F1_3', 'trimmed.fasta') uc = path.join(datadir, 'F1_3', 'trimmed.uc') outdir = self.mkoutdir() fa_out = path.join(outdir, 'denoised_empty.fasta') limit = 100 min_size = sys.maxint self.main([ fa, '--clusters', uc, '--outfile', fa_out, '--limit', limit, '--min-clust-size', min_size ]) # 1) self.assertTrue(os.path.isfile(fa_out)) with open(fa_out) as out: outseqs = list(fastalite(out)) # 2) self.assertEqual(len(outseqs), 0)
def test05(self): """ test if no cluster file all one cluster """ fa = path.join(datadir, '16S_random.fasta') outdir = self.mkoutdir() fa_out = fa_out = path.join(outdir, 'denoised.fasta') self.main([fa, '--outfile', fa_out]) reference = path.join(datadir, '16S_random_cons.fasta') with open(fa_out) as out, open(reference) as ref: outseqs = list(fastalite(out)) refseqs = list(fastalite(ref)) self.assertEqual(len(outseqs), len(refseqs)) self.assertEqual(set(s.seq for s in outseqs), set(s.seq for s in refseqs))
def test02(self): fa = path.join(datadir, 'F1_3', 'trimmed.fasta') uc = path.join(datadir, 'F1_3', 'trimmed.uc') outdir = self.mkoutdir() fa_out = path.join(outdir, 'denoised.fasta') limit = 100 min_size = 2 self.main([fa, '--clusters', uc, '--outfile', fa_out, '--limit', limit, '--min-clust-size', min_size]) reference = path.join(datadir, 'F1_3', 'test02_denoised.fasta') with open(fa_out) as out, open(reference) as ref: outseqs = list(fastalite(out)) refseqs = list(fastalite(ref)) self.assertEqual(len(outseqs), len(refseqs)) self.assertEqual(set(s.seq for s in outseqs), set(s.seq for s in refseqs))
def test05(self): """ test if no cluster file all one cluster """ fa = path.join(datadir, '16S_random.fasta') outdir = self.mkoutdir() fa_out = fa_out = path.join(outdir, 'denoised.fasta') self.main([fa, '--outfile', fa_out]) reference = path.join(datadir, '16S_random_cons.fasta') with open(fa_out) as out, open(reference) as ref: outseqs = list(fastalite(out)) refseqs = list(fastalite(ref)) self.assertEqual(len(outseqs), len(refseqs)) self.assertEqual(set(s.seq for s in outseqs), set(s.seq for s in refseqs))
def test02(self): fa = path.join(datadir, 'F1_3', 'trimmed.fasta') uc = path.join(datadir, 'F1_3', 'trimmed.uc') outdir = self.mkoutdir() fa_out = path.join(outdir, 'denoised.fasta') limit = 100 min_size = 2 self.main([ fa, '--clusters', uc, '--outfile', fa_out, '--limit', limit, '--min-clust-size', min_size ]) reference = path.join(datadir, 'F1_3', 'test02_denoised.fasta') with open(fa_out) as out, open(reference) as ref: outseqs = list(fastalite(out)) refseqs = list(fastalite(ref)) self.assertEqual(len(outseqs), len(refseqs)) self.assertEqual(set(s.seq for s in outseqs), set(s.seq for s in refseqs))
def build_parser(parser): parser.add_argument('fasta', type=lambda f: fastalite(Opener()(f)), help='input fasta file') parser.add_argument('-l', '--left-aligns', type=Opener(), help='left primer ssearch36 alignment results') parser.add_argument('-r', '--right-aligns', type=Opener(), help='right primer ssearch36 alignment results') parser.add_argument('--left-range', metavar='START,STOP', help='Range of acceptable left primer start positions') parser.add_argument('--left-zscore', metavar='VALUE', type=float, help='Min acceptable left primer z-score') parser.add_argument('--right-range', metavar='START,STOP', help=('Range of acceptable right ' 'primer start positions')) parser.add_argument('--right-zscore', metavar='VALUE', type=float, help='Min acceptable right primer z-score') parser.add_argument('--left-expr', help=('python expression defining ' 'criteria for keeping left primer')) parser.add_argument('--right-expr', help=('python expression defining criteria ' 'for keeping left primer')) parser.add_argument('-o', '--fasta-out', type=Opener('w'), default=sys.stdout, help='trimmed fasta output file') parser.add_argument('--rle', type=Csv2Dict('name', 'rle', fieldnames=['name', 'rle']), help='rle input file (required if --rle-out)') parser.add_argument( '--rle-out', type=lambda f: DictWriter(Opener('w')(f), fieldnames=['name', 'rle']), help='trimmed rle output file') parser.add_argument('-i', '--include-primer', action='store_true', default=False, help='Include primer in trimmed sequence') parser.add_argument('--keep-all-seqs', action='store_true', help='keep seqs that outside the trimming thresholds')
def build_parser(parser): parser.add_argument('seqs', type=lambda f: fastalite(Opener()(f), readfile=False), help='Input fasta file') parser.add_argument('rle', type=Opener(), help='csv file (may be bzip encoded) containing columns "name","rle"') parser.add_argument('-o', '--outfile', type=Opener('w'), default=sys.stdout, help='Name of output file')
def action(args): fieldnames = args.get or ['id','description','seq'] # make into [[columname,newname] ...] fieldnames = [f.split(':') for f in fieldnames] fieldnames = [f * (2 if len(f) == 1 else 1) for f in fieldnames] out = csv.DictWriter(args.out, fieldnames = map(operator.itemgetter(1), fieldnames), extrasaction = 'ignore') out.writeheader() for f in fastalite(args.fasta): f = f._asdict() out.writerow({v:f[k] for k,v in fieldnames})
def build_parser(parser): parser.add_argument('fasta', type = lambda f: fastalite(opener(f)), help = 'input file containing raw reads') parser.add_argument('--sample-id', help = 'sample id to pull reads for') parser.add_argument('--map-file', type = Csv2Dict(value = 'sample_id', fieldnames=['sequence_id','sample_id']), help = 'csv(.bz2) file containing sequence_id,sample_id in the rows.') parser.add_argument('-o', '--out', type = Opener('w'), default = sys.stdout, help = 'fasta output file')
def build_parser(parser): parser.add_argument('seqs', type=lambda f: fastalite(Opener()(f), readfile=False), help='Input fasta file') parser.add_argument( 'rle', type=Opener(), help='csv file (may be bzip encoded) containing columns "name","rle"') parser.add_argument('-o', '--outfile', type=Opener('w'), default=sys.stdout, help='Name of output file')
def test01(self): with open( self.data('five.fasta')) as f, open( self.data('five.fasta')) as r: seqs = sequtils.fastalite(f) raw = r.read() fasta = '' for seq in seqs: fasta += '>{}\n{}\n'.format(seq.description, seq.seq) log.debug('{}'.format(seq)) self.assertEquals( ''.join(raw).replace('\n', ''), fasta.replace('\n', ''))
def test01(self): fa = path.join(datadir, 'F1_3', 'trimmed.fasta') uc = path.join(datadir, 'F1_3', 'trimmed.uc') outdir = self.mkoutdir() fa_out = path.join(outdir, 'denoised.fasta') limit = 100 self.main([fa, '--clusters', uc, '--outfile', fa_out, '--limit', limit]) # cluster mass equals number of input sequences with open(fa_out) as f: cluster_mass = sum(int(line.split('_')[-1]) for line in f if line.startswith('>')) self.assertEqual(limit, cluster_mass) # regression test reference = path.join(datadir, 'F1_3', 'test01_denoised.fasta') with open(fa_out) as out, open(reference) as ref: outseqs = list(fastalite(out)) refseqs = list(fastalite(ref)) self.assertEqual(len(outseqs), len(refseqs)) self.assertEqual(set(s.seq for s in outseqs), set(s.seq for s in refseqs))
def test01(self): fa = path.join(datadir, 'F1_3', 'trimmed.fasta') uc = path.join(datadir, 'F1_3', 'trimmed.uc') outdir = self.mkoutdir() fa_out = path.join(outdir, 'denoised.fasta') limit = 100 self.main( [fa, '--clusters', uc, '--outfile', fa_out, '--limit', limit]) # cluster mass equals number of input sequences with open(fa_out) as f: cluster_mass = sum( int(line.split('_')[-1]) for line in f if line.startswith('>')) self.assertEqual(limit, cluster_mass) # regression test reference = path.join(datadir, 'F1_3', 'test01_denoised.fasta') with open(fa_out) as out, open(reference) as ref: outseqs = list(fastalite(out)) refseqs = list(fastalite(ref)) self.assertEqual(len(outseqs), len(refseqs)) self.assertEqual(set(s.seq for s in outseqs), set(s.seq for s in refseqs))
def build_parser(parser): parser.add_argument('fasta', type=lambda f: fastalite(opener(f)), help='input file containing raw reads') parser.add_argument('--sample-id', help='sample id to pull reads for') parser.add_argument( '--map-file', type=Csv2Dict(value='sample_id', fieldnames=['sequence_id', 'sample_id']), help='csv(.bz2) file containing sequence_id,sample_id in the rows.') parser.add_argument('-o', '--out', type=Opener('w'), default=sys.stdout, help='fasta output file')
def action(args): if args.seqname: seqname = args.seqname else: seqname = 'consensus' if args.infile is sys.stdin \ else splitext(basename(args.infile.name))[0] seqs = list(fastalite(args.infile, 'fasta')) if args.rlefile: rledict = json.load(args.rlefile) rlelist = [rledict[s.id] for s in seqs] cons = consensus(seqs, rlelist, degap=not args.gaps) else: cons = consensus(seqs, degap=not args.gaps) args.outfile.write('>{}\n{}\n'.format(seqname, cons))
def action(args): if args.seqname: seqname = args.seqname else: seqname = 'consensus' if args.infile is sys.stdin \ else splitext(basename(args.infile.name))[0] seqs = list(fastalite(args.infile, 'fasta')) if args.rlefile: rledict = json.load(args.rlefile) rlelist = [rledict[s.id] for s in seqs] cons = consensus(seqs, rlelist, degap=not args.gaps) else: cons = consensus(seqs, degap=not args.gaps) args.outfile.write('>{}\n{}\n'.format(seqname, cons))
def test01(self): out = self.mkoutdir() trimmed = path.join(out, 'trimmed.fasta') args = [ self.data('rle_100.fasta'), '--left-aligns', self.data('rle_100_left_ssearch.csv.bz2'), '--right-aligns', self.data('rle_100_right_ssearch.csv.bz2'), '--fasta-out', trimmed ] self.main(args) self.assertTrue(path.exists(trimmed)) with open(trimmed) as f: self.assertEqual(len(list(fastalite(f))), 99)
def action(args): fasta = fastalite(args.fasta) spec_map = DictReader(args.specimen_map, fieldnames = ['readname', 'specimen']) spec_map = {s['readname']:s['specimen'] for s in spec_map} def by_specimen(f): return spec_map[f.id] groups = sorted(fasta, key = by_specimen) groups = groupby(groups, key = by_specimen) for spec, fasta in groups: fasta = ('>{}\n{}'.format(f.description, f.seq) for f in fasta) fasta = '\n'.join(fasta) filename = path.join(args.outdir, '{}.fasta.bz2'.format(spec)) with opener(filename, 'w') as out: out.write(fasta)
def action(args): fasta = fastalite(args.fasta) spec_map = DictReader(args.specimen_map, fieldnames=['readname', 'specimen']) spec_map = {s['readname']: s['specimen'] for s in spec_map} def by_specimen(f): return spec_map[f.id] groups = sorted(fasta, key=by_specimen) groups = groupby(groups, key=by_specimen) for spec, fasta in groups: fasta = ('>{}\n{}'.format(f.description, f.seq) for f in fasta) fasta = '\n'.join(fasta) filename = path.join(args.outdir, '{}.fasta.bz2'.format(spec)) with opener(filename, 'w') as out: out.write(fasta)
def build_parser(parser): parser.add_argument('fasta', type=lambda f: fastalite(Opener()(f)), help='input fasta file') parser.add_argument('-l', '--left-aligns', type=Opener(), help='left primer ssearch36 alignment results') parser.add_argument('-r', '--right-aligns', type=Opener(), help='right primer ssearch36 alignment results') parser.add_argument('--left-range', metavar='START,STOP', help='Range of acceptable left primer start positions') parser.add_argument('--left-zscore', metavar='VALUE', type=float, help='Min acceptable left primer z-score') parser.add_argument('--right-range', metavar='START,STOP', help=('Range of acceptable right ' 'primer start positions')) parser.add_argument('--right-zscore', metavar='VALUE', type=float, help='Min acceptable right primer z-score') parser.add_argument('--left-expr', help=('python expression defining ' 'criteria for keeping left primer')) parser.add_argument('--right-expr', help=('python expression defining criteria ' 'for keeping left primer')) parser.add_argument('-o', '--fasta-out', type=Opener('w'), default=sys.stdout, help='trimmed fasta output file') parser.add_argument('--rle', type=Csv2Dict( 'name', 'rle', fieldnames=['name', 'rle']), help='rle input file (required if --rle-out)') parser.add_argument('--rle-out', type=lambda f: DictWriter( Opener('w')(f), fieldnames=['name', 'rle']), help='trimmed rle output file') parser.add_argument('-i', '--include-primer', action='store_true', default=False, help='Include primer in trimmed sequence') parser.add_argument('--keep-all-seqs', action='store_true', help='keep seqs that outside the trimming thresholds')
def build_parser(parser): parser.add_argument('raw_reads', type = lambda f: fastalite(Opener()(f)), help = """input fasta file containing original clustered reads (default stdin).""") parser.add_argument('readmap', type = Opener('r'), help = """output of `bioy denoise --readmap` (csv file with columns readname,clustername)""") parser.add_argument('-r', '--rlefile', type = Csv2Dict('name', 'rle', fieldnames=['name', 'rle']), help="""An optional file containing run length encoding for infile (.csv.bz2)""") parser.add_argument('-d', '--outdir', help='output directory', default='.') parser.add_argument('--pattern', help = """A regular expression matching cluster names""") parser.add_argument('-N', '--sample', type=int, default=100, metavar='N', help='inculde no more than N reads [%(default)s]') parser.add_argument('--name-suffix', help='string to insert into name before .fasta', default='aln') parser.add_argument('--no-align', action='store_false', dest='align', default=True)
def test04(self): """ test no clusters passing min_size 1) test file is actully created 2) test there are no seqs in file """ fa = path.join(datadir, 'F1_3', 'trimmed.fasta') uc = path.join(datadir, 'F1_3', 'trimmed.uc') outdir = self.mkoutdir() fa_out = path.join(outdir, 'denoised_empty.fasta') limit = 100 min_size = sys.maxint self.main([fa, '--clusters', uc, '--outfile', fa_out, '--limit', limit, '--min-clust-size', min_size]) # 1) self.assertTrue(os.path.isfile(fa_out)) with open(fa_out) as out: outseqs = list(fastalite(out)) # 2) self.assertEqual(len(outseqs), 0)
def action(args): if args.clusters: _, fileExt, = os.path.basename(args.clusters.name).split('.') if fileExt == 'uc': clusters = parse_uc(args.clusters)[0] else: clusters = {seq: tag for seq, tag in csv.reader(args.clusters)} by_clusters = lambda s: clusters.get(s.id, s.id) else: by_clusters = lambda _: 'all one cluster' seqs = fastalite(args.fastafile) seqs = islice(seqs, args.limit) seqs = sorted(seqs, key=by_clusters) grouped_seqs = groupby(seqs, key=by_clusters) chunks = ichunker((group for _, group in grouped_seqs), args.rlefile, args.min_clust_size, args.max_clust_size) # calculate consensus for each cluster, then accumulate names of # each set of identical consensus sequences in `exemplars` (keys # are the consensus sequences themselves). exemplars = defaultdict(list) pool = Pool(processes=args.threads) for cluster, cons in map(align_and_consensus, enumerate(chunks, start=1)): exemplars[cons].extend([c.id for c in cluster]) # calculate ratios of reads for the smallest group to each of the # other groups. outseqs is a list of (weight, consensus, list_of_names) if args.groups and exemplars: groups = dict(csv.reader(args.groups)) group_counts = Counter( groups[name] for name in chain.from_iterable(exemplars.values())) most_common = group_counts.most_common() _, least_common = most_common[-1] weights = {k: float(least_common) / v for k, v in most_common} outseqs = [(sum(weights[groups[n]] for n in names), cons, names) for cons, names in exemplars.items()] else: outseqs = [(len(names), cons, names) for cons, names in exemplars.items()] # write each consensus sequence in descending order of weight outseqs.sort(reverse=True, key=itemgetter(0)) for i, (weight, cons, names) in enumerate(outseqs, start=1): name_elements = [ args.name_prefix, 'cons{:04}'.format(i), '{:.0f}'.format(weight), args.name_suffix ] consname = args.name_delimiter.join([e for e in name_elements if e]) log.debug('writing {}'.format(consname)) args.outfile.write('>{}\n{}\n'.format(consname, cons)) if args.readmap: args.readmap.writerows((name, consname) for name in names) if args.clustermap and args.specimen: args.clustermap.writerow((consname, args.specimen)) if args.weights: args.weights.writerow((consname, weight))
def action(args): if args.remote and not args.remote_database: log.error("bioy blast: error: please specify a remote database") return elif not args.remote and not args.database: log.error("bioy blast: error: please specify path to local database") return command = ['blastn'] command += ['-query', args.fasta] if args.remote: command += ['-remote'] command += ['-db', args.remote_database] else: command += ['-db', args.database] command += ['-num_threads', str(args.threads)] command += ['-perc_identity', args.id] command += ['-outfmt', '6 ' + args.outfmt.replace(',', ' ')] command += ['-strand', args.strand] if args.max: command += ['-max_target_seqs', args.max] log.info(' '.join(command)) if args.dry_run: sys.exit(0) pipe = Popen(command, stdout=PIPE, stderr=PIPE) results, errors = pipe.communicate() if errors: log.error(errors) # split tab lines lines = (r.strip().split('\t') for r in StringIO(results)) header = args.outfmt.split(',') # match with fieldnames lines = (zip(header, l) for l in lines) # make into dict lines = [dict(l) for l in lines] # Replace blast's local alignment query coverage with global coverage calculation if 'qcovs' in args.outfmt.split(',') or isinstance(args.coverage, float): for l in lines: l['qcovs'] = (float(l['qend']) - float(l['qstart']) + 1) \ / float(l['qlen']) * 100 l['qcovs'] = '{0:.2f}'.format(l['qcovs']) if isinstance(args.coverage, float): lines = [l for l in lines if float(l['qcovs']) >= args.coverage] if args.nohits: # to get nohits first we need to know about the hits qids = groupby(lines, key=itemgetter('qseqid')) qids = set(q for q, _ in qids) # now we can build a list of nohits nohits = [] for q in fastalite(opener(args.fasta)): if q.id not in qids: nohits.append(q) # convert nohits into DictWriter format nohits = (dict(qseqid=q.id) for q in nohits) # append to lines lines = chain(lines, nohits) out = DictWriter(args.out, fieldnames=header, extrasaction='ignore') if args.header: out.writeheader() out.writerows(lines)
def setUp(self): self.outdir = self.mkoutdir() with open(self.data('two.fasta')) as f: self.seqs = list(sequtils.fastalite(f))
def action(args): if args.clusters: _, fileExt, = os.path.basename(args.clusters.name).split('.') if fileExt == 'uc': clusters = parse_uc(args.clusters)[0] else: clusters = {seq: tag for seq,tag in csv.reader(args.clusters)} by_clusters = lambda s: clusters.get(s.id, s.id) else: by_clusters = lambda _: 'all one cluster' seqs = fastalite(args.fastafile) seqs = islice(seqs, args.limit) seqs = sorted(seqs, key = by_clusters) grouped_seqs = groupby(seqs, key = by_clusters) chunks = ichunker((group for _, group in grouped_seqs), args.rlefile, args.min_clust_size, args.max_clust_size) # calculate consensus for each cluster, then accumulate names of # each set of identical consensus sequences in `exemplars` (keys # are the consensus sequences themselves). exemplars = defaultdict(list) pool = Pool(processes = args.threads) for cluster, cons in map(align_and_consensus, enumerate(chunks, start = 1)): exemplars[cons].extend([c.id for c in cluster]) # calculate ratios of reads for the smallest group to each of the # other groups. outseqs is a list of (weight, consensus, list_of_names) if args.groups and exemplars: groups = dict(csv.reader(args.groups)) group_counts = Counter(groups[name] for name in chain.from_iterable(exemplars.values())) most_common = group_counts.most_common() _, least_common = most_common[-1] weights = {k: float(least_common)/v for k, v in most_common} outseqs = [(sum(weights[groups[n]] for n in names), cons, names) for cons, names in exemplars.items()] else: outseqs = [(len(names), cons, names) for cons, names in exemplars.items()] # write each consensus sequence in descending order of weight outseqs.sort(reverse=True, key=itemgetter(0)) for i, (weight, cons, names) in enumerate(outseqs, start=1): name_elements = [args.name_prefix, 'cons{:04}'.format(i), '{:.0f}'.format(weight), args.name_suffix] consname = args.name_delimiter.join([e for e in name_elements if e]) log.debug('writing {}'.format(consname)) args.outfile.write('>{}\n{}\n'.format(consname, cons)) if args.readmap: args.readmap.writerows((name, consname) for name in names) if args.clustermap and args.specimen: args.clustermap.writerow((consname, args.specimen)) if args.weights: args.weights.writerow((consname, weight))
def action(args): if args.remote and not args.remote_database: log.error("bioy blast: error: please specify a remote database") return elif not args.remote and not args.database: log.error("bioy blast: error: please specify path to local database") return command = ['blastn'] command += ['-query', args.fasta] if args.remote: command += ['-remote'] command += ['-db', args.remote_database] else: command += ['-db', args.database] command += ['-num_threads', str(args.threads)] command += ['-perc_identity', args.id] command += ['-outfmt', '6 ' + args.outfmt.replace(',', ' ')] command += ['-strand', args.strand] if args.max: command += ['-max_target_seqs', args.max] log.info(' '.join(command)) if args.dry_run: sys.exit(0) pipe = Popen(command, stdout = PIPE, stderr = PIPE) results, errors = pipe.communicate() if errors: log.error(errors) # split tab lines lines = (r.strip().split('\t') for r in StringIO(results)) header = args.outfmt.split(',') # match with fieldnames lines = (zip(header, l) for l in lines) # make into dict lines = [dict(l) for l in lines] # Replace blast's local alignment query coverage with global coverage calculation if 'qcovs' in args.outfmt.split(',') or isinstance(args.coverage, float): for l in lines: l['qcovs'] = (float(l['qend']) - float(l['qstart']) + 1) \ / float(l['qlen']) * 100 l['qcovs'] = '{0:.2f}'.format(l['qcovs']) if isinstance(args.coverage, float): lines = [l for l in lines if float(l['qcovs']) >= args.coverage] if args.nohits: # to get nohits first we need to know about the hits qids = groupby(lines, key = itemgetter('qseqid')) qids = set(q for q,_ in qids) # now we can build a list of nohits nohits = [] for q in fastalite(opener(args.fasta)): if q.id not in qids: nohits.append(q) # convert nohits into DictWriter format nohits = (dict(qseqid = q.id) for q in nohits) # append to lines lines = chain(lines, nohits) out = DictWriter(args.out, fieldnames = header, extrasaction = 'ignore') if args.header: out.writeheader() out.writerows(lines)
def test02(self): with open(self.data('five.fasta')) as f: seqs = sequtils.fastalite(f) for seq in seqs: pass
def action(args): seqs = fastalite(args.sequences) # sort seqs by group information if args.split_info: primary, secondary = args.primary_group, args.secondary_group info_reader = csv.DictReader(args.split_info) info = {r['seqname']: r for r in info_reader} # group tag sequences if info_file exists def group_tag(seq): i = info[seq.id] group = i[primary] or i[secondary] if secondary else i[primary] return dict(group = group, seq = seq) seqs = (group_tag(s) for s in seqs) # group the sequences by tags seqs = sorted(seqs, key = itemgetter('group')) seqs = groupby(seqs, key = itemgetter('group')) # just need the seqs seqs = ((pair['seq'] for pair in group) for _,group in seqs) else: seqs = (seqs,) # set up output files if args.out_info and args.split_info: info_out = csv.DictWriter(args.out_info, fieldnames=info_reader.fieldnames) info_out.writeheader() if args.out_map: map_out = csv.DictWriter(args.out_map, fieldnames = ['kept', 'orig']) if args.out_weights: weights_out = csv.DictWriter(args.out_weights, fieldnames = ['kept', 'kept', 'weight']) # dedup seqs by groups for group in seqs: weights = Counter() deduped = {} for orig in group: # checksums are faster to manage clean = orig.seq.replace('\n', '').upper() checksum = hashlib.sha1(clean).hexdigest() if checksum in deduped: kept = deduped[checksum] else: kept = deduped[checksum] = orig args.out.write('>{}\n{}\n'.format(kept.description, kept.seq)) if args.out_info and args.split_info: info_out.writerow(info[kept.id]) if args.out_weights: weights[kept.id] += 1 if args.out_map: map_out.writerow(dict(kept=kept.id, orig=orig.id)) for kept_id,count in weights.items(): weights_out.writerow(dict(kept=kept_id, weight=count))
def action(args): seqs = fastalite(args.sequences) # sort seqs by group information if args.split_info: primary, secondary = args.primary_group, args.secondary_group info_reader = csv.DictReader(args.split_info) info = {r['seqname']: r for r in info_reader} # group tag sequences if info_file exists def group_tag(seq): i = info[seq.id] group = i[primary] or i[secondary] if secondary else i[primary] return dict(group=group, seq=seq) seqs = (group_tag(s) for s in seqs) # group the sequences by tags seqs = sorted(seqs, key=itemgetter('group')) seqs = groupby(seqs, key=itemgetter('group')) # just need the seqs seqs = ((pair['seq'] for pair in group) for _, group in seqs) else: seqs = (seqs, ) # set up output files if args.out_info and args.split_info: info_out = csv.DictWriter(args.out_info, fieldnames=info_reader.fieldnames) info_out.writeheader() if args.out_map: map_out = csv.DictWriter(args.out_map, fieldnames=['kept', 'orig']) if args.out_weights: weights_out = csv.DictWriter(args.out_weights, fieldnames=['kept', 'kept', 'weight']) # dedup seqs by groups for group in seqs: weights = Counter() deduped = {} for orig in group: # checksums are faster to manage clean = orig.seq.replace('\n', '').upper() checksum = hashlib.sha1(clean).hexdigest() if checksum in deduped: kept = deduped[checksum] else: kept = deduped[checksum] = orig args.out.write('>{}\n{}\n'.format(kept.description, kept.seq)) if args.out_info and args.split_info: info_out.writerow(info[kept.id]) if args.out_weights: weights[kept.id] += 1 if args.out_map: map_out.writerow(dict(kept=kept.id, orig=orig.id)) for kept_id, count in weights.items(): weights_out.writerow(dict(kept=kept_id, weight=count))