def action(args): seqs = fastalite(args.fasta) pairs = list(all_pairwise(seqs)) if args.distance: pairs = [(q, t, 1 - i) for q, t, i in pairs] if args.split_info and args.matrix_out: primary, secondary = args.primary_group, args.secondary_group split_info = list(csv.DictReader(args.split_info)) info = {r['seqname']: r for r in split_info if r['seqname']} tax = {r['tax_id']:r for r in split_info} pairs += map(itemgetter(1,0,2), pairs) def group(seqname): i = info[seqname] return i[primary] or i[secondary] if secondary else i[primary] pairs = ((group(left), group(right), score) for left,right,score in pairs) # sort and group rows pairs = list(groupbyl(pairs, key = itemgetter(0))) matrix_out = csv.writer(args.matrix_out) # this is the tax_id order we will be using for columns tax_ids = map(itemgetter(0), pairs) # get the species names to output as first row matrix_out.writerow([''] + [tax[t]['tax_name'] for t in tax_ids]) # iterator through the sorted rows (pairs) for row_id, columns in pairs: # sort and group columns columns = dict(groupbyl(columns, key = itemgetter(1))) # get the species name row = [tax[row_id]['tax_name']] for t in tax_ids: # if t not in columns that means there is only # sequence representing the group # therefore the median destance is 0 if t not in columns: med = 0 else: col = columns[t] med = median(map(itemgetter(2), col)) # percent and round med = math.ceil(med * 100) / 100 row.append(med) matrix_out.writerow(row) else: writer = csv.writer(args.out) writer.writerow(['query', 'target', 'identity']) writer.writerows(pairs)
def condense(queries, floor_rank, max_size, ranks, rank_thresholds, target_rank = None): target_rank = target_rank or ranks[0] groups = list(groupbyl(queries, key = itemgetter(target_rank))) num_groups = len(groups) if rank_thresholds.get(target_rank, max_size) < num_groups: return queries # assign where available target_rank_ids # groups without 'i' values remain assigned at previous (higher) rank for g in (g for i,g in groups if i): for q in g: q['target_rank_id'] = q[target_rank] # return if we hit the floor if target_rank == floor_rank: return queries # else move down a rank target_rank = ranks[ranks.index(target_rank) + 1] # recurse down the tax tree condensed = [] for _,g in groups: c = condense(g, floor_rank, max_size, ranks, rank_thresholds, target_rank) condensed.extend(c) return condensed
def action(args): rows = csv.DictReader(args.clusters, delimiter='\t', fieldnames=UCLUST_HEADERS) grouped = groupbyl(get_mapping(rows), key=itemgetter(0)) # group by centroid clusters = {c: rows for c, rows in grouped if len(rows) >= args.min_clust_size} readmap = csv.writer(args.out) specimenmap = csv.writer(args.specimenmap) \ if (args.specimenmap and args.specimen) else None weights = csv.writer(args.weights) if args.weights else None # Calculate ratios of reads for the smallest group to each of the # other groups. if args.groups: groups = dict(csv.reader(args.groups)) most_common = Counter(groups.values()).most_common() _, least_common = most_common[-1] wdict = {k: float(least_common) / v for k, v in most_common} for centroid, cluster in clusters.iteritems(): log.info('writing {}'.format(centroid)) for _centroid, read in cluster: readmap.writerow([read, centroid]) if specimenmap: specimenmap.writerow((centroid, args.specimen)) if weights: if args.groups: # normalize weight of each cluster by contribution of # reads by each group defined in --groups weights.writerow((centroid, sum(wdict[groups[r]] for c, r in cluster))) else: weights.writerow((centroid, len(cluster))) # filter non centroid seqs if args.fasta_in and args.fasta_out: for c in (c for c in args.fasta_in if c.id in clusters): args.fasta_out.write('>{}\n{}\n'.format(c.description, c.seq))
def condense(queries, floor_rank, max_size, ranks, rank_thresholds, target_rank=None): target_rank = target_rank or ranks[0] groups = list(groupbyl(queries, key=itemgetter(target_rank))) num_groups = len(groups) if rank_thresholds.get(target_rank, max_size) < num_groups: return queries # assign where available target_rank_ids # groups without 'i' values remain assigned at previous (higher) rank for g in (g for i, g in groups if i): for q in g: q['target_rank_id'] = q[target_rank] # return if we hit the floor if target_rank == floor_rank: return queries # else move down a rank target_rank = ranks[ranks.index(target_rank) + 1] # recurse down the tax tree condensed = [] for _, g in groups: c = condense(g, floor_rank, max_size, ranks, rank_thresholds, target_rank) condensed.extend(c) return condensed
def action(args): ### format format blast data and add additional available information fieldnames = None if args.has_header else sequtils.BLAST_HEADER_DEFAULT blast_results = DictReader(args.blast_file, fieldnames=fieldnames) blast_results = list(blast_results) sseqids = set(s['sseqid'] for s in blast_results) qseqids = set(s['qseqid'] for s in blast_results) # load seq_info and map file mapfile = DictReader(args.map, fieldnames=['name', 'specimen']) mapfile = { m['name']: m['specimen'] for m in mapfile if m['name'] in qseqids } seq_info = DictReader(args.seq_info) seq_info = {s['seqname']: s for s in seq_info if s['seqname'] in sseqids} # pident def pident(b): return dict(b, pident=float(b['pident'])) if b['sseqid'] else b blast_results = (pident(b) for b in blast_results) # coverage def cov(b): if b['sseqid'] and b['qcovs']: b['coverage'] = float(b['qcovs']) return b elif b['sseqid']: c = coverage(b['qstart'], b['qend'], b['qlen']) return dict(b, coverage=c) else: return b blast_results = (cov(b) for b in blast_results) # seq info def info(b): return dict(seq_info[b['sseqid']], **b) if b['sseqid'] else b blast_results = (info(b) for b in blast_results) # tax info def tax_info(b): return dict(args.taxonomy[b['tax_id']], **b) if b['sseqid'] else b blast_results = (tax_info(b) for b in blast_results) ### output file headers fieldnames = [ 'specimen', 'max_percent', 'min_percent', 'max_coverage', 'min_coverage', 'assignment_id', 'assignment' ] if args.weights: weights = DictReader(args.weights, fieldnames=['name', 'weight']) weights = { d['name']: d['weight'] for d in weights if d['name'] in qseqids } fieldnames += ['clusters', 'reads', 'pct_reads'] else: weights = {} if args.copy_numbers: copy_numbers = DictReader(args.copy_numbers) copy_numbers = {d['tax_id']: float(d['median']) for d in copy_numbers} fieldnames += ['corrected', 'pct_corrected'] else: copy_numbers = {} # TODO: take out target_rank, hi, low and provide in pipeline using csvmod # TODO: option to include tax_ids (default no) fieldnames += ['target_rank', 'hi', 'low', 'tax_ids'] ### Columns out = DictWriter(args.out, extrasaction='ignore', fieldnames=fieldnames) out.writeheader() if args.out_detail: args.out_detail.writeheader() def blast_hit(hit, args): return hit['sseqid'] and \ hit[args.target_rank] and \ hit['coverage'] >= args.coverage and \ float(weights.get(hit['qseqid'], 1)) >= args.min_cluster_size and \ hit[args.target_rank] not in args.exclude_by_taxid and \ hit['qseqid'] != hit['sseqid'] and \ int(hit['ambig_count']) <= args.max_ambiguous ### Rows etc = '[no blast result]' # This row will hold all unmatched # groups have list position prioritization groups = [ ('> {}%'.format(args.max_identity), lambda h: blast_hit(h, args) and h['pident'] > args.max_identity), (None, lambda h: blast_hit(h, args) and args.max_identity >= h[ 'pident'] > args.min_identity), ('<= {}%'.format(args.min_identity), lambda h: blast_hit(h, args) and h['pident'] <= args.min_identity), ] # used later for results output group_cats = map(itemgetter(0), groups) group_cats.append(etc) # assignment rank thresholds rank_thresholds = (d.split(':') for d in args.group_def) rank_thresholds = dict((k, int(v)) for k, v in rank_thresholds) # rt = {k: int(v) for k, v in (d.split(':') for d in args.group_def)} # group by specimen if args.map: specimen_grouper = lambda s: mapfile[s['qseqid']] elif args.all_one_group: specimen_grouper = lambda s: args.group_label else: specimen_grouper = lambda s: s['qseqid'] blast_results = groupbyl(blast_results, key=specimen_grouper) assignments = [] # assignment list for assignment ids for specimen, hits in blast_results: categories = defaultdict(list) # clusters will hold the query ids as hits are matched to categories clusters = set() # filter out categories for cat, fltr in groups: matches = filter(fltr, hits) if cat: categories[cat] = matches else: # create sets of tax_rank_id query_group = groupbyl(matches, key=itemgetter('qseqid')) target_cats = defaultdict(list) for _, queries in query_group: queries = condense(queries, args.target_rank, args.target_max_group_size, sequtils.RANKS, rank_thresholds) cat = map(itemgetter('target_rank_id'), queries) cat = frozenset(cat) target_cats[cat].extend(queries) categories = dict(categories, **target_cats) # add query ids that were matched to a filter clusters |= set(map(itemgetter('qseqid'), matches)) # remove all hits corresponding to a matched query id (cluster) hits = filter(lambda h: h['qseqid'] not in clusters, hits) # remaining hits go in the etc ('no match') category categories[etc] = hits # calculate read counts read_counts = dict() for k, v in categories.items(): qseqids = set(map(itemgetter('qseqid'), v)) weight = sum(float(weights.get(q, 1)) for q in qseqids) read_counts[k] = weight taxids = set() for k, v in categories.items(): if k is not etc: for h in v: taxids.add(h['tax_id']) ### list of assigned ids for count corrections assigned_ids = dict() for k, v in categories.items(): if k is not etc and v: assigned_ids[k] = set(map(itemgetter('tax_id'), v)) # correction counts corrected_counts = dict() for k, v in categories.items(): if k is not etc and v: av = mean(copy_numbers.get(t, 1) for t in assigned_ids[k]) corrected_counts[k] = ceil(read_counts[k] / av) # finally take the root value for the etc category corrected_counts[etc] = ceil(read_counts[etc] / copy_numbers.get('1', 1)) # totals for percent calculations later total_reads = sum(v for v in read_counts.values()) total_corrected = sum(v for v in corrected_counts.values()) # Print classifications per specimen sorted by # of reads in reverse (descending) order sort_by_reads_assign = lambda (c, h): corrected_counts.get(c, None) for cat, hits in sorted(categories.items(), key=sort_by_reads_assign, reverse=True): # continue if their are hits if hits: # for incrementing assignment id's if cat not in assignments: assignments.append(cat) assignment_id = assignments.index(cat) reads = read_counts[cat] reads_corrected = corrected_counts[cat] clusters = set(map(itemgetter('qseqid'), hits)) results = dict( hi=args.max_identity, low=args.min_identity, target_rank=args.target_rank, specimen=specimen, assignment_id=assignment_id, reads=int(reads), pct_reads='{0:.2f}'.format(reads / total_reads * 100), corrected=int(reads_corrected), pct_corrected='{0:.2f}'.format(reads_corrected / total_corrected * 100), clusters=len(clusters)) if cat is etc: assignment = etc results = dict(results, assignment=assignment) else: taxids = set(map(itemgetter('tax_id'), hits)) coverages = set(map(itemgetter('coverage'), hits)) percents = set(map(itemgetter('pident'), hits)) if cat in group_cats: assignment = cat else: names = [ args.taxonomy[h['target_rank_id']]['tax_name'] for h in hits ] selectors = [ h['pident'] >= args.asterisk for h in hits ] assignment = sequtils.format_taxonomy( names, selectors, '*') results = dict( results, assignment=assignment, max_percent='{0:.2f}'.format(max(percents)), min_percent='{0:.2f}'.format(min(percents)), max_coverage='{0:.2f}'.format(max(coverages)), min_coverage='{0:.2f}'.format(min(coverages)), tax_ids=' '.join(taxids)) out.writerow(results) if args.out_detail: if not args.details_full: # drop the no_hits hits = [h for h in hits if 'tax_id' in h] # only report heaviest centroid clusters_and_sizes = [(float(weights.get(c, 1.0)), c) for c in clusters] _, largest = max(clusters_and_sizes) hits = (h for h in hits if h['qseqid'] == largest) for h in hits: args.out_detail.writerow( dict(specimen=specimen, assignment=assignment, assignment_id=assignment_id, hi=args.max_identity, low=args.min_identity, target_rank=args.target_rank, **h))
def action(args): ### format format blast data and add additional available information fieldnames = None if args.has_header else sequtils.BLAST_HEADER_DEFAULT blast_results = DictReader(args.blast_file, fieldnames = fieldnames) blast_results = list(blast_results) sseqids = set(s['sseqid'] for s in blast_results) qseqids = set(s['qseqid'] for s in blast_results) # load seq_info and map file mapfile = DictReader(args.map, fieldnames = ['name', 'specimen']) mapfile = {m['name']:m['specimen'] for m in mapfile if m['name'] in qseqids} seq_info = DictReader(args.seq_info) seq_info = {s['seqname']:s for s in seq_info if s['seqname'] in sseqids} # pident def pident(b): return dict(b, pident = float(b['pident'])) if b['sseqid'] else b blast_results = (pident(b) for b in blast_results) # coverage def cov(b): if b['sseqid'] and b['qcovs']: b['coverage'] = float(b['qcovs']) return b elif b['sseqid']: c = coverage(b['qstart'], b['qend'], b['qlen']) return dict(b, coverage = c) else: return b blast_results = (cov(b) for b in blast_results) # seq info def info(b): return dict(seq_info[b['sseqid']], **b) if b['sseqid'] else b blast_results = (info(b) for b in blast_results) # tax info def tax_info(b): return dict(args.taxonomy[b['tax_id']], **b) if b['sseqid'] else b blast_results = (tax_info(b) for b in blast_results) ### output file headers fieldnames = ['specimen', 'max_percent', 'min_percent', 'max_coverage', 'min_coverage', 'assignment_id', 'assignment'] if args.weights: weights = DictReader(args.weights, fieldnames = ['name', 'weight']) weights = {d['name']:d['weight'] for d in weights if d['name'] in qseqids} fieldnames += ['clusters', 'reads', 'pct_reads'] else: weights = {} if args.copy_numbers: copy_numbers = DictReader(args.copy_numbers) copy_numbers = {d['tax_id']:float(d['median']) for d in copy_numbers} fieldnames += ['corrected', 'pct_corrected'] else: copy_numbers = {} # TODO: take out target_rank, hi, low and provide in pipeline using csvmod # TODO: option to include tax_ids (default no) fieldnames += ['target_rank', 'hi', 'low', 'tax_ids'] ### Columns out = DictWriter(args.out, extrasaction = 'ignore', fieldnames = fieldnames) out.writeheader() if args.out_detail: args.out_detail.writeheader() def blast_hit(hit, args): return hit['sseqid'] and \ hit[args.target_rank] and \ hit['coverage'] >= args.coverage and \ float(weights.get(hit['qseqid'], 1)) >= args.min_cluster_size and \ hit[args.target_rank] not in args.exclude_by_taxid and \ hit['qseqid'] != hit['sseqid'] and \ int(hit['ambig_count']) <= args.max_ambiguous ### Rows etc = '[no blast result]' # This row will hold all unmatched # groups have list position prioritization groups = [ ('> {}%'.format(args.max_identity), lambda h: blast_hit(h, args) and h['pident'] > args.max_identity), (None, lambda h: blast_hit(h, args) and args.max_identity >= h['pident'] > args.min_identity), ('<= {}%'.format(args.min_identity), lambda h: blast_hit(h, args) and h['pident'] <= args.min_identity), ] # used later for results output group_cats = map(itemgetter(0), groups) group_cats.append(etc) # assignment rank thresholds rank_thresholds = (d.split(':') for d in args.group_def) rank_thresholds = dict((k, int(v)) for k,v in rank_thresholds) # rt = {k: int(v) for k, v in (d.split(':') for d in args.group_def)} # group by specimen if args.map: specimen_grouper = lambda s: mapfile[s['qseqid']] elif args.all_one_group: specimen_grouper = lambda s: args.group_label else: specimen_grouper = lambda s: s['qseqid'] blast_results = groupbyl(blast_results, key = specimen_grouper) assignments = [] # assignment list for assignment ids for specimen, hits in blast_results: categories = defaultdict(list) # clusters will hold the query ids as hits are matched to categories clusters = set() # filter out categories for cat, fltr in groups: matches = filter(fltr, hits) if cat: categories[cat] = matches else: # create sets of tax_rank_id query_group = groupbyl(matches, key = itemgetter('qseqid')) target_cats = defaultdict(list) for _,queries in query_group: queries = condense( queries, args.target_rank, args.target_max_group_size, sequtils.RANKS, rank_thresholds) cat = map(itemgetter('target_rank_id'), queries) cat = frozenset(cat) target_cats[cat].extend(queries) categories = dict(categories, **target_cats) # add query ids that were matched to a filter clusters |= set(map(itemgetter('qseqid'), matches)) # remove all hits corresponding to a matched query id (cluster) hits = filter(lambda h: h['qseqid'] not in clusters, hits) # remaining hits go in the etc ('no match') category categories[etc] = hits # calculate read counts read_counts = dict() for k,v in categories.items(): qseqids = set(map(itemgetter('qseqid'), v)) weight = sum(float(weights.get(q, 1)) for q in qseqids) read_counts[k] = weight taxids = set() for k,v in categories.items(): if k is not etc: for h in v: taxids.add(h['tax_id']) ### list of assigned ids for count corrections assigned_ids = dict() for k,v in categories.items(): if k is not etc and v: assigned_ids[k] = set(map(itemgetter('tax_id'), v)) # correction counts corrected_counts = dict() for k,v in categories.items(): if k is not etc and v: av = mean(copy_numbers.get(t, 1) for t in assigned_ids[k]) corrected_counts[k] = ceil(read_counts[k] / av) # finally take the root value for the etc category corrected_counts[etc] = ceil(read_counts[etc] / copy_numbers.get('1', 1)) # totals for percent calculations later total_reads = sum(v for v in read_counts.values()) total_corrected = sum(v for v in corrected_counts.values()) # Print classifications per specimen sorted by # of reads in reverse (descending) order sort_by_reads_assign = lambda (c,h): corrected_counts.get(c, None) for cat, hits in sorted(categories.items(), key = sort_by_reads_assign, reverse = True): # continue if their are hits if hits: # for incrementing assignment id's if cat not in assignments: assignments.append(cat) assignment_id = assignments.index(cat) reads = read_counts[cat] reads_corrected = corrected_counts[cat] clusters = set(map(itemgetter('qseqid'), hits)) results = dict( hi = args.max_identity, low = args.min_identity, target_rank = args.target_rank, specimen = specimen, assignment_id = assignment_id, reads = int(reads), pct_reads = '{0:.2f}'.format(reads / total_reads * 100), corrected = int(reads_corrected), pct_corrected = '{0:.2f}'.format(reads_corrected / total_corrected * 100), clusters = len(clusters)) if cat is etc: assignment = etc results = dict(results, assignment = assignment) else: taxids = set(map(itemgetter('tax_id'), hits)) coverages = set(map(itemgetter('coverage'), hits)) percents = set(map(itemgetter('pident'), hits)) if cat in group_cats: assignment = cat else: names = [args.taxonomy[h['target_rank_id']]['tax_name'] for h in hits] selectors = [h['pident'] >= args.asterisk for h in hits] assignment = sequtils.format_taxonomy(names, selectors, '*') results = dict(results, assignment = assignment, max_percent = '{0:.2f}'.format(max(percents)), min_percent = '{0:.2f}'.format(min(percents)), max_coverage = '{0:.2f}'.format(max(coverages)), min_coverage = '{0:.2f}'.format(min(coverages)), tax_ids = ' '.join(taxids)) out.writerow(results) if args.out_detail: if not args.details_full: # drop the no_hits hits = [h for h in hits if 'tax_id' in h] # only report heaviest centroid clusters_and_sizes = [(float(weights.get(c, 1.0)), c) for c in clusters] _, largest = max(clusters_and_sizes) hits = (h for h in hits if h['qseqid'] == largest) for h in hits: args.out_detail.writerow(dict( specimen = specimen, assignment = assignment, assignment_id = assignment_id, hi = args.max_identity, low = args.min_identity, target_rank = args.target_rank, **h))