def test01(self): infile = self.data('clusters.uc') with open(infile) as f: cluster_ids, cluster_sizes = sequtils.parse_uc(f) counter = Counter() for cluster, count in cluster_sizes.items(): counter[count] += 1 # most of the clusters are singletons self.assertEquals(counter.most_common(1)[0][0], 1)
def action(args): if args.clusters: _, fileExt, = os.path.basename(args.clusters.name).split('.') if fileExt == 'uc': clusters = parse_uc(args.clusters)[0] else: clusters = {seq: tag for seq, tag in csv.reader(args.clusters)} by_clusters = lambda s: clusters.get(s.id, s.id) else: by_clusters = lambda _: 'all one cluster' seqs = fastalite(args.fastafile) seqs = islice(seqs, args.limit) seqs = sorted(seqs, key=by_clusters) grouped_seqs = groupby(seqs, key=by_clusters) chunks = ichunker((group for _, group in grouped_seqs), args.rlefile, args.min_clust_size, args.max_clust_size) # calculate consensus for each cluster, then accumulate names of # each set of identical consensus sequences in `exemplars` (keys # are the consensus sequences themselves). exemplars = defaultdict(list) pool = Pool(processes=args.threads) for cluster, cons in map(align_and_consensus, enumerate(chunks, start=1)): exemplars[cons].extend([c.id for c in cluster]) # calculate ratios of reads for the smallest group to each of the # other groups. outseqs is a list of (weight, consensus, list_of_names) if args.groups and exemplars: groups = dict(csv.reader(args.groups)) group_counts = Counter( groups[name] for name in chain.from_iterable(exemplars.values())) most_common = group_counts.most_common() _, least_common = most_common[-1] weights = {k: float(least_common) / v for k, v in most_common} outseqs = [(sum(weights[groups[n]] for n in names), cons, names) for cons, names in exemplars.items()] else: outseqs = [(len(names), cons, names) for cons, names in exemplars.items()] # write each consensus sequence in descending order of weight outseqs.sort(reverse=True, key=itemgetter(0)) for i, (weight, cons, names) in enumerate(outseqs, start=1): name_elements = [ args.name_prefix, 'cons{:04}'.format(i), '{:.0f}'.format(weight), args.name_suffix ] consname = args.name_delimiter.join([e for e in name_elements if e]) log.debug('writing {}'.format(consname)) args.outfile.write('>{}\n{}\n'.format(consname, cons)) if args.readmap: args.readmap.writerows((name, consname) for name in names) if args.clustermap and args.specimen: args.clustermap.writerow((consname, args.specimen)) if args.weights: args.weights.writerow((consname, weight))
def action(args): if args.clusters: _, fileExt, = os.path.basename(args.clusters.name).split('.') if fileExt == 'uc': clusters = parse_uc(args.clusters)[0] else: clusters = {seq: tag for seq,tag in csv.reader(args.clusters)} by_clusters = lambda s: clusters.get(s.id, s.id) else: by_clusters = lambda _: 'all one cluster' seqs = fastalite(args.fastafile) seqs = islice(seqs, args.limit) seqs = sorted(seqs, key = by_clusters) grouped_seqs = groupby(seqs, key = by_clusters) chunks = ichunker((group for _, group in grouped_seqs), args.rlefile, args.min_clust_size, args.max_clust_size) # calculate consensus for each cluster, then accumulate names of # each set of identical consensus sequences in `exemplars` (keys # are the consensus sequences themselves). exemplars = defaultdict(list) pool = Pool(processes = args.threads) for cluster, cons in map(align_and_consensus, enumerate(chunks, start = 1)): exemplars[cons].extend([c.id for c in cluster]) # calculate ratios of reads for the smallest group to each of the # other groups. outseqs is a list of (weight, consensus, list_of_names) if args.groups and exemplars: groups = dict(csv.reader(args.groups)) group_counts = Counter(groups[name] for name in chain.from_iterable(exemplars.values())) most_common = group_counts.most_common() _, least_common = most_common[-1] weights = {k: float(least_common)/v for k, v in most_common} outseqs = [(sum(weights[groups[n]] for n in names), cons, names) for cons, names in exemplars.items()] else: outseqs = [(len(names), cons, names) for cons, names in exemplars.items()] # write each consensus sequence in descending order of weight outseqs.sort(reverse=True, key=itemgetter(0)) for i, (weight, cons, names) in enumerate(outseqs, start=1): name_elements = [args.name_prefix, 'cons{:04}'.format(i), '{:.0f}'.format(weight), args.name_suffix] consname = args.name_delimiter.join([e for e in name_elements if e]) log.debug('writing {}'.format(consname)) args.outfile.write('>{}\n{}\n'.format(consname, cons)) if args.readmap: args.readmap.writerows((name, consname) for name in names) if args.clustermap and args.specimen: args.clustermap.writerow((consname, args.specimen)) if args.weights: args.weights.writerow((consname, weight))