import argparse from collections import Counter from utils.ClusterIO import sortby, parse, Cluster from Bio import SeqIO # Setup parser parser = argparse.ArgumentParser(description='Procedure to get the genuine representative sequences for each clusters in a CDHIT.clstr file') parser.add_argument('infile', action='store', help='Path to the input CD-HIT file for all reads') parser.add_argument('indexfile', action='store', help='Path to the index file for all reads') parser.add_argument('--version', action='version', version='%(prog)s 1.0') args = parser.parse_args() # Run through all Clusters cluster_gen = parse(args.infile, idx_file_path=args.indexfile, edit_dist=False, similarity_count=True) # Slow, Only do this once. seqrec_lookup = SeqIO.index_db(args.indexfile) rep_seq_correction_count = 0 clusters_count = 0 with open('clusters.temp', 'wb') as clust_file: for cluster in cluster_gen: # if fraction of reads 100% similar to representative seq is in majority, # skip and write to new file seq_similarity_ranking = cluster.editdist_counter.most_common() print seq_similarity_ranking[0][0]
# Setup parser parser = argparse.ArgumentParser(description='Procedure to write FastQ output file for clusters in a CDHIT.clstr file') parser.add_argument('indexfile', action='store', help='Path to the index file for all reads') parser.add_argument('ouputdir', action='store', help='Directory where outputs are stored to the index file for all reads') parser.add_argument('--version', action='version', version='%(prog)s 1.0') args = parser.parse_args() if not os.path.exists(args.ouputdir): os.makedirs(args.ouputdir) db = SeqIO.index_db(args.indexfilepath) idx_file_dir = os.path.split(args.indexfilepath)[0] cluster_gen = parse(sys.stdin, idx_file_path=args.indexfile, edit_dist=False) size_counter = Counter() count = 0 for cluster in cluster_gen: count += 1 # Write to output file size_counter[str(cluster.size)] += 1 # Get the sequence records for the cluster seqs = [] if os.getcwd() != idx_file_dir: