Esempio n. 1
0
import argparse
from collections import Counter

from utils.ClusterIO import sortby, parse, Cluster
from Bio import SeqIO

# Setup parser
parser = argparse.ArgumentParser(description='Procedure to get the genuine representative sequences for each clusters in a CDHIT.clstr file')
parser.add_argument('infile', action='store', help='Path to the input CD-HIT file for all reads')
parser.add_argument('indexfile', action='store', help='Path to the index file for all reads')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')

args = parser.parse_args()

# Run through all Clusters
cluster_gen = parse(args.infile, idx_file_path=args.indexfile, edit_dist=False, similarity_count=True)

# Slow, Only do this once.
seqrec_lookup = SeqIO.index_db(args.indexfile)

rep_seq_correction_count = 0
clusters_count = 0

with open('clusters.temp', 'wb') as clust_file:

    for cluster in cluster_gen:
        # if fraction of reads 100% similar to representative seq is in majority,
        # skip and write to new file
        seq_similarity_ranking = cluster.editdist_counter.most_common()
        
        print seq_similarity_ranking[0][0]
Esempio n. 2
0
# Setup parser
parser = argparse.ArgumentParser(description='Procedure to write FastQ output file for clusters in a CDHIT.clstr file')
parser.add_argument('indexfile', action='store', help='Path to the index file for all reads')
parser.add_argument('ouputdir', action='store', help='Directory where outputs are stored to the index file for all reads')
parser.add_argument('--version', action='version', version='%(prog)s 1.0')

args = parser.parse_args()

if not os.path.exists(args.ouputdir):
    os.makedirs(args.ouputdir)

db = SeqIO.index_db(args.indexfilepath)

idx_file_dir = os.path.split(args.indexfilepath)[0]

cluster_gen = parse(sys.stdin, idx_file_path=args.indexfile, edit_dist=False)

size_counter = Counter()
 
count = 0
 
for cluster in cluster_gen:

    count += 1 
    
    # Write to output file
    size_counter[str(cluster.size)] += 1
    
    # Get the sequence records for the cluster 
    seqs = []
    if os.getcwd() != idx_file_dir: