def test_clusters_from_uc_file(self): """ clusters_from_uc_file functions as expected """ expected_clusters = {'s2':['s2','s3']} expected_failures = ['s1'] expected_new_seeds = ['s2'] self.assertEqual(clusters_from_uc_file(self.uc_lines1), (expected_clusters,expected_failures,expected_new_seeds))
def test_clusters_from_uc_file(self): """ clusters_from_uc_file functions as expected """ expected_clusters = {"s2": ["s2", "s3"]} expected_failures = ["s1"] expected_new_seeds = ["s2"] self.assertEqual( clusters_from_uc_file(self.uc_lines1), (expected_clusters, expected_failures, expected_new_seeds) )
def test_clusters_from_uc_file_multiple_hits(self): """ clusters_from_uc_file handles error_on_multiple_hits correctly """ # when a query hits multiple hits and error_on_multiple_hits=True # an error should be raised self.assertRaises( UclustParseError, clusters_from_uc_file, self.uc_lines_w_multiple_hits_per_query, error_on_multiple_hits=True, ) # when a query hits multiple hits and error_on_multiple_hits=False # the query should show up in multiple clusters actual = clusters_from_uc_file(self.uc_lines_w_multiple_hits_per_query, error_on_multiple_hits=False) expected_clusters = {"s2": ["s2", "s3"], "s4": ["s4", "s3"]} expected_failures = ["s1"] expected_new_seeds = ["s2", "s4"] self.assertEqual(actual, (expected_clusters, expected_failures, expected_new_seeds))
def test_clusters_from_uc_file_multiple_hits(self): """ clusters_from_uc_file handles error_on_multiple_hits correctly """ # when a query hits multiple hits and error_on_multiple_hits=True # an error should be raised self.assertRaises(UclustParseError, clusters_from_uc_file, self.uc_lines_w_multiple_hits_per_query, error_on_multiple_hits=True) # when a query hits multiple hits and error_on_multiple_hits=False # the query should show up in multiple clusters actual = clusters_from_uc_file(self.uc_lines_w_multiple_hits_per_query, error_on_multiple_hits=False) expected_clusters = {'s2':['s2','s3'], 's4':['s4','s3']} expected_failures = ['s1'] expected_new_seeds = ['s2','s4'] self.assertEqual(actual, (expected_clusters,expected_failures,expected_new_seeds))
def cluster_seqs(seqspath, simm, folderout='/tmp', gapopen=None, gapext=None): if folderout[-1] != "/": folderout += "/" params = { '--usersort': True, '--id': float(simm), '--maxaccepts': 20, '--maxrejects': 500, '--stepwords': 20, '--hsp': 0, '--match': 1, '--mismatch': -1 } if gapopen is not None: params['--gapopen'] = gapopen if gapext is not None: params['--gapext'] = gapext uclust = Uclust(params, WorkingDir='/tmp') input_data = { '--input': seqspath, '--uc': folderout + "clusters.uc", '--log': folderout + "clusters.log" } result = uclust(input_data) clusters, failures, newseeds = clusters_from_uc_file(result['ClusterFile']) seqs = LoadSeqs(seqspath, aligned=False) convheader = {} clusterseqs = {} #create dictinary to convert shortened headers to full headers for header in seqs.getSeqNames(): convheader[header.split()[0]] = header #match headers in each cluster to seqs to create cluster tuples list for num, cluster in enumerate(clusters): clusterseqs["cluster_" + str(num)] = [] for header in clusters[cluster]: clusterseqs["cluster_" + str(num)].append((convheader[header], seqs.getSeq(convheader[header]))) return clusterseqs
def cluster_seqs(seqspath, simm, folderout='/tmp', gapopen=None, gapext=None): if folderout[-1] != "/": folderout += "/" params = { '--usersort': True, '--id': float(simm), '--maxaccepts': 20, '--maxrejects': 500, '--stepwords': 20, '--hsp': 0, '--match': 1, '--mismatch': -1 } if gapopen is not None: params['--gapopen'] = gapopen if gapext is not None: params['--gapext'] = gapext uclust = Uclust(params, WorkingDir='/tmp') input_data = { '--input': seqspath, '--uc': folderout + "clusters.uc", '--log': folderout + "clusters.log" } result = uclust(input_data) clusters, failures, newseeds = clusters_from_uc_file(result['ClusterFile']) seqs = LoadSeqs(seqspath, aligned=False) convheader = {} clusterseqs = {} #create dictinary to convert shortened headers to full headers for header in seqs.getSeqNames(): convheader[header.split()[0]] = header #match headers in each cluster to seqs to create cluster tuples list for num, cluster in enumerate(clusters): clusterseqs["cluster_" + str(num)] = [] for header in clusters[cluster]: clusterseqs["cluster_" + str(num)].append( (convheader[header], seqs.getSeq(convheader[header]))) return clusterseqs
def create_clusters(fastain, folderout, simmilarity, minseqs=0): params = { '--log': folderout + str(simmilarity) + "_clusters.log", '--usersort': False, '--id': float(simmilarity), '--maxaccepts': 20, '--maxrejects': 500, '--stepwords': 20, '--w': 12, '--gapopen': '10.0', '--gapext': '10.0', } uclust = Uclust(params, WorkingDir='/tmp') input_data = { '--input': fastain, '--uc': folderout + str(simmilarity) + "_clusters.uc" } result = uclust(input_data) clusters, failures, new_seeds = clusters_from_uc_file(result['ClusterFile']) #read in headers to rebuild full headers headers = {} for header, seq in MinimalFastaParser(open(fastain)): headers[header.split()[0]] = (header, seq) otus = [] otusout = open(folderout + str(simmilarity) + "_clusters.txt", 'w') for group, cluster in enumerate(clusters): count = 0 for seq in cluster: count += int(seq.split("_")[1]) if count >= minseqs: otusout.write(str(group) + "\t") otus.append([]) #map headers back to orignal ones with counts for header in clusters[cluster]: otus[group].append(headers[header]) otusout.write(headers[header][0] + "\t") otusout.write("\n") otusout.close() return otus
'--maxaccepts': 20, '--maxrejects': 500, '--stepwords': 20, '--w': 12, '--gapopen': '1.0/*TI', '--gapext': '1.0', '--hsp': 0 } uclust = Uclust(params, WorkingDir='/tmp') input_data = { '--input': argv[1], '--uc': argv[2] + argv[3] + "_clusters.uc", '--log': argv[2] + argv[3] + "_clusters.log" } result = uclust(input_data) clusters, failures, new_seeds = clusters_from_uc_file(result['ClusterFile']) print "RESULTS: ", len(clusters) headers = {} for header, seq in MinimalFastaParser(open(argv[1])): headers[header.split()[0]] = header otusout = open(argv[2] + argv[3] + "_clusters.txt", 'w') for group, cluster in enumerate(clusters): otusout.write(str(group) + "\t") #map headers back to orignal ones with counts for header in clusters[cluster]: otusout.write(headers[header] + "\t") otusout.write("\n") otusout.close() #log = open(argv[2] + argv[3] + "_clusters.log", 'w') #log.write('\n'.join(input_data) + '\n'.join(params) + # "clusters: " + str(len(clusters)) + "\n")