def test_matrix(self): vdj = datasets.vdjdb_beta() max_sequence_size = vdj.str.len().max() train = vdj.sample(2000) times = 3 size_per_time = 3000 clustering = Clustering(faiss_training_data=train, fitting_data_size=times * size_per_time, max_sequence_size=max_sequence_size) for i in range(times): sample = vdj.sample(size_per_time) clustering.batch_precluster(sample, name=f'time {i}') for clusters in clustering.batch_cluster(calc_cluster_matrix=True): df = clusters.clusters_df clustering.batch_cluster_matrix() clustering.batch_cleanup()
def test_batch_clustering_multiprocessing(self): vdj = datasets.vdjdb_beta() max_sequence_size = vdj.str.len().max() train = vdj.sample(2000) times = 3 size_per_time = 3000 clustering = Clustering(faiss_training_data=train, fitting_data_size=times * size_per_time, max_sequence_size=max_sequence_size, n_cpus='all') for i in range(times): sample = vdj.sample(size_per_time) clustering.batch_precluster(sample) for clusters in clustering.batch_cluster(): df = clusters.clusters_df clustering.batch_cleanup()
def evaluate_distance_metrics(start, end, step_size, replicates, filename=None): final = pd.DataFrame() for n in range(start, end, step_size): print('###################') print(n) print('###################') for i in range(replicates): try: beta = datasets.vdjdb_beta().sample(n) except ValueError: break epi = datasets.vdjdb_beta(epitopes=True) epi = epi[epi.CDR3.isin(beta)] t = time.time() out_HD = Clustering(method='two-step', distance_metric='HAMMING').fit(beta) t_hd = time.time() - t t = time.time() out_LD = Clustering(method='two-step', distance_metric='LEVENSHTEIN').fit(beta) t_ld = time.time() - t summ_HD = out_HD.metrics(epi).summary() summ_HD['n'] = n summ_HD['dm'] = 'Hamming' summ_HD['t'] = t_hd summ_LD = out_LD.metrics(epi).summary() summ_LD['n'] = n summ_LD['dm'] = 'Levenshtein' summ_LD['t'] = t_ld final = final.append(summ_HD) final = final.append(summ_LD) if filename is not None: final.to_csv(join('./results/', filename), sep='\t', index=False) return final
class ClusteringTest(TestBase): def setUp(self): self.cdr3 = datasets.test_cdr3() self.epitopes = datasets.test_epitopes() self.clustering_result = Clustering().fit(self.cdr3) def make_features(self): return self.clustering_result.compute_features(compute_pgen=True) def test_feature_generation(self): self.make_features() def test_pca(self): ClusterAnalysis(self.make_features()).pca() def test_prediction(self): ClusterAnalysis(self.make_features()).predict_quality() def test_train_model(self): model = ModelTraining(self.clustering_result.clusters_df, self.epitopes) fitted = model.fit_data() model.evaluate() model.save(fitted, 'test.pkl')
def test_summary(self): Clustering().fit(self.cdr3).summary()
def test_metrics(self): metrics = Clustering().fit(self.cdr3).metrics(self.epitopes) metrics.purity() metrics.consistency() metrics.retention() metrics.purity_90() metrics.summary()
def test_faiss_cluster_size(self): for size in range(2, 6003, 2000): for method in ['two-step', 'faiss', 'mcl']: Clustering(method=method, faiss_cluster_size=size).fit(self.cdr3)
def test_multiprocessing(self): for cpu in [-1, 0, 1, 2, 'all']: for method in ['two-step', 'faiss', 'mcl']: Clustering(method=method, n_cpus=cpu).fit(self.cdr3)
def test_alphabeta(self): df = datasets.vdjdb_paired() alpha, beta = df['CDR3_alpha'], df['CDR3_beta'] Clustering().fit(beta, alpha=alpha)
def test_faiss(self): Clustering(method='faiss').fit(self.cdr3)
def test_mcl(self): Clustering(method='mcl').fit(self.cdr3)
def test_quality(self): metrics = Clustering().fit(self.cdr3).metrics(self.epitopes) self.assertGreater(metrics.purity()[0], 0.6) self.assertGreater(metrics.consistency()[0], 0.12) self.assertGreater(metrics.retention(), 0.21) self.assertGreater(metrics.purity_90()[0], 0.36)
def test_cluster_contents(self): Clustering().fit(self.cdr3).cluster_contents()
def test_write_to_csv(self): Clustering().fit(self.cdr3).write_to_csv()
def test_normal(self): Clustering().fit(self.cdr3)
def setUp(self): self.cdr3 = datasets.test_cdr3() self.epitopes = datasets.test_epitopes() self.clustering_result = Clustering().fit(self.cdr3)