def test_greedy_secondary_clustering_2(self): """ Make sure this works with dereplicate """ test_dir = self.test_dir # Crash gracefully if the algorithm isn't right args = drep.argumentParser.parse_args([ 'dereplicate', self.wd_loc, '--greedy_secondary_clustering', '-sa', '0.99', '--ignoreGenomeQuality', '-d', '-g' ] + self.genomes) try: drep.controller.Controller().parseArguments(args) assert False except NameError: pass # Run it under normal conditions args = drep.argumentParser.parse_args([ 'dereplicate', self.wd_loc, '--greedy_secondary_clustering', '--S_algorithm', 'fastANI', '--ignoreGenomeQuality', '-sa', '0.99', '-d' ]) drep.controller.Controller().parseArguments(args) # Load test results wd = drep.WorkDirectory.WorkDirectory(self.wd_loc) # Load solutions wdS = drep.WorkDirectory.WorkDirectory(self.s_wd_loc) CdbS = wdS.get_db('Cdb').sort_values('genome').reset_index(drop=True) NdbS = wdS.get_db('Ndb') SdbS = wdS.get_db('Sdb') # Make sure you didn't pairwise Ndb = wd.get_db('Ndb') assert len(Ndb) != len(NdbS) # Make sure you incorporated centrality Sdb = wd.get_db('Sdb') assert not test_utils.compare_dfs2(Sdb, SdbS, verbose=True) # Make sure you still got the correct clustering Cdb = wd.get_db('Cdb').sort_values('genome').reset_index(drop=True) assert 'greedy_representative' in Cdb.columns del Cdb['greedy_representative'] for t in ['cluster_method', 'comparison_algorithm']: del Cdb[t] del CdbS[t] CdbS['secondary_cluster'] = [ x.replace('_0', '_1') for x in CdbS['secondary_cluster'] ] assert test_utils.compare_dfs2(CdbS, Cdb, verbose=True) # Make sure it handles plotting gracefully drep.d_analyze.plot_secondary_dendrograms_from_wd(wd, plot_dir=test_dir)
def test_centrality_1(self): """ Test the methods drep.d_choose.add_centrality and "choose_winners" on a small set of genomes """ wd = drep.WorkDirectory.WorkDirectory(self.working_wd_loc) kwargs = vars( argumentParser.parse_args( ['dereplicate', self.working_wd_loc, '--ignoreGenomeQuality'])) del kwargs['genomes'] # Modify Cdb cdb = wd.get_db('Cdb') cdb['secondary_cluster'] = [ x.replace('1_2', '1_1') for x in cdb['secondary_cluster'] ] wd.store_db(cdb, 'Cdb') # Run calculation bdb = wd.get_db('Bdb') Gdb = drep.d_filter.calc_genome_info(bdb['location'].tolist()) Gdb = drep.d_choose.add_centrality(wd, Gdb, **kwargs) # Test result of add_centrality assert 'centrality' in list(Gdb.columns) assert len(Gdb[Gdb['centrality'] > 0]) > 0 assert len(Gdb[Gdb['centrality'] > 1]) == 0 assert len(Gdb[Gdb['centrality'].isna()]) == 0 # Run choose winners Sdb, Wdb = drep.d_choose.choose_winners(cdb, Gdb, **kwargs) # Compare against choose winners with no centrality weight kwargs = vars( argumentParser.parse_args([ 'dereplicate', self.working_wd_loc, '--ignoreGenomeQuality', '-centW', '0' ])) del kwargs['genomes'] Sdb2, Wdb2 = drep.d_choose.choose_winners(cdb, Gdb, **kwargs) # Make sure you get different values, and make sure they're not too different assert not test_utils.compare_dfs2(Sdb, Sdb2) assert abs(Sdb['score'].mean() - Sdb2['score'].mean()) < 1 # Make sure S_ani is being loaded properly kwargs = vars( argumentParser.parse_args([ 'dereplicate', self.working_wd_loc, '--ignoreGenomeQuality', '-sa', '0.95' ])) del kwargs['genomes'] Sdb2, Wdb2 = drep.d_choose.choose_winners(cdb, Gdb, **kwargs) assert not test_utils.compare_dfs2(Sdb, Sdb2) assert abs(Sdb['score'].mean()) < Sdb2['score'].mean()
def test_multiround_primary_clustering_1(self): test_dir = self.test_dir # Run it under normal conditions args = drep.argumentParser.parse_args([ 'compare', self.wd_loc, '--primary_chunksize', '3', '--multiround_primary_clustering', '-pa', '0.95', '-d', '-g' ] + self.genomes) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(self.wd_loc, **kwargs) # Load test results wd = drep.WorkDirectory.WorkDirectory(self.wd_loc) # Load solutions wdS = drep.WorkDirectory.WorkDirectory(self.s_wd_loc) CdbS = wdS.get_db('Cdb').sort_values('genome').reset_index(drop=True) # Make sure you didn't pairwise Mdb = wd.get_db('Mdb') assert len(Mdb) != 25 assert 'genome_chunk' in list(Mdb.columns) assert len(Mdb['genome_chunk'].unique()) == 3 # Make sure you still got the correct clustering Cdb = wd.get_db('Cdb').sort_values('genome').reset_index(drop=True) assert test_utils.compare_dfs2(CdbS, Cdb, verbose=True) # Make sure it handles plotting gracefully drep.d_analyze.mash_dendrogram_from_wd(wd, plot_dir=test_dir)
def test_unit_1(self): ''' Test a normal run of cluster ''' # normal complete run args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '--S_algorithm', 'ANImf', '-g'] + \ self.genomes) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are correct: # for db in ['Cdb', 'Mdb', 'Ndb']: for db in ['Cdb', 'Ndb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) # get rid of some precision on the ANI; you are comparing fastANI with ANImf if db == 'Ndb': db1['ani'] = [round(x, 3) for x in db1['ani']] db2['ani'] = [round(x, 3) for x in db2['ani']] db1['alignment_length'] = [ round(x, -6) for x in db1['alignment_length'] ] db2['alignment_length'] = [ round(x, -6) for x in db2['alignment_length'] ] #db1 = db1[db2.columns] db1 = db1[['ani', 'alignment_length', 'querry', 'reference']] db2 = db2[['ani', 'alignment_length', 'querry', 'reference']] db1 = db1.sort_values(['querry', 'reference']).reset_index(drop=True) db2 = db2.sort_values(['querry', 'reference']).reset_index(drop=True) if db == 'Cdb': db1 = db1[['genome', 'secondary_cluster' ]].sort_values('genome').reset_index(drop=True) db2 = db2[['genome', 'secondary_cluster' ]].sort_values('genome').reset_index(drop=True) assert test_utils.compare_dfs2( db1, db2, verbose=True), "{0} is not the same!".format(db)
def test_compare_16(BTO): """ Test providing an .stb to compare """ # Run program in two steps sol_base = BTO.test_dir + 'testR' cmd = f"inStrain compare -i {BTO.IS1} {BTO.IS2} -o {sol_base} --store_mismatch_locations" print(cmd) # call(cmd, shell=True) inStrain.controller.Controller().main( inStrain.argumentParser.parse_args(cmd.split(' ')[1:])) cmd = f"inStrain genome_wide -i {sol_base} -s {BTO.stb}" print(cmd) call(cmd, shell=True) # Load output IS = inStrain.SNVprofile.SNVprofile(sol_base) files = glob.glob(IS.get_location('output') + '*') files = [f for f in files if 'genomeWide' in f] assert len(files) == 1 s = pd.read_csv(files[0], sep='\t') # Run the program in one step exp_base = BTO.test_dir + 'testSR' cmd = f"inStrain compare -i {BTO.IS1} {BTO.IS2} -o {exp_base} -s {BTO.stb} --store_mismatch_locations" print(cmd) #call(cmd, shell=True) inStrain.controller.Controller().main( inStrain.argumentParser.parse_args(cmd.split(' ')[1:])) # Load output IS = inStrain.SNVprofile.SNVprofile(exp_base) files = glob.glob(IS.get_location('output') + '*') files = [f for f in files if 'genomeWide' in f] assert len(files) == 1 e = pd.read_csv(files[0], sep='\t') # Compare assert test_utils.compare_dfs2(e, s, verbose=True) # See if figures were made figs = glob.glob(IS.get_location('figures') + '*') assert len(figs) > 0
def test_compare_13(BTO): """ Re-run and ensure that the results are the same as a previous run """ importlib.reload(logging) # Run program base = BTO.test_dir + 'RC_test' cmd = "inStrain compare -i {1} {2} -o {3} --include_self_comparisons --store_mismatch_locations -d".format( True, BTO.IS1, BTO.IS2, base, BTO.scafflistF) print(cmd) #call(cmd, shell=True) inStrain.controller.Controller().main(inStrain.argumentParser.parse_args(cmd.split(' ')[1:])) exp_RC = inStrain.SNVprofile.SNVprofile(base) sol_RC = inStrain.SNVprofile.SNVprofile(BTO.v12_solution) # Print what the output of the solutions directory looks like if True: s_out_files = glob.glob(exp_RC.get_location('output') + os.path.basename( exp_RC.get('location')) + '_*') print("The output has {0} tables".format(len(s_out_files))) for f in s_out_files: name = os.path.basename(f) print("{1}\n{0}\n{1}".format(name, '-' * len(name))) s = pd.read_csv(f, sep='\t') print(s.head()) print() # Make sure log is working assert len(glob.glob(base + '/log/*')) == 3, glob.glob(base + '/log/*') Ldb = exp_RC.get_parsed_log() print(Ldb) # Check output files e_out_files = glob.glob(exp_RC.get_location('output') + os.path.basename( exp_RC.get('location')) + '_*') s_out_files = glob.glob(sol_RC.get_location('output') + '*_*') assert len(s_out_files) == 1, sol_RC.get_location('output') + '*_*' for s_file in s_out_files: name = os.path.basename(s_file).split('RC_test_')[1] e_file = [e for e in e_out_files if name in os.path.basename(e)] print("checking {0}".format(name)) if len(e_file) == 1: # print("Both have {0}!".format(name)) e = pd.read_csv(e_file[0], sep='\t') s = pd.read_csv(s_file, sep='\t') if name == 'comparisonsTable.tsv': e = e.sort_values(['scaffold', 'name1', 'name2'] ).reset_index(drop=True) s = s.sort_values(['scaffold', 'name1', 'name2'] ).reset_index(drop=True) changed_cols = ['consensus_SNPs', 'conANI'] for c in changed_cols: del e[c] del s[c] assert set(s.columns) == set(e.columns), \ [set(s.columns) - set(e.columns), set(e.columns) - set(s.columns), ] s = s[list(e.columns)] assert test_utils.compare_dfs2(e, s, verbose=True), name else: assert False, name # Check attributes sAdb = sol_RC._get_attributes_file() for i, row in sAdb.iterrows(): print("checking {0}".format(i)) if i in ['location', 'version']: continue s = sol_RC.get(i) e = exp_RC.get(i) if i in ['comparisonsTable']: s = s.sort_values(['scaffold', 'name1', 'name2', 'mm']).reset_index(drop=True) e = e.sort_values(['scaffold', 'name1', 'name2', 'mm']).reset_index(drop=True) changed_cols = ['consensus_SNPs', 'conANI'] for c in changed_cols: del e[c] del s[c] # Re-arange column order assert set(e.columns) == set(s.columns), \ [i, set(e.columns) - set(s.columns), set(s.columns) - set(e.columns)] s = s[list(e.columns)] assert test_utils.compare_dfs2(e, s, verbose=True), i if i in ['pairwise_SNP_locations']: # Fix the solutions directory to remove the old errors (fixed in v1.3.0t) s = s[ \ ((s['consensus_SNP'] == True) & (((s['ref_base_1'] != s['con_base_1']) & (s['con_base_1'] == s['con_base_1'])) | ((s['ref_base_2'] != s['con_base_2']) & (s['con_base_2'] == s['con_base_2'])))) \ | (s['consensus_SNP'] == False)] # Make the solutions directory only have SNPs for c in ['consensus_SNP', 'population_SNP']: s[c] = s[c].astype(bool) s = s[s['consensus_SNP'] | s['population_SNP']] # Get rid of the junk colums s = s[e.columns] for c in ['position', 'mm']: s[c] = s[c].astype(int) s = s.sort_values(['scaffold', 'position', 'name1', 'name2']).reset_index(drop=True) e = e.sort_values(['scaffold', 'position', 'name1', 'name2']).reset_index(drop=True) assert set(e.columns) == set(s.columns), \ [i, set(e.columns) - set(s.columns), set(s.columns) - set(e.columns)] s = s[list(e.columns)] assert test_utils.compare_dfs2(e, s, verbose=True), i elif i in ['scaffold2length']: assert test_utils.compare_dicts(e, s), i
def test_compare_19(BTO): """ Ensure that compare can generate clusters """ exp_base = BTO.test_dir + 'testSR' cmd = f"inStrain compare -i {BTO.IS1} {BTO.IS2} -o {exp_base} -s {BTO.stb}" print(cmd) inStrain.controller.Controller().main(inStrain.argumentParser.parse_args(cmd.split(' ')[1:])) # Load output IS = inStrain.SNVprofile.SNVprofile(exp_base) # Run it with default settings files = glob.glob(IS.get_location('output') + '/*') assert len(files) == 3 for f in files: basename = os.path.basename(f) if basename.endswith('_strain_clusters.tsv'): Scdb = pd.read_csv(f, sep='\t') elif basename.endswith('_genomeWide_compare.tsv'): Sndb = pd.read_csv(f, sep='\t') assert len(Scdb['cluster'].unique()) == 3 # Adjust the ani threshold exp_base = BTO.test_dir + 'testSR2' cmd = f"inStrain compare -i {BTO.IS1} {BTO.IS2} -o {exp_base} -s {BTO.stb} -ani 0.999" inStrain.controller.Controller().main(inStrain.argumentParser.parse_args(cmd.split(' ')[1:])) IS = inStrain.SNVprofile.SNVprofile(exp_base) files = glob.glob(IS.get_location('output') + '/*') assert len(files) == 3 for f in files: basename = os.path.basename(f) if basename.endswith('_strain_clusters.tsv'): cdb = pd.read_csv(f, sep='\t') elif basename.endswith('_genomeWide_compare.tsv'): ndb = pd.read_csv(f, sep='\t') assert len(cdb['cluster'].unique()) == 2 assert test_utils.compare_dfs2(ndb, Sndb) # Adjust the coverage threshold exp_base = BTO.test_dir + 'testSR3' cmd = f"inStrain compare -i {BTO.IS1} {BTO.IS2} -o {exp_base} -s {BTO.stb} -cov 0.9999999999999" inStrain.controller.Controller().main(inStrain.argumentParser.parse_args(cmd.split(' ')[1:])) IS = inStrain.SNVprofile.SNVprofile(exp_base) files = glob.glob(IS.get_location('output') + '/*') assert len(files) == 3 for f in files: basename = os.path.basename(f) if basename.endswith('_strain_clusters.tsv'): cdb = pd.read_csv(f, sep='\t') elif basename.endswith('_genomeWide_compare.tsv'): ndb = pd.read_csv(f, sep='\t') assert len(cdb['cluster'].unique()) == 4 assert test_utils.compare_dfs2(ndb, Sndb) # Include self exp_base = BTO.test_dir + 'testSR4' cmd = f"inStrain compare -i {BTO.IS1} {BTO.IS2} -o {exp_base} -s {BTO.stb} --include_self_comparisons" inStrain.controller.Controller().main(inStrain.argumentParser.parse_args(cmd.split(' ')[1:])) IS = inStrain.SNVprofile.SNVprofile(exp_base) files = glob.glob(IS.get_location('output') + '/*') assert len(files) == 3 for f in files: basename = os.path.basename(f) if basename.endswith('_strain_clusters.tsv'): cdb = pd.read_csv(f, sep='\t') elif basename.endswith('_genomeWide_compare.tsv'): ndb = pd.read_csv(f, sep='\t') assert len(cdb['cluster'].unique()) == 3 assert not test_utils.compare_dfs2(ndb, Sndb)