def unit_tests_3(self): ''' Test cluster with --skipMash ''' # normal complete run args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \ self.genomes + ['--SkipMash']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are not the same: for db in ['Cdb', 'Ndb', 'Mdb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert not compare_dfs( db1, db2), "{0} is the same! (and shouldn't be)".format(db)
def test_dereplicate_5(self): ''' Test greedy clustering ''' genomes = self.large_genome_set[:10] wd_loc = self.wd_loc wd_loc2 = self.wd_loc2 if len(genomes) == 0: print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***") return # Get greedy results args = argumentParser.parse_args([ 'compare', wd_loc2, '--S_algorithm', 'fastANI', '--SkipMash', '--greedy_secondary_clustering', '-sa', '0.95', '-g' ] + genomes) Controller().parseArguments(args) wd = WorkDirectory(wd_loc2) CSdb = wd.get_db('Cdb') # Run normal args = argumentParser.parse_args([ 'compare', wd_loc, '--S_algorithm', 'fastANI', '--SkipMash', '-sa', '0.95', '-g' ] + genomes) Controller().parseArguments(args) # Verify they're the same wd = WorkDirectory(wd_loc) Cdb = wd.get_db('Cdb') assert len(CSdb) == len(Cdb) for c in ['primary_cluster', 'secondary_cluster']: assert set(CSdb[c].value_counts().to_dict().values()) == set( Cdb[c].value_counts().to_dict().values()), c assert set(CSdb[c].value_counts().to_dict().keys()) == set( Cdb[c].value_counts().to_dict().keys()), c assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist()) assert set(CSdb.columns) - set(Cdb.columns) == set( ['greedy_representative'])
def test_dereplicate_4(self): ''' Test the ability of primary clustering to take a large genome set and break it into chunks ''' genomes = self.large_genome_set wd_loc = self.wd_loc wd_loc2 = self.wd_loc2 if len(genomes) == 0: print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***") return # Get normal results args = argumentParser.parse_args([ 'compare', wd_loc2, '--S_algorithm', 'fastANI', '--SkipSecondary', '--primary_chunksize', '50', '-g' ] + genomes) Controller().parseArguments(args) wd = WorkDirectory(wd_loc2) CSdb = wd.get_db('Cdb') # Run with chunking args = argumentParser.parse_args([ 'compare', wd_loc, '--S_algorithm', 'fastANI', '--SkipSecondary', '--multiround_primary_clustering', '--primary_chunksize', '50', '-g' ] + genomes) Controller().parseArguments(args) # Verify they're the same wd = WorkDirectory(wd_loc) Cdb = wd.get_db('Cdb') assert len(CSdb) == len(Cdb) for c in ['primary_cluster', 'secondary_cluster']: assert set(CSdb[c].value_counts().to_dict().values()) == set( Cdb[c].value_counts().to_dict().values()) assert set(CSdb[c].value_counts().to_dict().keys()) == set( Cdb[c].value_counts().to_dict().keys()) assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist()) assert set(Cdb.columns) - set(CSdb.columns) == set( ['length', 'subcluster', 'primary_representitive'])
def functional_test_1(self): ''' Cluster the 5 genomes using default settings ''' genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc args = argumentParser.parse_args(['cluster', wd_loc, '-g'] + genomes) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Cdb.csv is correct db1 = Swd.get_db('Cdb') db2 = wd.get_db('Cdb') assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
def test_choose_2(self): ''' Try out the --skipCheckM argument for choose ''' # Delete Chdb wd_loc = self.working_wd_loc os.remove(wd_loc + '/data_tables/Chdb.csv') os.remove(wd_loc + '/data_tables/Sdb.csv') os.remove(wd_loc + '/data_tables/Wdb.csv') # Run choose with --skipCheckM args = argumentParser.parse_args( ['dereplicate', wd_loc, '--ignoreGenomeQuality']) kwargs = vars(args) del kwargs['genomes'] drep.d_choose.d_choose_wrapper(wd_loc, **kwargs) # # controller = Controller() # controller.parseArguments(args) Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) for db in ['Sdb', 'Wdb', 'genomeInformation']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert not test_utils.compare_dfs(db1, db2), "{0} is the same!".format(db) sdb = wd.get_db('Sdb') Swd.get_db(db) for s in sdb['score'].tolist(): assert (s > 0) & (s < 5) gdb = wd.get_db('genomeInformation') assert 'centrality' in gdb.columns
def skipsecondary_test(self): genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc args = argumentParser.parse_args(['cluster',wd_loc,'-g'] +genomes \ + ['--SkipSecondary']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Mdb.csv is correct db1 = Swd.get_db('Mdb') db2 = wd.get_db('Mdb') #assert compare_dfs(db1, db2), "{0} is not the same!".format('Mdb') # Confirm Ndb.csv doesn't exist db2 = wd.get_db('Ndb') assert db2.empty, 'Ndb is not empty'
def test_unit_3(self): ''' Test cluster with --skipMash ''' # normal complete run args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '-g'] + \ self.genomes + ['--SkipMash']) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are not the same: for db in ['Cdb', 'Ndb']: # , 'Mdb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert not test_utils.compare_dfs( db1, db2), "{0} is the same! (and shouldn't be)".format(db)
def test_taxonomy_4(self): ''' Try actually running centrifuge without prodigal done ''' loc, works = drep.d_bonus.find_program('centrifuge') if works == False: print('Centrifuge not installed- skipping tests') else: genomes = self.genomes wd_loc = self.wd_loc swd_loc = self.s_wd_loc # Remove previous data run shutil.rmtree(os.path.join(self.wd_loc, 'data', 'centrifuge')) shutil.rmtree(os.path.join(self.wd_loc, 'data', 'prodigal')) # Call the command args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \ + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\ '--tax_method', 'percent']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(swd_loc) wd = WorkDirectory(wd_loc) tdbS = Swd.get_db('BdbP') tdb = wd.get_db('Bdb') del tdbS['location'] del tdb['location'] assert test_utils.compare_dfs( tdb, tdbS), "{0} is not the same!".format('Bdb') tdbS = Swd.get_db('TdbP') tdb = wd.get_db('Tdb') assert test_utils.compare_dfs( tdb, tdbS), "{0} is not the same!".format('Tdb')
def functional_test_3(self): ''' Cluster the 5 genomes using ANImf ''' genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc args = argumentParser.parse_args(['cluster',wd_loc,'--S_algorithm',\ 'ANImf','-g']+genomes) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Cdb.csv is correct db1 = Swd.get_db('Cdb') del db1['comparison_algorithm'] db2 = wd.get_db('Cdb') del db2['comparison_algorithm'] assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
def unit_tests_1(self): ''' Test a normal run of cluster ''' # normal complete run args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \ self.genomes) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are correct: #for db in ['Cdb', 'Mdb', 'Ndb']: for db in ['Cdb', 'Ndb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) if compare_dfs(db1, db2) == False: # db1['solution'] = True # db2['solution'] = False # db = pd.merge(db1, db2, on='') db1 = db1[['reference', 'querry', 'ani']] db1.rename(columns={'ani': 'ani1'}, inplace=True) db2 = db2[['reference', 'querry', 'ani']] db2.rename(columns={'ani': 'ani2'}, inplace=True) db1.sort_values(['reference', 'querry'], inplace=True) db2.sort_values(['reference', 'querry'], inplace=True) print("{0} is not the same!".format(db)) my_panel = pd.Panel(dict(df1=db1, df2=db2)) print(my_panel.apply(report_diff, axis=0)) print(pd.merge(db1, db2, on=['reference', 'querry'])) assert compare_dfs(db1, db2), "{0} is not the same!".format(db)
def test_unit_5(self): ''' Test changing cluster --S_algorithm gANI ''' loc, works = drep.d_bonus.find_program('ANIcalculator') if not works: return # normal complete run args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '-g'] + \ self.genomes + ['--S_algorithm', 'gANI']) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are correct: for db in ['Cdb', 'Mdb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert test_utils.compare_dfs(db1, db2), "{0} is not the same!".format(db)
def test_taxonomy_1(self): ''' Check the taxonomy call for max method ''' genomes = self.genomes wd_loc = self.wd_loc swd_loc = self.s_wd_loc # Call the command args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \ + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\ '--tax_method', 'max']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(swd_loc) wd = WorkDirectory(wd_loc) tdbS = Swd.get_db('Bdb') tdb = wd.get_db('Bdb') del tdbS['location'] del tdb['location'] assert test_utils.compare_dfs(tdb, tdbS), "{0} is not the same!".format('Bdb') tdbS = Swd.get_db('Tdb') tdb = wd.get_db('Tdb') if test_utils.compare_dfs(tdb, tdbS) == False: print("{0} is not the same! May be due to centrifuge index issues". format('Tdb')) my_panel = pd.Panel(dict(df1=tdbS, df2=tdb)) print(my_panel.apply(test_utils.report_diff, axis=0)) assert True
def test_dereplicate_6(self): ''' Test zipped genomes ''' genomes = self.zipped_genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc test_utils.sanity_check(WorkDirectory(s_wd_loc)) args = argumentParser.parse_args( ['compare', wd_loc, '--S_algorithm', 'fastANI', '-g'] + genomes) controller = Controller() controller.parseArguments(args) # Verify wd = WorkDirectory(wd_loc) anis = wd.get_db('Ndb')['ani'].tolist() assert max(anis) <= 1 assert min(anis) >= 0 assert len(set(anis)) > 1
def test_dereplicate_3(self): ''' Use goANI ''' genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc test_utils.sanity_check(WorkDirectory(s_wd_loc)) args = argumentParser.parse_args( ['compare', wd_loc, '--S_algorithm', 'goANI', '-g'] + genomes) controller = Controller() controller.parseArguments(args) # Verify s_wd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) Ndb = wd.get_db('Ndb') assert len(Ndb) > 0 # Perform sanity check to make sure solutions directiory isn't # being overwritten test_utils.sanity_check(s_wd)
def unit_tests_1(self): ''' Test a normal run of cluster ''' # normal complete run args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \ self.genomes) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are correct: #for db in ['Cdb', 'Mdb', 'Ndb']: for db in ['Cdb', 'Ndb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) # get rid of some precision on the ANI if db == 'Ndb': db1['ani'] = [float("{0:.4f}".format(x)) for x in db1['ani']] db2['ani'] = [float("{0:.4f}".format(x)) for x in db2['ani']] if compare_dfs(db1, db2) == False: # # db1['solution'] = True # # db2['solution'] = False # # db = pd.merge(db1, db2, on='') db1 = db1[['reference', 'querry', 'ani']] # db1.rename(columns={'ani':'ani1'}, inplace=True) db2 = db2[['reference', 'querry', 'ani']] # db2.rename(columns={'ani':'ani2'}, inplace=True) print("now?") print(compare_dfs(db1, db2)) db1 = db1.sort_values(['reference', 'querry']) db2 = db2.sort_values(['reference', 'querry']) print(db1) print(db2) my_panel = pd.Panel(dict(df1=db1, df2=db2)) print('panel:') print(my_panel.apply(report_diff, axis=0)) # print("{0} is not the same!".format(db)) # # my_panel = pd.Panel(dict(df1=db1,df2=db2)) # print('panel:') # print(my_panel.apply(report_diff, axis=0)) # print('merge:') # xdb = pd.merge(db1, db2, on=['reference', 'querry']) # print(xdb) # print('diff:') # print(xdb[xdb['ani1'] != xdb['ani2']]) # # print('ref sorted 1') # print(db1['reference'].sort_values()) # # print('ref sorted 2') # print(db2['reference'].sort_values()) # # print('querry sorted 1') # print(db1['querry'].sort_values()) # # print('querry sorted 2') # print(db2['querry'].sort_values()) assert compare_dfs(db1, db2), "{0} is not the same!".format(db)