def plot_6_test_1(self): ''' Test plot 6 with different things missing ''' # Test with everything there args = argumentParser.parse_args(['analyze',self.working_wd_loc,'-pl', '6']) controller = Controller() controller.parseArguments(args) fig_dir = os.path.join(self.working_wd_loc, 'figures', '') figs = [os.path.basename(f) for f in glob.glob(fig_dir + '*')] FIGS = ['Winning_genomes.pdf'] assert sorted(figs) == sorted(FIGS) for fig in glob.glob(fig_dir + '*'): assert os.path.getsize(fig) > 0 # Test with removing Widb db_loc = os.path.join(self.working_wd_loc, 'data_tables', 'Widb.csv') os.remove(db_loc) for f in glob.glob(fig_dir + '*'): os.remove(f) args = argumentParser.parse_args(['analyze',self.working_wd_loc,'-pl', '6']) controller = Controller() controller.parseArguments(args) fig_dir = os.path.join(self.working_wd_loc, 'figures', '') figs = [os.path.basename(f) for f in glob.glob(fig_dir + '*')] FIGS = ['Winning_genomes.pdf'] assert sorted(figs) == sorted(FIGS) for fig in glob.glob(fig_dir + '*'): assert os.path.getsize(fig) > 0
def test_centrality_1(self): """ Test the methods drep.d_choose.add_centrality and "choose_winners" on a small set of genomes """ wd = drep.WorkDirectory.WorkDirectory(self.working_wd_loc) kwargs = vars( argumentParser.parse_args( ['dereplicate', self.working_wd_loc, '--ignoreGenomeQuality'])) del kwargs['genomes'] # Modify Cdb cdb = wd.get_db('Cdb') cdb['secondary_cluster'] = [ x.replace('1_2', '1_1') for x in cdb['secondary_cluster'] ] wd.store_db(cdb, 'Cdb') # Run calculation bdb = wd.get_db('Bdb') Gdb = drep.d_filter.calc_genome_info(bdb['location'].tolist()) Gdb = drep.d_choose.add_centrality(wd, Gdb, **kwargs) # Test result of add_centrality assert 'centrality' in list(Gdb.columns) assert len(Gdb[Gdb['centrality'] > 0]) > 0 assert len(Gdb[Gdb['centrality'] > 1]) == 0 assert len(Gdb[Gdb['centrality'].isna()]) == 0 # Run choose winners Sdb, Wdb = drep.d_choose.choose_winners(cdb, Gdb, **kwargs) # Compare against choose winners with no centrality weight kwargs = vars( argumentParser.parse_args([ 'dereplicate', self.working_wd_loc, '--ignoreGenomeQuality', '-centW', '0' ])) del kwargs['genomes'] Sdb2, Wdb2 = drep.d_choose.choose_winners(cdb, Gdb, **kwargs) # Make sure you get different values, and make sure they're not too different assert not test_utils.compare_dfs2(Sdb, Sdb2) assert abs(Sdb['score'].mean() - Sdb2['score'].mean()) < 1 # Make sure S_ani is being loaded properly kwargs = vars( argumentParser.parse_args([ 'dereplicate', self.working_wd_loc, '--ignoreGenomeQuality', '-sa', '0.95' ])) del kwargs['genomes'] Sdb2, Wdb2 = drep.d_choose.choose_winners(cdb, Gdb, **kwargs) assert not test_utils.compare_dfs2(Sdb, Sdb2) assert abs(Sdb['score'].mean()) < Sdb2['score'].mean()
def test_unit_7(self): ''' Test cluster with --SkipSecondary ''' # run args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '-g'] + \ self.genomes + ['--SkipSecondary']) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are the same: # for db in ['Mdb']: # db1 = Swd.get_db(db) # db2 = wd.get_db(db) # assert test_utils.compare_dfs(db1, db2), "{0} is not the same!".format(db) # Confirm the following are not the same: for db in ['Cdb', 'Ndb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert not test_utils.compare_dfs( db1, db2), "{0} is the same! (and shouldn't be)".format(db)
def taxTest2(self): ''' Check the taxonomy call for percent method ''' genomes = self.genomes wd_loc = self.wd_loc swd_loc = self.s_wd_loc # Call the command args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \ + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\ '--tax_method', 'percent']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(swd_loc) wd = WorkDirectory(wd_loc) tdbS = Swd.get_db('BdbP') tdb = wd.get_db('Bdb') del tdbS['location'] del tdb['location'] assert compare_dfs(tdb, tdbS), "{0} is not the same!".format('Bdb') tdbS = Swd.get_db('TdbP') tdb = wd.get_db('Tdb') assert compare_dfs(tdb, tdbS), "{0} is not the same!".format('Tdb')
def unit_tests_4(self): ''' Test changing cluster -pa ''' # normal complete run args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \ self.genomes + ['-pa', '0.10']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are correct: # for db in ['Mdb']: # db1 = Swd.get_db(db) # db2 = wd.get_db(db) # assert compare_dfs(db1, db2), "{0} is not the same!".format(db) # Confirm the following are not the same: for db in ['Ndb', 'Cdb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert not compare_dfs( db1, db2), "{0} is the same! (and shouldn't be)".format(db)
def test_skipsecondary(self): genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc args = argumentParser.parse_args(['dereplicate',wd_loc,'-g'] +genomes \ + ['--SkipSecondary']) # controller = Controller() # controller.parseArguments(args) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Mdb.csv is correct db1 = Swd.get_db('Mdb') db2 = wd.get_db('Mdb') #assert compare_dfs(db1, db2), "{0} is not the same!".format('Mdb') # Confirm Ndb.csv doesn't exist db2 = wd.get_db('Ndb') assert db2.empty, 'Ndb is not empty'
def functional_test_2(self): ''' Cluster the 5 genomes using gANI ''' genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc # Make sure gANI is installed loc, works = find_program('ANIcalculator') if (loc == None or works == False): print('Cannot locate the program {0}- skipping related tests'\ .format('ANIcalculator (for gANI)')) return args = argumentParser.parse_args(['cluster',wd_loc,'--S_algorithm',\ 'gANI','-g']+genomes) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Cdb.csv is correct db1 = Swd.get_db('Cdb') del db1['comparison_algorithm'] db2 = wd.get_db('Cdb') del db2['comparison_algorithm'] assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
def test_cluster_functional_1(self): ''' Cluster the 5 genomes using default settings ''' genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc args = argumentParser.parse_args(['dereplicate', wd_loc, '-g'] + genomes) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs) # args = argumentParser.parse_args(['cluster',wd_loc,'-g']+genomes) # controller = Controller() # controller.parseArguments(args) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Cdb.csv is correct db1 = Swd.get_db('Cdb') db2 = wd.get_db('Cdb') assert test_utils.compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
def test_cluster_functional_4(self): ''' Cluster the 5 genomes using fastANI ''' genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc args = argumentParser.parse_args(['dereplicate',wd_loc,'--S_algorithm',\ 'fastANI','-g']+genomes) # controller = Controller() # controller.parseArguments(args) # args = argumentParser.parse_args(['dereplicate', wd_loc, '--S_algorithm', 'ANImf', '-g'] + genomes) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Cdb.csv is correct db1 = Swd.get_db('Cdb') del db1['comparison_algorithm'] db2 = wd.get_db('Cdb') del db2['comparison_algorithm'] assert test_utils.compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
def unit_test_2(self): ''' Try out the --skipCheckM argument for choose ''' # Delete Chdb wd_loc = self.working_wd_loc os.remove(wd_loc + '/data_tables/Chdb.csv') os.remove(wd_loc + '/data_tables/Sdb.csv') os.remove(wd_loc + '/data_tables/Wdb.csv') # Run choose with --skipCheckM args = argumentParser.parse_args( ['choose', wd_loc, '--noQualityFiltering']) controller = Controller() controller.parseArguments(args) Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) for db in ['Sdb', 'Wdb', 'genomeInformation']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert not compare_dfs(db1, db2), "{0} is the same!".format(db) sdb = wd.get_db('Sdb') for s in sdb['score'].tolist(): assert (s > 0) & (s < 5)
def unit_test_1(self): ''' Ensure choose can handle when Chdb is not present, running checkM automatically ''' # Delete Chdb wd_loc = self.working_wd_loc os.remove(wd_loc + '/data_tables/Chdb.csv') # Modify Bdb so the genome locations are right genomes = load_test_genomes() g2l = {os.path.basename(g): g for g in genomes} Bdb = pd.read_csv(wd_loc + '/data_tables/Bdb.csv') Bdb['location'] = Bdb['genome'].map(g2l) Bdb.to_csv(wd_loc + '/data_tables/Bdb.csv', index=False) # Run choose - this should re-run checkM and re-generate chdb args = argumentParser.parse_args(['choose', wd_loc, '--checkM_method',\ 'taxonomy_wf']) controller = Controller() controller.parseArguments(args) Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) for db in ['Chdb', 'genomeInformation']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert compare_dfs(db1, db2), "{0} is not the same!".format(db)
def taxTest1(self): ''' Check the taxonomy call for max method ''' genomes = self.genomes wd_loc = self.wd_loc swd_loc = self.s_wd_loc # Call the command args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \ + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\ '--tax_method', 'max']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(swd_loc) wd = WorkDirectory(wd_loc) tdbS = Swd.get_db('Bdb') tdb = wd.get_db('Bdb') del tdbS['location'] del tdb['location'] assert compare_dfs(tdb, tdbS), "{0} is not the same!".format('Bdb') tdbS = Swd.get_db('Tdb') tdb = wd.get_db('Tdb') if compare_dfs(tdb, tdbS) == False: print("{0} is not the same! May be due to centrifuge index issues". format('Tdb')) my_panel = pd.Panel(dict(df1=tdbS, df2=tdb)) print(my_panel.apply(report_diff, axis=0)) assert True
def test_choose_2(self): ''' Try out the --skipCheckM argument for choose ''' # Delete Chdb wd_loc = self.working_wd_loc os.remove(wd_loc + '/data_tables/Chdb.csv') os.remove(wd_loc + '/data_tables/Sdb.csv') os.remove(wd_loc + '/data_tables/Wdb.csv') # Run choose with --skipCheckM args = argumentParser.parse_args( ['dereplicate', wd_loc, '--ignoreGenomeQuality']) kwargs = vars(args) del kwargs['genomes'] drep.d_choose.d_choose_wrapper(wd_loc, **kwargs) # # controller = Controller() # controller.parseArguments(args) Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) for db in ['Sdb', 'Wdb', 'genomeInformation']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert not test_utils.compare_dfs(db1, db2), "{0} is the same!".format(db) sdb = wd.get_db('Sdb') Swd.get_db(db) for s in sdb['score'].tolist(): assert (s > 0) & (s < 5) gdb = wd.get_db('genomeInformation') assert 'centrality' in gdb.columns
def functional_test_1(self): ''' Call filter on 'Escherichia_coli_Sakai.fna' Should call both prodigal and checkM ''' genomes = self.genomes wd_loc = self.wd_loc # make sure calling it on the right genome genome = [ g for g in genomes if g.endswith('Enterococcus_faecalis_T2.fna') ] assert len(genome) == 1 genome = genome[0] args = argumentParser.parse_args(['filter',wd_loc,'-g',genome] \ + ['--checkM_method', 'taxonomy_wf']) controller = Controller() controller.parseArguments(args) # Confirm Chdb.csv is correct wd = drep.WorkDirectory.WorkDirectory(wd_loc) chdb = wd.get_db('Chdb') assert chdb['Completeness'].tolist()[0] == 98.28 # Confirm genome is in Bdb.csv Gdb = wd.get_db('genomeInfo') assert Gdb['completeness'].tolist()[0] == 98.28
def test_dereplicate_8(self): ''' Test greedy clustering with some primary clusters only having a single member ''' if len(self.large_genome_set) == 0: print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***") return genomes = [self.large_genome_set[0], self.large_genome_set[20]] wd_loc = self.wd_loc wd_loc2 = self.wd_loc2 # Get greedy results args = argumentParser.parse_args([ 'compare', wd_loc2, '--S_algorithm', 'fastANI', '--multiround_primary_clustering', '--primary_chunksize', '50', '--greedy_secondary_clustering', '-sa', '0.95', '-pa', '0.99', '-g' ] + genomes) Controller().parseArguments(args) wd = WorkDirectory(wd_loc2) CSdb = wd.get_db('Cdb') # Run normal args = argumentParser.parse_args([ 'compare', wd_loc, '--S_algorithm', 'fastANI', '--multiround_primary_clustering', '--primary_chunksize', '50', '-sa', '0.95', '-pa', '0.99', '-g' ] + genomes) Controller().parseArguments(args) # Verify they're the same wd = WorkDirectory(wd_loc) Cdb = wd.get_db('Cdb') assert len(CSdb) == len(Cdb) for c in ['primary_cluster', 'secondary_cluster']: assert set(CSdb[c].value_counts().to_dict().values()) == set( Cdb[c].value_counts().to_dict().values()), c if c != 'secondary_cluster': assert set(CSdb[c].value_counts().to_dict().keys()) == set( Cdb[c].value_counts().to_dict().keys() ) #, [set(CSdb[c].value_counts().to_dict().keys()), set(Cdb[c].value_counts().to_dict().keys())] assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist()) assert set(CSdb.columns) - set(Cdb.columns) == set( ['greedy_representative'])
def test_filer_functional_4(self): """ Test some logging things """ # Capture all logging self._caplog.set_level(0) args = argumentParser.parse_args(['dereplicate', self.wd_loc, '-g'] + self.genomes) kwargs = vars(args) # Run the "verify" thing bdb = drep.d_cluster.utils.load_genomes(kwargs['genomes']) drep.d_filter.sanity_check(bdb, **kwargs) for logger_name, log_level, message in self._caplog.record_tuples: assert message == '5 genomes were input to dRep' # Make sure it warns correctly self._caplog.clear() args = argumentParser.parse_args(['dereplicate', self.wd_loc, '--primary_chunksize', '4', '-g'] + self.genomes) kwargs = vars(args) bdb = drep.d_cluster.utils.load_genomes(kwargs['genomes']) drep.d_filter.sanity_check(bdb, **kwargs) got = False for logger_name, log_level, message in self._caplog.record_tuples: if 'genomes and arent using greedy algorithms' in message: got = True assert got # Make sure it doesnt warn incorrectly self._caplog.clear() args = argumentParser.parse_args(['dereplicate', self.wd_loc, '--primary_chunksize', '4', '--multiround_primary_clustering', '-g'] + self.genomes) kwargs = vars(args) bdb = drep.d_cluster.utils.load_genomes(kwargs['genomes']) drep.d_filter.sanity_check(bdb, **kwargs) got = False for logger_name, log_level, message in self._caplog.record_tuples: if 'genomes and arent using greedy algorithms' in message: got = True assert not got
def test_dereplicate_5(self): ''' Test greedy clustering ''' genomes = self.large_genome_set[:10] wd_loc = self.wd_loc wd_loc2 = self.wd_loc2 if len(genomes) == 0: print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***") return # Get greedy results args = argumentParser.parse_args([ 'compare', wd_loc2, '--S_algorithm', 'fastANI', '--SkipMash', '--greedy_secondary_clustering', '-sa', '0.95', '-g' ] + genomes) Controller().parseArguments(args) wd = WorkDirectory(wd_loc2) CSdb = wd.get_db('Cdb') # Run normal args = argumentParser.parse_args([ 'compare', wd_loc, '--S_algorithm', 'fastANI', '--SkipMash', '-sa', '0.95', '-g' ] + genomes) Controller().parseArguments(args) # Verify they're the same wd = WorkDirectory(wd_loc) Cdb = wd.get_db('Cdb') assert len(CSdb) == len(Cdb) for c in ['primary_cluster', 'secondary_cluster']: assert set(CSdb[c].value_counts().to_dict().values()) == set( Cdb[c].value_counts().to_dict().values()), c assert set(CSdb[c].value_counts().to_dict().keys()) == set( Cdb[c].value_counts().to_dict().keys()), c assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist()) assert set(CSdb.columns) - set(Cdb.columns) == set( ['greedy_representative'])
def test_dereplicate_4(self): ''' Test the ability of primary clustering to take a large genome set and break it into chunks ''' genomes = self.large_genome_set wd_loc = self.wd_loc wd_loc2 = self.wd_loc2 if len(genomes) == 0: print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***") return # Get normal results args = argumentParser.parse_args([ 'compare', wd_loc2, '--S_algorithm', 'fastANI', '--SkipSecondary', '--primary_chunksize', '50', '-g' ] + genomes) Controller().parseArguments(args) wd = WorkDirectory(wd_loc2) CSdb = wd.get_db('Cdb') # Run with chunking args = argumentParser.parse_args([ 'compare', wd_loc, '--S_algorithm', 'fastANI', '--SkipSecondary', '--multiround_primary_clustering', '--primary_chunksize', '50', '-g' ] + genomes) Controller().parseArguments(args) # Verify they're the same wd = WorkDirectory(wd_loc) Cdb = wd.get_db('Cdb') assert len(CSdb) == len(Cdb) for c in ['primary_cluster', 'secondary_cluster']: assert set(CSdb[c].value_counts().to_dict().values()) == set( Cdb[c].value_counts().to_dict().values()) assert set(CSdb[c].value_counts().to_dict().keys()) == set( Cdb[c].value_counts().to_dict().keys()) assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist()) assert set(Cdb.columns) - set(CSdb.columns) == set( ['length', 'subcluster', 'primary_representitive'])
def test_filer_functional_3(self): ''' Test the sanity check to make sure there are no duplicate genome names or things like that ''' # Capture all logging self._caplog.set_level(0) wd_loc = self.wd_loc # Make a genome info genomes = self.genomes table = {} atts = ['completeness', 'contamination', 'strain_heterogeneity'] for a in atts: table[a] = [] table['genome'] = [] table['location'] = [] for g in genomes: table['genome'].append(os.path.basename(g)) table['location'].append(g) for a in atts: table[a].append(10) Idb = pd.DataFrame(table) if not os.path.isdir(self.testdir): os.mkdir(self.testdir) GI_loc = os.path.join(self.testdir, 'genomeInfo.csv') Idb.to_csv(GI_loc, index=False) # Add a genome with the same name at a different location sgenomes = genomes + [self.stinker_genome] args = argumentParser.parse_args(['dereplicate',wd_loc,'-g'] + sgenomes \ + ['--genomeInfo', GI_loc]) kwargs = vars(args) # Make sure it fails failed = True try: drep.d_filter.d_filter_wrapper(wd_loc, **kwargs) failed = False except: pass assert failed # Verify logs got = False for logger_name, log_level, message in self._caplog.record_tuples: if 'You have duplicate genome basenames!' in message: got = True assert got
def functional_test_2(self): ''' Ensure analyze crashes gracefully ''' wd_loc = self.working_wd_loc wd = drep.WorkDirectory.WorkDirectory(wd_loc) os.remove(os.path.join(wd.get_dir('data_tables'), 'Mdb.csv')) os.remove(os.path.join(wd.get_dir('data_tables'), 'Cdb.csv')) os.remove(os.path.join(wd.get_dir('data_tables'), 'Bdb.csv')) args = argumentParser.parse_args(['analyze',self.working_wd_loc,'-pl'] + \ ['a']) controller = Controller() controller.parseArguments(args)
def test_list_genome_load(self): ''' Test inputting a list of genomes via a text file ''' bdb = drep.d_cluster.utils.load_genomes(self.genomes) data_folder = self.test_dir # Make the list of genomes if not os.path.exists(data_folder): os.mkdir(data_folder) genome_loc = os.path.join(data_folder, 'genomes.txt') with open(genome_loc, 'w') as o: for i, row in bdb.iterrows(): o.write(row['location'] + '\n') # Test it out wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc # args = argumentParser.parse_args(['cluster',wd_loc,'--S_algorithm',\ # 'fastANI','-g',genome_loc]) # controller = Controller() # controller.parseArguments(args) args = argumentParser.parse_args( ['dereplicate', wd_loc, '--S_algorithm', 'fastANI', '-g', genome_loc]) kwargs = vars(args) # del kwargs['genomes'] # drep.d_cluster.d_cluster_wrapper(wd_loc, **kwargs) drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Cdb.csv is correct db1 = Swd.get_db('Cdb') del db1['comparison_algorithm'] db2 = wd.get_db('Cdb') del db2['comparison_algorithm'] assert test_utils.compare_dfs(db1, db2), "{0} is not the same!".format('Cdb') Ndb = drep.d_cluster.compare_utils.compare_genomes(bdb, 'fastANI', data_folder) db = Ndb[(Ndb['reference'] == 'Enterococcus_faecalis_T2.fna')\ & (Ndb['querry'] == 'Enterococcus_casseliflavus_EC20.fasta')] assert (db['ani'].tolist()[0] > 0.7) & (db['ani'].tolist()[0] < 0.8)
def test_unit_1(self): ''' Test a normal run of cluster ''' # normal complete run args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '--S_algorithm', 'ANImf', '-g'] + \ self.genomes) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are correct: # for db in ['Cdb', 'Mdb', 'Ndb']: for db in ['Cdb', 'Ndb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) # get rid of some precision on the ANI; you are comparing fastANI with ANImf if db == 'Ndb': db1['ani'] = [round(x, 3) for x in db1['ani']] db2['ani'] = [round(x, 3) for x in db2['ani']] db1['alignment_length'] = [ round(x, -6) for x in db1['alignment_length'] ] db2['alignment_length'] = [ round(x, -6) for x in db2['alignment_length'] ] #db1 = db1[db2.columns] db1 = db1[['ani', 'alignment_length', 'querry', 'reference']] db2 = db2[['ani', 'alignment_length', 'querry', 'reference']] db1 = db1.sort_values(['querry', 'reference']).reset_index(drop=True) db2 = db2.sort_values(['querry', 'reference']).reset_index(drop=True) if db == 'Cdb': db1 = db1[['genome', 'secondary_cluster' ]].sort_values('genome').reset_index(drop=True) db2 = db2[['genome', 'secondary_cluster' ]].sort_values('genome').reset_index(drop=True) assert test_utils.compare_dfs2( db1, db2, verbose=True), "{0} is not the same!".format(db)
def test_filer_functional_2(self): ''' Call filter on 'Escherichia_coli_Sakai.fna' with GenomeInfo provivded ''' genomes = self.genomes wd_loc = self.wd_loc # make sure calling it on the right genome genome = [g for g in genomes if g.endswith('Enterococcus_faecalis_T2.fna')] assert len(genome) == 1 genome = genome[0] table = {} atts = ['completeness', 'contamination', 'strain_heterogeneity'] for a in atts: table[a] = [] table['genome'] = [] table['location'] = [] for g in [genome]: table['genome'].append(os.path.basename(g)) table['location'].append(g) for a in atts: table[a].append(10) Idb = pd.DataFrame(table) if not os.path.isdir(self.testdir): os.mkdir(self.testdir) GI_loc = os.path.join(self.testdir, 'genomeInfo.csv') Idb.to_csv(GI_loc, index=False) args = argumentParser.parse_args(['dereplicate',wd_loc,'-g',genome] \ + ['--genomeInfo', GI_loc]) # controller = Controller() # controller.parseArguments(args) kwargs = vars(args) drep.d_filter.d_filter_wrapper(wd_loc, **kwargs) # Confirm Chdb.csv is correct wd = drep.WorkDirectory.WorkDirectory(wd_loc) # Confirm genome is in Bdb.csv Gdb = wd.get_db('genomeInfo') assert Gdb['completeness'].tolist()[0] == 10
def unit_tests_5(self): ''' Test changing cluster --S_algorithm gANI ''' # normal complete run args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \ self.genomes + ['--S_algorithm', 'gANI']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are correct: for db in ['Cdb', 'Mdb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert compare_dfs(db1, db2), "{0} is not the same!".format(db)
def functional_test_1(self): ''' Ensure analyze produces all plots ''' args = argumentParser.parse_args(['analyze',self.working_wd_loc,'-pl'] + \ ['a']) controller = Controller() controller.parseArguments(args) FIGS = ['Cluster_scoring.pdf', 'Clustering_scatterplots.pdf', \ 'Primary_clustering_dendrogram.pdf', 'Secondary_clustering_dendrograms.pdf', \ 'Winning_genomes.pdf', 'Secondary_clustering_MDS.pdf'] fig_dir = os.path.join(self.working_wd_loc, 'figures', '') figs = [os.path.basename(f) for f in glob.glob(fig_dir + '*')] assert sorted(figs) == sorted(FIGS) for fig in glob.glob(fig_dir + '*'): assert os.path.getsize(fig) > 0
def unit_tests_3(self): ''' Test cluster with --skipMash ''' # normal complete run args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \ self.genomes + ['--SkipMash']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are not the same: for db in ['Cdb', 'Ndb', 'Mdb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert not compare_dfs( db1, db2), "{0} is the same! (and shouldn't be)".format(db)
def functional_test_2(self): genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc sanity_check(WorkDirectory(s_wd_loc)) args = argumentParser.parse_args(['compare', wd_loc, '-g'] + genomes) controller = Controller() controller.parseArguments(args) # Verify s_wd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) ensure_identicle(s_wd, wd, skip=['Bdb', 'Chdb', 'Sdb', 'Wdb', 'Widb',\ 'genomeInformation', 'Mdb']) # Perform sanity check to make sure solutions directiory isn't # being overwritten sanity_check(s_wd)
def functional_test_1(self): genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc sanity_check(WorkDirectory(s_wd_loc)) args = argumentParser.parse_args(['dereplicate',wd_loc,'-g'] + genomes \ + ['--checkM_method', 'taxonomy_wf']) controller = Controller() controller.parseArguments(args) # Verify s_wd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) ensure_identicle(s_wd, wd, skip=['Bdb', 'Mdb']) # Perform sanity check to make sure solutions directiory isn't # being overwritten sanity_check(s_wd)
def functional_test_1(self): ''' Cluster the 5 genomes using default settings ''' genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc args = argumentParser.parse_args(['cluster', wd_loc, '-g'] + genomes) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Cdb.csv is correct db1 = Swd.get_db('Cdb') db2 = wd.get_db('Cdb') assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
def test_dereplicate_6(self): ''' Test zipped genomes ''' genomes = self.zipped_genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc test_utils.sanity_check(WorkDirectory(s_wd_loc)) args = argumentParser.parse_args( ['compare', wd_loc, '--S_algorithm', 'fastANI', '-g'] + genomes) controller = Controller() controller.parseArguments(args) # Verify wd = WorkDirectory(wd_loc) anis = wd.get_db('Ndb')['ani'].tolist() assert max(anis) <= 1 assert min(anis) >= 0 assert len(set(anis)) > 1