def parseArguments(self, args): ''' Parse user options and call the correct pipeline''' # Load the workDirectory wd_loc = str(os.path.abspath(args.work_directory)) wd = WorkDirectory(wd_loc) # Set up the logger self.setup_logger(wd.get_loc('log')) logging.debug(str(args)) # Call the appropriate workflow if args.operation == "dereplicate": self.dereplicate_operation(**vars(args)) if args.operation == "compare": self.compare_operation(**vars(args)) if args.operation == "filter": self.filter_operation(**vars(args)) if args.operation == "cluster": self.cluster_operation(**vars(args)) if args.operation == "analyze": self.analyze_operation(**vars(args)) if args.operation == "choose": self.choose_operation(**vars(args)) if args.operation == "adjust": self.adjust_operation(**vars(args)) if args.operation == "bonus": self.bonus_operation(**vars(args)) if args.operation == "evaluate": self.evaluate_operation(**vars(args))
def parseArguments(self, args): ''' Parse user options and call the correct pipeline''' if args.operation == 'check_dependencies': drep.d_bonus.check_dependencies(print_out=True) return # Load the workDirectory wd_loc = str(os.path.abspath(args.work_directory)) wd = WorkDirectory(wd_loc) # Set up the logger self.setup_logger(wd.get_loc('log')) logging.debug(str(args)) # Do some testing if args.run_tertiary_clustering: if args.operation != "dereplicate": raise ValueError( "Can only run tertiary clustering with dereplicate") # Call the appropriate workflow if args.operation == "dereplicate": self.dereplicate_operation(**vars(args)) if args.operation == "compare": self.compare_operation(**vars(args))
def test_choose_2(self): ''' Try out the --skipCheckM argument for choose ''' # Delete Chdb wd_loc = self.working_wd_loc os.remove(wd_loc + '/data_tables/Chdb.csv') os.remove(wd_loc + '/data_tables/Sdb.csv') os.remove(wd_loc + '/data_tables/Wdb.csv') # Run choose with --skipCheckM args = argumentParser.parse_args( ['dereplicate', wd_loc, '--ignoreGenomeQuality']) kwargs = vars(args) del kwargs['genomes'] drep.d_choose.d_choose_wrapper(wd_loc, **kwargs) # # controller = Controller() # controller.parseArguments(args) Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) for db in ['Sdb', 'Wdb', 'genomeInformation']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert not test_utils.compare_dfs(db1, db2), "{0} is the same!".format(db) sdb = wd.get_db('Sdb') Swd.get_db(db) for s in sdb['score'].tolist(): assert (s > 0) & (s < 5) gdb = wd.get_db('genomeInformation') assert 'centrality' in gdb.columns
def taxTest1(self): ''' Check the taxonomy call for max method ''' genomes = self.genomes wd_loc = self.wd_loc swd_loc = self.s_wd_loc # Call the command args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \ + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\ '--tax_method', 'max']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(swd_loc) wd = WorkDirectory(wd_loc) tdbS = Swd.get_db('Bdb') tdb = wd.get_db('Bdb') del tdbS['location'] del tdb['location'] assert compare_dfs(tdb, tdbS), "{0} is not the same!".format('Bdb') tdbS = Swd.get_db('Tdb') tdb = wd.get_db('Tdb') if compare_dfs(tdb, tdbS) == False: print("{0} is not the same! May be due to centrifuge index issues". format('Tdb')) my_panel = pd.Panel(dict(df1=tdbS, df2=tdb)) print(my_panel.apply(report_diff, axis=0)) assert True
def taxTest2(self): ''' Check the taxonomy call for percent method ''' genomes = self.genomes wd_loc = self.wd_loc swd_loc = self.s_wd_loc # Call the command args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \ + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\ '--tax_method', 'percent']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(swd_loc) wd = WorkDirectory(wd_loc) tdbS = Swd.get_db('BdbP') tdb = wd.get_db('Bdb') del tdbS['location'] del tdb['location'] assert compare_dfs(tdb, tdbS), "{0} is not the same!".format('Bdb') tdbS = Swd.get_db('TdbP') tdb = wd.get_db('Tdb') assert compare_dfs(tdb, tdbS), "{0} is not the same!".format('Tdb')
def test_skipsecondary(self): genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc args = argumentParser.parse_args(['dereplicate',wd_loc,'-g'] +genomes \ + ['--SkipSecondary']) # controller = Controller() # controller.parseArguments(args) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Mdb.csv is correct db1 = Swd.get_db('Mdb') db2 = wd.get_db('Mdb') #assert compare_dfs(db1, db2), "{0} is not the same!".format('Mdb') # Confirm Ndb.csv doesn't exist db2 = wd.get_db('Ndb') assert db2.empty, 'Ndb is not empty'
def unit_test_2(self): ''' Try out the --skipCheckM argument for choose ''' # Delete Chdb wd_loc = self.working_wd_loc os.remove(wd_loc + '/data_tables/Chdb.csv') os.remove(wd_loc + '/data_tables/Sdb.csv') os.remove(wd_loc + '/data_tables/Wdb.csv') # Run choose with --skipCheckM args = argumentParser.parse_args( ['choose', wd_loc, '--noQualityFiltering']) controller = Controller() controller.parseArguments(args) Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) for db in ['Sdb', 'Wdb', 'genomeInformation']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert not compare_dfs(db1, db2), "{0} is the same!".format(db) sdb = wd.get_db('Sdb') for s in sdb['score'].tolist(): assert (s > 0) & (s < 5)
def unit_tests_6(self): ''' Test drep call commands ''' # try on single mash command wd = WorkDirectory(self.working_wd_loc) MASH_folder = wd.get_dir('MASH') log_folder = wd.get_dir('cmd_logs') mash_exe = 'mash' all_file = MASH_folder + 'ALL.msh' cmd = [mash_exe, 'dist', all_file, all_file, '>', MASH_folder + 'MASH_table.tsv'] cmd = ' '.join(cmd) drep.run_cmd(cmd, shell=True, logdir=log_folder) assert len(glob.glob(log_folder + '*')) == 3
def functional_test_2(self): genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc sanity_check(WorkDirectory(s_wd_loc)) args = argumentParser.parse_args(['compare', wd_loc, '-g'] + genomes) controller = Controller() controller.parseArguments(args) # Verify s_wd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) ensure_identicle(s_wd, wd, skip=['Bdb', 'Chdb', 'Sdb', 'Wdb', 'Widb',\ 'genomeInformation', 'Mdb']) # Perform sanity check to make sure solutions directiory isn't # being overwritten sanity_check(s_wd)
def functional_test_1(self): genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc sanity_check(WorkDirectory(s_wd_loc)) args = argumentParser.parse_args(['dereplicate',wd_loc,'-g'] + genomes \ + ['--checkM_method', 'taxonomy_wf']) controller = Controller() controller.parseArguments(args) # Verify s_wd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) ensure_identicle(s_wd, wd, skip=['Bdb', 'Mdb']) # Perform sanity check to make sure solutions directiory isn't # being overwritten sanity_check(s_wd)
def test_dereplicate_6(self): ''' Test zipped genomes ''' genomes = self.zipped_genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc test_utils.sanity_check(WorkDirectory(s_wd_loc)) args = argumentParser.parse_args( ['compare', wd_loc, '--S_algorithm', 'fastANI', '-g'] + genomes) controller = Controller() controller.parseArguments(args) # Verify wd = WorkDirectory(wd_loc) anis = wd.get_db('Ndb')['ani'].tolist() assert max(anis) <= 1 assert min(anis) >= 0 assert len(set(anis)) > 1
def unit_tests_4(self): ''' Test changing cluster -pa ''' # normal complete run args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \ self.genomes + ['-pa', '0.10']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are correct: # for db in ['Mdb']: # db1 = Swd.get_db(db) # db2 = wd.get_db(db) # assert compare_dfs(db1, db2), "{0} is not the same!".format(db) # Confirm the following are not the same: for db in ['Ndb', 'Cdb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert not compare_dfs( db1, db2), "{0} is the same! (and shouldn't be)".format(db)
def test_cluster_functional_4(self): ''' Cluster the 5 genomes using fastANI ''' genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc args = argumentParser.parse_args(['dereplicate',wd_loc,'--S_algorithm',\ 'fastANI','-g']+genomes) # controller = Controller() # controller.parseArguments(args) # args = argumentParser.parse_args(['dereplicate', wd_loc, '--S_algorithm', 'ANImf', '-g'] + genomes) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Cdb.csv is correct db1 = Swd.get_db('Cdb') del db1['comparison_algorithm'] db2 = wd.get_db('Cdb') del db2['comparison_algorithm'] assert test_utils.compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
def unit_test_1(self): ''' Ensure choose can handle when Chdb is not present, running checkM automatically ''' # Delete Chdb wd_loc = self.working_wd_loc os.remove(wd_loc + '/data_tables/Chdb.csv') # Modify Bdb so the genome locations are right genomes = load_test_genomes() g2l = {os.path.basename(g): g for g in genomes} Bdb = pd.read_csv(wd_loc + '/data_tables/Bdb.csv') Bdb['location'] = Bdb['genome'].map(g2l) Bdb.to_csv(wd_loc + '/data_tables/Bdb.csv', index=False) # Run choose - this should re-run checkM and re-generate chdb args = argumentParser.parse_args(['choose', wd_loc, '--checkM_method',\ 'taxonomy_wf']) controller = Controller() controller.parseArguments(args) Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) for db in ['Chdb', 'genomeInformation']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert compare_dfs(db1, db2), "{0} is not the same!".format(db)
def functional_test_2(self): ''' Cluster the 5 genomes using gANI ''' genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc # Make sure gANI is installed loc, works = find_program('ANIcalculator') if (loc == None or works == False): print('Cannot locate the program {0}- skipping related tests'\ .format('ANIcalculator (for gANI)')) return args = argumentParser.parse_args(['cluster',wd_loc,'--S_algorithm',\ 'gANI','-g']+genomes) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Cdb.csv is correct db1 = Swd.get_db('Cdb') del db1['comparison_algorithm'] db2 = wd.get_db('Cdb') del db2['comparison_algorithm'] assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
def test_unit_7(self): ''' Test cluster with --SkipSecondary ''' # run args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '-g'] + \ self.genomes + ['--SkipSecondary']) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are the same: # for db in ['Mdb']: # db1 = Swd.get_db(db) # db2 = wd.get_db(db) # assert test_utils.compare_dfs(db1, db2), "{0} is not the same!".format(db) # Confirm the following are not the same: for db in ['Cdb', 'Ndb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert not test_utils.compare_dfs( db1, db2), "{0} is the same! (and shouldn't be)".format(db)
def test_cluster_functional_1(self): ''' Cluster the 5 genomes using default settings ''' genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc args = argumentParser.parse_args(['dereplicate', wd_loc, '-g'] + genomes) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs) # args = argumentParser.parse_args(['cluster',wd_loc,'-g']+genomes) # controller = Controller() # controller.parseArguments(args) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Cdb.csv is correct db1 = Swd.get_db('Cdb') db2 = wd.get_db('Cdb') assert test_utils.compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
def test_dereplicate_1(self): genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc test_utils.sanity_check(WorkDirectory(s_wd_loc)) args = argumentParser.parse_args(['dereplicate',wd_loc,'-g'] + genomes \ + ['--checkM_method', 'taxonomy_wf', '--debug', '--S_algorithm', 'ANImf']) controller = Controller() controller.parseArguments(args) # Verify s_wd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) test_utils.ensure_identicle( s_wd, wd, skip=['Bdb', 'Mdb', 'Sdb', 'Wdb', 'genomeInformation', 'Widb']) # Perform sanity check to make sure solutions directiory isn't # being overwritten test_utils.sanity_check(s_wd)
def test_dereplicate_3(self): ''' Use goANI ''' genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc test_utils.sanity_check(WorkDirectory(s_wd_loc)) args = argumentParser.parse_args( ['compare', wd_loc, '--S_algorithm', 'goANI', '-g'] + genomes) controller = Controller() controller.parseArguments(args) # Verify s_wd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) Ndb = wd.get_db('Ndb') assert len(Ndb) > 0 # Perform sanity check to make sure solutions directiory isn't # being overwritten test_utils.sanity_check(s_wd)
def test_taxonomy_4(self): ''' Try actually running centrifuge without prodigal done ''' loc, works = drep.d_bonus.find_program('centrifuge') if works == False: print('Centrifuge not installed- skipping tests') else: genomes = self.genomes wd_loc = self.wd_loc swd_loc = self.s_wd_loc # Remove previous data run shutil.rmtree(os.path.join(self.wd_loc, 'data', 'centrifuge')) shutil.rmtree(os.path.join(self.wd_loc, 'data', 'prodigal')) # Call the command args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \ + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\ '--tax_method', 'percent']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(swd_loc) wd = WorkDirectory(wd_loc) tdbS = Swd.get_db('BdbP') tdb = wd.get_db('Bdb') del tdbS['location'] del tdb['location'] assert test_utils.compare_dfs( tdb, tdbS), "{0} is not the same!".format('Bdb') tdbS = Swd.get_db('TdbP') tdb = wd.get_db('Tdb') assert test_utils.compare_dfs( tdb, tdbS), "{0} is not the same!".format('Tdb')
def test_list_genome_load(self): ''' Test inputting a list of genomes via a text file ''' bdb = drep.d_cluster.utils.load_genomes(self.genomes) data_folder = self.test_dir # Make the list of genomes if not os.path.exists(data_folder): os.mkdir(data_folder) genome_loc = os.path.join(data_folder, 'genomes.txt') with open(genome_loc, 'w') as o: for i, row in bdb.iterrows(): o.write(row['location'] + '\n') # Test it out wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc # args = argumentParser.parse_args(['cluster',wd_loc,'--S_algorithm',\ # 'fastANI','-g',genome_loc]) # controller = Controller() # controller.parseArguments(args) args = argumentParser.parse_args( ['dereplicate', wd_loc, '--S_algorithm', 'fastANI', '-g', genome_loc]) kwargs = vars(args) # del kwargs['genomes'] # drep.d_cluster.d_cluster_wrapper(wd_loc, **kwargs) drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Cdb.csv is correct db1 = Swd.get_db('Cdb') del db1['comparison_algorithm'] db2 = wd.get_db('Cdb') del db2['comparison_algorithm'] assert test_utils.compare_dfs(db1, db2), "{0} is not the same!".format('Cdb') Ndb = drep.d_cluster.compare_utils.compare_genomes(bdb, 'fastANI', data_folder) db = Ndb[(Ndb['reference'] == 'Enterococcus_faecalis_T2.fna')\ & (Ndb['querry'] == 'Enterococcus_casseliflavus_EC20.fasta')] assert (db['ani'].tolist()[0] > 0.7) & (db['ani'].tolist()[0] < 0.8)
def test_unit_1(self): ''' Test a normal run of cluster ''' # normal complete run args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '--S_algorithm', 'ANImf', '-g'] + \ self.genomes) kwargs = vars(args) drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are correct: # for db in ['Cdb', 'Mdb', 'Ndb']: for db in ['Cdb', 'Ndb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) # get rid of some precision on the ANI; you are comparing fastANI with ANImf if db == 'Ndb': db1['ani'] = [round(x, 3) for x in db1['ani']] db2['ani'] = [round(x, 3) for x in db2['ani']] db1['alignment_length'] = [ round(x, -6) for x in db1['alignment_length'] ] db2['alignment_length'] = [ round(x, -6) for x in db2['alignment_length'] ] #db1 = db1[db2.columns] db1 = db1[['ani', 'alignment_length', 'querry', 'reference']] db2 = db2[['ani', 'alignment_length', 'querry', 'reference']] db1 = db1.sort_values(['querry', 'reference']).reset_index(drop=True) db2 = db2.sort_values(['querry', 'reference']).reset_index(drop=True) if db == 'Cdb': db1 = db1[['genome', 'secondary_cluster' ]].sort_values('genome').reset_index(drop=True) db2 = db2[['genome', 'secondary_cluster' ]].sort_values('genome').reset_index(drop=True) assert test_utils.compare_dfs2( db1, db2, verbose=True), "{0} is not the same!".format(db)
def test_dereplicate_8(self): ''' Test greedy clustering with some primary clusters only having a single member ''' if len(self.large_genome_set) == 0: print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***") return genomes = [self.large_genome_set[0], self.large_genome_set[20]] wd_loc = self.wd_loc wd_loc2 = self.wd_loc2 # Get greedy results args = argumentParser.parse_args([ 'compare', wd_loc2, '--S_algorithm', 'fastANI', '--multiround_primary_clustering', '--primary_chunksize', '50', '--greedy_secondary_clustering', '-sa', '0.95', '-pa', '0.99', '-g' ] + genomes) Controller().parseArguments(args) wd = WorkDirectory(wd_loc2) CSdb = wd.get_db('Cdb') # Run normal args = argumentParser.parse_args([ 'compare', wd_loc, '--S_algorithm', 'fastANI', '--multiround_primary_clustering', '--primary_chunksize', '50', '-sa', '0.95', '-pa', '0.99', '-g' ] + genomes) Controller().parseArguments(args) # Verify they're the same wd = WorkDirectory(wd_loc) Cdb = wd.get_db('Cdb') assert len(CSdb) == len(Cdb) for c in ['primary_cluster', 'secondary_cluster']: assert set(CSdb[c].value_counts().to_dict().values()) == set( Cdb[c].value_counts().to_dict().values()), c if c != 'secondary_cluster': assert set(CSdb[c].value_counts().to_dict().keys()) == set( Cdb[c].value_counts().to_dict().keys() ) #, [set(CSdb[c].value_counts().to_dict().keys()), set(Cdb[c].value_counts().to_dict().keys())] assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist()) assert set(CSdb.columns) - set(Cdb.columns) == set( ['greedy_representative'])
def unit_tests_5(self): ''' Test changing cluster --S_algorithm gANI ''' # normal complete run args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \ self.genomes + ['--S_algorithm', 'gANI']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are correct: for db in ['Cdb', 'Mdb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert compare_dfs(db1, db2), "{0} is not the same!".format(db)
def unit_tests_3(self): ''' Test cluster with --skipMash ''' # normal complete run args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \ self.genomes + ['--SkipMash']) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are not the same: for db in ['Cdb', 'Ndb', 'Mdb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) assert not compare_dfs( db1, db2), "{0} is the same! (and shouldn't be)".format(db)
def test_dereplicate_4(self): ''' Test the ability of primary clustering to take a large genome set and break it into chunks ''' genomes = self.large_genome_set wd_loc = self.wd_loc wd_loc2 = self.wd_loc2 if len(genomes) == 0: print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***") return # Get normal results args = argumentParser.parse_args([ 'compare', wd_loc2, '--S_algorithm', 'fastANI', '--SkipSecondary', '--primary_chunksize', '50', '-g' ] + genomes) Controller().parseArguments(args) wd = WorkDirectory(wd_loc2) CSdb = wd.get_db('Cdb') # Run with chunking args = argumentParser.parse_args([ 'compare', wd_loc, '--S_algorithm', 'fastANI', '--SkipSecondary', '--multiround_primary_clustering', '--primary_chunksize', '50', '-g' ] + genomes) Controller().parseArguments(args) # Verify they're the same wd = WorkDirectory(wd_loc) Cdb = wd.get_db('Cdb') assert len(CSdb) == len(Cdb) for c in ['primary_cluster', 'secondary_cluster']: assert set(CSdb[c].value_counts().to_dict().values()) == set( Cdb[c].value_counts().to_dict().values()) assert set(CSdb[c].value_counts().to_dict().keys()) == set( Cdb[c].value_counts().to_dict().keys()) assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist()) assert set(Cdb.columns) - set(CSdb.columns) == set( ['length', 'subcluster', 'primary_representitive'])
def test_dereplicate_5(self): ''' Test greedy clustering ''' genomes = self.large_genome_set[:10] wd_loc = self.wd_loc wd_loc2 = self.wd_loc2 if len(genomes) == 0: print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***") return # Get greedy results args = argumentParser.parse_args([ 'compare', wd_loc2, '--S_algorithm', 'fastANI', '--SkipMash', '--greedy_secondary_clustering', '-sa', '0.95', '-g' ] + genomes) Controller().parseArguments(args) wd = WorkDirectory(wd_loc2) CSdb = wd.get_db('Cdb') # Run normal args = argumentParser.parse_args([ 'compare', wd_loc, '--S_algorithm', 'fastANI', '--SkipMash', '-sa', '0.95', '-g' ] + genomes) Controller().parseArguments(args) # Verify they're the same wd = WorkDirectory(wd_loc) Cdb = wd.get_db('Cdb') assert len(CSdb) == len(Cdb) for c in ['primary_cluster', 'secondary_cluster']: assert set(CSdb[c].value_counts().to_dict().values()) == set( Cdb[c].value_counts().to_dict().values()), c assert set(CSdb[c].value_counts().to_dict().keys()) == set( Cdb[c].value_counts().to_dict().keys()), c assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist()) assert set(CSdb.columns) - set(Cdb.columns) == set( ['greedy_representative'])
def functional_test_1(self): ''' Cluster the 5 genomes using default settings ''' genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc args = argumentParser.parse_args(['cluster', wd_loc, '-g'] + genomes) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Cdb.csv is correct db1 = Swd.get_db('Cdb') db2 = wd.get_db('Cdb') assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
def unit_tests_1(self): ''' Test a normal run of cluster ''' # normal complete run args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \ self.genomes) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(self.s_wd_loc) wd = WorkDirectory(self.working_wd_loc) # Confirm the following are correct: #for db in ['Cdb', 'Mdb', 'Ndb']: for db in ['Cdb', 'Ndb']: db1 = Swd.get_db(db) db2 = wd.get_db(db) if compare_dfs(db1, db2) == False: # db1['solution'] = True # db2['solution'] = False # db = pd.merge(db1, db2, on='') db1 = db1[['reference', 'querry', 'ani']] db1.rename(columns={'ani': 'ani1'}, inplace=True) db2 = db2[['reference', 'querry', 'ani']] db2.rename(columns={'ani': 'ani2'}, inplace=True) db1.sort_values(['reference', 'querry'], inplace=True) db2.sort_values(['reference', 'querry'], inplace=True) print("{0} is not the same!".format(db)) my_panel = pd.Panel(dict(df1=db1, df2=db2)) print(my_panel.apply(report_diff, axis=0)) print(pd.merge(db1, db2, on=['reference', 'querry'])) assert compare_dfs(db1, db2), "{0} is not the same!".format(db)
def functional_test_3(self): ''' Cluster the 5 genomes using ANImf ''' genomes = self.genomes wd_loc = self.wd_loc s_wd_loc = self.s_wd_loc args = argumentParser.parse_args(['cluster',wd_loc,'--S_algorithm',\ 'ANImf','-g']+genomes) controller = Controller() controller.parseArguments(args) # Verify Swd = WorkDirectory(s_wd_loc) wd = WorkDirectory(wd_loc) # Confirm Cdb.csv is correct db1 = Swd.get_db('Cdb') del db1['comparison_algorithm'] db2 = wd.get_db('Cdb') del db2['comparison_algorithm'] assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')