Exemple #1
0
    def plot_6_test_1(self):
        '''
        Test plot 6 with different things missing
        '''
        # Test with everything there
        args = argumentParser.parse_args(['analyze',self.working_wd_loc,'-pl', '6'])
        controller = Controller()
        controller.parseArguments(args)
        fig_dir = os.path.join(self.working_wd_loc, 'figures', '')

        figs = [os.path.basename(f) for f in glob.glob(fig_dir + '*')]
        FIGS = ['Winning_genomes.pdf']

        assert sorted(figs) == sorted(FIGS)
        for fig in glob.glob(fig_dir + '*'):
            assert os.path.getsize(fig) > 0

        # Test with removing Widb
        db_loc = os.path.join(self.working_wd_loc, 'data_tables', 'Widb.csv')
        os.remove(db_loc)
        for f in glob.glob(fig_dir + '*'):
            os.remove(f)

        args = argumentParser.parse_args(['analyze',self.working_wd_loc,'-pl', '6'])
        controller = Controller()
        controller.parseArguments(args)
        fig_dir = os.path.join(self.working_wd_loc, 'figures', '')

        figs = [os.path.basename(f) for f in glob.glob(fig_dir + '*')]
        FIGS = ['Winning_genomes.pdf']

        assert sorted(figs) == sorted(FIGS)
        for fig in glob.glob(fig_dir + '*'):
            assert os.path.getsize(fig) > 0
Exemple #2
0
def test_centrality_1(self):
    """
    Test the methods drep.d_choose.add_centrality and "choose_winners" on a small set of genomes
    """
    wd = drep.WorkDirectory.WorkDirectory(self.working_wd_loc)
    kwargs = vars(
        argumentParser.parse_args(
            ['dereplicate', self.working_wd_loc, '--ignoreGenomeQuality']))
    del kwargs['genomes']

    # Modify Cdb
    cdb = wd.get_db('Cdb')
    cdb['secondary_cluster'] = [
        x.replace('1_2', '1_1') for x in cdb['secondary_cluster']
    ]
    wd.store_db(cdb, 'Cdb')

    # Run calculation
    bdb = wd.get_db('Bdb')
    Gdb = drep.d_filter.calc_genome_info(bdb['location'].tolist())
    Gdb = drep.d_choose.add_centrality(wd, Gdb, **kwargs)

    # Test result of add_centrality
    assert 'centrality' in list(Gdb.columns)
    assert len(Gdb[Gdb['centrality'] > 0]) > 0
    assert len(Gdb[Gdb['centrality'] > 1]) == 0
    assert len(Gdb[Gdb['centrality'].isna()]) == 0

    # Run choose winners
    Sdb, Wdb = drep.d_choose.choose_winners(cdb, Gdb, **kwargs)

    # Compare against choose winners with no centrality weight
    kwargs = vars(
        argumentParser.parse_args([
            'dereplicate', self.working_wd_loc, '--ignoreGenomeQuality',
            '-centW', '0'
        ]))
    del kwargs['genomes']
    Sdb2, Wdb2 = drep.d_choose.choose_winners(cdb, Gdb, **kwargs)

    # Make sure you get different values, and make sure they're not too different
    assert not test_utils.compare_dfs2(Sdb, Sdb2)
    assert abs(Sdb['score'].mean() - Sdb2['score'].mean()) < 1

    # Make sure S_ani is being loaded properly
    kwargs = vars(
        argumentParser.parse_args([
            'dereplicate', self.working_wd_loc, '--ignoreGenomeQuality', '-sa',
            '0.95'
        ]))
    del kwargs['genomes']
    Sdb2, Wdb2 = drep.d_choose.choose_winners(cdb, Gdb, **kwargs)
    assert not test_utils.compare_dfs2(Sdb, Sdb2)
    assert abs(Sdb['score'].mean()) < Sdb2['score'].mean()
Exemple #3
0
def test_unit_7(self):
    '''
    Test cluster with --SkipSecondary
    '''
    # run
    args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '-g'] + \
                                     self.genomes + ['--SkipSecondary'])
    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs)

    # Verify
    Swd = WorkDirectory(self.s_wd_loc)
    wd = WorkDirectory(self.working_wd_loc)

    # Confirm the following are the same:
    # for db in ['Mdb']:
    #     db1 = Swd.get_db(db)
    #     db2 =  wd.get_db(db)
    #     assert test_utils.compare_dfs(db1, db2), "{0} is not the same!".format(db)

    # Confirm the following are not the same:
    for db in ['Cdb', 'Ndb']:
        db1 = Swd.get_db(db)
        db2 = wd.get_db(db)
        assert not test_utils.compare_dfs(
            db1, db2), "{0} is the same! (and shouldn't be)".format(db)
Exemple #4
0
    def taxTest2(self):
        '''
        Check the taxonomy call for percent method
        '''
        genomes = self.genomes
        wd_loc = self.wd_loc
        swd_loc = self.s_wd_loc

        # Call the command
        args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \
                + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\
                '--tax_method', 'percent'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(swd_loc)
        wd = WorkDirectory(wd_loc)

        tdbS = Swd.get_db('BdbP')
        tdb = wd.get_db('Bdb')
        del tdbS['location']
        del tdb['location']
        assert compare_dfs(tdb, tdbS), "{0} is not the same!".format('Bdb')

        tdbS = Swd.get_db('TdbP')
        tdb = wd.get_db('Tdb')
        assert compare_dfs(tdb, tdbS), "{0} is not the same!".format('Tdb')
Exemple #5
0
    def unit_tests_4(self):
        '''
        Test changing cluster -pa
        '''
        # normal complete run
        args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \
            self.genomes + ['-pa', '0.10'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)

        # Confirm the following are correct:
        # for db in ['Mdb']:
        #     db1 = Swd.get_db(db)
        #     db2 =  wd.get_db(db)
        #     assert compare_dfs(db1, db2), "{0} is not the same!".format(db)

        # Confirm the following are not the same:
        for db in ['Ndb', 'Cdb']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)
            assert not compare_dfs(
                db1, db2), "{0} is the same! (and shouldn't be)".format(db)
Exemple #6
0
def test_skipsecondary(self):
    genomes = self.genomes
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    args = argumentParser.parse_args(['dereplicate',wd_loc,'-g'] +genomes \
            + ['--SkipSecondary'])
    # controller = Controller()
    # controller.parseArguments(args)

    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs)

    # Verify
    Swd = WorkDirectory(s_wd_loc)
    wd = WorkDirectory(wd_loc)

    # Confirm Mdb.csv is correct
    db1 = Swd.get_db('Mdb')
    db2 = wd.get_db('Mdb')
    #assert compare_dfs(db1, db2), "{0} is not the same!".format('Mdb')

    # Confirm Ndb.csv doesn't exist
    db2 = wd.get_db('Ndb')
    assert db2.empty, 'Ndb is not empty'
Exemple #7
0
    def functional_test_2(self):
        '''
        Cluster the 5 genomes using gANI
        '''
        genomes = self.genomes
        wd_loc = self.wd_loc
        s_wd_loc = self.s_wd_loc

        # Make sure gANI is installed
        loc, works = find_program('ANIcalculator')
        if (loc == None or works == False):
            print('Cannot locate the program {0}- skipping related tests'\
                .format('ANIcalculator (for gANI)'))
            return

        args = argumentParser.parse_args(['cluster',wd_loc,'--S_algorithm',\
            'gANI','-g']+genomes)
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(s_wd_loc)
        wd = WorkDirectory(wd_loc)

        # Confirm Cdb.csv is correct
        db1 = Swd.get_db('Cdb')
        del db1['comparison_algorithm']
        db2 = wd.get_db('Cdb')
        del db2['comparison_algorithm']
        assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
Exemple #8
0
def test_cluster_functional_1(self):
    '''
    Cluster the 5 genomes using default settings
    '''
    genomes = self.genomes
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    args = argumentParser.parse_args(['dereplicate', wd_loc, '-g'] + genomes)
    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs)

    # args = argumentParser.parse_args(['cluster',wd_loc,'-g']+genomes)
    # controller = Controller()
    # controller.parseArguments(args)

    # Verify
    Swd = WorkDirectory(s_wd_loc)
    wd = WorkDirectory(wd_loc)

    # Confirm Cdb.csv is correct
    db1 = Swd.get_db('Cdb')
    db2 = wd.get_db('Cdb')

    assert test_utils.compare_dfs(db1,
                                  db2), "{0} is not the same!".format('Cdb')
Exemple #9
0
def test_cluster_functional_4(self):
    '''
    Cluster the 5 genomes using fastANI
    '''

    genomes = self.genomes
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    args = argumentParser.parse_args(['dereplicate',wd_loc,'--S_algorithm',\
        'fastANI','-g']+genomes)
    # controller = Controller()
    # controller.parseArguments(args)
    # args = argumentParser.parse_args(['dereplicate', wd_loc, '--S_algorithm', 'ANImf', '-g'] + genomes)

    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs)

    # Verify
    Swd = WorkDirectory(s_wd_loc)
    wd = WorkDirectory(wd_loc)

    # Confirm Cdb.csv is correct
    db1 = Swd.get_db('Cdb')
    del db1['comparison_algorithm']
    db2 = wd.get_db('Cdb')
    del db2['comparison_algorithm']
    assert test_utils.compare_dfs(db1,
                                  db2), "{0} is not the same!".format('Cdb')
Exemple #10
0
    def unit_test_2(self):
        '''
        Try out the --skipCheckM argument for choose
        '''
        # Delete Chdb
        wd_loc = self.working_wd_loc
        os.remove(wd_loc + '/data_tables/Chdb.csv')
        os.remove(wd_loc + '/data_tables/Sdb.csv')
        os.remove(wd_loc + '/data_tables/Wdb.csv')

        # Run choose with --skipCheckM
        args = argumentParser.parse_args(
            ['choose', wd_loc, '--noQualityFiltering'])
        controller = Controller()
        controller.parseArguments(args)

        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)
        for db in ['Sdb', 'Wdb', 'genomeInformation']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)
            assert not compare_dfs(db1, db2), "{0} is the same!".format(db)

        sdb = wd.get_db('Sdb')
        for s in sdb['score'].tolist():
            assert (s > 0) & (s < 5)
Exemple #11
0
    def unit_test_1(self):
        '''
        Ensure choose can handle when Chdb is not present, running checkM automatically
        '''
        # Delete Chdb
        wd_loc = self.working_wd_loc
        os.remove(wd_loc + '/data_tables/Chdb.csv')

        # Modify Bdb so the genome locations are right
        genomes = load_test_genomes()
        g2l = {os.path.basename(g): g for g in genomes}

        Bdb = pd.read_csv(wd_loc + '/data_tables/Bdb.csv')
        Bdb['location'] = Bdb['genome'].map(g2l)
        Bdb.to_csv(wd_loc + '/data_tables/Bdb.csv', index=False)

        # Run choose - this should re-run checkM and re-generate chdb
        args = argumentParser.parse_args(['choose', wd_loc, '--checkM_method',\
            'taxonomy_wf'])
        controller = Controller()
        controller.parseArguments(args)

        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)
        for db in ['Chdb', 'genomeInformation']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)
            assert compare_dfs(db1, db2), "{0} is not the same!".format(db)
Exemple #12
0
    def taxTest1(self):
        '''
        Check the taxonomy call for max method
        '''
        genomes = self.genomes
        wd_loc = self.wd_loc
        swd_loc = self.s_wd_loc

        # Call the command
        args = argumentParser.parse_args(['bonus',wd_loc,'-g'] +genomes \
                + ['--run_tax','--cent_index','/home/mattolm/download/centrifuge/indices/b+h+v',\
                '--tax_method', 'max'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(swd_loc)
        wd = WorkDirectory(wd_loc)

        tdbS = Swd.get_db('Bdb')
        tdb = wd.get_db('Bdb')
        del tdbS['location']
        del tdb['location']
        assert compare_dfs(tdb, tdbS), "{0} is not the same!".format('Bdb')

        tdbS = Swd.get_db('Tdb')
        tdb = wd.get_db('Tdb')

        if compare_dfs(tdb, tdbS) == False:
            print("{0} is not the same! May be due to centrifuge index issues".
                  format('Tdb'))
            my_panel = pd.Panel(dict(df1=tdbS, df2=tdb))
            print(my_panel.apply(report_diff, axis=0))

        assert True
Exemple #13
0
def test_choose_2(self):
    '''
    Try out the --skipCheckM argument for choose
    '''
    # Delete Chdb
    wd_loc = self.working_wd_loc
    os.remove(wd_loc + '/data_tables/Chdb.csv')
    os.remove(wd_loc + '/data_tables/Sdb.csv')
    os.remove(wd_loc + '/data_tables/Wdb.csv')

    # Run choose with --skipCheckM
    args = argumentParser.parse_args(
        ['dereplicate', wd_loc, '--ignoreGenomeQuality'])
    kwargs = vars(args)
    del kwargs['genomes']
    drep.d_choose.d_choose_wrapper(wd_loc, **kwargs)
    #
    # controller = Controller()
    # controller.parseArguments(args)

    Swd = WorkDirectory(self.s_wd_loc)
    wd = WorkDirectory(self.working_wd_loc)
    for db in ['Sdb', 'Wdb', 'genomeInformation']:
        db1 = Swd.get_db(db)
        db2 = wd.get_db(db)
        assert not test_utils.compare_dfs(db1,
                                          db2), "{0} is the same!".format(db)

    sdb = wd.get_db('Sdb')
    Swd.get_db(db)
    for s in sdb['score'].tolist():
        assert (s > 0) & (s < 5)

    gdb = wd.get_db('genomeInformation')
    assert 'centrality' in gdb.columns
Exemple #14
0
    def functional_test_1(self):
        '''
        Call filter on 'Escherichia_coli_Sakai.fna'

        Should call both prodigal and checkM
        '''
        genomes = self.genomes
        wd_loc = self.wd_loc

        # make sure calling it on the right genome
        genome = [
            g for g in genomes if g.endswith('Enterococcus_faecalis_T2.fna')
        ]
        assert len(genome) == 1
        genome = genome[0]

        args = argumentParser.parse_args(['filter',wd_loc,'-g',genome] \
            + ['--checkM_method', 'taxonomy_wf'])
        controller = Controller()
        controller.parseArguments(args)

        # Confirm Chdb.csv is correct
        wd = drep.WorkDirectory.WorkDirectory(wd_loc)
        chdb = wd.get_db('Chdb')
        assert chdb['Completeness'].tolist()[0] == 98.28

        # Confirm genome is in Bdb.csv
        Gdb = wd.get_db('genomeInfo')
        assert Gdb['completeness'].tolist()[0] == 98.28
Exemple #15
0
def test_dereplicate_8(self):
    '''
    Test greedy clustering with some primary clusters only having a single member
    '''
    if len(self.large_genome_set) == 0:
        print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***")
        return

    genomes = [self.large_genome_set[0], self.large_genome_set[20]]
    wd_loc = self.wd_loc
    wd_loc2 = self.wd_loc2

    # Get greedy results
    args = argumentParser.parse_args([
        'compare', wd_loc2, '--S_algorithm', 'fastANI',
        '--multiround_primary_clustering', '--primary_chunksize', '50',
        '--greedy_secondary_clustering', '-sa', '0.95', '-pa', '0.99', '-g'
    ] + genomes)
    Controller().parseArguments(args)
    wd = WorkDirectory(wd_loc2)
    CSdb = wd.get_db('Cdb')

    # Run normal
    args = argumentParser.parse_args([
        'compare', wd_loc, '--S_algorithm', 'fastANI',
        '--multiround_primary_clustering', '--primary_chunksize', '50', '-sa',
        '0.95', '-pa', '0.99', '-g'
    ] + genomes)
    Controller().parseArguments(args)

    # Verify they're the same
    wd = WorkDirectory(wd_loc)
    Cdb = wd.get_db('Cdb')

    assert len(CSdb) == len(Cdb)
    for c in ['primary_cluster', 'secondary_cluster']:
        assert set(CSdb[c].value_counts().to_dict().values()) == set(
            Cdb[c].value_counts().to_dict().values()), c
        if c != 'secondary_cluster':
            assert set(CSdb[c].value_counts().to_dict().keys()) == set(
                Cdb[c].value_counts().to_dict().keys()
            )  #, [set(CSdb[c].value_counts().to_dict().keys()), set(Cdb[c].value_counts().to_dict().keys())]
    assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist())
    assert set(CSdb.columns) - set(Cdb.columns) == set(
        ['greedy_representative'])
Exemple #16
0
def test_filer_functional_4(self):
    """
    Test some logging things
    """
    # Capture all logging
    self._caplog.set_level(0)

    args = argumentParser.parse_args(['dereplicate', self.wd_loc, '-g'] + self.genomes)
    kwargs = vars(args)

    # Run the "verify" thing
    bdb = drep.d_cluster.utils.load_genomes(kwargs['genomes'])
    drep.d_filter.sanity_check(bdb, **kwargs)

    for logger_name, log_level, message in self._caplog.record_tuples:
        assert message == '5 genomes were input to dRep'

    # Make sure it warns correctly
    self._caplog.clear()
    args = argumentParser.parse_args(['dereplicate', self.wd_loc, '--primary_chunksize', '4', '-g'] + self.genomes)
    kwargs = vars(args)
    bdb = drep.d_cluster.utils.load_genomes(kwargs['genomes'])
    drep.d_filter.sanity_check(bdb, **kwargs)

    got = False
    for logger_name, log_level, message in self._caplog.record_tuples:
        if 'genomes and arent using greedy algorithms' in message:
            got = True
    assert got

    # Make sure it doesnt warn incorrectly
    self._caplog.clear()
    args = argumentParser.parse_args(['dereplicate', self.wd_loc, '--primary_chunksize', '4', '--multiround_primary_clustering', '-g'] + self.genomes)
    kwargs = vars(args)
    bdb = drep.d_cluster.utils.load_genomes(kwargs['genomes'])
    drep.d_filter.sanity_check(bdb, **kwargs)

    got = False
    for logger_name, log_level, message in self._caplog.record_tuples:
        if 'genomes and arent using greedy algorithms' in message:
            got = True
    assert not got
Exemple #17
0
def test_dereplicate_5(self):
    '''
    Test greedy clustering
    '''
    genomes = self.large_genome_set[:10]
    wd_loc = self.wd_loc
    wd_loc2 = self.wd_loc2

    if len(genomes) == 0:
        print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***")
        return

    # Get greedy results
    args = argumentParser.parse_args([
        'compare', wd_loc2, '--S_algorithm', 'fastANI', '--SkipMash',
        '--greedy_secondary_clustering', '-sa', '0.95', '-g'
    ] + genomes)
    Controller().parseArguments(args)
    wd = WorkDirectory(wd_loc2)
    CSdb = wd.get_db('Cdb')

    # Run normal
    args = argumentParser.parse_args([
        'compare', wd_loc, '--S_algorithm', 'fastANI', '--SkipMash', '-sa',
        '0.95', '-g'
    ] + genomes)
    Controller().parseArguments(args)

    # Verify they're the same
    wd = WorkDirectory(wd_loc)
    Cdb = wd.get_db('Cdb')

    assert len(CSdb) == len(Cdb)
    for c in ['primary_cluster', 'secondary_cluster']:
        assert set(CSdb[c].value_counts().to_dict().values()) == set(
            Cdb[c].value_counts().to_dict().values()), c
        assert set(CSdb[c].value_counts().to_dict().keys()) == set(
            Cdb[c].value_counts().to_dict().keys()), c
    assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist())
    assert set(CSdb.columns) - set(Cdb.columns) == set(
        ['greedy_representative'])
Exemple #18
0
def test_dereplicate_4(self):
    '''
    Test the ability of primary clustering to take a large genome set and break it into chunks
    '''
    genomes = self.large_genome_set
    wd_loc = self.wd_loc
    wd_loc2 = self.wd_loc2

    if len(genomes) == 0:
        print("*** THIS TEST ONLY WORKS ON MO'S DEVELOPMENT MACHINE ***")
        return

    # Get normal results
    args = argumentParser.parse_args([
        'compare', wd_loc2, '--S_algorithm', 'fastANI', '--SkipSecondary',
        '--primary_chunksize', '50', '-g'
    ] + genomes)
    Controller().parseArguments(args)
    wd = WorkDirectory(wd_loc2)
    CSdb = wd.get_db('Cdb')

    # Run with chunking
    args = argumentParser.parse_args([
        'compare', wd_loc, '--S_algorithm', 'fastANI', '--SkipSecondary',
        '--multiround_primary_clustering', '--primary_chunksize', '50', '-g'
    ] + genomes)
    Controller().parseArguments(args)

    # Verify they're the same
    wd = WorkDirectory(wd_loc)
    Cdb = wd.get_db('Cdb')

    assert len(CSdb) == len(Cdb)
    for c in ['primary_cluster', 'secondary_cluster']:
        assert set(CSdb[c].value_counts().to_dict().values()) == set(
            Cdb[c].value_counts().to_dict().values())
        assert set(CSdb[c].value_counts().to_dict().keys()) == set(
            Cdb[c].value_counts().to_dict().keys())
    assert set(CSdb['genome'].tolist()) == set(Cdb['genome'].tolist())
    assert set(Cdb.columns) - set(CSdb.columns) == set(
        ['length', 'subcluster', 'primary_representitive'])
Exemple #19
0
def test_filer_functional_3(self):
    '''
    Test the sanity check to make sure there are no duplicate genome names or things like that
    '''
    # Capture all logging
    self._caplog.set_level(0)

    wd_loc = self.wd_loc

    # Make a genome info
    genomes = self.genomes
    table = {}
    atts = ['completeness', 'contamination', 'strain_heterogeneity']
    for a in atts:
        table[a] = []
    table['genome'] = []
    table['location'] = []
    for g in genomes:
        table['genome'].append(os.path.basename(g))
        table['location'].append(g)
        for a in atts:
            table[a].append(10)
    Idb = pd.DataFrame(table)

    if not os.path.isdir(self.testdir):
        os.mkdir(self.testdir)
    GI_loc = os.path.join(self.testdir, 'genomeInfo.csv')
    Idb.to_csv(GI_loc, index=False)

    # Add a genome with the same name at a different location
    sgenomes = genomes + [self.stinker_genome]

    args = argumentParser.parse_args(['dereplicate',wd_loc,'-g'] + sgenomes \
        + ['--genomeInfo', GI_loc])
    kwargs = vars(args)

    # Make sure it fails
    failed = True
    try:
        drep.d_filter.d_filter_wrapper(wd_loc, **kwargs)
        failed = False
    except:
        pass
    assert failed

    # Verify logs
    got = False
    for logger_name, log_level, message in self._caplog.record_tuples:
        if 'You have duplicate genome basenames!' in message:
            got = True
    assert got
Exemple #20
0
    def functional_test_2(self):
        '''
        Ensure analyze crashes gracefully
        '''
        wd_loc = self.working_wd_loc
        wd = drep.WorkDirectory.WorkDirectory(wd_loc)
        os.remove(os.path.join(wd.get_dir('data_tables'), 'Mdb.csv'))
        os.remove(os.path.join(wd.get_dir('data_tables'), 'Cdb.csv'))
        os.remove(os.path.join(wd.get_dir('data_tables'), 'Bdb.csv'))

        args = argumentParser.parse_args(['analyze',self.working_wd_loc,'-pl'] + \
            ['a'])
        controller = Controller()
        controller.parseArguments(args)
Exemple #21
0
def test_list_genome_load(self):
    '''
    Test inputting a list of genomes via a text file
    '''
    bdb = drep.d_cluster.utils.load_genomes(self.genomes)
    data_folder = self.test_dir

    # Make the list of genomes
    if not os.path.exists(data_folder):
        os.mkdir(data_folder)
    genome_loc = os.path.join(data_folder, 'genomes.txt')
    with open(genome_loc, 'w') as o:
        for i, row in bdb.iterrows():
            o.write(row['location'] + '\n')

    # Test it out
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    # args = argumentParser.parse_args(['cluster',wd_loc,'--S_algorithm',\
    #     'fastANI','-g',genome_loc])
    # controller = Controller()
    # controller.parseArguments(args)
    args = argumentParser.parse_args(
        ['dereplicate', wd_loc, '--S_algorithm', 'fastANI', '-g', genome_loc])
    kwargs = vars(args)
    # del kwargs['genomes']
    # drep.d_cluster.d_cluster_wrapper(wd_loc, **kwargs)
    drep.d_cluster.controller.d_cluster_wrapper(wd_loc, **kwargs)

    # Verify
    Swd = WorkDirectory(s_wd_loc)
    wd = WorkDirectory(wd_loc)

    # Confirm Cdb.csv is correct
    db1 = Swd.get_db('Cdb')
    del db1['comparison_algorithm']
    db2 = wd.get_db('Cdb')
    del db2['comparison_algorithm']
    assert test_utils.compare_dfs(db1,
                                  db2), "{0} is not the same!".format('Cdb')

    Ndb = drep.d_cluster.compare_utils.compare_genomes(bdb, 'fastANI',
                                                       data_folder)
    db = Ndb[(Ndb['reference'] == 'Enterococcus_faecalis_T2.fna')\
        & (Ndb['querry'] == 'Enterococcus_casseliflavus_EC20.fasta')]

    assert (db['ani'].tolist()[0] > 0.7) & (db['ani'].tolist()[0] < 0.8)
Exemple #22
0
def test_unit_1(self):
    '''
    Test a normal run of cluster
    '''
    # normal complete run
    args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '--S_algorithm', 'ANImf', '-g'] + \
                                     self.genomes)
    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs)

    # Verify
    Swd = WorkDirectory(self.s_wd_loc)
    wd = WorkDirectory(self.working_wd_loc)

    # Confirm the following are correct:
    # for db in ['Cdb', 'Mdb', 'Ndb']:
    for db in ['Cdb', 'Ndb']:
        db1 = Swd.get_db(db)
        db2 = wd.get_db(db)

        # get rid of some precision on the ANI; you are comparing fastANI with ANImf
        if db == 'Ndb':
            db1['ani'] = [round(x, 3) for x in db1['ani']]
            db2['ani'] = [round(x, 3) for x in db2['ani']]
            db1['alignment_length'] = [
                round(x, -6) for x in db1['alignment_length']
            ]
            db2['alignment_length'] = [
                round(x, -6) for x in db2['alignment_length']
            ]

            #db1 = db1[db2.columns]
            db1 = db1[['ani', 'alignment_length', 'querry', 'reference']]
            db2 = db2[['ani', 'alignment_length', 'querry', 'reference']]

            db1 = db1.sort_values(['querry',
                                   'reference']).reset_index(drop=True)
            db2 = db2.sort_values(['querry',
                                   'reference']).reset_index(drop=True)

        if db == 'Cdb':
            db1 = db1[['genome', 'secondary_cluster'
                       ]].sort_values('genome').reset_index(drop=True)
            db2 = db2[['genome', 'secondary_cluster'
                       ]].sort_values('genome').reset_index(drop=True)

        assert test_utils.compare_dfs2(
            db1, db2, verbose=True), "{0} is not the same!".format(db)
Exemple #23
0
def test_filer_functional_2(self):
    '''
    Call filter on 'Escherichia_coli_Sakai.fna' with GenomeInfo provivded
    '''
    genomes = self.genomes
    wd_loc  = self.wd_loc

    # make sure calling it on the right genome
    genome = [g for g in genomes if g.endswith('Enterococcus_faecalis_T2.fna')]
    assert len(genome) == 1
    genome = genome[0]

    table = {}
    atts = ['completeness', 'contamination', 'strain_heterogeneity']
    for a in atts:
        table[a] = []
    table['genome'] = []
    table['location'] = []
    for g in [genome]:
        table['genome'].append(os.path.basename(g))
        table['location'].append(g)
        for a in atts:
            table[a].append(10)
    Idb = pd.DataFrame(table)

    if not os.path.isdir(self.testdir):
        os.mkdir(self.testdir)
    GI_loc = os.path.join(self.testdir, 'genomeInfo.csv')
    Idb.to_csv(GI_loc, index=False)

    args = argumentParser.parse_args(['dereplicate',wd_loc,'-g',genome] \
        + ['--genomeInfo', GI_loc])
    # controller = Controller()
    # controller.parseArguments(args)

    kwargs = vars(args)
    drep.d_filter.d_filter_wrapper(wd_loc, **kwargs)

    # Confirm Chdb.csv is correct
    wd = drep.WorkDirectory.WorkDirectory(wd_loc)

    # Confirm genome is in Bdb.csv
    Gdb = wd.get_db('genomeInfo')
    assert Gdb['completeness'].tolist()[0] == 10
Exemple #24
0
    def unit_tests_5(self):
        '''
        Test changing cluster --S_algorithm gANI
        '''
        # normal complete run
        args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \
            self.genomes + ['--S_algorithm', 'gANI'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)

        # Confirm the following are correct:
        for db in ['Cdb', 'Mdb']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)
            assert compare_dfs(db1, db2), "{0} is not the same!".format(db)
Exemple #25
0
    def functional_test_1(self):
        '''
        Ensure analyze produces all plots
        '''
        args = argumentParser.parse_args(['analyze',self.working_wd_loc,'-pl'] + \
            ['a'])
        controller = Controller()
        controller.parseArguments(args)

        FIGS = ['Cluster_scoring.pdf', 'Clustering_scatterplots.pdf', \
            'Primary_clustering_dendrogram.pdf', 'Secondary_clustering_dendrograms.pdf', \
            'Winning_genomes.pdf', 'Secondary_clustering_MDS.pdf']

        fig_dir = os.path.join(self.working_wd_loc, 'figures', '')
        figs = [os.path.basename(f) for f in glob.glob(fig_dir + '*')]

        assert sorted(figs) == sorted(FIGS)
        for fig in glob.glob(fig_dir + '*'):
            assert os.path.getsize(fig) > 0
Exemple #26
0
    def unit_tests_3(self):
        ''' Test cluster with --skipMash
        '''

        # normal complete run
        args = argumentParser.parse_args(['cluster',self.working_wd_loc,'-g'] + \
            self.genomes + ['--SkipMash'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(self.s_wd_loc)
        wd = WorkDirectory(self.working_wd_loc)

        # Confirm the following are not the same:
        for db in ['Cdb', 'Ndb', 'Mdb']:
            db1 = Swd.get_db(db)
            db2 = wd.get_db(db)
            assert not compare_dfs(
                db1, db2), "{0} is the same! (and shouldn't be)".format(db)
Exemple #27
0
    def functional_test_2(self):
        genomes = self.genomes
        wd_loc = self.wd_loc
        s_wd_loc = self.s_wd_loc

        sanity_check(WorkDirectory(s_wd_loc))

        args = argumentParser.parse_args(['compare', wd_loc, '-g'] + genomes)
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        s_wd = WorkDirectory(s_wd_loc)
        wd = WorkDirectory(wd_loc)
        ensure_identicle(s_wd, wd, skip=['Bdb', 'Chdb', 'Sdb', 'Wdb', 'Widb',\
            'genomeInformation', 'Mdb'])

        # Perform sanity check to make sure solutions directiory isn't
        # being overwritten
        sanity_check(s_wd)
Exemple #28
0
    def functional_test_1(self):
        genomes = self.genomes
        wd_loc = self.wd_loc
        s_wd_loc = self.s_wd_loc

        sanity_check(WorkDirectory(s_wd_loc))

        args = argumentParser.parse_args(['dereplicate',wd_loc,'-g'] + genomes \
            + ['--checkM_method', 'taxonomy_wf'])
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        s_wd = WorkDirectory(s_wd_loc)
        wd = WorkDirectory(wd_loc)
        ensure_identicle(s_wd, wd, skip=['Bdb', 'Mdb'])

        # Perform sanity check to make sure solutions directiory isn't
        # being overwritten
        sanity_check(s_wd)
Exemple #29
0
    def functional_test_1(self):
        '''
        Cluster the 5 genomes using default settings
        '''
        genomes = self.genomes
        wd_loc = self.wd_loc
        s_wd_loc = self.s_wd_loc

        args = argumentParser.parse_args(['cluster', wd_loc, '-g'] + genomes)
        controller = Controller()
        controller.parseArguments(args)

        # Verify
        Swd = WorkDirectory(s_wd_loc)
        wd = WorkDirectory(wd_loc)

        # Confirm Cdb.csv is correct
        db1 = Swd.get_db('Cdb')
        db2 = wd.get_db('Cdb')

        assert compare_dfs(db1, db2), "{0} is not the same!".format('Cdb')
Exemple #30
0
def test_dereplicate_6(self):
    '''
    Test zipped genomes
    '''
    genomes = self.zipped_genomes
    wd_loc = self.wd_loc
    s_wd_loc = self.s_wd_loc

    test_utils.sanity_check(WorkDirectory(s_wd_loc))

    args = argumentParser.parse_args(
        ['compare', wd_loc, '--S_algorithm', 'fastANI', '-g'] + genomes)
    controller = Controller()
    controller.parseArguments(args)

    # Verify
    wd = WorkDirectory(wd_loc)
    anis = wd.get_db('Ndb')['ani'].tolist()
    assert max(anis) <= 1
    assert min(anis) >= 0
    assert len(set(anis)) > 1