Beispiel #1
0
def test_greedy_secondary_clustering_2(self):
    """
    Make sure this works with dereplicate
    """
    test_dir = self.test_dir

    # Crash gracefully if the algorithm isn't right
    args = drep.argumentParser.parse_args([
        'dereplicate', self.wd_loc, '--greedy_secondary_clustering', '-sa',
        '0.99', '--ignoreGenomeQuality', '-d', '-g'
    ] + self.genomes)
    try:
        drep.controller.Controller().parseArguments(args)
        assert False
    except NameError:
        pass

    # Run it under normal conditions
    args = drep.argumentParser.parse_args([
        'dereplicate', self.wd_loc, '--greedy_secondary_clustering',
        '--S_algorithm', 'fastANI', '--ignoreGenomeQuality', '-sa', '0.99',
        '-d'
    ])
    drep.controller.Controller().parseArguments(args)

    # Load test results
    wd = drep.WorkDirectory.WorkDirectory(self.wd_loc)

    # Load solutions
    wdS = drep.WorkDirectory.WorkDirectory(self.s_wd_loc)
    CdbS = wdS.get_db('Cdb').sort_values('genome').reset_index(drop=True)
    NdbS = wdS.get_db('Ndb')
    SdbS = wdS.get_db('Sdb')

    # Make sure you didn't pairwise
    Ndb = wd.get_db('Ndb')
    assert len(Ndb) != len(NdbS)

    # Make sure you incorporated centrality
    Sdb = wd.get_db('Sdb')
    assert not test_utils.compare_dfs2(Sdb, SdbS, verbose=True)

    # Make sure you still got the correct clustering
    Cdb = wd.get_db('Cdb').sort_values('genome').reset_index(drop=True)
    assert 'greedy_representative' in Cdb.columns
    del Cdb['greedy_representative']

    for t in ['cluster_method', 'comparison_algorithm']:
        del Cdb[t]
        del CdbS[t]

    CdbS['secondary_cluster'] = [
        x.replace('_0', '_1') for x in CdbS['secondary_cluster']
    ]

    assert test_utils.compare_dfs2(CdbS, Cdb, verbose=True)

    # Make sure it handles plotting gracefully
    drep.d_analyze.plot_secondary_dendrograms_from_wd(wd, plot_dir=test_dir)
Beispiel #2
0
def test_centrality_1(self):
    """
    Test the methods drep.d_choose.add_centrality and "choose_winners" on a small set of genomes
    """
    wd = drep.WorkDirectory.WorkDirectory(self.working_wd_loc)
    kwargs = vars(
        argumentParser.parse_args(
            ['dereplicate', self.working_wd_loc, '--ignoreGenomeQuality']))
    del kwargs['genomes']

    # Modify Cdb
    cdb = wd.get_db('Cdb')
    cdb['secondary_cluster'] = [
        x.replace('1_2', '1_1') for x in cdb['secondary_cluster']
    ]
    wd.store_db(cdb, 'Cdb')

    # Run calculation
    bdb = wd.get_db('Bdb')
    Gdb = drep.d_filter.calc_genome_info(bdb['location'].tolist())
    Gdb = drep.d_choose.add_centrality(wd, Gdb, **kwargs)

    # Test result of add_centrality
    assert 'centrality' in list(Gdb.columns)
    assert len(Gdb[Gdb['centrality'] > 0]) > 0
    assert len(Gdb[Gdb['centrality'] > 1]) == 0
    assert len(Gdb[Gdb['centrality'].isna()]) == 0

    # Run choose winners
    Sdb, Wdb = drep.d_choose.choose_winners(cdb, Gdb, **kwargs)

    # Compare against choose winners with no centrality weight
    kwargs = vars(
        argumentParser.parse_args([
            'dereplicate', self.working_wd_loc, '--ignoreGenomeQuality',
            '-centW', '0'
        ]))
    del kwargs['genomes']
    Sdb2, Wdb2 = drep.d_choose.choose_winners(cdb, Gdb, **kwargs)

    # Make sure you get different values, and make sure they're not too different
    assert not test_utils.compare_dfs2(Sdb, Sdb2)
    assert abs(Sdb['score'].mean() - Sdb2['score'].mean()) < 1

    # Make sure S_ani is being loaded properly
    kwargs = vars(
        argumentParser.parse_args([
            'dereplicate', self.working_wd_loc, '--ignoreGenomeQuality', '-sa',
            '0.95'
        ]))
    del kwargs['genomes']
    Sdb2, Wdb2 = drep.d_choose.choose_winners(cdb, Gdb, **kwargs)
    assert not test_utils.compare_dfs2(Sdb, Sdb2)
    assert abs(Sdb['score'].mean()) < Sdb2['score'].mean()
Beispiel #3
0
def test_multiround_primary_clustering_1(self):
    test_dir = self.test_dir

    # Run it under normal conditions
    args = drep.argumentParser.parse_args([
        'compare', self.wd_loc, '--primary_chunksize', '3',
        '--multiround_primary_clustering', '-pa', '0.95', '-d', '-g'
    ] + self.genomes)
    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(self.wd_loc, **kwargs)

    # Load test results
    wd = drep.WorkDirectory.WorkDirectory(self.wd_loc)

    # Load solutions
    wdS = drep.WorkDirectory.WorkDirectory(self.s_wd_loc)
    CdbS = wdS.get_db('Cdb').sort_values('genome').reset_index(drop=True)

    # Make sure you didn't pairwise
    Mdb = wd.get_db('Mdb')
    assert len(Mdb) != 25
    assert 'genome_chunk' in list(Mdb.columns)
    assert len(Mdb['genome_chunk'].unique()) == 3

    # Make sure you still got the correct clustering
    Cdb = wd.get_db('Cdb').sort_values('genome').reset_index(drop=True)
    assert test_utils.compare_dfs2(CdbS, Cdb, verbose=True)

    # Make sure it handles plotting gracefully
    drep.d_analyze.mash_dendrogram_from_wd(wd, plot_dir=test_dir)
Beispiel #4
0
def test_unit_1(self):
    '''
    Test a normal run of cluster
    '''
    # normal complete run
    args = argumentParser.parse_args(['dereplicate', self.working_wd_loc, '--S_algorithm', 'ANImf', '-g'] + \
                                     self.genomes)
    kwargs = vars(args)
    drep.d_cluster.controller.d_cluster_wrapper(self.working_wd_loc, **kwargs)

    # Verify
    Swd = WorkDirectory(self.s_wd_loc)
    wd = WorkDirectory(self.working_wd_loc)

    # Confirm the following are correct:
    # for db in ['Cdb', 'Mdb', 'Ndb']:
    for db in ['Cdb', 'Ndb']:
        db1 = Swd.get_db(db)
        db2 = wd.get_db(db)

        # get rid of some precision on the ANI; you are comparing fastANI with ANImf
        if db == 'Ndb':
            db1['ani'] = [round(x, 3) for x in db1['ani']]
            db2['ani'] = [round(x, 3) for x in db2['ani']]
            db1['alignment_length'] = [
                round(x, -6) for x in db1['alignment_length']
            ]
            db2['alignment_length'] = [
                round(x, -6) for x in db2['alignment_length']
            ]

            #db1 = db1[db2.columns]
            db1 = db1[['ani', 'alignment_length', 'querry', 'reference']]
            db2 = db2[['ani', 'alignment_length', 'querry', 'reference']]

            db1 = db1.sort_values(['querry',
                                   'reference']).reset_index(drop=True)
            db2 = db2.sort_values(['querry',
                                   'reference']).reset_index(drop=True)

        if db == 'Cdb':
            db1 = db1[['genome', 'secondary_cluster'
                       ]].sort_values('genome').reset_index(drop=True)
            db2 = db2[['genome', 'secondary_cluster'
                       ]].sort_values('genome').reset_index(drop=True)

        assert test_utils.compare_dfs2(
            db1, db2, verbose=True), "{0} is not the same!".format(db)
Beispiel #5
0
def test_compare_16(BTO):
    """
    Test providing an .stb to compare
    """
    # Run program in two steps
    sol_base = BTO.test_dir + 'testR'
    cmd = f"inStrain compare -i {BTO.IS1} {BTO.IS2} -o {sol_base} --store_mismatch_locations"
    print(cmd)
    # call(cmd, shell=True)
    inStrain.controller.Controller().main(
        inStrain.argumentParser.parse_args(cmd.split(' ')[1:]))

    cmd = f"inStrain genome_wide -i {sol_base} -s {BTO.stb}"
    print(cmd)
    call(cmd, shell=True)

    # Load output
    IS = inStrain.SNVprofile.SNVprofile(sol_base)
    files = glob.glob(IS.get_location('output') + '*')
    files = [f for f in files if 'genomeWide' in f]
    assert len(files) == 1
    s = pd.read_csv(files[0], sep='\t')

    # Run the program in one step
    exp_base = BTO.test_dir + 'testSR'

    cmd = f"inStrain compare -i {BTO.IS1} {BTO.IS2} -o {exp_base} -s {BTO.stb} --store_mismatch_locations"
    print(cmd)
    #call(cmd, shell=True)
    inStrain.controller.Controller().main(
        inStrain.argumentParser.parse_args(cmd.split(' ')[1:]))

    # Load output
    IS = inStrain.SNVprofile.SNVprofile(exp_base)
    files = glob.glob(IS.get_location('output') + '*')
    files = [f for f in files if 'genomeWide' in f]
    assert len(files) == 1
    e = pd.read_csv(files[0], sep='\t')

    # Compare
    assert test_utils.compare_dfs2(e, s, verbose=True)

    # See if figures were made
    figs = glob.glob(IS.get_location('figures') + '*')
    assert len(figs) > 0
Beispiel #6
0
def test_compare_13(BTO):
    """
    Re-run and ensure that the results are the same as a previous run
    """
    importlib.reload(logging)
    # Run program
    base = BTO.test_dir + 'RC_test'

    cmd = "inStrain compare -i {1} {2} -o {3} --include_self_comparisons --store_mismatch_locations -d".format(
        True, BTO.IS1, BTO.IS2,
        base, BTO.scafflistF)
    print(cmd)
    #call(cmd, shell=True)
    inStrain.controller.Controller().main(inStrain.argumentParser.parse_args(cmd.split(' ')[1:]))

    exp_RC = inStrain.SNVprofile.SNVprofile(base)
    sol_RC = inStrain.SNVprofile.SNVprofile(BTO.v12_solution)

    # Print what the output of the solutions directory looks like
    if True:
        s_out_files = glob.glob(exp_RC.get_location('output') + os.path.basename(
            exp_RC.get('location')) + '_*')
        print("The output has {0} tables".format(len(s_out_files)))
        for f in s_out_files:
            name = os.path.basename(f)
            print("{1}\n{0}\n{1}".format(name, '-' * len(name)))

            s = pd.read_csv(f, sep='\t')
            print(s.head())
            print()

    # Make sure log is working
    assert len(glob.glob(base + '/log/*')) == 3, glob.glob(base + '/log/*')
    Ldb = exp_RC.get_parsed_log()
    print(Ldb)

    # Check output files
    e_out_files = glob.glob(exp_RC.get_location('output') + os.path.basename(
        exp_RC.get('location')) + '_*')
    s_out_files = glob.glob(sol_RC.get_location('output') + '*_*')
    assert len(s_out_files) == 1, sol_RC.get_location('output') + '*_*'

    for s_file in s_out_files:
        name = os.path.basename(s_file).split('RC_test_')[1]
        e_file = [e for e in e_out_files if name in os.path.basename(e)]

        print("checking {0}".format(name))

        if len(e_file) == 1:
            # print("Both have {0}!".format(name))

            e = pd.read_csv(e_file[0], sep='\t')
            s = pd.read_csv(s_file, sep='\t')

            if name == 'comparisonsTable.tsv':
                e = e.sort_values(['scaffold', 'name1', 'name2']
                                  ).reset_index(drop=True)
                s = s.sort_values(['scaffold', 'name1', 'name2']
                                  ).reset_index(drop=True)

                changed_cols = ['consensus_SNPs', 'conANI']
                for c in changed_cols:
                    del e[c]
                    del s[c]

            assert set(s.columns) == set(e.columns), \
                [set(s.columns) - set(e.columns),
                 set(e.columns) - set(s.columns), ]
            s = s[list(e.columns)]
            assert test_utils.compare_dfs2(e, s, verbose=True), name

        else:
            assert False, name

    # Check attributes
    sAdb = sol_RC._get_attributes_file()

    for i, row in sAdb.iterrows():
        print("checking {0}".format(i))

        if i in ['location', 'version']:
            continue

        s = sol_RC.get(i)
        e = exp_RC.get(i)

        if i in ['comparisonsTable']:
            s = s.sort_values(['scaffold', 'name1', 'name2', 'mm']).reset_index(drop=True)
            e = e.sort_values(['scaffold', 'name1', 'name2', 'mm']).reset_index(drop=True)

            changed_cols = ['consensus_SNPs', 'conANI']
            for c in changed_cols:
                del e[c]
                del s[c]

            # Re-arange column order
            assert set(e.columns) == set(s.columns), \
                [i,
                 set(e.columns) - set(s.columns),
                 set(s.columns) - set(e.columns)]
            s = s[list(e.columns)]
            assert test_utils.compare_dfs2(e, s, verbose=True), i

        if i in ['pairwise_SNP_locations']:
            # Fix the solutions directory to remove the old errors (fixed in v1.3.0t)
            s = s[ \
                ((s['consensus_SNP'] == True)
                 &
                 (((s['ref_base_1'] != s['con_base_1']) &
                   (s['con_base_1'] == s['con_base_1']))
                  |
                  ((s['ref_base_2'] != s['con_base_2']) &
                   (s['con_base_2'] == s['con_base_2'])))) \
                | (s['consensus_SNP'] == False)]

            # Make the solutions directory only have SNPs
            for c in ['consensus_SNP', 'population_SNP']:
                s[c] = s[c].astype(bool)
            s = s[s['consensus_SNP'] | s['population_SNP']]

            # Get rid of the junk colums
            s = s[e.columns]
            for c in ['position', 'mm']:
                s[c] = s[c].astype(int)

            s = s.sort_values(['scaffold', 'position', 'name1', 'name2']).reset_index(drop=True)
            e = e.sort_values(['scaffold', 'position', 'name1', 'name2']).reset_index(drop=True)

            assert set(e.columns) == set(s.columns), \
                [i,
                 set(e.columns) - set(s.columns),
                 set(s.columns) - set(e.columns)]
            s = s[list(e.columns)]
            assert test_utils.compare_dfs2(e, s, verbose=True), i

        elif i in ['scaffold2length']:
            assert test_utils.compare_dicts(e, s), i
Beispiel #7
0
def test_compare_19(BTO):
    """
    Ensure that compare can generate clusters
    """
    exp_base = BTO.test_dir + 'testSR'
    cmd = f"inStrain compare -i {BTO.IS1} {BTO.IS2} -o {exp_base} -s {BTO.stb}"
    print(cmd)
    inStrain.controller.Controller().main(inStrain.argumentParser.parse_args(cmd.split(' ')[1:]))

    # Load output
    IS = inStrain.SNVprofile.SNVprofile(exp_base)

    # Run it with default settings
    files = glob.glob(IS.get_location('output') + '/*')
    assert len(files) == 3

    for f in files:
        basename = os.path.basename(f)
        if basename.endswith('_strain_clusters.tsv'):
            Scdb = pd.read_csv(f, sep='\t')
        elif basename.endswith('_genomeWide_compare.tsv'):
            Sndb = pd.read_csv(f, sep='\t')

    assert len(Scdb['cluster'].unique()) == 3

    # Adjust the ani threshold
    exp_base = BTO.test_dir + 'testSR2'
    cmd = f"inStrain compare -i {BTO.IS1} {BTO.IS2} -o {exp_base} -s {BTO.stb} -ani 0.999"
    inStrain.controller.Controller().main(inStrain.argumentParser.parse_args(cmd.split(' ')[1:]))

    IS = inStrain.SNVprofile.SNVprofile(exp_base)
    files = glob.glob(IS.get_location('output') + '/*')
    assert len(files) == 3
    for f in files:
        basename = os.path.basename(f)
        if basename.endswith('_strain_clusters.tsv'):
            cdb = pd.read_csv(f, sep='\t')
        elif basename.endswith('_genomeWide_compare.tsv'):
            ndb = pd.read_csv(f, sep='\t')

    assert len(cdb['cluster'].unique()) == 2
    assert test_utils.compare_dfs2(ndb, Sndb)

    # Adjust the coverage threshold
    exp_base = BTO.test_dir + 'testSR3'
    cmd = f"inStrain compare -i {BTO.IS1} {BTO.IS2} -o {exp_base} -s {BTO.stb} -cov 0.9999999999999"
    inStrain.controller.Controller().main(inStrain.argumentParser.parse_args(cmd.split(' ')[1:]))

    IS = inStrain.SNVprofile.SNVprofile(exp_base)
    files = glob.glob(IS.get_location('output') + '/*')
    assert len(files) == 3
    for f in files:
        basename = os.path.basename(f)
        if basename.endswith('_strain_clusters.tsv'):
            cdb = pd.read_csv(f, sep='\t')
        elif basename.endswith('_genomeWide_compare.tsv'):
            ndb = pd.read_csv(f, sep='\t')

    assert len(cdb['cluster'].unique()) == 4
    assert test_utils.compare_dfs2(ndb, Sndb)

    # Include self
    exp_base = BTO.test_dir + 'testSR4'
    cmd = f"inStrain compare -i {BTO.IS1} {BTO.IS2} -o {exp_base} -s {BTO.stb} --include_self_comparisons"
    inStrain.controller.Controller().main(inStrain.argumentParser.parse_args(cmd.split(' ')[1:]))

    IS = inStrain.SNVprofile.SNVprofile(exp_base)
    files = glob.glob(IS.get_location('output') + '/*')
    assert len(files) == 3
    for f in files:
        basename = os.path.basename(f)
        if basename.endswith('_strain_clusters.tsv'):
            cdb = pd.read_csv(f, sep='\t')
        elif basename.endswith('_genomeWide_compare.tsv'):
            ndb = pd.read_csv(f, sep='\t')

    assert len(cdb['cluster'].unique()) == 3
    assert not test_utils.compare_dfs2(ndb, Sndb)