def test_setup_series_analysis():

    base_dir = os.path.join("./fake_series")
    mdl_dir = os.path.join(base_dir,"new_mdl_dir")
    feature_dir = "feature_dir"
    series_name = "fake_series"

    protein_list = ["fake_kinase1", "fake_kinase2"]
    project_dict = {"fake_kinase1": ["fake_proj1", "fake_proj2"],
                    "fake_kinase2": ["fake_proj3"]}
    mdl_params = {'tica__n_components': 1, 'tica__lag_time': 2,
                  'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01,
                  'cluster__n_clusters': 174}

    with enter_temp_directory():
        create_fake_series()
        setup_series_analysis(base_dir, mdl_dir, feature_dir,
                              series_name, protein_list,
                              project_dict, mdl_params)

        assert os.path.isdir(mdl_dir)
        for protein in protein_list:
            assert os.path.isdir(os.path.join(mdl_dir, protein))

        assert(os.path.isfile(os.path.join(base_dir,"series.yaml")))
        fin = open(os.path.join(mdl_dir,"project.yaml"), 'r')
        yaml_file = yaml.load(fin)

        assert yaml_file["base_dir"] == base_dir
        assert yaml_file["series_name"] == series_name
        assert yaml_file["protein_list"] == protein_list
        assert yaml_file["project_dict"] == project_dict
        assert yaml_file["mdl_params"] == mdl_params

    return
def test_project():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        print(base_dir)
        print(type(base_dir))
        mdl_dir = os.path.join(base_dir,"mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {"kinase_1": ["fake_proj1",],
                        "kinase_2": ["fake_proj2"]}
        mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1,
                  'tica__weighted_transform': True, 'tica__shrinkage': 0.01,
                  'cluster__n_clusters': 2,'msm__lag_time': 1,
                  'bootstrap__n_samples':1}

        create_fake_data(base_dir, protein_list, project_dict)

        setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                  series_name, protein_list,
                                  project_dict, mdl_params)
        fit_pipeline(base_dir)

        prj = ProteinSeries(os.path.join(mdl_dir,"project.yaml"))

        assert isinstance(prj, ProteinSeries)
        assert isinstance(prj.tica_mdl ,tICA)

        assert _test_protein_without_project()
        assert _test_protein_with_project(prj)
        assert _test_tic_dict(prj)

        assert _test_obs_mapping(prj)
    return
def _setup_test():
    #remove previous mdl_dir
    try:
        shutil.rmtree(os.path.join(base_dir,"mdl_dir"))
    except:
        pass
    setup_series_analysis(base_dir =base_dir,
                          mdl_dir = os.path.abspath(os.path.join(base_dir,"mdl_dir")),
                          feature_dir = "features",
                          series_name="fake_series",
                          protein_list = ["kinase_1", "kinase_2"],
                          project_dict = {"kinase_1":["fake_proj1","fake_proj2"],
                                          "kinase_2":["fake_proj3"]},
                          mdl_params= {"cluster__n_clusters": 2,
                                       "msm__lag_time": 1,
                                       "tica__shrinkage": 0.005,
                                       "tica__lag_time": 1,
                                       "tica__n_components": 2,
                                       "tica__weighted_transform": True},
                         )
def test_multiple_mdls():
    base_dir = os.path.join("./fake_series")
    mdl_dir = os.path.join(base_dir,"new_mdl_dir")
    feature_dir = "feature_dir"
    series_name = "fake_series"
    protein_list = ["fake_kinase1", "fake_kinase2"]
    project_dict = {"fake_kinase1": ["fake_proj1", "fake_proj2"],
                    "fake_kinase2": ["fake_proj3"]}
    mdl_params = {'tica__n_components': 4, 'tica__lag_time': 223,
                  'tica__kinetic_mapping': True, 'tica__gamma': 0.0121,
                  'cluster__n_clusters': 212}

    with enter_temp_directory():
        create_fake_series()
        for i in range(3):
            setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                  series_name, protein_list,
                                  project_dict, mdl_params)
            time.sleep(1)
        assert len(glob.glob("./fake_series/*/project.yaml")) == 3
    return
Example #5
0
def test_pipeline():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        mdl_dir = os.path.join(base_dir, "mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {
            "kinase_1": [
                "fake_proj1",
            ],
            "kinase_2": ["fake_proj2"]
        }
        mdl_params = {
            'tica__n_components': 1,
            'tica__lag_time': 1,
            'tica__kinetic_mapping': True,
            'tica__shrinkage': 0.01,
            'cluster__n_clusters': 2,
            'msm__lag_time': 1,
            'bootstrap__n_samples': 1
        }

        create_fake_data(base_dir, protein_list, project_dict)

        yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                          series_name, protein_list,
                                          project_dict, mdl_params)
        fit_protein_tica(yaml_file)
        transform_protein_tica(yaml_file)
        fit_protein_kmeans(yaml_file)
        transform_protein_kmeans(yaml_file)
        fit_msms(yaml_file)
        fit_bootstrap(yaml_file)

        raw_count_obs = 0
        for p in protein_list:
            for j in glob.glob(os.path.join(base_dir, p, feature_dir, "*.jl")):
                raw_count_obs += verboseload(j).shape[0]
        tica_mdl = verboseload(os.path.join(mdl_dir, "tica_mdl.pkl"))
        #make sure the mdl is seeing all the data, could probably have a far stronger test here
        assert tica_mdl.n_observations_ == raw_count_obs
        assert os.path.exists(os.path.join(mdl_dir, "kinase_1/tica_data.pkl"))
        assert os.path.exists(os.path.join(mdl_dir, "kinase_2/tica_data.pkl"))
        assert os.path.exists(os.path.join(mdl_dir, "kinase_1/msm_mdl.pkl"))
        assert os.path.exists(os.path.join(mdl_dir, "kinase_2/msm_mdl.pkl"))
        assert os.path.exists(
            os.path.join(mdl_dir, "kinase_2/bootstrap_msm_mdl.pkl"))
        assert os.path.exists(os.path.join(mdl_dir, "kmeans_mdl.pkl"))

        return
Example #6
0
def test_pipeline():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        mdl_dir = os.path.join(base_dir,"mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {"kinase_1": ["fake_proj1",],
                        "kinase_2": ["fake_proj2"]}
        mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1,
                  'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01,
                  'cluster__n_clusters': 2,
                  'msm__lag_time': 1, 'bootstrap__n_samples':1 }

        create_fake_data(base_dir, protein_list, project_dict)


        yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                  series_name, protein_list,
                                  project_dict, mdl_params)
        fit_protein_tica(yaml_file)
        transform_protein_tica(yaml_file)
        fit_protein_kmeans(yaml_file)
        transform_protein_kmeans(yaml_file)
        fit_msms(yaml_file)
        fit_bootstrap(yaml_file)

        raw_count_obs = 0
        for p in protein_list:
            for j in glob.glob(os.path.join(base_dir,p,feature_dir,"*.jl")):
                raw_count_obs += verboseload(j).shape[0]
        tica_mdl = verboseload(os.path.join(mdl_dir,"tica_mdl.pkl"))
        #make sure the mdl is seeing all the data, could probably have a far stronger test here
        assert tica_mdl.n_observations_ == raw_count_obs
        assert os.path.exists(os.path.join(mdl_dir,"kinase_1/tica_data.pkl"))
        assert os.path.exists(os.path.join(mdl_dir,"kinase_2/tica_data.pkl"))
        assert os.path.exists(os.path.join(mdl_dir,"kinase_1/msm_mdl.pkl"))
        assert os.path.exists(os.path.join(mdl_dir,"kinase_2/msm_mdl.pkl"))
        assert os.path.exists(os.path.join(mdl_dir,"kinase_2/bootstrap_msm_mdl.pkl"))
        assert os.path.exists(os.path.join(mdl_dir,"kmeans_mdl.pkl"))

        return
def test_slicer():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        mdl_dir = os.path.join(base_dir,"mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {"kinase_1": ["fake_proj1",],
                        "kinase_2": ["fake_proj2"]}
        mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1,
                  'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01,
                  'cluster__n_clusters': 2,
                  'msm__lag_time': 1, 'bootstrap__n_samples':1 }

        create_fake_data(base_dir, protein_list, project_dict)

        yaml_file = setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                  series_name, protein_list,
                                  project_dict, mdl_params)

        dict_feat_ind={}
        dict_feat_ind["kinase_1"] =[0, 2]
        dict_feat_ind["kinase_2"] =[1, 1, 0, 2]

        series_feature_slicer(yaml_file, dict_feat_ind)


        for protein in protein_list:
            with enter_protein_data_dir(yaml_file, protein):
                assert (os.path.isdir("sliced_feature_dir"))
                flist = glob.glob("./%s/*.jl"%feature_dir)
                for fname in flist:
                    original_file = verboseload(fname)
                    expected_file = original_file[:, dict_feat_ind[protein]]
                    written_file = verboseload("./%s/%s"%("sliced_feature_dir",
                                                          os.path.basename(fname)
                                                          ))
                    assert (expected_file==written_file).all()
    return
Example #8
0
def test_plotting_utils():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        mdl_dir = os.path.join(base_dir, "mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {
            "kinase_1": [
                "fake_proj1",
            ],
            "kinase_2": ["fake_proj2"]
        }
        mdl_params = {
            'tica__n_components': 1,
            'tica__lag_time': 1,
            'tica__kinetic_mapping': True,
            'tica__shrinkage': 0.01,
            'cluster__n_clusters': 2,
            'msm__lag_time': 1,
            'bootrap__n_samples': 1
        }

        create_fake_data(base_dir, protein_list, project_dict)
        setup_series_analysis(base_dir, mdl_dir, feature_dir, series_name,
                              protein_list, project_dict, mdl_params)

        fit_pipeline(base_dir)
        prj = ProteinSeries(os.path.join(mdl_dir, "project.yaml"))

        prt1 = Protein(prj, "kinase_1")
        prt2 = Protein(prj, "kinase_2")

        prt1._mlpt_fct = 0.0
        prt2._mlpt_fct = 0.0
        n_bins = 100

        lin_spaced_tic_dict = global_tic_boundaries([prt1, prt2],
                                                    range(prt1.n_tics_),
                                                    n_bins)

        def test_bounds():
            locally_calc = {}
            for i in range(prt1.n_tics_):
                locally_calc[i] = []
                global_min = min(
                    min([min(i) for i in prt1.tica_data.values()]),
                    min([min(i) for i in prt2.tica_data.values()]))

                locally_calc[i].append(global_min)

                global_max = max(
                    max([max(i) for i in prt1.tica_data.values()]),
                    max([max(i) for i in prt2.tica_data.values()]))

                locally_calc[i].append(global_max)

            for i in range(prt1.n_tics_):
                assert (lin_spaced_tic_dict[i][0] == locally_calc[i][0])
                assert (lin_spaced_tic_dict[i][-1] == locally_calc[i][-1])
                assert (len(lin_spaced_tic_dict[i]) == n_bins)

            return True

        def test_histogram_data():
            H_dict, H_calc, _ = tica_histogram(prj,
                                               prt1, [0],
                                               x_array=lin_spaced_tic_dict[0],
                                               n_bins=None)
            assert (len(H_dict.keys()) == prt1.n_states_)
            assert (len(H_calc) == len(lin_spaced_tic_dict[0]) - 1)
            rnd_state = np.random.randint(0, prt1.n_states_)
            assert (np.allclose(
                H_dict[rnd_state],
                np.histogram(prt1.tic_dict[0][rnd_state],
                             bins=lin_spaced_tic_dict[0],
                             normed=True)[0]))
            return True

        def test_one_dim_free_energy():
            df = one_dim_tic_free_energy(prj,
                                         prt1,
                                         0,
                                         n_bins=None,
                                         lin_spaced_tic=lin_spaced_tic_dict[0],
                                         errorbars=False)

            assert ((df.protein_name == prt1.name).all())
            assert ((df.mdl_index == "mle").all())

            return True

        assert (test_bounds())
        assert (test_histogram_data())
        assert (test_one_dim_free_energy())

        return
def test_plotting_utils():
    with enter_temp_directory():
        base_dir = os.path.abspath(os.path.curdir)
        mdl_dir = os.path.join(base_dir,"mdl_dir")
        feature_dir = "feature_dir"
        series_name = "fake_series"
        protein_list = ["kinase_1", "kinase_2"]
        project_dict = {"kinase_1": ["fake_proj1",],
                            "kinase_2": ["fake_proj2"]}
        mdl_params = {'tica__n_components': 1, 'tica__lag_time': 1,
                      'tica__kinetic_mapping': True, 'tica__shrinkage': 0.01,
                      'cluster__n_clusters': 2,'msm__lag_time': 1,
                      'bootrap__n_samples':1
                      }


        create_fake_data(base_dir, protein_list, project_dict)
        setup_series_analysis(base_dir, mdl_dir, feature_dir,
                                      series_name, protein_list,
                                      project_dict, mdl_params)

        fit_pipeline(base_dir)
        prj = ProteinSeries(os.path.join(mdl_dir,"project.yaml"))

        prt1 = Protein(prj, "kinase_1")
        prt2 = Protein(prj, "kinase_2")

        prt1._mlpt_fct = 0.0
        prt2._mlpt_fct = 0.0
        n_bins = 100

        lin_spaced_tic_dict = global_tic_boundaries([prt1, prt2],
                                                    range(prt1.n_tics_), n_bins)

        def test_bounds():
            locally_calc={}
            for i in range(prt1.n_tics_):
                locally_calc[i] =[]
                global_min = min(min([min(i) for i in prt1.tica_data.values()]),
                    min([min(i) for i in prt2.tica_data.values()]))

                locally_calc[i].append(global_min)

                global_max = max(max([max(i) for i in prt1.tica_data.values()]),
                    max([max(i) for i in prt2.tica_data.values()]))

                locally_calc[i].append(global_max)

            for i in range(prt1.n_tics_):
                assert(lin_spaced_tic_dict[i][0]==locally_calc[i][0])
                assert(lin_spaced_tic_dict[i][-1]==locally_calc[i][-1])
                assert(len(lin_spaced_tic_dict[i])==n_bins)

            return True

        def test_histogram_data():
            H_dict, H_calc, _ = tica_histogram(prj, prt1, [0],
                                               x_array=lin_spaced_tic_dict[0],
                                               n_bins=None)
            assert(len(H_dict.keys()) == prt1.n_states_)
            assert(len(H_calc) == len(lin_spaced_tic_dict[0])-1)
            rnd_state = np.random.randint(0,prt1.n_states_)
            assert(np.allclose(H_dict[rnd_state], np.histogram(prt1.tic_dict[0][rnd_state],
                                                       bins = lin_spaced_tic_dict[0],
                                                       normed=True)[0]))
            return True


        def test_one_dim_free_energy():
            df = one_dim_tic_free_energy(prj, prt1, 0, n_bins=None ,
                        lin_spaced_tic=lin_spaced_tic_dict[0], errorbars=False)

            assert((df.protein_name==prt1.name).all())
            assert((df.mdl_index=="mle").all())

            return True

        assert(test_bounds())
        assert(test_histogram_data())
        assert(test_one_dim_free_energy())


        return