Example #1
0
    def feature_selection(self,data_set,feature_selector,fit=True):
        """
        Perform feature selection. Must be done before loading testing sets

        :param feat_selector:
        :return:
        """

        assert hasattr(feature_selector,"transform")


        clustering_logger.info("Pre feature selection: num features: {}".format(data_set.X.shape[1]))

        if fit:
            X,y,ref_id=data_set.X,data_set.y,data_set.ref_index

            flatten_train_set(data_set)
            feature_selector.fit(data_set.X,data_set.y)

            data_set.X,data_set.y,data_set.ref_index=X,y,ref_id


        train_X=feature_selector.transform(data_set.X)

        clustering_logger.info("Post feature selection: num features: {}".format(train_X.shape[1]))

        data_set.X=train_X
Example #2
0
def load_training_testing(Xs,ys,ref_indexes,settings,train_set_index,test_set_index):
    """
    Load training and testing set based on indexes provided by crossvalidation

    :return: List of train_set and test_set objs
    """
    train_sets=[]
    test_sets=[]

    for c,setting in enumerate(settings):
        train_set=Training(setting,pickle_dir=setting.pickle_dir)
        train_set.X=Xs[c][train_set_index]
        train_set.y=ys[c][train_set_index]
        train_set.ref_index=ref_indexes[c][train_set_index]

        test_set=Testing(setting,pickle_dir=setting.pickle_dir)
        test_set.X=Xs[c][test_set_index]
        test_set.y=ys[c][test_set_index]
        test_set.ref_index=ref_indexes[c][test_set_index]

        train_sets.append(train_set)
        test_sets.append(test_set)

    #flatten training
    for train_set in train_sets:
        flatten_train_set(train_set)

    #make sure the sets match
    classification_logger.info("Checking the sets match")
    ys=[train_set.y for train_set in train_sets]
    ref_indexes=[train_set.ref_index for train_set in train_sets]

    test_ys=np.array([test_set.y for test_set in test_sets])
    test_ref_indexes=[test_set.ref_index for test_set in test_sets]

    for c,elem in enumerate((ys,ref_indexes,test_ys,test_ref_indexes)):

        prev=elem[0]
        match=True
        for e in elem[1:]:
            match=match and (e==prev).all()
        if not match:
            raise AttributeError("NOT MATCH FOR {} ELEMENT".format(c))

    return train_sets,test_sets


# def select_training_testing_sets(settings,Xs,y,ref_index,num,do_pickle=True):
#     """
#     Randomly choose from a super set of data and split it into a training set of size num. The remainder will become
#         the Test set. Uses _pick_random_samples
#
#     :param setting:
#     :param X:
#     :param y:
#     :param ref_index:
#     :param num:
#     :return: tuple_training,tuple_testing
#     """
#
#     selector=np.not_equal(ref_index,None)
#     ref_index=ref_index[selector]
#     Xs=[X[selector] for X in Xs]
#     y=y[selector]
#
#     train_Xs,train_y,train_ref_index,test_Xs,test_y,test_ref_index=_pick_random_samples(Xs,y,ref_index,num)
#
#     train_objs=[]
#     test_objs=[]
#     if do_pickle:
#         for c,setting in enumerate(settings):
#             train_X=train_Xs[c]
#             test_X=test_Xs[c]
#
#             _pickle_training_testing(setting,train_X,train_y,train_ref_index,test_X,test_y,test_ref_index)
#
#             training_obj=Training(label=setting,pickle_dir=setting.pickle_dir)
#             training_obj.set_data(train_X,train_y,train_ref_index)
#
#             testing_obj=Testing(label=setting,pickle_dir=setting.pickle_dir)
#             testing_obj.set_data(test_X,test_y,test_ref_index)
#
#             train_objs.append(training_obj)
#             test_objs.append(testing_obj)
#
#     return train_objs,test_objs