Esempio n. 1
0
def main(): 
    dataset = sys.argv[1]
    features_number = int(sys.argv[2])
    clusters_number = CLUSTERS_NUMBERS[dataset]

    app_logger.info('STARTED {0} with {1} selected features'.format(dataset, features_number), extra = LOGGER_EXTRA_OBJECT)
    all_tsfresh_selection.select(dataset, clusters_number)
    relevent_tsfresh_selection.select(dataset, clusters_number)
    MCFS_selection.select(dataset, features_number, clusters_number)
    feature_agglomeration.agglomerate(dataset, features_number, clusters_number)
    corr_selection.select(dataset, features_number, clusters_number)
    app_logger.info('ENDED {0} with {1} selected features'.format(dataset, features_number), extra = LOGGER_EXTRA_OBJECT)
def select(dataset, clusters_number):

    app_logger.info(
        'STARTED [RELEVANT TSFRESH Selection] on {0}'.format(dataset),
        extra=LOGGER_EXTRA_OBJECT)

    # Retrieving relevant feature extracted by tsfresh from the pickles on the disk
    current_dir = os.getcwd().split('\\')[-1]
    projet_dir = 'MCFS-Unsupervisioned-Feature-Selection'
    if current_dir == projet_dir:
        relevant_features_train = pd.read_pickle(
            'Pickle/RelevantFeatures/Train/{0}.pkl'.format(dataset))
        relevant_features_test = pd.read_pickle(
            'Pickle/RelevantFeatures/Test/{0}.pkl'.format(dataset))
    else:
        relevant_features_train = pd.read_pickle(
            '../Pickle/RelevantFeatures/Train/{0}.pkl'.format(dataset))
        relevant_features_test = pd.read_pickle(
            '../Pickle/RelevantFeatures/Test/{0}.pkl'.format(dataset))

    app_logger.info(
        'Relevant features (including target column) trainset shape: {0}'.
        format(relevant_features_train.shape),
        extra=LOGGER_EXTRA_OBJECT)
    app_logger.info(
        'Relevant features (including target column) testset shape: {0}'.
        format(relevant_features_test.shape),
        extra=LOGGER_EXTRA_OBJECT)

    # Retrieving indipendent columns of both set and known labels of the test set
    indipendent_columns_train = relevant_features_train.iloc[:, 1:]
    indipendent_columns_test = relevant_features_test.iloc[:, 1:]
    known_labels_test = relevant_features_test.iloc[:, 0]

    # Running k-means on dataframes obtained from the pickles
    test_feature_selection.testFeatureSelectionWithKMeans(
        'RELEVANT TSFRESH', indipendent_columns_train.shape[1], dataset,
        indipendent_columns_train.values, indipendent_columns_test.values,
        clusters_number, known_labels_test)

    app_logger.info(
        'ENDED [RELEVANT TSFRESH Selection] on {0}'.format(dataset),
        extra=LOGGER_EXTRA_OBJECT)


# Testing
# selekct('TwoPatterns', 4)
Esempio n. 3
0
def select(dataset, features_number, clusters_number):

    app_logger.info(
        'STARTED [MCFS Selection] on {0} with features number = {1}'.format(
            dataset, features_number),
        extra=LOGGER_EXTRA_OBJECT)

    # Retrieving all feature extracted by tsfresh from the pickles on the disk
    current_dir = os.getcwd().split('\\')[-1]
    projet_dir = 'MCFS-Unsupervisioned-Feature-Selection'
    if current_dir == projet_dir:
        all_features_train = pd.read_pickle(
            'Pickle/AllFeatures/Train/{0}.pkl'.format(dataset))
        all_features_test = pd.read_pickle(
            'Pickle/AllFeatures/Test/{0}.pkl'.format(dataset))
    else:
        all_features_train = pd.read_pickle(
            '../Pickle/AllFeatures/Train/{0}.pkl'.format(dataset))
        all_features_test = pd.read_pickle(
            '../Pickle/AllFeatures/Test/{0}.pkl'.format(dataset))

    app_logger.info(
        'All features (including target column) trainset shape: {0}'.format(
            all_features_train.shape),
        extra=LOGGER_EXTRA_OBJECT)
    app_logger.info(
        'All features (including target column) testset shape: {0}'.format(
            all_features_test.shape),
        extra=LOGGER_EXTRA_OBJECT)

    # np.savetxt(r'testDataFrame.txt', all_features_test.values, fmt='%d')

    # Retrieving indipendent columns of both set and known labels of the test set
    indipendent_columns_train = all_features_train.iloc[:, 1:]
    indipendent_columns_test = all_features_test.iloc[:, 1:]
    known_labels_test = all_features_test.iloc[:, 0]

    # Building matrix W for MCFS algorithm
    kwargs = {
        'metric': 'euclidean',
        'neighbor_mode': 'knn',
        'weight_mode': 'binary',
        'k': 3
        # 'weight_mode': 'heat_kernel',
        # 'k': 5,
        # 't': 1
    }
    W = construct_W.construct_W(indipendent_columns_train.values, **kwargs)

    # MCFS gives a weight to each features
    kwargs = {'W': W, 'n_clusters': clusters_number}
    weighted_features = MCFS.mcfs(indipendent_columns_train.values,
                                  features_number, **kwargs)

    # Ordering the features according to their weight
    ordered_features = MCFS.feature_ranking(weighted_features)

    # Getting only the first 'features_number' features
    selected_features = ordered_features[0:features_number]

    # Getting names of selected features
    names_selected_features = []
    for feature_index in selected_features:
        names_selected_features.append(
            indipendent_columns_train.columns[feature_index])

    # Selected only the selected features on the train set
    selected_features_train = indipendent_columns_train.loc[:,
                                                            names_selected_features]
    app_logger.info('Selected features trainset: {0}'.format(
        selected_features_train.shape),
                    extra=LOGGER_EXTRA_OBJECT)

    # Selected only the selected features on the test set
    selected_features_test = indipendent_columns_test.loc[:,
                                                          names_selected_features]
    app_logger.info('Selected features testset: {0}'.format(
        selected_features_test.shape),
                    extra=LOGGER_EXTRA_OBJECT)
    '''
    # Pickles for rfd
    if selected_features_train.shape[0] > 1000:
        print('Test-set')
        selected_features_test.to_pickle('../rfd/Pickle_rfd/MCFS/{0}.pkl'.format(dataset))
    else:
        print('Train-set')
        selected_features_train.to_pickle('../rfd/Pickle_rfd/MCFS/{0}.pkl'.format(dataset))
    exit()
    '''

    # Running k-means according to selected features
    test_feature_selection.testFeatureSelectionWithRepeatedKMeans(
        'MCFS', features_number, dataset, selected_features_train.values,
        selected_features_test.values, clusters_number, known_labels_test)

    app_logger.info('ENDED [MCFS Selection] on {0}'.format(dataset),
                    extra=LOGGER_EXTRA_OBJECT)


# Testing
#select('TwoPatterns', 10, 4)
def agglomerate(dataset, features_number, clusters_number):
    app_logger.info(
        'STARTED [Feature Agglomeration] on {0} with features number = {1}'.
        format(dataset, features_number),
        extra=LOGGER_EXTRA_OBJECT)

    # Retrieving all feature extracted by tsfresh from the pickles on the disk
    current_dir = os.getcwd().split('\\')[-1]
    projet_dir = 'MCFS-Unsupervisioned-Feature-Selection'
    if current_dir == projet_dir:
        all_features_train = pd.read_pickle(
            'Pickle/AllFeatures/Train/{0}.pkl'.format(dataset))
        all_features_test = pd.read_pickle(
            'Pickle/AllFeatures/Test/{0}.pkl'.format(dataset))
    else:
        all_features_train = pd.read_pickle(
            '../Pickle/AllFeatures/Train/{0}.pkl'.format(dataset))
        all_features_test = pd.read_pickle(
            '../Pickle/AllFeatures/Test/{0}.pkl'.format(dataset))

    app_logger.info(
        'All features (including target column) trainset shape: {0}'.format(
            all_features_train.shape),
        extra=LOGGER_EXTRA_OBJECT)
    app_logger.info(
        'All features (including target column) testset shape: {0}'.format(
            all_features_test.shape),
        extra=LOGGER_EXTRA_OBJECT)

    # Retrieving indipendent columns of both set and known labels of the test set
    indipendent_columns_train = all_features_train.iloc[:, 1:]
    indipendent_columns_test = all_features_test.iloc[:, 1:]
    known_labels_test = all_features_test.iloc[:, 0]

    agglomeration = cluster.FeatureAgglomeration(n_clusters=features_number)
    agglomeration.fit(indipendent_columns_train)
    reduced_train = agglomeration.transform(indipendent_columns_train)
    reduced_test = agglomeration.transform(indipendent_columns_test)
    app_logger.info('Reduced train set: {0}'.format(reduced_train),
                    extra=LOGGER_EXTRA_OBJECT)
    app_logger.info('Reduced test set: {0}'.format(reduced_test),
                    extra=LOGGER_EXTRA_OBJECT)

    # Running k-means according to selected features
    test_feature_selection.testFeatureSelectionWithRepeatedKMeans(
        'AGGLOMERATION', features_number, dataset, reduced_train, reduced_test,
        clusters_number, known_labels_test)

    app_logger.info('ENDED [Feature Agglomeration] on {0}'.format(dataset),
                    extra=LOGGER_EXTRA_OBJECT)
def select(dataset, features_number, clusters_number):
    app_logger.info(
        'STARTED [Corr Selection] on {0} with features number = {1}'.format(
            dataset, features_number),
        extra=LOGGER_EXTRA_OBJECT)

    # Retrieving all feature extracted by tsfresh from the pickles on the disk
    current_dir = os.getcwd().split('\\')[-1]
    projet_dir = 'MCFS-Unsupervisioned-Feature-Selection'
    if current_dir == projet_dir:
        all_features_train = pd.read_pickle(
            'Pickle/AllFeatures/Train/{0}.pkl'.format(dataset))
        all_features_test = pd.read_pickle(
            'Pickle/AllFeatures/Test/{0}.pkl'.format(dataset))
    else:
        all_features_train = pd.read_pickle(
            '../Pickle/AllFeatures/Train/{0}.pkl'.format(dataset))
        all_features_test = pd.read_pickle(
            '../Pickle/AllFeatures/Test/{0}.pkl'.format(dataset))

    app_logger.info(
        'All features (including target column) trainset shape: {0}'.format(
            all_features_train.shape),
        extra=LOGGER_EXTRA_OBJECT)
    app_logger.info(
        'All features (including target column) testset shape: {0}'.format(
            all_features_test.shape),
        extra=LOGGER_EXTRA_OBJECT)

    # Selecting indipendent columns and the target column of the train set
    indipendent_columns_train = all_features_train.iloc[:, 1:]
    target_column_train = all_features_train.iloc[:, 0]
    # Selecting indipendent columns and the target column of the test set
    indipendent_columns_test = all_features_test.iloc[:, 1:]
    known_labels_test = all_features_test.iloc[:, 0]

    dfcolumns = pd.DataFrame(indipendent_columns_train.columns)

    # Correlation Matrix
    # data = all_features_train
    data = all_features_train.astype(
        float
    )  # Otherwise, don't consider target column beacuse its type is integer (and not float)
    corrmat = data.corr()
    dfcorr_target = pd.DataFrame(corrmat[['target']].iloc[1:].values)

    # Creating dataframe which contains columns names and correlation values
    dfscores = pd.concat([dfcolumns, dfcorr_target], axis=1)
    dfscores.columns = ['feature_name', 'target_corr']
    dfscores = dfscores.dropna(axis=0)

    # Converting Corr column
    dfscores[['target_corr']] = abs(dfscores[['target_corr']])

    dfscores = dfscores.sort_values(by='target_corr', ascending=False)

    top_k_scores = dfscores.head(features_number)
    app_logger.info(top_k_scores, extra=LOGGER_EXTRA_OBJECT)

    selected_features_names = top_k_scores['feature_name'].values
    selected_features_train = indipendent_columns_train.loc[:,
                                                            selected_features_names]
    selected_features_test = indipendent_columns_test.loc[:,
                                                          selected_features_names]
    '''
    # Pickles for rfd
    if selected_features_train.shape[0] > 1000:
        print('Test-set')
        selected_features_test.to_pickle('../rfd/Pickle_rfd/Corr/{0}.pkl'.format(dataset))
    else:
        print('Train-set')
        selected_features_train.to_pickle('../rfd/Pickle_rfd/Corr/{0}.pkl'.format(dataset))
    exit()
    '''

    # Running k-means according to selected features
    test_feature_selection.testFeatureSelectionWithRepeatedKMeans(
        'CORRELATION', features_number, dataset,
        selected_features_train.values, selected_features_test.values,
        clusters_number, known_labels_test)

    app_logger.info('ENDED [Corr Selection] on {0}'.format(dataset),
                    extra=LOGGER_EXTRA_OBJECT)


# Testing
#select('TwoPatterns', 10, 4)