def main(): dataset = sys.argv[1] features_number = int(sys.argv[2]) clusters_number = CLUSTERS_NUMBERS[dataset] app_logger.info('STARTED {0} with {1} selected features'.format(dataset, features_number), extra = LOGGER_EXTRA_OBJECT) all_tsfresh_selection.select(dataset, clusters_number) relevent_tsfresh_selection.select(dataset, clusters_number) MCFS_selection.select(dataset, features_number, clusters_number) feature_agglomeration.agglomerate(dataset, features_number, clusters_number) corr_selection.select(dataset, features_number, clusters_number) app_logger.info('ENDED {0} with {1} selected features'.format(dataset, features_number), extra = LOGGER_EXTRA_OBJECT)
def select(dataset, clusters_number): app_logger.info( 'STARTED [RELEVANT TSFRESH Selection] on {0}'.format(dataset), extra=LOGGER_EXTRA_OBJECT) # Retrieving relevant feature extracted by tsfresh from the pickles on the disk current_dir = os.getcwd().split('\\')[-1] projet_dir = 'MCFS-Unsupervisioned-Feature-Selection' if current_dir == projet_dir: relevant_features_train = pd.read_pickle( 'Pickle/RelevantFeatures/Train/{0}.pkl'.format(dataset)) relevant_features_test = pd.read_pickle( 'Pickle/RelevantFeatures/Test/{0}.pkl'.format(dataset)) else: relevant_features_train = pd.read_pickle( '../Pickle/RelevantFeatures/Train/{0}.pkl'.format(dataset)) relevant_features_test = pd.read_pickle( '../Pickle/RelevantFeatures/Test/{0}.pkl'.format(dataset)) app_logger.info( 'Relevant features (including target column) trainset shape: {0}'. format(relevant_features_train.shape), extra=LOGGER_EXTRA_OBJECT) app_logger.info( 'Relevant features (including target column) testset shape: {0}'. format(relevant_features_test.shape), extra=LOGGER_EXTRA_OBJECT) # Retrieving indipendent columns of both set and known labels of the test set indipendent_columns_train = relevant_features_train.iloc[:, 1:] indipendent_columns_test = relevant_features_test.iloc[:, 1:] known_labels_test = relevant_features_test.iloc[:, 0] # Running k-means on dataframes obtained from the pickles test_feature_selection.testFeatureSelectionWithKMeans( 'RELEVANT TSFRESH', indipendent_columns_train.shape[1], dataset, indipendent_columns_train.values, indipendent_columns_test.values, clusters_number, known_labels_test) app_logger.info( 'ENDED [RELEVANT TSFRESH Selection] on {0}'.format(dataset), extra=LOGGER_EXTRA_OBJECT) # Testing # selekct('TwoPatterns', 4)
def select(dataset, features_number, clusters_number): app_logger.info( 'STARTED [MCFS Selection] on {0} with features number = {1}'.format( dataset, features_number), extra=LOGGER_EXTRA_OBJECT) # Retrieving all feature extracted by tsfresh from the pickles on the disk current_dir = os.getcwd().split('\\')[-1] projet_dir = 'MCFS-Unsupervisioned-Feature-Selection' if current_dir == projet_dir: all_features_train = pd.read_pickle( 'Pickle/AllFeatures/Train/{0}.pkl'.format(dataset)) all_features_test = pd.read_pickle( 'Pickle/AllFeatures/Test/{0}.pkl'.format(dataset)) else: all_features_train = pd.read_pickle( '../Pickle/AllFeatures/Train/{0}.pkl'.format(dataset)) all_features_test = pd.read_pickle( '../Pickle/AllFeatures/Test/{0}.pkl'.format(dataset)) app_logger.info( 'All features (including target column) trainset shape: {0}'.format( all_features_train.shape), extra=LOGGER_EXTRA_OBJECT) app_logger.info( 'All features (including target column) testset shape: {0}'.format( all_features_test.shape), extra=LOGGER_EXTRA_OBJECT) # np.savetxt(r'testDataFrame.txt', all_features_test.values, fmt='%d') # Retrieving indipendent columns of both set and known labels of the test set indipendent_columns_train = all_features_train.iloc[:, 1:] indipendent_columns_test = all_features_test.iloc[:, 1:] known_labels_test = all_features_test.iloc[:, 0] # Building matrix W for MCFS algorithm kwargs = { 'metric': 'euclidean', 'neighbor_mode': 'knn', 'weight_mode': 'binary', 'k': 3 # 'weight_mode': 'heat_kernel', # 'k': 5, # 't': 1 } W = construct_W.construct_W(indipendent_columns_train.values, **kwargs) # MCFS gives a weight to each features kwargs = {'W': W, 'n_clusters': clusters_number} weighted_features = MCFS.mcfs(indipendent_columns_train.values, features_number, **kwargs) # Ordering the features according to their weight ordered_features = MCFS.feature_ranking(weighted_features) # Getting only the first 'features_number' features selected_features = ordered_features[0:features_number] # Getting names of selected features names_selected_features = [] for feature_index in selected_features: names_selected_features.append( indipendent_columns_train.columns[feature_index]) # Selected only the selected features on the train set selected_features_train = indipendent_columns_train.loc[:, names_selected_features] app_logger.info('Selected features trainset: {0}'.format( selected_features_train.shape), extra=LOGGER_EXTRA_OBJECT) # Selected only the selected features on the test set selected_features_test = indipendent_columns_test.loc[:, names_selected_features] app_logger.info('Selected features testset: {0}'.format( selected_features_test.shape), extra=LOGGER_EXTRA_OBJECT) ''' # Pickles for rfd if selected_features_train.shape[0] > 1000: print('Test-set') selected_features_test.to_pickle('../rfd/Pickle_rfd/MCFS/{0}.pkl'.format(dataset)) else: print('Train-set') selected_features_train.to_pickle('../rfd/Pickle_rfd/MCFS/{0}.pkl'.format(dataset)) exit() ''' # Running k-means according to selected features test_feature_selection.testFeatureSelectionWithRepeatedKMeans( 'MCFS', features_number, dataset, selected_features_train.values, selected_features_test.values, clusters_number, known_labels_test) app_logger.info('ENDED [MCFS Selection] on {0}'.format(dataset), extra=LOGGER_EXTRA_OBJECT) # Testing #select('TwoPatterns', 10, 4)
def agglomerate(dataset, features_number, clusters_number): app_logger.info( 'STARTED [Feature Agglomeration] on {0} with features number = {1}'. format(dataset, features_number), extra=LOGGER_EXTRA_OBJECT) # Retrieving all feature extracted by tsfresh from the pickles on the disk current_dir = os.getcwd().split('\\')[-1] projet_dir = 'MCFS-Unsupervisioned-Feature-Selection' if current_dir == projet_dir: all_features_train = pd.read_pickle( 'Pickle/AllFeatures/Train/{0}.pkl'.format(dataset)) all_features_test = pd.read_pickle( 'Pickle/AllFeatures/Test/{0}.pkl'.format(dataset)) else: all_features_train = pd.read_pickle( '../Pickle/AllFeatures/Train/{0}.pkl'.format(dataset)) all_features_test = pd.read_pickle( '../Pickle/AllFeatures/Test/{0}.pkl'.format(dataset)) app_logger.info( 'All features (including target column) trainset shape: {0}'.format( all_features_train.shape), extra=LOGGER_EXTRA_OBJECT) app_logger.info( 'All features (including target column) testset shape: {0}'.format( all_features_test.shape), extra=LOGGER_EXTRA_OBJECT) # Retrieving indipendent columns of both set and known labels of the test set indipendent_columns_train = all_features_train.iloc[:, 1:] indipendent_columns_test = all_features_test.iloc[:, 1:] known_labels_test = all_features_test.iloc[:, 0] agglomeration = cluster.FeatureAgglomeration(n_clusters=features_number) agglomeration.fit(indipendent_columns_train) reduced_train = agglomeration.transform(indipendent_columns_train) reduced_test = agglomeration.transform(indipendent_columns_test) app_logger.info('Reduced train set: {0}'.format(reduced_train), extra=LOGGER_EXTRA_OBJECT) app_logger.info('Reduced test set: {0}'.format(reduced_test), extra=LOGGER_EXTRA_OBJECT) # Running k-means according to selected features test_feature_selection.testFeatureSelectionWithRepeatedKMeans( 'AGGLOMERATION', features_number, dataset, reduced_train, reduced_test, clusters_number, known_labels_test) app_logger.info('ENDED [Feature Agglomeration] on {0}'.format(dataset), extra=LOGGER_EXTRA_OBJECT)
def select(dataset, features_number, clusters_number): app_logger.info( 'STARTED [Corr Selection] on {0} with features number = {1}'.format( dataset, features_number), extra=LOGGER_EXTRA_OBJECT) # Retrieving all feature extracted by tsfresh from the pickles on the disk current_dir = os.getcwd().split('\\')[-1] projet_dir = 'MCFS-Unsupervisioned-Feature-Selection' if current_dir == projet_dir: all_features_train = pd.read_pickle( 'Pickle/AllFeatures/Train/{0}.pkl'.format(dataset)) all_features_test = pd.read_pickle( 'Pickle/AllFeatures/Test/{0}.pkl'.format(dataset)) else: all_features_train = pd.read_pickle( '../Pickle/AllFeatures/Train/{0}.pkl'.format(dataset)) all_features_test = pd.read_pickle( '../Pickle/AllFeatures/Test/{0}.pkl'.format(dataset)) app_logger.info( 'All features (including target column) trainset shape: {0}'.format( all_features_train.shape), extra=LOGGER_EXTRA_OBJECT) app_logger.info( 'All features (including target column) testset shape: {0}'.format( all_features_test.shape), extra=LOGGER_EXTRA_OBJECT) # Selecting indipendent columns and the target column of the train set indipendent_columns_train = all_features_train.iloc[:, 1:] target_column_train = all_features_train.iloc[:, 0] # Selecting indipendent columns and the target column of the test set indipendent_columns_test = all_features_test.iloc[:, 1:] known_labels_test = all_features_test.iloc[:, 0] dfcolumns = pd.DataFrame(indipendent_columns_train.columns) # Correlation Matrix # data = all_features_train data = all_features_train.astype( float ) # Otherwise, don't consider target column beacuse its type is integer (and not float) corrmat = data.corr() dfcorr_target = pd.DataFrame(corrmat[['target']].iloc[1:].values) # Creating dataframe which contains columns names and correlation values dfscores = pd.concat([dfcolumns, dfcorr_target], axis=1) dfscores.columns = ['feature_name', 'target_corr'] dfscores = dfscores.dropna(axis=0) # Converting Corr column dfscores[['target_corr']] = abs(dfscores[['target_corr']]) dfscores = dfscores.sort_values(by='target_corr', ascending=False) top_k_scores = dfscores.head(features_number) app_logger.info(top_k_scores, extra=LOGGER_EXTRA_OBJECT) selected_features_names = top_k_scores['feature_name'].values selected_features_train = indipendent_columns_train.loc[:, selected_features_names] selected_features_test = indipendent_columns_test.loc[:, selected_features_names] ''' # Pickles for rfd if selected_features_train.shape[0] > 1000: print('Test-set') selected_features_test.to_pickle('../rfd/Pickle_rfd/Corr/{0}.pkl'.format(dataset)) else: print('Train-set') selected_features_train.to_pickle('../rfd/Pickle_rfd/Corr/{0}.pkl'.format(dataset)) exit() ''' # Running k-means according to selected features test_feature_selection.testFeatureSelectionWithRepeatedKMeans( 'CORRELATION', features_number, dataset, selected_features_train.values, selected_features_test.values, clusters_number, known_labels_test) app_logger.info('ENDED [Corr Selection] on {0}'.format(dataset), extra=LOGGER_EXTRA_OBJECT) # Testing #select('TwoPatterns', 10, 4)