def test_make_matrix_row_stochastic_when_mostly_is_row_stochastic_already( self): matrix = np.array([[0.85, 0.05, 0.05, 0.05], [0.35, 0.25, 0.35, 0.03], [0.02, 0.01, 0.02, 0.02], [0.25, 0.25, 0.25, 0.25]]) expected = np.array([[0.85, 0.05, 0.05, 0.05], [0.36, 0.26, 0.36, 0.03], [0.29, 0.14, 0.29, 0.29], [0.25, 0.25, 0.25, 0.25]]) computed = utils.make_matrix_row_stochastic(matrix) np_testing.assert_array_almost_equal(expected, computed, decimal=2)
def get_influence_matrices2x2( self, make_it_row_stochastic: bool = True ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: """Gets influence matrices in 2 * 2 format. If empty or missing string, it fills with 100 - other one. If both empty or missing it fills both with 50. """ influence_matrices = [] influences_from_data = [] users = self.users questions = np.unique(self.influences.question) for question in questions: influences = [] for user in users: for input in ['self', 'other']: this_influence = self.influences[ (self.influences.question == question) & (self.influences.sender == user) & (self.influences.input == input)] val = '' if len(this_influence.value) > 0: # Because if there might be multiple log entry for the # same text box, we take the last one. val = list(this_influence.value)[-1] val = str(val).split('%')[0] influences.append(val) tmp = influences[2] influences[2] = influences[3] influences[3] = tmp influences = np.reshape(influences, (2, 2)) empty_strings = np.where(influences == '') influence_from_data = np.ones((2, 2), dtype=np.bool) for l in range(len(empty_strings[0])): i = empty_strings[0][l] j = empty_strings[1][l] if influences[i, 1 - j] == '': influences[i, 1 - j] = 50 influence_from_data[i, 1 - j] = False influences[i, j] = 100 - float(influences[i, 1 - j]) influence_from_data[i, j] = False influences = np.array(influences, dtype=np.float) if make_it_row_stochastic: influences = utils.make_matrix_row_stochastic(influences) influence_matrices.append(influences) influences_from_data.append(influence_from_data) question_names = [ question[len('GD_influence_'):] for question in questions ] return question_names, np.array(influence_matrices), np.array( influences_from_data)
def sbt_model_func(X_train, y_train, X_validation_or_test, y_validation_or_test, feature_names=[], estimation_name='influence_matrix', lambdaa=[], error_type_str='mse', params={'mode': 1}): """Structural Balance Theory model inspired (similar to Kulakowski et 2005). """ if 'mode' in params: mode = params['mode'] else: mode = 1 y_validation_or_test_predicted = [] for item in X_validation_or_test: influence_matrix = item['previous_influence_matrix'] n, m = influence_matrix.shape if n != m: raise ValueError('The matrix was not squared.') next_influence_matrix = np.zeros((n, n)) for i in range(n): for j in range(n): if i != j: ks = list(set.difference(set(range(n)), [i, j])) wij = 0 for k in ks: wij += influence_matrix[i, k] * influence_matrix[k, j] # wij /= (n - 2) next_influence_matrix[i, j] = wij if mode == 1: # Fill the diagonal with previous influence matrix and normalize to become row-stochastic. np.fill_diagonal(next_influence_matrix, np.diag(influence_matrix)) next_influence_matrix = utils.make_matrix_row_stochastic( next_influence_matrix) elif mode == 2: # Fill the diagonal with 1 - sum of the current filled row. np.fill_diagonal(next_influence_matrix, 1 - np.sum(next_influence_matrix, axis=1)) else: raise ValueError( 'The input mode was wrong. It was {}'.format(mode)) y_validation_or_test_predicted.append(next_influence_matrix) validation_or_test_error = compute_error( y_train_or_validation_or_test_true=y_validation_or_test, y_train_or_validation_or_test_predicted=y_validation_or_test_predicted, estimation_name=estimation_name, error_type_str=error_type_str) return -1, validation_or_test_error
def test_make_matrix_row_stochastic_when_already_row_stochastic(self): matrix = np.array([[0.11, 0.26, 0.34, 0.29], [0.26, 0.21, 0.25, 0.28], [0.05, 0.05, 0.85, 0.05], [0.25, 0.25, 0.25, 0.25]]) expected = matrix computed = utils.make_matrix_row_stochastic(matrix) np_testing.assert_array_almost_equal(expected, computed, decimal=2)
def test_make_matrix_row_stochastic_when_all_zeros(self): matrix = np.zeros((4, 4)) expected = np.ones((4, 4)) * 0.25 computed = utils.make_matrix_row_stochastic(matrix) np_testing.assert_array_almost_equal(expected, computed, decimal=2)
def test_make_matrix_row_stochastic(self): matrix = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]]) expected = np.array([[0, 0.33, 0.67], [0.25, 0.33, 0.42], [0.29, 0.33, 0.38]]) computed = utils.make_matrix_row_stochastic(matrix) np_testing.assert_array_almost_equal(expected, computed, decimal=2)
def generate_dataset(self): X = [] y = [] for team_id, team_log in self.data.items(): if team_id in self.networks: print("In generate_dataset: processing team", team_id, '...') # First influence matrix: first_index = 0 while first_index < len(self.networks[team_id]): influence_matrix = np.matrix( team_log.member_influences[first_index]) if self.skip_matrices_not_completely_from_members and np.sum( team_log.member_influences_from_data[first_index] ) != 16: print('E1: Index: {} was skipped.'.format(first_index)) first_index += 1 continue normalized_influence_matrix = utils.shuffle_matrix_in_given_order( matrix=influence_matrix, order=np.argsort(team_log.members)) / 100 first_row_stochastic_normalized_influence_matrix = np.matrix( utils.make_matrix_row_stochastic( normalized_influence_matrix)) previous_row_stochastic_normalized_influence_matrix = first_row_stochastic_normalized_influence_matrix.copy( ) break # Average of previous influence matrices: previous_influence_matrices_cnt = 1 # CHECK IF THIS IS NOTHING average_of_previous_influence_matrices = first_row_stochastic_normalized_influence_matrix.copy( ) for index in range(first_index + 1, len(self.networks[team_id])): influence_matrix = np.matrix( team_log.member_influences[index]) if self.skip_matrices_not_completely_from_members and np.sum( team_log.member_influences_from_data[index]) != 16: print('E2: Index: {} was skipped.'.format(index)) continue # Individual performance: individual_performance = np.zeros(4) individual_performance_hardness_weighted = np.zeros(4) perf_rates = self.individual_performance_rates[team_id][ index] for i, member in enumerate(sorted(team_log.members)): individual_performance[i] = perf_rates[member][ 'correct_rate_so_far'] individual_performance_hardness_weighted[ i] = perf_rates[member][ 'hardness_weighted_correct_rate_so_far'] # Networks: network = self.networks[team_id][index] # Contents: contents_embedding = self.contents_embeddings[team_id][ index] # Average of previous influence matrices: normalized_influence_matrix = utils.shuffle_matrix_in_given_order( matrix=influence_matrix, order=np.argsort(team_log.members)) / 100 row_stochastic_normalized_influence_matrix = np.matrix( utils.make_matrix_row_stochastic( normalized_influence_matrix)) # Multi-class classification (who is (are) the most influential individual(s)): most_influentials = utils.most_influential_on_others( influence_matrix= row_stochastic_normalized_influence_matrix, remove_self_influence=True) # Combining all features together: y.append({ 'influence_matrix': row_stochastic_normalized_influence_matrix, 'most_influentials': most_influentials }) X.append({ 'individual_performance': individual_performance, 'individual_performance_hardness_weighted': individual_performance_hardness_weighted, 'content_embedding_matrix': contents_embedding, 'first_influence_matrix': first_row_stochastic_normalized_influence_matrix, 'previous_influence_matrix': previous_row_stochastic_normalized_influence_matrix, 'average_of_previous_influence_matrices': average_of_previous_influence_matrices / previous_influence_matrices_cnt, 'reply_duration': nx.adj_matrix(network['reply_duration']).todense(), 'sentiment': nx.adj_matrix(network['sentiment']).todense(), 'emotion_arousal': nx.adj_matrix(network['emotion_arousal']).todense(), 'emotion_dominance': nx.adj_matrix(network['emotion_dominance']).todense(), 'emotion_valence': nx.adj_matrix(network['emotion_valence']).todense() }) previous_row_stochastic_normalized_influence_matrix = row_stochastic_normalized_influence_matrix.copy( ) average_of_previous_influence_matrices += row_stochastic_normalized_influence_matrix previous_influence_matrices_cnt += 1 self.supervised_data = {'X': X, 'y': y}
def model_builder(X_train, y_train, X_test, y_test, feature_names, estimation_name='influence_matrix', error_type_str='normalized_frob_norm', tune_hyperparameters_by_validation=True, with_replication=True, lambdas=[0, 0.1, 1, 10, 100, 1000], model_func='average', params={ 'with_constraints': True, 'n_splits': 3, 'best_lambda': 0.1 }): # For the baseline models. if model_func == 'average': mats = [] for i in range(len(y_train)): mats.append(y_train[i][estimation_name]) y_baseline_predicted = [ np.matrix(np.mean(mats, axis=0)) for _ in range(len(y_train)) ] elif model_func == 'uniform': y_baseline_predicted = [ np.matrix(np.ones((4, 4)) * 0.25) for _ in range(len(y_train)) ] elif model_func == 'random': y_baseline_predicted = [ np.matrix(utils.make_matrix_row_stochastic(np.random.rand(4, 4))) for _ in range(len(y_train)) ] if model_func in ['average', 'uniform', 'random']: train_error = compute_error(y_train, y_baseline_predicted, estimation_name=estimation_name, error_type_str=error_type_str) test_error = compute_error(y_test, y_baseline_predicted, estimation_name=estimation_name, error_type_str=error_type_str) return train_error, test_error, None # For the proposed models. validation_errors = defaultdict(lambda: 0) if tune_hyperparameters_by_validation: print('{}-fold validation ...'.format(params['n_splits'])) kf = KFold(n_splits=params['n_splits']) for train_index, validation_index in kf.split(X_train): X_train_subset, X_validation = X_train[train_index], X_train[ validation_index] y_train_subset, y_validation = y_train[train_index], y_train[ validation_index] if with_replication: print('Replicating ...') X_train_subset, y_train_subset = utils.replicate_matrices_in_train_dataset_with_reordering( X_train_subset, y_train_subset) X_train_subset = np.array(X_train_subset) y_train_subset = np.array(y_train_subset) print('Shapes of train: {}, validation: {}, test: {}.'.format( X_train_subset.shape, X_validation.shape, X_test.shape)) for lambdaa in lambdas: validation_errors[lambdaa] += model_func( X_train=X_train_subset, y_train=y_train_subset, X_validation_or_test=X_validation, y_validation_or_test=y_validation, feature_names=feature_names, estimation_name=estimation_name, lambdaa=lambdaa, error_type_str=error_type_str, params=params)[1] best_lambda = min(validation_errors, key=validation_errors.get) else: best_lambda = params['best_lambda'] print('Training with the best lambda: {} on entire training set...'.format( best_lambda)) if with_replication: print('Replicating ...') X_train, y_train = utils.replicate_matrices_in_train_dataset_with_reordering( X_train, y_train) X_train = np.array(X_train) y_train = np.array(y_train) train_error, test_error = model_func(X_train=X_train, y_train=y_train, X_validation_or_test=X_test, y_validation_or_test=y_test, feature_names=feature_names, estimation_name=estimation_name, lambdaa=best_lambda, error_type_str=error_type_str, params=params) return train_error, test_error, validation_errors
def concatinated_deep_neural_network_model_func(X_train, y_train, X_validation_or_test, y_validation_or_test, feature_names, estimation_name, lambdaa, error_type_str, params={ 'n_epochs': 10, 'batch_size': 32 }): flatten_X_train = [] flatten_y_train = [] for i in range(len(X_train)): features = X_train[i] label = y_train[i][estimation_name] feat_list = [] for feature_name in feature_names: if len(features[feature_name].shape) == 1: feat_list.append(features[feature_name]) else: feat_list.append(np.array(features[feature_name].flatten())[0]) flatten_X_train.append(np.hstack(feat_list)) flatten_y_train.append(np.array(label.flatten())[0]) flatten_X_train = np.array(flatten_X_train) flatten_y_train = np.array(flatten_y_train) flatten_X_validation_or_test = [] flatten_y_validation_or_test = [] for i in range(len(X_validation_or_test)): features = X_validation_or_test[i] label = y_validation_or_test[i][estimation_name] feat_list = [] for feature_name in feature_names: if len(features[feature_name].shape) == 1: feat_list.append(features[feature_name]) else: feat_list.append(np.array(features[feature_name].flatten())[0]) flatten_X_validation_or_test.append(np.hstack(feat_list)) flatten_y_validation_or_test.append(np.array(label.flatten())[0]) flatten_X_validation_or_test = np.array(flatten_X_validation_or_test) flatten_y_validation_or_test = np.array(flatten_y_validation_or_test) _, input_size = flatten_X_train.shape print('Input size for the neural network was: {}'.format(input_size)) model = Sequential([ Dense(units=32, kernel_initializer='he_normal', activation='elu', input_shape=(input_size, ), kernel_regularizer=regularizers.l1(lambdaa), activity_regularizer=regularizers.l1(lambdaa)), Dropout(0.5), Dense(units=64, kernel_initializer='he_normal', activation='elu', kernel_regularizer=regularizers.l1(lambdaa), activity_regularizer=regularizers.l1(lambdaa)), Dropout(0.5), Dense(units=32, kernel_initializer='he_normal', activation='elu', kernel_regularizer=regularizers.l1(lambdaa), activity_regularizer=regularizers.l1(lambdaa)), Dropout(0.5), Dense(16, kernel_initializer='glorot_uniform', activation='sigmoid') ]) model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) model.fit(flatten_X_train, flatten_y_train, epochs=params['n_epochs'], batch_size=params['batch_size']) # Predicting and computing train error. y_train_predicted = [ utils.make_matrix_row_stochastic(np.matrix(np.reshape(element, (4, 4)))) for element in model.predict(flatten_X_train) ] train_error = compute_error( y_train_or_validation_or_test_true=y_train, y_train_or_validation_or_test_predicted=y_train_predicted, estimation_name=estimation_name, error_type_str=error_type_str) # Predicting and computing train error. y_validation_or_test_predicted = [ utils.make_matrix_row_stochastic(np.matrix(np.reshape(element, (4, 4)))) for element in model.predict(flatten_X_validation_or_test) ] validation_or_test_error = compute_error( y_train_or_validation_or_test_true=y_validation_or_test, y_train_or_validation_or_test_predicted=y_validation_or_test_predicted, estimation_name=estimation_name, error_type_str=error_type_str) return train_error, validation_or_test_error