def test_make_matrix_row_stochastic_when_mostly_is_row_stochastic_already(
         self):
     matrix = np.array([[0.85, 0.05, 0.05, 0.05], [0.35, 0.25, 0.35, 0.03],
                        [0.02, 0.01, 0.02, 0.02], [0.25, 0.25, 0.25, 0.25]])
     expected = np.array([[0.85, 0.05, 0.05,
                           0.05], [0.36, 0.26, 0.36, 0.03],
                          [0.29, 0.14, 0.29, 0.29],
                          [0.25, 0.25, 0.25, 0.25]])
     computed = utils.make_matrix_row_stochastic(matrix)
     np_testing.assert_array_almost_equal(expected, computed, decimal=2)
 def get_influence_matrices2x2(
     self,
     make_it_row_stochastic: bool = True
 ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
     """Gets influence matrices in 2 * 2 format.
     
     If empty or missing string, it fills with 100 - other one. If both
     empty or missing it fills both with 50.
     """
     influence_matrices = []
     influences_from_data = []
     users = self.users
     questions = np.unique(self.influences.question)
     for question in questions:
         influences = []
         for user in users:
             for input in ['self', 'other']:
                 this_influence = self.influences[
                     (self.influences.question == question)
                     & (self.influences.sender == user) &
                     (self.influences.input == input)]
                 val = ''
                 if len(this_influence.value) > 0:
                     # Because if there might be multiple log entry for the
                     #  same text box, we take the last one.
                     val = list(this_influence.value)[-1]
                 val = str(val).split('%')[0]
                 influences.append(val)
         tmp = influences[2]
         influences[2] = influences[3]
         influences[3] = tmp
         influences = np.reshape(influences, (2, 2))
         empty_strings = np.where(influences == '')
         influence_from_data = np.ones((2, 2), dtype=np.bool)
         for l in range(len(empty_strings[0])):
             i = empty_strings[0][l]
             j = empty_strings[1][l]
             if influences[i, 1 - j] == '':
                 influences[i, 1 - j] = 50
                 influence_from_data[i, 1 - j] = False
             influences[i, j] = 100 - float(influences[i, 1 - j])
             influence_from_data[i, j] = False
         influences = np.array(influences, dtype=np.float)
         if make_it_row_stochastic:
             influences = utils.make_matrix_row_stochastic(influences)
         influence_matrices.append(influences)
         influences_from_data.append(influence_from_data)
     question_names = [
         question[len('GD_influence_'):] for question in questions
     ]
     return question_names, np.array(influence_matrices), np.array(
         influences_from_data)
Beispiel #3
0
def sbt_model_func(X_train,
                   y_train,
                   X_validation_or_test,
                   y_validation_or_test,
                   feature_names=[],
                   estimation_name='influence_matrix',
                   lambdaa=[],
                   error_type_str='mse',
                   params={'mode': 1}):
    """Structural Balance Theory model inspired (similar to Kulakowski et 2005).
    """
    if 'mode' in params:
        mode = params['mode']
    else:
        mode = 1
    y_validation_or_test_predicted = []
    for item in X_validation_or_test:
        influence_matrix = item['previous_influence_matrix']
        n, m = influence_matrix.shape
        if n != m:
            raise ValueError('The matrix was not squared.')
        next_influence_matrix = np.zeros((n, n))
        for i in range(n):
            for j in range(n):
                if i != j:
                    ks = list(set.difference(set(range(n)), [i, j]))
                    wij = 0
                    for k in ks:
                        wij += influence_matrix[i, k] * influence_matrix[k, j]
    #                 wij /= (n - 2)
                    next_influence_matrix[i, j] = wij
        if mode == 1:
            # Fill the diagonal with previous influence matrix and normalize to become row-stochastic.
            np.fill_diagonal(next_influence_matrix, np.diag(influence_matrix))
            next_influence_matrix = utils.make_matrix_row_stochastic(
                next_influence_matrix)
        elif mode == 2:
            # Fill the diagonal with 1 - sum of the current filled row.
            np.fill_diagonal(next_influence_matrix,
                             1 - np.sum(next_influence_matrix, axis=1))
        else:
            raise ValueError(
                'The input mode was wrong. It was {}'.format(mode))
        y_validation_or_test_predicted.append(next_influence_matrix)
    validation_or_test_error = compute_error(
        y_train_or_validation_or_test_true=y_validation_or_test,
        y_train_or_validation_or_test_predicted=y_validation_or_test_predicted,
        estimation_name=estimation_name,
        error_type_str=error_type_str)
    return -1, validation_or_test_error
 def test_make_matrix_row_stochastic_when_already_row_stochastic(self):
     matrix = np.array([[0.11, 0.26, 0.34, 0.29], [0.26, 0.21, 0.25, 0.28],
                        [0.05, 0.05, 0.85, 0.05], [0.25, 0.25, 0.25, 0.25]])
     expected = matrix
     computed = utils.make_matrix_row_stochastic(matrix)
     np_testing.assert_array_almost_equal(expected, computed, decimal=2)
 def test_make_matrix_row_stochastic_when_all_zeros(self):
     matrix = np.zeros((4, 4))
     expected = np.ones((4, 4)) * 0.25
     computed = utils.make_matrix_row_stochastic(matrix)
     np_testing.assert_array_almost_equal(expected, computed, decimal=2)
 def test_make_matrix_row_stochastic(self):
     matrix = np.array([[0, 1, 2], [3, 4, 5], [6, 7, 8]])
     expected = np.array([[0, 0.33, 0.67], [0.25, 0.33, 0.42],
                          [0.29, 0.33, 0.38]])
     computed = utils.make_matrix_row_stochastic(matrix)
     np_testing.assert_array_almost_equal(expected, computed, decimal=2)
    def generate_dataset(self):
        X = []
        y = []
        for team_id, team_log in self.data.items():
            if team_id in self.networks:
                print("In generate_dataset: processing team", team_id, '...')

                # First influence matrix:
                first_index = 0
                while first_index < len(self.networks[team_id]):
                    influence_matrix = np.matrix(
                        team_log.member_influences[first_index])
                    if self.skip_matrices_not_completely_from_members and np.sum(
                            team_log.member_influences_from_data[first_index]
                    ) != 16:
                        print('E1: Index: {} was skipped.'.format(first_index))
                        first_index += 1
                        continue
                    normalized_influence_matrix = utils.shuffle_matrix_in_given_order(
                        matrix=influence_matrix,
                        order=np.argsort(team_log.members)) / 100
                    first_row_stochastic_normalized_influence_matrix = np.matrix(
                        utils.make_matrix_row_stochastic(
                            normalized_influence_matrix))
                    previous_row_stochastic_normalized_influence_matrix = first_row_stochastic_normalized_influence_matrix.copy(
                    )
                    break

                # Average of previous influence matrices:
                previous_influence_matrices_cnt = 1
                # CHECK IF THIS IS NOTHING
                average_of_previous_influence_matrices = first_row_stochastic_normalized_influence_matrix.copy(
                )
                for index in range(first_index + 1,
                                   len(self.networks[team_id])):
                    influence_matrix = np.matrix(
                        team_log.member_influences[index])
                    if self.skip_matrices_not_completely_from_members and np.sum(
                            team_log.member_influences_from_data[index]) != 16:
                        print('E2: Index: {} was skipped.'.format(index))
                        continue

                    # Individual performance:
                    individual_performance = np.zeros(4)
                    individual_performance_hardness_weighted = np.zeros(4)
                    perf_rates = self.individual_performance_rates[team_id][
                        index]
                    for i, member in enumerate(sorted(team_log.members)):
                        individual_performance[i] = perf_rates[member][
                            'correct_rate_so_far']
                        individual_performance_hardness_weighted[
                            i] = perf_rates[member][
                                'hardness_weighted_correct_rate_so_far']

                    # Networks:
                    network = self.networks[team_id][index]

                    # Contents:
                    contents_embedding = self.contents_embeddings[team_id][
                        index]

                    # Average of previous influence matrices:
                    normalized_influence_matrix = utils.shuffle_matrix_in_given_order(
                        matrix=influence_matrix,
                        order=np.argsort(team_log.members)) / 100
                    row_stochastic_normalized_influence_matrix = np.matrix(
                        utils.make_matrix_row_stochastic(
                            normalized_influence_matrix))

                    # Multi-class classification (who is (are) the most influential individual(s)):
                    most_influentials = utils.most_influential_on_others(
                        influence_matrix=
                        row_stochastic_normalized_influence_matrix,
                        remove_self_influence=True)

                    # Combining all features together:
                    y.append({
                        'influence_matrix':
                        row_stochastic_normalized_influence_matrix,
                        'most_influentials': most_influentials
                    })
                    X.append({
                        'individual_performance':
                        individual_performance,
                        'individual_performance_hardness_weighted':
                        individual_performance_hardness_weighted,
                        'content_embedding_matrix':
                        contents_embedding,
                        'first_influence_matrix':
                        first_row_stochastic_normalized_influence_matrix,
                        'previous_influence_matrix':
                        previous_row_stochastic_normalized_influence_matrix,
                        'average_of_previous_influence_matrices':
                        average_of_previous_influence_matrices /
                        previous_influence_matrices_cnt,
                        'reply_duration':
                        nx.adj_matrix(network['reply_duration']).todense(),
                        'sentiment':
                        nx.adj_matrix(network['sentiment']).todense(),
                        'emotion_arousal':
                        nx.adj_matrix(network['emotion_arousal']).todense(),
                        'emotion_dominance':
                        nx.adj_matrix(network['emotion_dominance']).todense(),
                        'emotion_valence':
                        nx.adj_matrix(network['emotion_valence']).todense()
                    })
                    previous_row_stochastic_normalized_influence_matrix = row_stochastic_normalized_influence_matrix.copy(
                    )
                    average_of_previous_influence_matrices += row_stochastic_normalized_influence_matrix
                    previous_influence_matrices_cnt += 1

        self.supervised_data = {'X': X, 'y': y}
Beispiel #8
0
def model_builder(X_train,
                  y_train,
                  X_test,
                  y_test,
                  feature_names,
                  estimation_name='influence_matrix',
                  error_type_str='normalized_frob_norm',
                  tune_hyperparameters_by_validation=True,
                  with_replication=True,
                  lambdas=[0, 0.1, 1, 10, 100, 1000],
                  model_func='average',
                  params={
                      'with_constraints': True,
                      'n_splits': 3,
                      'best_lambda': 0.1
                  }):

    # For the baseline models.
    if model_func == 'average':
        mats = []
        for i in range(len(y_train)):
            mats.append(y_train[i][estimation_name])
        y_baseline_predicted = [
            np.matrix(np.mean(mats, axis=0)) for _ in range(len(y_train))
        ]
    elif model_func == 'uniform':
        y_baseline_predicted = [
            np.matrix(np.ones((4, 4)) * 0.25) for _ in range(len(y_train))
        ]
    elif model_func == 'random':
        y_baseline_predicted = [
            np.matrix(utils.make_matrix_row_stochastic(np.random.rand(4, 4)))
            for _ in range(len(y_train))
        ]
    if model_func in ['average', 'uniform', 'random']:
        train_error = compute_error(y_train,
                                    y_baseline_predicted,
                                    estimation_name=estimation_name,
                                    error_type_str=error_type_str)
        test_error = compute_error(y_test,
                                   y_baseline_predicted,
                                   estimation_name=estimation_name,
                                   error_type_str=error_type_str)
        return train_error, test_error, None

    # For the proposed models.
    validation_errors = defaultdict(lambda: 0)
    if tune_hyperparameters_by_validation:
        print('{}-fold validation ...'.format(params['n_splits']))
        kf = KFold(n_splits=params['n_splits'])
        for train_index, validation_index in kf.split(X_train):
            X_train_subset, X_validation = X_train[train_index], X_train[
                validation_index]
            y_train_subset, y_validation = y_train[train_index], y_train[
                validation_index]
            if with_replication:
                print('Replicating ...')
                X_train_subset, y_train_subset = utils.replicate_matrices_in_train_dataset_with_reordering(
                    X_train_subset, y_train_subset)
                X_train_subset = np.array(X_train_subset)
                y_train_subset = np.array(y_train_subset)
            print('Shapes of train: {}, validation: {}, test: {}.'.format(
                X_train_subset.shape, X_validation.shape, X_test.shape))
            for lambdaa in lambdas:
                validation_errors[lambdaa] += model_func(
                    X_train=X_train_subset,
                    y_train=y_train_subset,
                    X_validation_or_test=X_validation,
                    y_validation_or_test=y_validation,
                    feature_names=feature_names,
                    estimation_name=estimation_name,
                    lambdaa=lambdaa,
                    error_type_str=error_type_str,
                    params=params)[1]
        best_lambda = min(validation_errors, key=validation_errors.get)
    else:
        best_lambda = params['best_lambda']
    print('Training with the best lambda: {} on entire training set...'.format(
        best_lambda))
    if with_replication:
        print('Replicating ...')
        X_train, y_train = utils.replicate_matrices_in_train_dataset_with_reordering(
            X_train, y_train)
        X_train = np.array(X_train)
        y_train = np.array(y_train)
    train_error, test_error = model_func(X_train=X_train,
                                         y_train=y_train,
                                         X_validation_or_test=X_test,
                                         y_validation_or_test=y_test,
                                         feature_names=feature_names,
                                         estimation_name=estimation_name,
                                         lambdaa=best_lambda,
                                         error_type_str=error_type_str,
                                         params=params)
    return train_error, test_error, validation_errors
Beispiel #9
0
def concatinated_deep_neural_network_model_func(X_train,
                                                y_train,
                                                X_validation_or_test,
                                                y_validation_or_test,
                                                feature_names,
                                                estimation_name,
                                                lambdaa,
                                                error_type_str,
                                                params={
                                                    'n_epochs': 10,
                                                    'batch_size': 32
                                                }):

    flatten_X_train = []
    flatten_y_train = []
    for i in range(len(X_train)):
        features = X_train[i]
        label = y_train[i][estimation_name]
        feat_list = []
        for feature_name in feature_names:
            if len(features[feature_name].shape) == 1:
                feat_list.append(features[feature_name])
            else:
                feat_list.append(np.array(features[feature_name].flatten())[0])
        flatten_X_train.append(np.hstack(feat_list))
        flatten_y_train.append(np.array(label.flatten())[0])
    flatten_X_train = np.array(flatten_X_train)
    flatten_y_train = np.array(flatten_y_train)

    flatten_X_validation_or_test = []
    flatten_y_validation_or_test = []
    for i in range(len(X_validation_or_test)):
        features = X_validation_or_test[i]
        label = y_validation_or_test[i][estimation_name]
        feat_list = []
        for feature_name in feature_names:
            if len(features[feature_name].shape) == 1:
                feat_list.append(features[feature_name])
            else:
                feat_list.append(np.array(features[feature_name].flatten())[0])
        flatten_X_validation_or_test.append(np.hstack(feat_list))
        flatten_y_validation_or_test.append(np.array(label.flatten())[0])
    flatten_X_validation_or_test = np.array(flatten_X_validation_or_test)
    flatten_y_validation_or_test = np.array(flatten_y_validation_or_test)

    _, input_size = flatten_X_train.shape
    print('Input size for the neural network was: {}'.format(input_size))

    model = Sequential([
        Dense(units=32,
              kernel_initializer='he_normal',
              activation='elu',
              input_shape=(input_size, ),
              kernel_regularizer=regularizers.l1(lambdaa),
              activity_regularizer=regularizers.l1(lambdaa)),
        Dropout(0.5),
        Dense(units=64,
              kernel_initializer='he_normal',
              activation='elu',
              kernel_regularizer=regularizers.l1(lambdaa),
              activity_regularizer=regularizers.l1(lambdaa)),
        Dropout(0.5),
        Dense(units=32,
              kernel_initializer='he_normal',
              activation='elu',
              kernel_regularizer=regularizers.l1(lambdaa),
              activity_regularizer=regularizers.l1(lambdaa)),
        Dropout(0.5),
        Dense(16, kernel_initializer='glorot_uniform', activation='sigmoid')
    ])
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(flatten_X_train,
              flatten_y_train,
              epochs=params['n_epochs'],
              batch_size=params['batch_size'])

    # Predicting and computing train error.
    y_train_predicted = [
        utils.make_matrix_row_stochastic(np.matrix(np.reshape(element,
                                                              (4, 4))))
        for element in model.predict(flatten_X_train)
    ]
    train_error = compute_error(
        y_train_or_validation_or_test_true=y_train,
        y_train_or_validation_or_test_predicted=y_train_predicted,
        estimation_name=estimation_name,
        error_type_str=error_type_str)

    # Predicting and computing train error.
    y_validation_or_test_predicted = [
        utils.make_matrix_row_stochastic(np.matrix(np.reshape(element,
                                                              (4, 4))))
        for element in model.predict(flatten_X_validation_or_test)
    ]
    validation_or_test_error = compute_error(
        y_train_or_validation_or_test_true=y_validation_or_test,
        y_train_or_validation_or_test_predicted=y_validation_or_test_predicted,
        estimation_name=estimation_name,
        error_type_str=error_type_str)

    return train_error, validation_or_test_error