Ejemplo n.º 1
0
 def construct_measured_vector(self, train_subset):
     num_seqs = num_rows(train_subset)
     measured_vals = np.matlib.zeros((1, num_seqs), dtype=np.float)
     for ctr in range(num_seqs):
         seq = str(train_subset[ctr, :])
         measured_vals[0, ctr] = self.sequence_matrix.seq_to_measurement[seq]
     return measured_vals
Ejemplo n.º 2
0
    def update_dictionaries(self):
        """
        When the training data is changed (features or examples added/removed) the dictionary keys need to be updated

        # using a dictionary to look up measured values an inequalities means that sequences must be unique in training data
        # however, this doesn't seem to be an issue and it is the simplest way to associate sequences with the measured
        # values and inequalities across shuffling and CV splits

        :return:
        """
        self.seq_to_measurement.clear()
        self.seq_to_inequality.clear()
        for ctr in range(num_rows(self.training_data)):
            seq = str(self.training_data[ctr, :])
            self.seq_to_inequality[seq] = self.inequalities[ctr]
            self.seq_to_measurement[seq] = self.measured_values[ctr, 0]
Ejemplo n.º 3
0
    def distance(self, log_lambdas):
        print('calculating distance')
        dist = 0.0
        count = 0
        for fold_num in range(self.num_cv_folds):
            train = self.cv_train_data[fold_num]
            blind = self.cv_blind_data[fold_num]
            if fold_num == 0:
                print('starting solve_x for fold ', fold_num)
            self.solve_x(train, log_lambdas, fold_num)
            dist += self.x_distance(blind, fold_num)[0, 0]
            # num rows is num elements in blind set
            count += num_rows(blind)

        dist /= float(count)
        print('dist: ' + str(dist) + '\n')
        print('\n')
        sys.stdout.flush()
        return dist
Ejemplo n.º 4
0
    def add_pair_coeffs(self, pairs):
        num_pairs = len(pairs)
        # extend columns for pair coeff binary features
        coeff_feature_matrix = np.matlib.zeros((num_rows(self.training_data), num_pairs), dtype=np.float)

        # TODO: 20 should be dynamic alphabet size
        base_index = 1 + self.seq_length * 20
        feature_index_to_pair = [None] * (base_index + len(pairs))
        for offset, pair in enumerate(pairs):
            feature_index_to_pair[base_index + offset] = pair

        seq_index = 0
        for seq in self.training_data:
            for offset, pair in enumerate(pairs):
                if seq[0, pair[0]] == 1 and seq[0, pair[1]] == 1:
                    coeff_feature_matrix[seq_index, offset] = 1.0
            seq_index += 1

        self.training_data = np.concatenate((self.training_data, coeff_feature_matrix), axis=1)
        self.update_dictionaries()
Ejemplo n.º 5
0
    def create_pair_training_matrix(self, pairs):
        """
        The 0th pair will be the 1st column of features due to bias
        :param pairs:
        :return:
        """
        num_pairs = len(pairs)
        # feature_index_to_pair = [None] * (1 + len(pairs))
        # for offset, pair in enumerate(pairs):
        #     feature_index_to_pair[1 + offset] = pair

        coeff_feature_matrix = np.matlib.zeros((num_rows(self.training_data), 1 + num_pairs), dtype=np.float)
        seq_index = 0
        # can't use np.ndenumerate() because that is by element, not row
        for seq in self.training_data:
            # pair is tuple of index locations of binary pair feature
            coeff_feature_matrix[seq_index, 0] = 1.0
            for offset, pair in enumerate(pairs):
                if seq[0, pair[0]] == 1 and seq[0, pair[1]] == 1:  # does seq contain the pair
                    coeff_feature_matrix[seq_index, offset + 1] = 1.0 # plus 1 to account for bias being first column
            seq_index += 1

        return coeff_feature_matrix