def __init__(self, config_dictionary):  # completed
        """variables for Neural Network: feature_file_name(read from)
        required_variables - required variables for running system
        all_variables - all valid variables for each type"""
        self.feature_file_name = self.default_variable_define(config_dictionary, "feature_file_name", arg_type="string")
        self.features, self.feature_sequence_lens = self.read_feature_file()
        self.model = Bidirectional_RNNLM_Weight()
        self.output_name = self.default_variable_define(config_dictionary, "output_name", arg_type="string")

        self.required_variables = dict()
        self.all_variables = dict()
        self.required_variables["train"] = ["mode", "feature_file_name", "output_name"]
        self.all_variables["train"] = self.required_variables["train"] + [
            "label_file_name",
            "num_hiddens",
            "weight_matrix_name",
            "initial_weight_max",
            "initial_weight_min",
            "initial_bias_max",
            "initial_bias_min",
            "save_each_epoch",
            "do_pretrain",
            "pretrain_method",
            "pretrain_iterations",
            "pretrain_learning_rate",
            "pretrain_batch_size",
            "do_backprop",
            "backprop_method",
            "backprop_batch_size",
            "l2_regularization_const",
            "num_epochs",
            "num_line_searches",
            "armijo_const",
            "wolfe_const",
            "steepest_learning_rate",
            "momentum_rate",
            "conjugate_max_iterations",
            "conjugate_const_type",
            "truncated_newton_num_cg_epochs",
            "truncated_newton_init_damping_factor",
            "krylov_num_directions",
            "krylov_num_batch_splits",
            "krylov_num_bfgs_epochs",
            "second_order_matrix",
            "krylov_use_hessian_preconditioner",
            "krylov_eigenvalue_floor_const",
            "fisher_preconditioner_floor_val",
            "use_fisher_preconditioner",
            "structural_damping_const",
            "validation_feature_file_name",
            "validation_label_file_name",
        ]
        self.required_variables["test"] = ["mode", "feature_file_name", "weight_matrix_name", "output_name"]
        self.all_variables["test"] = self.required_variables["test"] + ["label_file_name"]
    def __init__(self, config_dictionary):  #completed
        """variables for Neural Network: feature_file_name(read from)
        required_variables - required variables for running system
        all_variables - all valid variables for each type"""
        self.feature_file_name = self.default_variable_define(
            config_dictionary, 'feature_file_name', arg_type='string')
        self.features, self.feature_sequence_lens = self.read_feature_file()
        self.model = Bidirectional_RNNLM_Weight()
        self.output_name = self.default_variable_define(config_dictionary,
                                                        'output_name',
                                                        arg_type='string')

        self.required_variables = dict()
        self.all_variables = dict()
        self.required_variables['train'] = [
            'mode', 'feature_file_name', 'output_name'
        ]
        self.all_variables['train'] = self.required_variables['train'] + [
            'label_file_name', 'num_hiddens', 'weight_matrix_name',
            'initial_weight_max', 'initial_weight_min', 'initial_bias_max',
            'initial_bias_min', 'save_each_epoch', 'do_pretrain',
            'pretrain_method', 'pretrain_iterations', 'pretrain_learning_rate',
            'pretrain_batch_size', 'do_backprop', 'backprop_method',
            'backprop_batch_size', 'l2_regularization_const', 'num_epochs',
            'num_line_searches', 'armijo_const', 'wolfe_const',
            'steepest_learning_rate', 'momentum_rate',
            'conjugate_max_iterations', 'conjugate_const_type',
            'truncated_newton_num_cg_epochs',
            'truncated_newton_init_damping_factor', 'krylov_num_directions',
            'krylov_num_batch_splits', 'krylov_num_bfgs_epochs',
            'second_order_matrix', 'krylov_use_hessian_preconditioner',
            'krylov_eigenvalue_floor_const', 'fisher_preconditioner_floor_val',
            'use_fisher_preconditioner', 'structural_damping_const',
            'validation_feature_file_name', 'validation_label_file_name'
        ]
        self.required_variables['test'] = [
            'mode', 'feature_file_name', 'weight_matrix_name', 'output_name'
        ]
        self.all_variables['test'] = self.required_variables['test'] + [
            'label_file_name'
        ]
class Bidirectional_Recurrent_Neural_Network_Language_Model(object, Vector_Math):
    """features are stored in format max_seq_len x nseq x nvis where n_max_obs is the maximum number of observations per sequence
    and nseq is the number of sequences
    weights are stored as nvis x nhid at feature level
    biases are stored as 1 x nhid
    rbm_type is either rbm_gaussian_bernoulli, rbm_bernoulli_bernoulli, logistic"""

    def __init__(self, config_dictionary):  # completed
        """variables for Neural Network: feature_file_name(read from)
        required_variables - required variables for running system
        all_variables - all valid variables for each type"""
        self.feature_file_name = self.default_variable_define(config_dictionary, "feature_file_name", arg_type="string")
        self.features, self.feature_sequence_lens = self.read_feature_file()
        self.model = Bidirectional_RNNLM_Weight()
        self.output_name = self.default_variable_define(config_dictionary, "output_name", arg_type="string")

        self.required_variables = dict()
        self.all_variables = dict()
        self.required_variables["train"] = ["mode", "feature_file_name", "output_name"]
        self.all_variables["train"] = self.required_variables["train"] + [
            "label_file_name",
            "num_hiddens",
            "weight_matrix_name",
            "initial_weight_max",
            "initial_weight_min",
            "initial_bias_max",
            "initial_bias_min",
            "save_each_epoch",
            "do_pretrain",
            "pretrain_method",
            "pretrain_iterations",
            "pretrain_learning_rate",
            "pretrain_batch_size",
            "do_backprop",
            "backprop_method",
            "backprop_batch_size",
            "l2_regularization_const",
            "num_epochs",
            "num_line_searches",
            "armijo_const",
            "wolfe_const",
            "steepest_learning_rate",
            "momentum_rate",
            "conjugate_max_iterations",
            "conjugate_const_type",
            "truncated_newton_num_cg_epochs",
            "truncated_newton_init_damping_factor",
            "krylov_num_directions",
            "krylov_num_batch_splits",
            "krylov_num_bfgs_epochs",
            "second_order_matrix",
            "krylov_use_hessian_preconditioner",
            "krylov_eigenvalue_floor_const",
            "fisher_preconditioner_floor_val",
            "use_fisher_preconditioner",
            "structural_damping_const",
            "validation_feature_file_name",
            "validation_label_file_name",
        ]
        self.required_variables["test"] = ["mode", "feature_file_name", "weight_matrix_name", "output_name"]
        self.all_variables["test"] = self.required_variables["test"] + ["label_file_name"]

    def dump_config_vals(self):
        no_attr_key = list()
        print "********************************************************************************"
        print "Neural Network configuration is as follows:"

        for key in self.all_variables[self.mode]:
            if hasattr(self, key):
                print key, "=", eval("self." + key)
            else:
                no_attr_key.append(key)

        print "********************************************************************************"
        print "Undefined keys are as follows:"
        for key in no_attr_key:
            print key, "not set"
        print "********************************************************************************"

    def default_variable_define(
        self,
        config_dictionary,
        config_key,
        arg_type="string",
        default_value=None,
        error_string=None,
        exit_if_no_default=True,
        acceptable_values=None,
    ):
        # arg_type is either int, float, string, int_comma_string, float_comma_string, boolean
        try:
            if arg_type == "int_comma_string":
                return self.read_config_comma_string(config_dictionary[config_key], needs_int=True)
            elif arg_type == "float_comma_string":
                return self.read_config_comma_string(config_dictionary[config_key], needs_int=False)
            elif arg_type == "int":
                return int(config_dictionary[config_key])
            elif arg_type == "float":
                return float(config_dictionary[config_key])
            elif arg_type == "string":
                return config_dictionary[config_key]
            elif arg_type == "boolean":
                if (
                    config_dictionary[config_key] == "False"
                    or config_dictionary[config_key] == "0"
                    or config_dictionary[config_key] == "F"
                ):
                    return False
                elif (
                    config_dictionary[config_key] == "True"
                    or config_dictionary[config_key] == "1"
                    or config_dictionary[config_key] == "T"
                ):
                    return True
                else:
                    print config_dictionary[
                        config_key
                    ], "is not valid for boolean type... Acceptable values are True, False, 1, 0, T, or F... Exiting now"
                    sys.exit()
            else:
                print arg_type, "is not a valid type, arg_type can be either int, float, string, int_comma_string, float_comma_string... exiting now"
                sys.exit()
        except KeyError:
            if error_string != None:
                print error_string
            else:
                print "No", config_key, "defined,",
            if default_value == None and exit_if_no_default:
                print "since", config_key, "must be defined... exiting now"
                sys.exit()
            else:
                if acceptable_values != None and (default_value not in acceptable_values):
                    print default_value, "is not an acceptable input, acceptable inputs are", acceptable_values, "... Exiting now"
                    sys.exit()
                if error_string == None:
                    print "setting", config_key, "to", default_value
                return default_value

    def read_feature_file(self, feature_file_name=None):  # completed
        if feature_file_name is None:
            feature_file_name = self.feature_file_name
        try:
            feature_data = sp.loadmat(feature_file_name)
            features = feature_data["features"].astype(np.int32)
            sequence_len = feature_data["feature_sequence_lengths"]
            sequence_len = np.reshape(sequence_len, (sequence_len.size,))
            return features, sequence_len  # in MATLAB format
        except IOError:
            print "Unable to open ", feature_file_name, "... Exiting now"
            sys.exit()

    def read_label_file(self, label_file_name=None):  # completed
        """label file is a two-column file in the form
        sent_id label_1
        sent_id label_2
        ...
        """
        if label_file_name is None:
            label_file_name = self.label_file_name
        try:
            label_data = sp.loadmat(label_file_name)["labels"].astype(np.int32)
            return label_data  # [:,1], label_data[:,0]#in MATLAB format
        except IOError:
            print "Unable to open ", label_file_name, "... Exiting now"
            sys.exit()

    def batch_size(self, feature_sequence_lens):
        return np.sum(feature_sequence_lens)

    def read_config_comma_string(self, input_string, needs_int=False):
        output_list = []
        for elem in input_string.split(","):
            if "*" in elem:
                elem_list = elem.split("*")
                if needs_int:
                    output_list.extend([int(elem_list[1])] * int(elem_list[0]))
                else:
                    output_list.extend([float(elem_list[1])] * int(elem_list[0]))
            else:
                if needs_int:
                    output_list.append(int(elem))
                else:
                    output_list.append(float(elem))
        return output_list

    def levenshtein_string_edit_distance(self, string1, string2):  # completed
        dist = dict()
        string1_len = len(string1)
        string2_len = len(string2)

        for idx in range(-1, string1_len + 1):
            dist[(idx, -1)] = idx + 1
        for idx in range(-1, string2_len + 1):
            dist[(-1, idx)] = idx + 1

        for idx1 in range(string1_len):
            for idx2 in range(string2_len):
                if string1[idx1] == string2[idx2]:
                    cost = 0
                else:
                    cost = 1
                dist[(idx1, idx2)] = min(
                    dist[(idx1 - 1, idx2)] + 1,  # deletion
                    dist[(idx1, idx2 - 1)] + 1,  # insertion
                    dist[(idx1 - 1, idx2 - 1)] + cost,  # substitution
                )
                if idx1 and idx2 and string1[idx1] == string2[idx2 - 1] and string1[idx1 - 1] == string2[idx2]:
                    dist[(idx1, idx2)] = min(dist[(idx1, idx2)], dist[idx1 - 2, idx2 - 2] + cost)  # transposition
        return dist[(string1_len - 1, string2_len - 1)]

    def check_keys(self, config_dictionary):  # completed
        print "Checking config keys...",
        exit_flag = False

        config_dictionary_keys = config_dictionary.keys()

        if self.mode == "train":
            correct_mode = "train"
            incorrect_mode = "test"
        elif self.mode == "test":
            correct_mode = "test"
            incorrect_mode = "train"

        for req_var in self.required_variables[correct_mode]:
            if req_var not in config_dictionary_keys:
                print req_var, "needs to be set for", correct_mode, "but is not."
                if exit_flag == False:
                    print "Because of above error, will exit after checking rest of keys"
                    exit_flag = True

        for var in config_dictionary_keys:
            if var not in self.all_variables[correct_mode]:
                print var, "in the config file given is not a valid key for", correct_mode
                if var in self.all_variables[incorrect_mode]:
                    print "but", var, "is a valid key for", incorrect_mode, "so either the mode or key is incorrect"
                else:
                    string_distances = np.array(
                        [
                            self.levenshtein_string_edit_distance(var, string2)
                            for string2 in self.all_variables[correct_mode]
                        ]
                    )
                    print "perhaps you meant ***", self.all_variables[correct_mode][
                        np.argmin(string_distances)
                    ], "\b*** (levenshtein string edit distance", np.min(
                        string_distances
                    ), "\b) instead of ***", var, "\b***?"
                if exit_flag == False:
                    print "Because of above error, will exit after checking rest of keys"
                    exit_flag = True

        if exit_flag:
            print "Exiting now"
            sys.exit()
        else:
            print "seems copacetic"

    def check_labels(self):  # want to prune non-contiguous labels, might be expensive
        # TODO: check sentids to make sure seqences are good
        print "Checking labels..."
        if len(self.labels.shape) != 2:
            print "labels need to be in (n_samples,2) format and the shape of labels is ", self.labels.shape, "... Exiting now"
            sys.exit()
        if self.labels.shape[0] != sum(self.feature_sequence_lens):
            print "Number of examples in feature file: ", sum(
                self.feature_sequence_lens
            ), " does not equal size of label file, ", self.labels.size, "... Exiting now"
            sys.exit()
            #        if  [i for i in np.unique(self.labels)] != range(np.max(self.labels)+1):
            #            print "Labels need to be in the form 0,1,2,....,n,... Exiting now"
            sys.exit()
        #        label_counts = np.bincount(np.ravel(self.labels[:,1])) #[self.labels.count(x) for x in range(np.max(self.labels)+1)]
        #        print "distribution of labels is:"
        #        for x in range(len(label_counts)):
        #            print "#", x, "\b's:", label_counts[x]
        print "labels seem copacetic"

    def forward_layer(
        self, inputs, weights, biases, weight_type, secondary_inputs=None, secondary_weights=None
    ):  # completed
        #        raise ValueError("forward_layer() not implemented yet")
        if weight_type == "logistic":
            return self.softmax(
                self.weight_matrix_multiply(inputs, weights, biases) + np.dot(secondary_inputs, secondary_weights)
            )
        elif weight_type == "rbm_gaussian_bernoulli" or weight_type == "rbm_bernoulli_bernoulli":
            return self.sigmoid(
                weights[(inputs), :] + self.weight_matrix_multiply(secondary_inputs, secondary_weights, biases)
            )
        # added to test finite differences calculation for pearlmutter forward pass
        elif weight_type == "linear":  # only used for the logistic layer
            return self.weight_matrix_multiply(inputs, weights, biases) + np.dot(secondary_inputs, secondary_weights)
        else:
            print "weight_type", weight_type, "is not a valid layer type.",
            print "Valid layer types are", self.model.valid_layer_types, "Exiting now..."
            sys.exit()

    def forward_pass_single_batch(self, inputs, model=None, return_hiddens=False, linear_output=False):
        """forward pass for single batch size. Mainly for speed in this case
        """
        if model == None:
            model = self.model
        num_observations = inputs.size
        hiddens_forward = model.weights["visible_hidden"][(inputs), :]
        hiddens_forward[:1, :] += self.weight_matrix_multiply(
            model.init_hiddens["forward"], model.weights["hidden_hidden_forward"], model.bias["hidden_forward"]
        )
        expit(hiddens_forward[0, :], hiddens_forward[0, :])

        hiddens_backward = model.weights["visible_hidden"][(inputs), :]
        hiddens_backward[-1:, :] += self.weight_matrix_multiply(
            model.init_hiddens["backward"], model.weights["hidden_hidden_backward"], model.bias["hidden_backward"]
        )
        expit(hiddens_backward[-1, :], hiddens_backward[-1, :])

        for time_step in range(1, num_observations):
            hiddens_forward[time_step : time_step + 1, :] += self.weight_matrix_multiply(
                hiddens_forward[time_step - 1 : time_step, :],
                model.weights["hidden_hidden_forward"],
                model.bias["hidden_forward"],
            )
            expit(hiddens_forward[time_step, :], hiddens_forward[time_step, :])  # sigmoid

            hiddens_backward[
                num_observations - time_step - 1 : num_observations - time_step, :
            ] += self.weight_matrix_multiply(
                hiddens_backward[num_observations - time_step : num_observations - time_step + 1, :],
                model.weights["hidden_hidden_backward"],
                model.bias["hidden_backward"],
            )
            expit(
                hiddens_backward[num_observations - time_step - 1, :],
                hiddens_backward[num_observations - time_step - 1, :],
            )  # sigmoid

        outputs = self.forward_layer(
            hiddens_forward,
            model.weights["hidden_output_forward"],
            model.bias["output"],
            model.weight_type["hidden_output"],
            hiddens_backward,
            model.weights["hidden_output_backward"],
        )

        if return_hiddens:
            return outputs, hiddens_forward, hiddens_backward
        else:
            del hiddens_forward, hiddens_backward
            return outputs

    def forward_pass(
        self, inputs, feature_sequence_lens, model=None, return_hiddens=False, linear_output=False
    ):  # completed
        """forward pass each layer starting with feature level
        inputs in the form n_max_obs x n_seq x n_vis"""
        raise ValueError("forward_pass() not implemented yet")
        if model == None:
            model = self.model
        architecture = self.model.get_architecture()
        max_sequence_observations = inputs.shape[0]
        num_sequences = inputs.shape[1]
        num_hiddens = architecture[1]
        num_outs = architecture[2]
        hiddens_forward = np.zeros((max_sequence_observations, num_sequences, num_hiddens))
        hiddens_backward = np.zeros((max_sequence_observations, num_sequences, num_hiddens))
        outputs = np.zeros((max_sequence_observations, num_sequences, num_outs))
        # propagate hiddens
        hiddens_forward[0, :, :] = self.forward_layer(
            inputs[0, :],
            model.weights["visible_hidden"],
            model.bias["hidden"],
            model.weight_type["visible_hidden"],
            model.init_hiddens["forward"],
            model.weights["hidden_hidden_forward"],
        )

        hiddens_backward[0, :, :] = self.forward_layer(
            inputs[0, :],
            model.weights["visible_hidden"],
            model.bias["hidden"],
            model.weight_type["visible_hidden"],
            model.init_hiddens["backward"],
            model.weights["hidden_hidden_backward"],
        )
        if linear_output:
            outputs[0, :, :] = self.forward_layer(
                hiddens_forward[0, :, :], model.weights["hidden_output"], model.bias["output"], "linear"
            )
        else:
            outputs[0, :, :] = self.forward_layer(
                hiddens[0, :, :],
                model.weights["hidden_output"],
                model.bias["output"],
                model.weight_type["hidden_output"],
            )
        for sequence_index in range(1, max_sequence_observations):
            sequence_input = inputs[sequence_index, :]
            hiddens[sequence_index, :, :] = self.forward_layer(
                sequence_input,
                model.weights["visible_hidden"],
                model.bias["hidden"],
                model.weight_type["visible_hidden"],
                hiddens[sequence_index - 1, :, :],
                model.weights["hidden_hidden"],
            )
            if linear_output:
                outputs[sequence_index, :, :] = self.forward_layer(
                    hiddens[sequence_index, :, :], model.weights["hidden_output"], model.bias["output"], "linear"
                )
            else:
                outputs[sequence_index, :, :] = self.forward_layer(
                    hiddens[sequence_index, :, :],
                    model.weights["hidden_output"],
                    model.bias["output"],
                    model.weight_type["hidden_output"],
                )
            # find the observations where the sequence has ended,
            # and then zero out hiddens and outputs, so nothing horrible happens during backprop, etc.
            zero_input = np.where(feature_sequence_lens <= sequence_index)
            hiddens[sequence_index, zero_input, :] = 0.0
            outputs[sequence_index, zero_input, :] = 0.0
        if return_hiddens:
            return outputs, hiddens
        else:
            del hiddens
            return outputs

    def flatten_output(self, output, feature_sequence_lens=None):
        """outputs in the form of max_obs_seq x n_seq x n_outs  get converted to form
        n_data x n_outs, so we can calculate classification accuracy and cross-entropy
        """
        if feature_sequence_lens == None:
            feature_sequence_lens = self.feature_sequence_lens
        num_outs = output.shape[2]
        #        num_seq = output.shape[1]
        flat_output = np.zeros((self.batch_size(feature_sequence_lens), num_outs))
        cur_index = 0
        for seq_index, num_obs in enumerate(feature_sequence_lens):
            flat_output[cur_index : cur_index + num_obs, :] = copy.deepcopy(output[:num_obs, seq_index, :])
            cur_index += num_obs

        return flat_output

    def calculate_log_perplexity(self, output, flat_labels):  # completed, expensive, should be compiled
        """calculates perplexity with flat labels
        """
        return -np.sum(
            np.log2(np.clip(output, a_min=1e-12, a_max=1.0))[np.arange(flat_labels.shape[0]), flat_labels[:, 1]]
        )

    def calculate_cross_entropy(self, output, flat_labels):  # completed, expensive, should be compiled
        """calculates perplexity with flat labels
        """
        return -np.sum(
            np.log(np.clip(output, a_min=1e-12, a_max=1.0))[np.arange(flat_labels.shape[0]), flat_labels[:, 1]]
        )

    def calculate_classification_accuracy(self, flat_output, labels):  # completed, possibly expensive
        prediction = flat_output.argmax(axis=1).reshape(labels.shape)
        classification_accuracy = sum(prediction == labels) / float(labels.size)
        return classification_accuracy[0]
class Bidirectional_Recurrent_Neural_Network_Language_Model(
        object, Vector_Math):
    """features are stored in format max_seq_len x nseq x nvis where n_max_obs is the maximum number of observations per sequence
    and nseq is the number of sequences
    weights are stored as nvis x nhid at feature level
    biases are stored as 1 x nhid
    rbm_type is either rbm_gaussian_bernoulli, rbm_bernoulli_bernoulli, logistic"""
    def __init__(self, config_dictionary):  #completed
        """variables for Neural Network: feature_file_name(read from)
        required_variables - required variables for running system
        all_variables - all valid variables for each type"""
        self.feature_file_name = self.default_variable_define(
            config_dictionary, 'feature_file_name', arg_type='string')
        self.features, self.feature_sequence_lens = self.read_feature_file()
        self.model = Bidirectional_RNNLM_Weight()
        self.output_name = self.default_variable_define(config_dictionary,
                                                        'output_name',
                                                        arg_type='string')

        self.required_variables = dict()
        self.all_variables = dict()
        self.required_variables['train'] = [
            'mode', 'feature_file_name', 'output_name'
        ]
        self.all_variables['train'] = self.required_variables['train'] + [
            'label_file_name', 'num_hiddens', 'weight_matrix_name',
            'initial_weight_max', 'initial_weight_min', 'initial_bias_max',
            'initial_bias_min', 'save_each_epoch', 'do_pretrain',
            'pretrain_method', 'pretrain_iterations', 'pretrain_learning_rate',
            'pretrain_batch_size', 'do_backprop', 'backprop_method',
            'backprop_batch_size', 'l2_regularization_const', 'num_epochs',
            'num_line_searches', 'armijo_const', 'wolfe_const',
            'steepest_learning_rate', 'momentum_rate',
            'conjugate_max_iterations', 'conjugate_const_type',
            'truncated_newton_num_cg_epochs',
            'truncated_newton_init_damping_factor', 'krylov_num_directions',
            'krylov_num_batch_splits', 'krylov_num_bfgs_epochs',
            'second_order_matrix', 'krylov_use_hessian_preconditioner',
            'krylov_eigenvalue_floor_const', 'fisher_preconditioner_floor_val',
            'use_fisher_preconditioner', 'structural_damping_const',
            'validation_feature_file_name', 'validation_label_file_name'
        ]
        self.required_variables['test'] = [
            'mode', 'feature_file_name', 'weight_matrix_name', 'output_name'
        ]
        self.all_variables['test'] = self.required_variables['test'] + [
            'label_file_name'
        ]

    def dump_config_vals(self):
        no_attr_key = list()
        print "********************************************************************************"
        print "Neural Network configuration is as follows:"

        for key in self.all_variables[self.mode]:
            if hasattr(self, key):
                print key, "=", eval('self.' + key)
            else:
                no_attr_key.append(key)

        print "********************************************************************************"
        print "Undefined keys are as follows:"
        for key in no_attr_key:
            print key, "not set"
        print "********************************************************************************"

    def default_variable_define(self,
                                config_dictionary,
                                config_key,
                                arg_type='string',
                                default_value=None,
                                error_string=None,
                                exit_if_no_default=True,
                                acceptable_values=None):
        #arg_type is either int, float, string, int_comma_string, float_comma_string, boolean
        try:
            if arg_type == 'int_comma_string':
                return self.read_config_comma_string(
                    config_dictionary[config_key], needs_int=True)
            elif arg_type == 'float_comma_string':
                return self.read_config_comma_string(
                    config_dictionary[config_key], needs_int=False)
            elif arg_type == 'int':
                return int(config_dictionary[config_key])
            elif arg_type == 'float':
                return float(config_dictionary[config_key])
            elif arg_type == 'string':
                return config_dictionary[config_key]
            elif arg_type == 'boolean':
                if config_dictionary[
                        config_key] == 'False' or config_dictionary[
                            config_key] == '0' or config_dictionary[
                                config_key] == 'F':
                    return False
                elif config_dictionary[
                        config_key] == 'True' or config_dictionary[
                            config_key] == '1' or config_dictionary[
                                config_key] == 'T':
                    return True
                else:
                    print config_dictionary[
                        config_key], "is not valid for boolean type... Acceptable values are True, False, 1, 0, T, or F... Exiting now"
                    sys.exit()
            else:
                print arg_type, "is not a valid type, arg_type can be either int, float, string, int_comma_string, float_comma_string... exiting now"
                sys.exit()
        except KeyError:
            if error_string != None:
                print error_string
            else:
                print "No", config_key, "defined,",
            if default_value == None and exit_if_no_default:
                print "since", config_key, "must be defined... exiting now"
                sys.exit()
            else:
                if acceptable_values != None and (default_value
                                                  not in acceptable_values):
                    print default_value, "is not an acceptable input, acceptable inputs are", acceptable_values, "... Exiting now"
                    sys.exit()
                if error_string == None:
                    print "setting", config_key, "to", default_value
                return default_value

    def read_feature_file(self, feature_file_name=None):  #completed
        if feature_file_name is None:
            feature_file_name = self.feature_file_name
        try:
            feature_data = sp.loadmat(feature_file_name)
            features = feature_data['features'].astype(np.int32)
            sequence_len = feature_data['feature_sequence_lengths']
            sequence_len = np.reshape(sequence_len, (sequence_len.size, ))
            return features, sequence_len  #in MATLAB format
        except IOError:
            print "Unable to open ", feature_file_name, "... Exiting now"
            sys.exit()

    def read_label_file(self, label_file_name=None):  #completed
        """label file is a two-column file in the form
        sent_id label_1
        sent_id label_2
        ...
        """
        if label_file_name is None:
            label_file_name = self.label_file_name
        try:
            label_data = sp.loadmat(label_file_name)['labels'].astype(np.int32)
            return label_data  #[:,1], label_data[:,0]#in MATLAB format
        except IOError:
            print "Unable to open ", label_file_name, "... Exiting now"
            sys.exit()

    def batch_size(self, feature_sequence_lens):
        return np.sum(feature_sequence_lens)

    def read_config_comma_string(self, input_string, needs_int=False):
        output_list = []
        for elem in input_string.split(','):
            if '*' in elem:
                elem_list = elem.split('*')
                if needs_int:
                    output_list.extend([int(elem_list[1])] * int(elem_list[0]))
                else:
                    output_list.extend([float(elem_list[1])] *
                                       int(elem_list[0]))
            else:
                if needs_int:
                    output_list.append(int(elem))
                else:
                    output_list.append(float(elem))
        return output_list

    def levenshtein_string_edit_distance(self, string1, string2):  #completed
        dist = dict()
        string1_len = len(string1)
        string2_len = len(string2)

        for idx in range(-1, string1_len + 1):
            dist[(idx, -1)] = idx + 1
        for idx in range(-1, string2_len + 1):
            dist[(-1, idx)] = idx + 1

        for idx1 in range(string1_len):
            for idx2 in range(string2_len):
                if string1[idx1] == string2[idx2]:
                    cost = 0
                else:
                    cost = 1
                dist[(idx1, idx2)] = min(
                    dist[(idx1 - 1, idx2)] + 1,  # deletion
                    dist[(idx1, idx2 - 1)] + 1,  # insertion
                    dist[(idx1 - 1, idx2 - 1)] + cost,  # substitution
                )
                if idx1 and idx2 and string1[idx1] == string2[
                        idx2 - 1] and string1[idx1 - 1] == string2[idx2]:
                    dist[(idx1, idx2)] = min(dist[(idx1, idx2)],
                                             dist[idx1 - 2, idx2 - 2] +
                                             cost)  # transposition
        return dist[(string1_len - 1, string2_len - 1)]

    def check_keys(self, config_dictionary):  #completed
        print "Checking config keys...",
        exit_flag = False

        config_dictionary_keys = config_dictionary.keys()

        if self.mode == 'train':
            correct_mode = 'train'
            incorrect_mode = 'test'
        elif self.mode == 'test':
            correct_mode = 'test'
            incorrect_mode = 'train'

        for req_var in self.required_variables[correct_mode]:
            if req_var not in config_dictionary_keys:
                print req_var, "needs to be set for", correct_mode, "but is not."
                if exit_flag == False:
                    print "Because of above error, will exit after checking rest of keys"
                    exit_flag = True

        for var in config_dictionary_keys:
            if var not in self.all_variables[correct_mode]:
                print var, "in the config file given is not a valid key for", correct_mode
                if var in self.all_variables[incorrect_mode]:
                    print "but", var, "is a valid key for", incorrect_mode, "so either the mode or key is incorrect"
                else:
                    string_distances = np.array([
                        self.levenshtein_string_edit_distance(var, string2)
                        for string2 in self.all_variables[correct_mode]
                    ])
                    print "perhaps you meant ***", self.all_variables[
                        correct_mode][np.argmin(
                            string_distances
                        )], "\b*** (levenshtein string edit distance", np.min(
                            string_distances
                        ), "\b) instead of ***", var, "\b***?"
                if exit_flag == False:
                    print "Because of above error, will exit after checking rest of keys"
                    exit_flag = True

        if exit_flag:
            print "Exiting now"
            sys.exit()
        else:
            print "seems copacetic"

    def check_labels(
            self):  #want to prune non-contiguous labels, might be expensive
        #TODO: check sentids to make sure seqences are good
        print "Checking labels..."
        if len(self.labels.shape) != 2:
            print "labels need to be in (n_samples,2) format and the shape of labels is ", self.labels.shape, "... Exiting now"
            sys.exit()
        if self.labels.shape[0] != sum(self.feature_sequence_lens):
            print "Number of examples in feature file: ", sum(
                self.feature_sequence_lens
            ), " does not equal size of label file, ", self.labels.size, "... Exiting now"
            sys.exit()
            #        if  [i for i in np.unique(self.labels)] != range(np.max(self.labels)+1):
            #            print "Labels need to be in the form 0,1,2,....,n,... Exiting now"
            sys.exit()
#        label_counts = np.bincount(np.ravel(self.labels[:,1])) #[self.labels.count(x) for x in range(np.max(self.labels)+1)]
#        print "distribution of labels is:"
#        for x in range(len(label_counts)):
#            print "#", x, "\b's:", label_counts[x]
        print "labels seem copacetic"

    def forward_layer(self,
                      inputs,
                      weights,
                      biases,
                      weight_type,
                      secondary_inputs=None,
                      secondary_weights=None):  #completed
        #        raise ValueError("forward_layer() not implemented yet")
        if weight_type == 'logistic':
            return self.softmax(
                self.weight_matrix_multiply(inputs, weights, biases) +
                np.dot(secondary_inputs, secondary_weights))
        elif weight_type == 'rbm_gaussian_bernoulli' or weight_type == 'rbm_bernoulli_bernoulli':
            return self.sigmoid(weights[
                (inputs), :] + self.weight_matrix_multiply(
                    secondary_inputs, secondary_weights, biases))
        #added to test finite differences calculation for pearlmutter forward pass
        elif weight_type == 'linear':  #only used for the logistic layer
            return self.weight_matrix_multiply(
                inputs, weights, biases) + np.dot(secondary_inputs,
                                                  secondary_weights)
        else:
            print "weight_type", weight_type, "is not a valid layer type.",
            print "Valid layer types are", self.model.valid_layer_types, "Exiting now..."
            sys.exit()

    def forward_pass_single_batch(self,
                                  inputs,
                                  model=None,
                                  return_hiddens=False,
                                  linear_output=False):
        """forward pass for single batch size. Mainly for speed in this case
        """
        if model == None:
            model = self.model
        num_observations = inputs.size
        hiddens_forward = model.weights['visible_hidden'][(inputs), :]
        hiddens_forward[:1, :] += self.weight_matrix_multiply(
            model.init_hiddens['forward'],
            model.weights['hidden_hidden_forward'],
            model.bias['hidden_forward'])
        expit(hiddens_forward[0, :], hiddens_forward[0, :])

        hiddens_backward = model.weights['visible_hidden'][(inputs), :]
        hiddens_backward[-1:, :] += self.weight_matrix_multiply(
            model.init_hiddens['backward'],
            model.weights['hidden_hidden_backward'],
            model.bias['hidden_backward'])
        expit(hiddens_backward[-1, :], hiddens_backward[-1, :])

        for time_step in range(1, num_observations):
            hiddens_forward[time_step:time_step +
                            1, :] += self.weight_matrix_multiply(
                                hiddens_forward[time_step - 1:time_step, :],
                                model.weights['hidden_hidden_forward'],
                                model.bias['hidden_forward'])
            expit(hiddens_forward[time_step, :],
                  hiddens_forward[time_step, :])  #sigmoid

            hiddens_backward[num_observations - time_step -
                             1:num_observations -
                             time_step, :] += self.weight_matrix_multiply(
                                 hiddens_backward[num_observations -
                                                  time_step:num_observations -
                                                  time_step + 1, :],
                                 model.weights['hidden_hidden_backward'],
                                 model.bias['hidden_backward'])
            expit(hiddens_backward[num_observations - time_step - 1, :],
                  hiddens_backward[num_observations - time_step -
                                   1, :])  #sigmoid

        outputs = self.forward_layer(hiddens_forward,
                                     model.weights['hidden_output_forward'],
                                     model.bias['output'],
                                     model.weight_type['hidden_output'],
                                     hiddens_backward,
                                     model.weights['hidden_output_backward'])

        if return_hiddens:
            return outputs, hiddens_forward, hiddens_backward
        else:
            del hiddens_forward, hiddens_backward
            return outputs

    def forward_pass(self,
                     inputs,
                     feature_sequence_lens,
                     model=None,
                     return_hiddens=False,
                     linear_output=False):  #completed
        """forward pass each layer starting with feature level
        inputs in the form n_max_obs x n_seq x n_vis"""
        raise ValueError("forward_pass() not implemented yet")
        if model == None:
            model = self.model
        architecture = self.model.get_architecture()
        max_sequence_observations = inputs.shape[0]
        num_sequences = inputs.shape[1]
        num_hiddens = architecture[1]
        num_outs = architecture[2]
        hiddens_forward = np.zeros(
            (max_sequence_observations, num_sequences, num_hiddens))
        hiddens_backward = np.zeros(
            (max_sequence_observations, num_sequences, num_hiddens))
        outputs = np.zeros(
            (max_sequence_observations, num_sequences, num_outs))
        #propagate hiddens
        hiddens_forward[0, :, :] = self.forward_layer(
            inputs[0, :], model.weights['visible_hidden'],
            model.bias['hidden'], model.weight_type['visible_hidden'],
            model.init_hiddens['forward'],
            model.weights['hidden_hidden_forward'])

        hiddens_backward[0, :, :] = self.forward_layer(
            inputs[0, :], model.weights['visible_hidden'],
            model.bias['hidden'], model.weight_type['visible_hidden'],
            model.init_hiddens['backward'],
            model.weights['hidden_hidden_backward'])
        if linear_output:
            outputs[0, :, :] = self.forward_layer(
                hiddens_forward[0, :, :],
                model.weights['hidden_output'],
                model.bias['output'],
                'linear',
            )
        else:
            outputs[0, :, :] = self.forward_layer(
                hiddens[0, :, :], model.weights['hidden_output'],
                model.bias['output'], model.weight_type['hidden_output'])
        for sequence_index in range(1, max_sequence_observations):
            sequence_input = inputs[sequence_index, :]
            hiddens[sequence_index, :, :] = self.forward_layer(
                sequence_input, model.weights['visible_hidden'],
                model.bias['hidden'], model.weight_type['visible_hidden'],
                hiddens[sequence_index - 1, :, :],
                model.weights['hidden_hidden'])
            if linear_output:
                outputs[sequence_index, :, :] = self.forward_layer(
                    hiddens[sequence_index, :, :],
                    model.weights['hidden_output'], model.bias['output'],
                    'linear')
            else:
                outputs[sequence_index, :, :] = self.forward_layer(
                    hiddens[sequence_index, :, :],
                    model.weights['hidden_output'], model.bias['output'],
                    model.weight_type['hidden_output'])
            #find the observations where the sequence has ended,
            #and then zero out hiddens and outputs, so nothing horrible happens during backprop, etc.
            zero_input = np.where(feature_sequence_lens <= sequence_index)
            hiddens[sequence_index, zero_input, :] = 0.0
            outputs[sequence_index, zero_input, :] = 0.0
        if return_hiddens:
            return outputs, hiddens
        else:
            del hiddens
            return outputs

    def flatten_output(self, output, feature_sequence_lens=None):
        """outputs in the form of max_obs_seq x n_seq x n_outs  get converted to form
        n_data x n_outs, so we can calculate classification accuracy and cross-entropy
        """
        if feature_sequence_lens == None:
            feature_sequence_lens = self.feature_sequence_lens
        num_outs = output.shape[2]
        #        num_seq = output.shape[1]
        flat_output = np.zeros(
            (self.batch_size(feature_sequence_lens), num_outs))
        cur_index = 0
        for seq_index, num_obs in enumerate(feature_sequence_lens):
            flat_output[cur_index:cur_index + num_obs, :] = copy.deepcopy(
                output[:num_obs, seq_index, :])
            cur_index += num_obs

        return flat_output

    def calculate_log_perplexity(
            self, output,
            flat_labels):  #completed, expensive, should be compiled
        """calculates perplexity with flat labels
        """
        return -np.sum(
            np.log2(np.clip(output, a_min=1E-12,
                            a_max=1.0))[np.arange(flat_labels.shape[0]),
                                        flat_labels[:, 1]])

    def calculate_cross_entropy(
            self, output,
            flat_labels):  #completed, expensive, should be compiled
        """calculates perplexity with flat labels
        """
        return -np.sum(
            np.log(np.clip(output, a_min=1E-12,
                           a_max=1.0))[np.arange(flat_labels.shape[0]),
                                       flat_labels[:, 1]])

    def calculate_classification_accuracy(
            self, flat_output, labels):  #completed, possibly expensive
        prediction = flat_output.argmax(axis=1).reshape(labels.shape)
        classification_accuracy = sum(prediction == labels) / float(
            labels.size)
        return classification_accuracy[0]
Exemple #5
0
    def backprop_adagrad_single_batch(self):
        print "Starting backprop using adagrad"
        adagrad_weight = Bidirectional_RNNLM_Weight()
        adagrad_weight.init_zero_weights(self.model.get_architecture())

        buffer_weight = Bidirectional_RNNLM_Weight()
        buffer_weight.init_zero_weights(self.model.get_architecture())

        fudge_factor = 1.0
        adagrad_weight = adagrad_weight + fudge_factor
        gradient = Bidirectional_RNNLM_Weight()
        gradient.init_zero_weights(self.model.get_architecture())
        if self.validation_feature_file_name is not None:
            cross_entropy, perplexity, num_correct, num_examples, loss = self.calculate_classification_statistics(
                self.validation_features, self.validation_labels,
                self.validation_fsl, self.model)
            print "cross-entropy before steepest descent is", cross_entropy
            print "perplexity is", perplexity
            if self.l2_regularization_const > 0.0:
                print "regularized loss is", loss
            print "number correctly classified is", num_correct, "of", num_examples

#        excluded_keys = {'bias':['0'], 'weights':[]}
#        frame_table = np.cumsum(self.feature_sequence_lens)
        for epoch_num in range(len(self.steepest_learning_rate)):
            print "At epoch", epoch_num + 1, "of", len(
                self.steepest_learning_rate
            ), "with learning rate", self.steepest_learning_rate[epoch_num]
            start_frame = 0
            end_frame = 0
            cross_entropy = 0.0
            num_examples = 0
            #            if hasattr(self, 'momentum_rate'):
            #                momentum_rate = self.momentum_rate[epoch_num]
            #                print "momentum is", momentum_rate
            #            else:
            #                momentum_rate = 0.0
            for batch_index, feature_sequence_len in enumerate(
                    self.feature_sequence_lens):
                end_frame = start_frame + feature_sequence_len
                batch_features = self.features[:feature_sequence_len,
                                               batch_index]
                batch_labels = self.labels[start_frame:end_frame, 1]
                #                print ""
                #                print batch_index
                #                print batch_features
                #                print batch_labels
                cur_xent = self.calculate_gradient_single_batch(
                    batch_features,
                    batch_labels,
                    gradient,
                    return_cross_entropy=True,
                    check_gradient=False)
                #                print self.model.norm()
                #                print gradient.norm()
                if self.l2_regularization_const > 0.0:
                    buffer_weight.assign_weights(self.model)
                    buffer_weight *= self.l2_regularization_const
                    gradient += buffer_weight
                buffer_weight.assign_weights(gradient)
                #                print gradient.init_hiddens
                buffer_weight **= 2.0
                adagrad_weight += buffer_weight
                #                print adagrad_weight.init_hiddens
                buffer_weight.assign_weights(adagrad_weight)
                buffer_weight **= 0.5
                #                print buffer_weight.init_hiddens
                gradient /= buffer_weight
                #                print gradient.init_hiddens
                cross_entropy += cur_xent
                per_done = float(batch_index) / self.num_sequences * 100
                sys.stdout.write(
                    "\r                                                                \r"
                )  #clear line
                sys.stdout.write("\r%.1f%% done " %
                                 per_done), sys.stdout.flush()
                ppp = cross_entropy / end_frame
                sys.stdout.write("train X-ent: %f " % ppp), sys.stdout.flush()
                gradient *= -self.steepest_learning_rate[epoch_num]
                self.model += gradient  #/ batch_size
                #                if momentum_rate > 0.0:
                #                    prev_step *= momentum_rate
                #                    self.model += prev_step
                #                prev_step.assign_weights(gradient)
                #                prev_step *= -self.steepest_learning_rate[epoch_num]

                start_frame = end_frame

            if self.validation_feature_file_name is not None:
                cross_entropy, perplexity, num_correct, num_examples, loss = self.calculate_classification_statistics(
                    self.validation_features, self.validation_labels,
                    self.validation_fsl, self.model)
                print "cross-entropy at the end of the epoch is", cross_entropy
                print "perplexity is", perplexity
                if self.l2_regularization_const > 0.0:
                    print "regularized loss is", loss
                print "number correctly classified is", num_correct, "of", num_examples

            sys.stdout.write("\r100.0% done \r")
            sys.stdout.write(
                "\r                                                                \r"
            )  #clear line
            if self.save_each_epoch:
                self.model.write_weights(''.join(
                    [self.output_name, '_epoch_',
                     str(epoch_num + 1)]))
Exemple #6
0
    def calculate_gradient_single_batch(self,
                                        batch_inputs,
                                        batch_labels,
                                        gradient_weights,
                                        hiddens_forward=None,
                                        hiddens_backward=None,
                                        outputs=None,
                                        check_gradient=False,
                                        model=None,
                                        l2_regularization_const=0.0,
                                        return_cross_entropy=False):
        #need to check regularization
        #calculate gradient with particular Neural Network model. If None is specified, will use current weights (i.e., self.model)
        batch_size = batch_labels.size
        if model is None:
            model = self.model
        if hiddens_forward is None or hiddens_backward is None or outputs is None:
            outputs, hiddens_forward, hiddens_backward = self.forward_pass_single_batch(
                batch_inputs, model, return_hiddens=True)
        #derivative of log(cross-entropy softmax)
        batch_indices = np.arange(batch_size)
        gradient_weights *= 0.0
        backward_inputs = outputs

        if return_cross_entropy:
            cross_entropy = -np.sum(
                np.log2(backward_inputs[batch_indices, batch_labels]))
        backward_inputs[batch_indices, batch_labels] -= 1.0

        np.sum(backward_inputs, axis=0, out=gradient_weights.bias['output'][0])

        np.dot(hiddens_forward.T,
               backward_inputs,
               out=gradient_weights.weights['hidden_output_forward'])
        pre_nonlinearity_hiddens_forward = np.dot(
            backward_inputs[batch_size - 1, :],
            model.weights['hidden_output_forward'].T)
        pre_nonlinearity_hiddens_forward *= hiddens_forward[batch_size - 1, :]
        pre_nonlinearity_hiddens_forward *= 1 - hiddens_forward[batch_size -
                                                                1, :]

        np.dot(hiddens_backward.T,
               backward_inputs,
               out=gradient_weights.weights['hidden_output_backward'])
        pre_nonlinearity_hiddens_backward = np.dot(
            backward_inputs[0, :], model.weights['hidden_output_backward'].T)
        pre_nonlinearity_hiddens_backward *= hiddens_backward[0, :]
        pre_nonlinearity_hiddens_backward *= 1 - hiddens_backward[0, :]

        if batch_size > 1:
            gradient_weights.weights['visible_hidden'][batch_inputs[
                batch_size - 1]] += pre_nonlinearity_hiddens_forward
            gradient_weights.weights['hidden_hidden_forward'] += np.outer(
                hiddens_forward[batch_size - 2, :],
                pre_nonlinearity_hiddens_forward)
            gradient_weights.bias['hidden_forward'][
                0] += pre_nonlinearity_hiddens_forward

            gradient_weights.weights['visible_hidden'][
                batch_inputs[0]] += pre_nonlinearity_hiddens_backward
            gradient_weights.weights['hidden_hidden_backward'] += np.outer(
                hiddens_backward[1, :], pre_nonlinearity_hiddens_backward)
            gradient_weights.bias['hidden_backward'][
                0] += pre_nonlinearity_hiddens_backward

        for index in range(batch_size - 2):
            backward_index = batch_size - 2 - index
            forward_index = index + 1
            pre_nonlinearity_hiddens_forward = (
                (np.dot(backward_inputs[backward_index, :],
                        model.weights['hidden_output_forward'].T) +
                 np.dot(pre_nonlinearity_hiddens_forward,
                        model.weights['hidden_hidden_forward'].T)) *
                hiddens_forward[backward_index, :] *
                (1 - hiddens_forward[backward_index, :]))

            pre_nonlinearity_hiddens_backward = (
                (np.dot(backward_inputs[forward_index, :],
                        model.weights['hidden_output_backward'].T) +
                 np.dot(pre_nonlinearity_hiddens_backward,
                        model.weights['hidden_hidden_backward'].T)) *
                hiddens_backward[forward_index, :] *
                (1 - hiddens_backward[forward_index, :]))

            gradient_weights.weights['visible_hidden'][batch_inputs[
                backward_index]] += pre_nonlinearity_hiddens_forward  #+= np.dot(visibles[observation_index,:,:].T, pre_nonlinearity_hiddens)
            gradient_weights.weights['hidden_hidden_forward'] += np.outer(
                hiddens_forward[backward_index - 1, :],
                pre_nonlinearity_hiddens_forward)
            gradient_weights.bias['hidden_forward'][
                0] += pre_nonlinearity_hiddens_forward

            gradient_weights.weights['visible_hidden'][batch_inputs[
                forward_index]] += pre_nonlinearity_hiddens_backward  #+= np.dot(visibles[observation_index,:,:].T, pre_nonlinearity_hiddens)
            gradient_weights.weights['hidden_hidden_backward'] += np.outer(
                hiddens_backward[forward_index + 1, :],
                pre_nonlinearity_hiddens_backward)
            gradient_weights.bias['hidden_backward'][
                0] += pre_nonlinearity_hiddens_backward

        if batch_size > 1:
            pre_nonlinearity_hiddens_forward = (
                (np.dot(backward_inputs[0, :],
                        model.weights['hidden_output_forward'].T) +
                 np.dot(pre_nonlinearity_hiddens_forward,
                        model.weights['hidden_hidden_forward'].T)) *
                hiddens_forward[0, :] * (1 - hiddens_forward[0, :]))
            pre_nonlinearity_hiddens_backward = (
                (np.dot(backward_inputs[-1, :],
                        model.weights['hidden_output_backward'].T) +
                 np.dot(pre_nonlinearity_hiddens_backward,
                        model.weights['hidden_hidden_backward'].T)) *
                hiddens_backward[-1, :] * (1 - hiddens_backward[-1, :]))

        gradient_weights.weights['visible_hidden'][batch_inputs[
            0]] += pre_nonlinearity_hiddens_forward  # += np.dot(visibles[0,:,:].T, pre_nonlinearity_hiddens)
        gradient_weights.weights['hidden_hidden_forward'] += np.outer(
            model.init_hiddens['forward'], pre_nonlinearity_hiddens_forward
        )  #np.dot(np.tile(model.init_hiddens, (pre_nonlinearity_hiddens.shape[0],1)).T, pre_nonlinearity_hiddens)
        gradient_weights.bias['hidden_forward'][
            0] += pre_nonlinearity_hiddens_forward
        gradient_weights.init_hiddens['forward'][0] = np.dot(
            pre_nonlinearity_hiddens_forward,
            model.weights['hidden_hidden_forward'].T)

        gradient_weights.weights['visible_hidden'][batch_inputs[
            -1]] += pre_nonlinearity_hiddens_backward  # += np.dot(visibles[0,:,:].T, pre_nonlinearity_hiddens)
        gradient_weights.weights['hidden_hidden_backward'] += np.outer(
            model.init_hiddens['backward'], pre_nonlinearity_hiddens_backward
        )  #np.dot(np.tile(model.init_hiddens, (pre_nonlinearity_hiddens.shape[0],1)).T, pre_nonlinearity_hiddens)
        gradient_weights.bias['hidden_backward'][
            0] += pre_nonlinearity_hiddens_backward
        gradient_weights.init_hiddens['backward'][0] = np.dot(
            pre_nonlinearity_hiddens_backward,
            model.weights['hidden_hidden_backward'].T)
        #        gradient_weights = self.backward_pass(backward_inputs, hiddens, batch_inputs, model)
        backward_inputs[batch_indices, batch_labels] += 1.0
        gradient_weights /= batch_size

        if l2_regularization_const > 0.0:
            gradient_weights += model * l2_regularization_const
        if return_cross_entropy and not check_gradient:
            return cross_entropy
#        if not check_gradient:
#            if not return_cross_entropy:
#                if l2_regularization_const > 0.0:
#                    return gradient_weights / batch_size + model * l2_regularization_const
#                return gradient_weights / batch_size
#            else:
#                if l2_regularization_const > 0.0:
#                    return gradient_weights / batch_size + model * l2_regularization_const, cross_entropy
#                return gradient_weights / batch_size, cross_entropy

### below block checks gradient... only to be used if you think the gradient is incorrectly calculated ##############
        else:
            gradient_weights *= batch_size
            if l2_regularization_const > 0.0:
                gradient_weights += model * (l2_regularization_const *
                                             batch_size)
            sys.stdout.write(
                "\r                                                                \r"
            )
            print "checking gradient..."
            finite_difference_model = Bidirectional_RNNLM_Weight()
            finite_difference_model.init_zero_weights(
                self.model.get_architecture(), verbose=False)

            direction = Bidirectional_RNNLM_Weight()
            direction.init_zero_weights(self.model.get_architecture(),
                                        verbose=False)
            epsilon = 1E-5
            print "at initial hiddens"
            for key in direction.init_hiddens.keys():
                for index in range(direction.init_hiddens[key].size):
                    direction.init_hiddens[key][0][index] = epsilon
                    forward_loss = -np.sum(
                        np.log(
                            self.forward_pass_single_batch(
                                batch_inputs, model=model +
                                direction)[batch_indices, batch_labels]))
                    backward_loss = -np.sum(
                        np.log(
                            self.forward_pass_single_batch(
                                batch_inputs, model=model -
                                direction)[batch_indices, batch_labels]))
                    finite_difference_model.init_hiddens[key][0][index] = (
                        forward_loss - backward_loss) / (2 * epsilon)
                    direction.init_hiddens[key][0][index] = 0.0
            for key in direction.bias.keys():
                print "at bias key", key
                for index in range(direction.bias[key].size):
                    direction.bias[key][0][index] = epsilon
                    #print direction.norm()
                    forward_loss = -np.sum(
                        np.log(
                            self.forward_pass_single_batch(
                                batch_inputs, model=model +
                                direction)[batch_indices, batch_labels]))
                    backward_loss = -np.sum(
                        np.log(
                            self.forward_pass_single_batch(
                                batch_inputs, model=model -
                                direction)[batch_indices, batch_labels]))
                    finite_difference_model.bias[key][0][index] = (
                        forward_loss - backward_loss) / (2 * epsilon)
                    direction.bias[key][0][index] = 0.0
            for key in direction.weights.keys():
                print "at weight key", key
                for index0 in range(direction.weights[key].shape[0]):
                    for index1 in range(direction.weights[key].shape[1]):
                        direction.weights[key][index0][index1] = epsilon
                        forward_loss = -np.sum(
                            np.log(
                                self.forward_pass_single_batch(
                                    batch_inputs, model=model +
                                    direction)[batch_indices, batch_labels]))
                        backward_loss = -np.sum(
                            np.log(
                                self.forward_pass_single_batch(
                                    batch_inputs, model=model -
                                    direction)[batch_indices, batch_labels]))
                        finite_difference_model.weights[key][index0][
                            index1] = (forward_loss -
                                       backward_loss) / (2 * epsilon)
                        direction.weights[key][index0][index1] = 0.0

            print "calculated gradient for forward initial hiddens"
            print gradient_weights.init_hiddens['forward']
            print "finite difference approximation for forward initial hiddens"
            print finite_difference_model.init_hiddens['forward']

            print "calculated gradient for backward initial hiddens"
            print gradient_weights.init_hiddens['backward']
            print "finite difference approximation for backward initial hiddens"
            print finite_difference_model.init_hiddens['backward']

            print "calculated gradient for forward hidden bias"
            print gradient_weights.bias['hidden_forward']
            print "finite difference approximation for forward hidden bias"
            print finite_difference_model.bias['hidden_forward']

            print "calculated gradient for backward hidden bias"
            print gradient_weights.bias['hidden_backward']
            print "finite difference approximation for backward hidden bias"
            print finite_difference_model.bias['hidden_backward']

            print "calculated gradient for output bias"
            print gradient_weights.bias['output']
            print "finite difference approximation for output bias"
            print finite_difference_model.bias['output']

            print "calculated gradient for visible_hidden layer"
            print gradient_weights.weights['visible_hidden']
            print "finite difference approximation for visible_hidden layer"
            print finite_difference_model.weights['visible_hidden']
            print np.sum((finite_difference_model.weights['visible_hidden'] -
                          gradient_weights.weights['visible_hidden'])**2)

            print "calculated gradient for hidden_hidden_forward layer"
            print gradient_weights.weights['hidden_hidden_forward']
            print "finite difference approximation for hidden_hidden_forward layer"
            print finite_difference_model.weights['hidden_hidden_forward']

            print "calculated gradient for hidden_hidden_backward layer"
            print gradient_weights.weights['hidden_hidden_backward']
            print "finite difference approximation for hidden_hidden_backward layer"
            print finite_difference_model.weights['hidden_hidden_backward']

            print "calculated gradient for hidden_output_forward layer"
            print gradient_weights.weights['hidden_output_forward']
            print "finite difference approximation for hidden_output_forward layer"
            print finite_difference_model.weights['hidden_output_forward']

            print "calculated gradient for hidden_output_backward layer"
            print gradient_weights.weights['hidden_output_backward']
            print "finite difference approximation for hidden_output_backward layer"
            print finite_difference_model.weights['hidden_output_backward']

            sys.exit()
Exemple #7
0
    def backprop_adagrad_single_batch(self):
        print "Starting backprop using adagrad"
        adagrad_weight = Bidirectional_RNNLM_Weight()
        adagrad_weight.init_zero_weights(self.model.get_architecture())
        
        buffer_weight = Bidirectional_RNNLM_Weight()
        buffer_weight.init_zero_weights(self.model.get_architecture())
        
        fudge_factor = 1.0
        adagrad_weight = adagrad_weight + fudge_factor
        gradient = Bidirectional_RNNLM_Weight()
        gradient.init_zero_weights(self.model.get_architecture())
        if self.validation_feature_file_name is not None:
            cross_entropy, perplexity, num_correct, num_examples, loss = self.calculate_classification_statistics(self.validation_features, self.validation_labels, self.validation_fsl, self.model)
            print "cross-entropy before steepest descent is", cross_entropy
            print "perplexity is", perplexity
            if self.l2_regularization_const > 0.0:
                print "regularized loss is", loss
            print "number correctly classified is", num_correct, "of", num_examples
        
#        excluded_keys = {'bias':['0'], 'weights':[]}
#        frame_table = np.cumsum(self.feature_sequence_lens)
        for epoch_num in range(len(self.steepest_learning_rate)):
            print "At epoch", epoch_num+1, "of", len(self.steepest_learning_rate), "with learning rate", self.steepest_learning_rate[epoch_num]
            start_frame = 0
            end_frame = 0
            cross_entropy = 0.0
            num_examples = 0
#            if hasattr(self, 'momentum_rate'):
#                momentum_rate = self.momentum_rate[epoch_num]
#                print "momentum is", momentum_rate
#            else:
#                momentum_rate = 0.0
            for batch_index, feature_sequence_len in enumerate(self.feature_sequence_lens):
                end_frame = start_frame + feature_sequence_len
                batch_features = self.features[:feature_sequence_len, batch_index]
                batch_labels = self.labels[start_frame:end_frame,1]
#                print ""
#                print batch_index
#                print batch_features
#                print batch_labels
                cur_xent = self.calculate_gradient_single_batch(batch_features, batch_labels, gradient, return_cross_entropy = True, 
                                                                check_gradient = False)
#                print self.model.norm()
#                print gradient.norm()
                if self.l2_regularization_const > 0.0:
                    buffer_weight.assign_weights(self.model)
                    buffer_weight *= self.l2_regularization_const
                    gradient += buffer_weight
                buffer_weight.assign_weights(gradient)
#                print gradient.init_hiddens
                buffer_weight **= 2.0
                adagrad_weight += buffer_weight
#                print adagrad_weight.init_hiddens
                buffer_weight.assign_weights(adagrad_weight)
                buffer_weight **= 0.5
#                print buffer_weight.init_hiddens
                gradient /= buffer_weight
#                print gradient.init_hiddens
                cross_entropy += cur_xent 
                per_done = float(batch_index)/self.num_sequences*100
                sys.stdout.write("\r                                                                \r") #clear line
                sys.stdout.write("\r%.1f%% done " % per_done), sys.stdout.flush()
                ppp = cross_entropy / end_frame
                sys.stdout.write("train X-ent: %f " % ppp), sys.stdout.flush()
                gradient *= -self.steepest_learning_rate[epoch_num]
                self.model += gradient #/ batch_size
#                if momentum_rate > 0.0:
#                    prev_step *= momentum_rate
#                    self.model += prev_step
#                prev_step.assign_weights(gradient)
#                prev_step *= -self.steepest_learning_rate[epoch_num]
                
                start_frame = end_frame
                
            if self.validation_feature_file_name is not None:
                cross_entropy, perplexity, num_correct, num_examples, loss = self.calculate_classification_statistics(self.validation_features, self.validation_labels, self.validation_fsl, self.model)
                print "cross-entropy at the end of the epoch is", cross_entropy
                print "perplexity is", perplexity
                if self.l2_regularization_const > 0.0:
                    print "regularized loss is", loss
                print "number correctly classified is", num_correct, "of", num_examples
                
            sys.stdout.write("\r100.0% done \r")
            sys.stdout.write("\r                                                                \r") #clear line
            if self.save_each_epoch:
                self.model.write_weights(''.join([self.output_name, '_epoch_', str(epoch_num+1)]))  
Exemple #8
0
    def calculate_gradient_single_batch(self, batch_inputs, batch_labels, gradient_weights, hiddens_forward = None, hiddens_backward = None, 
                                        outputs = None, check_gradient=False, model=None, l2_regularization_const = 0.0, 
                                        return_cross_entropy = False): 
        #need to check regularization
        #calculate gradient with particular Neural Network model. If None is specified, will use current weights (i.e., self.model)
        batch_size = batch_labels.size
        if model is None:
            model = self.model
        if hiddens_forward is None or hiddens_backward is None or outputs is None:
            outputs, hiddens_forward, hiddens_backward = self.forward_pass_single_batch(batch_inputs, model, return_hiddens=True)
        #derivative of log(cross-entropy softmax)
        batch_indices = np.arange(batch_size)
        gradient_weights *= 0.0
        backward_inputs = outputs

        if return_cross_entropy:
            cross_entropy = -np.sum(np.log2(backward_inputs[batch_indices, batch_labels]))
        backward_inputs[batch_indices, batch_labels] -= 1.0

        np.sum(backward_inputs, axis=0, out = gradient_weights.bias['output'][0])
        
        np.dot(hiddens_forward.T, backward_inputs, out = gradient_weights.weights['hidden_output_forward'])
        pre_nonlinearity_hiddens_forward = np.dot(backward_inputs[batch_size-1,:], model.weights['hidden_output_forward'].T) 
        pre_nonlinearity_hiddens_forward *= hiddens_forward[batch_size-1,:] 
        pre_nonlinearity_hiddens_forward *= 1 - hiddens_forward[batch_size-1,:]
        
        np.dot(hiddens_backward.T, backward_inputs, out = gradient_weights.weights['hidden_output_backward'])
        pre_nonlinearity_hiddens_backward = np.dot(backward_inputs[0,:], model.weights['hidden_output_backward'].T) 
        pre_nonlinearity_hiddens_backward *= hiddens_backward[0,:] 
        pre_nonlinearity_hiddens_backward *= 1 - hiddens_backward[0,:]
        
        if batch_size > 1:
            gradient_weights.weights['visible_hidden'][batch_inputs[batch_size-1]] += pre_nonlinearity_hiddens_forward
            gradient_weights.weights['hidden_hidden_forward'] += np.outer(hiddens_forward[batch_size-2,:], pre_nonlinearity_hiddens_forward)
            gradient_weights.bias['hidden_forward'][0] += pre_nonlinearity_hiddens_forward
            
            gradient_weights.weights['visible_hidden'][batch_inputs[0]] += pre_nonlinearity_hiddens_backward
            gradient_weights.weights['hidden_hidden_backward'] += np.outer(hiddens_backward[1,:], pre_nonlinearity_hiddens_backward)
            gradient_weights.bias['hidden_backward'][0] += pre_nonlinearity_hiddens_backward
        
        for index in range(batch_size-2):
            backward_index = batch_size - 2 - index
            forward_index = index + 1
            pre_nonlinearity_hiddens_forward = ((np.dot(backward_inputs[backward_index,:], model.weights['hidden_output_forward'].T) + 
                                                 np.dot(pre_nonlinearity_hiddens_forward, model.weights['hidden_hidden_forward'].T))
                                                * hiddens_forward[backward_index,:] * (1 - hiddens_forward[backward_index,:]))
            
            pre_nonlinearity_hiddens_backward = ((np.dot(backward_inputs[forward_index,:], model.weights['hidden_output_backward'].T) + 
                                                 np.dot(pre_nonlinearity_hiddens_backward, model.weights['hidden_hidden_backward'].T))
                                                * hiddens_backward[forward_index,:] * (1 - hiddens_backward[forward_index,:]))

            gradient_weights.weights['visible_hidden'][batch_inputs[backward_index]] += pre_nonlinearity_hiddens_forward #+= np.dot(visibles[observation_index,:,:].T, pre_nonlinearity_hiddens)
            gradient_weights.weights['hidden_hidden_forward'] += np.outer(hiddens_forward[backward_index-1,:], pre_nonlinearity_hiddens_forward)
            gradient_weights.bias['hidden_forward'][0] += pre_nonlinearity_hiddens_forward
            
            gradient_weights.weights['visible_hidden'][batch_inputs[forward_index]] += pre_nonlinearity_hiddens_backward #+= np.dot(visibles[observation_index,:,:].T, pre_nonlinearity_hiddens)
            gradient_weights.weights['hidden_hidden_backward'] += np.outer(hiddens_backward[forward_index+1,:], pre_nonlinearity_hiddens_backward)
            gradient_weights.bias['hidden_backward'][0] += pre_nonlinearity_hiddens_backward
        
        if batch_size > 1:
            pre_nonlinearity_hiddens_forward = ((np.dot(backward_inputs[0,:], model.weights['hidden_output_forward'].T) 
                                                 + np.dot(pre_nonlinearity_hiddens_forward, model.weights['hidden_hidden_forward'].T))
                                                * hiddens_forward[0,:] * (1 - hiddens_forward[0,:]))
            pre_nonlinearity_hiddens_backward = ((np.dot(backward_inputs[-1,:], model.weights['hidden_output_backward'].T) 
                                                  + np.dot(pre_nonlinearity_hiddens_backward, model.weights['hidden_hidden_backward'].T))
                                                * hiddens_backward[-1,:] * (1 - hiddens_backward[-1,:]))
            
        gradient_weights.weights['visible_hidden'][batch_inputs[0]] += pre_nonlinearity_hiddens_forward# += np.dot(visibles[0,:,:].T, pre_nonlinearity_hiddens)
        gradient_weights.weights['hidden_hidden_forward'] += np.outer(model.init_hiddens['forward'], pre_nonlinearity_hiddens_forward) #np.dot(np.tile(model.init_hiddens, (pre_nonlinearity_hiddens.shape[0],1)).T, pre_nonlinearity_hiddens)
        gradient_weights.bias['hidden_forward'][0] += pre_nonlinearity_hiddens_forward
        gradient_weights.init_hiddens['forward'][0] = np.dot(pre_nonlinearity_hiddens_forward, model.weights['hidden_hidden_forward'].T)
        
        gradient_weights.weights['visible_hidden'][batch_inputs[-1]] += pre_nonlinearity_hiddens_backward# += np.dot(visibles[0,:,:].T, pre_nonlinearity_hiddens)
        gradient_weights.weights['hidden_hidden_backward'] += np.outer(model.init_hiddens['backward'], pre_nonlinearity_hiddens_backward) #np.dot(np.tile(model.init_hiddens, (pre_nonlinearity_hiddens.shape[0],1)).T, pre_nonlinearity_hiddens)
        gradient_weights.bias['hidden_backward'][0] += pre_nonlinearity_hiddens_backward
        gradient_weights.init_hiddens['backward'][0] = np.dot(pre_nonlinearity_hiddens_backward, model.weights['hidden_hidden_backward'].T)
#        gradient_weights = self.backward_pass(backward_inputs, hiddens, batch_inputs, model)
        backward_inputs[batch_indices, batch_labels] += 1.0
        gradient_weights /= batch_size
        
        if l2_regularization_const > 0.0:
            gradient_weights += model * l2_regularization_const
        if return_cross_entropy and not check_gradient:
            return cross_entropy
#        if not check_gradient:
#            if not return_cross_entropy:
#                if l2_regularization_const > 0.0:
#                    return gradient_weights / batch_size + model * l2_regularization_const
#                return gradient_weights / batch_size
#            else:
#                if l2_regularization_const > 0.0:
#                    return gradient_weights / batch_size + model * l2_regularization_const, cross_entropy
#                return gradient_weights / batch_size, cross_entropy
            
        ### below block checks gradient... only to be used if you think the gradient is incorrectly calculated ##############
        else:
            gradient_weights *= batch_size
            if l2_regularization_const > 0.0:
                gradient_weights += model * (l2_regularization_const * batch_size)
            sys.stdout.write("\r                                                                \r")
            print "checking gradient..."
            finite_difference_model = Bidirectional_RNNLM_Weight()
            finite_difference_model.init_zero_weights(self.model.get_architecture(), verbose=False)
            
            direction = Bidirectional_RNNLM_Weight()
            direction.init_zero_weights(self.model.get_architecture(), verbose=False)
            epsilon = 1E-5
            print "at initial hiddens"
            for key in direction.init_hiddens.keys():
                for index in range(direction.init_hiddens[key].size):
                    direction.init_hiddens[key][0][index] = epsilon
                    forward_loss = -np.sum(np.log(self.forward_pass_single_batch(batch_inputs, model = model + direction)[batch_indices, batch_labels]))
                    backward_loss = -np.sum(np.log(self.forward_pass_single_batch(batch_inputs, model = model - direction)[batch_indices, batch_labels]))
                    finite_difference_model.init_hiddens[key][0][index] = (forward_loss - backward_loss) / (2 * epsilon)
                    direction.init_hiddens[key][0][index] = 0.0
            for key in direction.bias.keys():
                print "at bias key", key
                for index in range(direction.bias[key].size):
                    direction.bias[key][0][index] = epsilon
                    #print direction.norm()
                    forward_loss = -np.sum(np.log(self.forward_pass_single_batch(batch_inputs, model = model + direction)[batch_indices, batch_labels]))
                    backward_loss = -np.sum(np.log(self.forward_pass_single_batch(batch_inputs, model = model - direction)[batch_indices, batch_labels]))
                    finite_difference_model.bias[key][0][index] = (forward_loss - backward_loss) / (2 * epsilon)
                    direction.bias[key][0][index] = 0.0
            for key in direction.weights.keys():
                print "at weight key", key
                for index0 in range(direction.weights[key].shape[0]):
                    for index1 in range(direction.weights[key].shape[1]):
                        direction.weights[key][index0][index1] = epsilon
                        forward_loss = -np.sum(np.log(self.forward_pass_single_batch(batch_inputs, model = model + direction)[batch_indices, batch_labels]))
                        backward_loss = -np.sum(np.log(self.forward_pass_single_batch(batch_inputs, model = model - direction)[batch_indices, batch_labels]))
                        finite_difference_model.weights[key][index0][index1] = (forward_loss - backward_loss) / (2 * epsilon)
                        direction.weights[key][index0][index1] = 0.0
            
            print "calculated gradient for forward initial hiddens"
            print gradient_weights.init_hiddens['forward']
            print "finite difference approximation for forward initial hiddens"
            print finite_difference_model.init_hiddens['forward']
            
            print "calculated gradient for backward initial hiddens"
            print gradient_weights.init_hiddens['backward']
            print "finite difference approximation for backward initial hiddens"
            print finite_difference_model.init_hiddens['backward']
            
            print "calculated gradient for forward hidden bias"
            print gradient_weights.bias['hidden_forward']
            print "finite difference approximation for forward hidden bias"
            print finite_difference_model.bias['hidden_forward']
            
            print "calculated gradient for backward hidden bias"
            print gradient_weights.bias['hidden_backward']
            print "finite difference approximation for backward hidden bias"
            print finite_difference_model.bias['hidden_backward']
            
            print "calculated gradient for output bias"
            print gradient_weights.bias['output']
            print "finite difference approximation for output bias"
            print finite_difference_model.bias['output']
            
            print "calculated gradient for visible_hidden layer"
            print gradient_weights.weights['visible_hidden']
            print "finite difference approximation for visible_hidden layer"
            print finite_difference_model.weights['visible_hidden']
            print np.sum((finite_difference_model.weights['visible_hidden'] - gradient_weights.weights['visible_hidden']) ** 2)
            
            print "calculated gradient for hidden_hidden_forward layer"
            print gradient_weights.weights['hidden_hidden_forward']
            print "finite difference approximation for hidden_hidden_forward layer"
            print finite_difference_model.weights['hidden_hidden_forward']
            
            print "calculated gradient for hidden_hidden_backward layer"
            print gradient_weights.weights['hidden_hidden_backward']
            print "finite difference approximation for hidden_hidden_backward layer"
            print finite_difference_model.weights['hidden_hidden_backward']
            
            print "calculated gradient for hidden_output_forward layer"
            print gradient_weights.weights['hidden_output_forward']
            print "finite difference approximation for hidden_output_forward layer"
            print finite_difference_model.weights['hidden_output_forward']
            
            print "calculated gradient for hidden_output_backward layer"
            print gradient_weights.weights['hidden_output_backward']
            print "finite difference approximation for hidden_output_backward layer"
            print finite_difference_model.weights['hidden_output_backward']
            
            sys.exit()