def input_nub_generator(variable, transformed_observations): """ Generate an input layer and input 'nub' for a Keras network. - input_layer: The input layer accepts data from the outside world. - input_nub: The input nub will always include the input_layer as its first layer. It may also include other layers for handling the data type in specific ways :param variable: Name of the variable :type variable: str :param transformed_observations: A dataframe, containing either the specified variable, or derived variables :type transformed_observations: pandas.DataFrame :return: A tuple containing the input layer, and the last layer of the nub """ logging.info('Creating input nub for: {}'.format(variable)) # Get transformed data for shaping. One column per token. if variable in transformed_observations.columns: variable_list = [variable] else: variable_name_prefix = variable + '_' variable_list = list(filter(lambda x: x.startswith(variable_name_prefix), transformed_observations.columns)) logging.info('Determined variable list: {}'.format(variable_list)) # Pull transformed data as matrix transformed = transformed_observations[variable_list].as_matrix() # Determine sequence length if len(transformed.shape) >= 2: # If we have multiple columns, it's one column per word input_sequence_length = int(transformed.shape[1]) else: # If there are not multiple columns, there is only one word input_sequence_length = 1 # Determine vocabulary size (number of rows in the embedding). The additional offsets are due to 1 for len # vs indexing w/ 0, 1 for unknown token, and the others for something else? vocab_size = int(numpy.max(transformed)) + 4 # Determine embedding output size # TODO There must be a better heuristic embedding_output_dim = 200 logging.info('Creating embedding for text_var: {}, with input_sequence_length: {}, vocab size: {}, ' 'and embedding_output_dim: {}'.format(variable, input_sequence_length, vocab_size, embedding_output_dim)) # Create and stack layers input_layer = keras.Input(shape=(input_sequence_length,), name=lib.namespace_conversion('input_{}'.format(variable))) x = input_layer x = Embedding(input_dim=vocab_size, output_dim=embedding_output_dim, input_length=input_sequence_length, name=lib.namespace_conversion('embedding_{}'.format(variable)))(x) x = Bidirectional(LSTM(128, name=lib.namespace_conversion('lstm_{}'.format(variable))), name=lib.namespace_conversion('bidirectiona_lstm_{}'.format(variable)))(x) input_nub = x # Return return input_layer, input_nub
def test_namespace_conversion(self): placeholder(name=lib.namespace_conversion(' asdf @$@#$@#')) placeholder(name=lib.namespace_conversion('asdf @$ @#$@#')) placeholder(name=lib.namespace_conversion('12342342')) iris_vars = [ 'sepal length in cm', 'sepal width in cm', 'petal length in cm', 'petal width in cm', 'class', 'Iris Setosa', 'Iris Versicolour', 'Iris Virginica' ] for var in iris_vars: placeholder(name=lib.namespace_conversion(var)) pass
def input_nub_generator(variable, transformed_observations): """ Generate an input layer and input 'nub' for a Keras network. - input_layer: The input layer accepts data from the outside world. - input_nub: The input nub will always include the input_layer as its first layer. It may also include other layers for handling the data type in specific ways :param variable: Name of the variable :type variable: str :param transformed_observations: A dataframe, containing either the specified variable, or derived variables :type transformed_observations: pandas.DataFrame :return: A tuple containing the input layer, and the last layer of the nub """ # Get transformed data for shaping if variable in transformed_observations.columns: variable_list = [variable] else: variable_name_prefix = variable + '_' variable_list = list( filter(lambda x: x.startswith(variable_name_prefix), transformed_observations.columns)) transformed = transformed_observations[variable_list].as_matrix() # Set up sequence length for input_layer if len(transformed.shape) >= 2: input_sequence_length = int(transformed.shape[1]) else: input_sequence_length = 1 logging.info( 'For variable: {}, using input_sequence_length: {}'.format( variable, input_sequence_length)) # Create and stack layers input_layer = keras.Input(shape=(input_sequence_length, ), name=lib.namespace_conversion( 'input_{}'.format(variable))) x = input_layer x = Reshape((input_sequence_length, 1))(x) x = Bidirectional(LSTM(32, name=lib.namespace_conversion( 'lstm_{}'.format(variable))), name=lib.namespace_conversion( 'bidirectional_lstm_{}'.format(variable)))(x) input_nub = x return input_layer, input_nub
def input_nub_generator(variable, transformed_observations): """ Generate an input layer and input 'nub' for a Keras network. - input_layer: The input layer accepts data from the outside world. - input_nub: The input nub will always include the input_layer as its first layer. It may also include other layers for handling the data type in specific ways :param variable: Name of the variable :type variable: str :param transformed_observations: A dataframe, containing either the specified variable, or derived variables :type transformed_observations: pandas.DataFrame :return: A tuple containing the input layer, and the last layer of the nub """ # Get transformed data for shaping transformed = transformed_observations[variable].as_matrix() # Set up dimensions for input_layer layer if len(transformed.shape) >= 2: input_sequence_length = int(transformed.shape[1]) else: input_sequence_length = 1 # TODO Convert below to numpy.max (?) categorical_num_levels = int(max(transformed)) + 2 embedding_output_dim = int(min((categorical_num_levels + 1) / 2, 50)) logging.info( 'Creating embedding for cat_var: {}, with input_sequence_length: {}, categorical_num_levels: {}, ' 'and embedding_output_dim: {}'.format(variable, input_sequence_length, categorical_num_levels, embedding_output_dim)) input_layer = keras.Input(shape=(input_sequence_length, ), name=lib.namespace_conversion( 'input_{}'.format(variable))) x = input_layer x = Embedding(input_dim=categorical_num_levels, output_dim=embedding_output_dim, input_length=input_sequence_length, name=lib.namespace_conversion( 'embedding_{}'.format(variable)))(x) x = Flatten(name=lib.namespace_conversion( 'flatten_embedding_{}'.format(variable)))(x) input_nub = x return input_layer, input_nub
def input_nub_generator(self, variable, transformed_observations): """ Generate an input layer and input 'nub' for a Keras network. - input_layer: The input layer accepts data from the outside world. - input_nub: The input nub will always include the input_layer as its first layer. It may also include other layers for handling the data type in specific ways :param variable: Name of the variable :type variable: str :param transformed_obervations: A dataframe, containing either the specified variable, or derived variables :type transformed_obervations: pandas.DataFrame :return: A tuple containing the input layer, and the last layer of the nub """ transformed = transformed_observations[variable].as_matrix() # Set up dimensions for input_layer layer if len(transformed.shape) >= 2: input_sequence_length = int(transformed.shape[1]) else: input_sequence_length = 1 # Create input_layer layer input_layer = keras.Input(shape=(input_sequence_length,), name=lib.namespace_conversion('input_{}'.format(variable))) input_nub = input_layer # Return, in format of input_layer, last variable-specific layer return input_layer, input_nub