def features_standardization(dataframe, fit=False):
   """
    Performs feature standardization using a Standard scaler.
    After standardization, features will have zero means and
    unit standard deviation, changing the original distribution.
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with encoded data
        fit: boolean
            Indicates if we should train or load a scaler
    Returns
        dataframe: pd.DataFrame
            Dataframe with scaled features
    """
    # Train or load a scaler    
    if fit:
        scaler = StandardScaler()
        scaler.fit(dataframe)

        pickle_obj(scaler, 'standard_scaler')
    else:
        scaler = unpickle_obj('standard_scaler')

    # Transform data and recreate dataframe from np.array
    X = scaler.transform(dataframe)
    df = pd.DataFrame(X, columns=dataframe.columns)

    return df
def save_feature_set(dataframe, attributes_df, label='features_of_interest', save_original_features=True):
  """
    Save list of features using their original or current names
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with pre-processed data
        attributes_df: pd.DataFrame
            Dataframe with feature attributes, based on the data dictionary
        label: string
            Filename for serialization
        save_original_features: boolean
            Flag indicating if we should save original or current feature names
    Returns
        None
    """
    # get current feature names
    renamed_features = set(dataframe.columns.values)
    # retrieve original feature names, using attributes dataframe
    original_features = attributes_df[attributes_df.Renaming.isin(renamed_features)].Feature.values
    
    # decide which feature set to save based on save_original_features flag
    features = original_features if save_original_features else renamed_features
    
    # serialize list of features
    pickle_obj(features, label)
def encode_high_cardinality_categorical_df(dataframe, fit=False):
   """
    Encode high cardinality categorical features using Binary Encoding and dropping invariant features
    In Binary Encoding, features are converted to a binary representation and binary digits are used as new
    features.
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with pre-processed data (i.e. renamed features), high card. categorical features only
        fit: boolean
            Indicates if we should train or load an encoder
    Returns
        dataframe: pd.DataFrame
            Dataframe with encoded data
    """
    # Train or load an encoder    
    if fit:
        encoder = BinaryEncoder(cols=dataframe.columns.values, drop_invariant=True)
        encoder.fit(dataframe)
        
        pickle_obj(encoder, 'high_card_categorical_encoder')
    else:
        encoder = unpickle_obj('high_card_categorical_encoder')

    # transform data
    return encoder.transform(dataframe)
def features_normalization(dataframe, fit=False):
   """
    Performs feature normalization using a MinMax scaler.
    After normalization, features values will be in the range [0:1]
    while preserving original distribution.
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with encoded data
        fit: boolean
            Indicates if we should train or load a scaler
    Returns
        dataframe: pd.DataFrame
            Dataframe with scaled features
    """
    # Train or load a scaler
    if fit:
        scaler = MinMaxScaler()
        scaler.fit(dataframe)

        pickle_obj(scaler, 'minmax_scaler')
    else:
        scaler = unpickle_obj('minmax_scaler')

    # Transform data and recreate dataframe from np.array
    X = scaler.transform(dataframe)
    df = pd.DataFrame(X, columns=dataframe.columns)

    return df
def ordinal_values_imputation(ordinal_dataframe, fit=False, nan_value=-1):
    """
    Perform imputation of missing values for ordinal features using median value.
    ---
    Arguments
        ordinal_dataframe: pd.DataFrame
            Dataframe with pre-processed data (i.e. renamed features), ordinal features only
        fit: boolean
            Indicates if we should train or load an imputer
        nan_value: Any
            Value to be considered as missing value
    Returns
        dataframe: pd.DataFrame
            Dataframe with missing values imputed
    """
    # Train or load a simple imputer, responsible for filling missing values with feature median
    if fit:
        imputer = SimpleImputer(missing_values=nan_value, strategy='median')
        imputer.fit(ordinal_dataframe)
        
        pickle_obj(imputer, 'ordinal_imputer')
    else:
        imputer = unpickle_obj('ordinal_imputer')

    # input missing values
    transformed = imputer.transform(ordinal_dataframe)

    # construct a dataframe from np.array values and original column names    
    return pd.DataFrame(transformed, columns=ordinal_dataframe.columns.values)
def train_and_save():
    # logging to file
    logger = logging.getLogger('tensorflow')
    logger.setLevel(logging.DEBUG)

    fhandler = logging.FileHandler('tensorflow.log')
    for hdlr in logger.handlers:
        logger.removeHandler(hdlr)
    logger.addHandler(fhandler)

    # Preprocess the files
    sgm_preprocessor = preprocess.SkipGramPreprocess(config['nltk_packages'],
                                                     config['tokenizer_path'],
                                                     config['data_root'],
                                                     config['vocab_size'])
    # Generate context target pairs
    context_target_pairs = preprocess.SkipGramContextTargetPair(
        sgm_preprocessor, seed=42).get_context_target_pairs(20)

    np.random.seed(42)
    np.random.shuffle(context_target_pairs)

    contexts = context_target_pairs[:, 0]
    targets = np.expand_dims(context_target_pairs[:, 1], 1)
    input_fn_ = lambda: input_fn(
        contexts, targets, batch_size=config['batch_size'])
    w2v = tf.estimator.Estimator(model_fn=word2vec,
                                 model_dir=config['model_dir'],
                                 params={
                                     'vocab_size': config['vocab_size'],
                                     'embedding_size':
                                     config['embedding_size'],
                                     'num_sampled': config['num_neg_samples']
                                 })

    steps = config['epochs'] * contexts.shape[0] // config['batch_size']
    print('* Starting to train')
    print('\t- Number of epochs: {0:,}'.format(config['epochs']))
    print('\t- Number of steps : {0:,}'.format(steps))
    w2v.train(input_fn=input_fn_, steps=steps)
    print('* End of training')
    print('\t- For training logs see tensorflow.log')

    print('* Collecting Embedding matrix')
    input_fn_ = lambda: input_fn(contexts[:10], targets[:10], repeat=1)
    embedding_matrix = next(w2v.predict(input_fn_))['embedding_matrix']

    # Save embeddings
    print('* Saving Embeddings')
    if not os.path.isdir(config['w2v_root']):
        os.makedirs(config['w2v_root'])
    pickle_obj(sgm_preprocessor.word_to_ids,
               os.path.join(config['w2v_root'], 'word_ids.pickle'))
    np.save(os.path.join(config['w2v_root'], 'embedding_matrix.npy'),
            embedding_matrix)
def main():
    print('* Preprocessing Raw Corpus')
    preprocessor = preprocess.LangModelPreprocess(config['nltk_packages'],
                                                  config['tokenizer_path'],
                                                  config['data_root'],
                                                  config['vocab_size'],
                                                  config['ppdata_root'])
    print('* Generating Sorted File')
    sort_seq_len(os.path.join(config['ppdata_root'], 'pp.txt'),
                 os.path.join(config['ppdata_root'], 'pp_sorted.txt'))

    print('* Build Logger')
    logger = get_tensorflow_logger('tensorflow.log')

    print('* Estimator Instance Created')
    train_1_input_fn = lambda: input_fn(glob.glob(os.path.join(config['ppdata_root'], 'pp_sorted.txt')),
                                        batch_size=config['batch_size'],
                                        padding_val=config['vocab_size']-1,
                                        shuffle=False)
    train_input_fn = lambda: input_fn(glob.glob(os.path.join(config['ppdata_root'], 'pp.txt')),
                                        batch_size=config['batch_size'],
                                        padding_val=config['vocab_size']-1)
    lang_model = tf.estimator.Estimator(model_fn,
                                        model_dir=config['model_dir'],
                                        params={
                                            'lr': config['lr'],
                                            'vocab_size': config['vocab_size'],
                                            'embedding_size': config['embedding_size'],
                                            'hidden_units': config['hidden_units'],
                                            'keep_rate': config['keep_rate'],
                                            'num_layers': config['num_layers'],
                                            'max_gradient_norm': config['max_gradient_norm']
                                        })

    print('* Start Training - Training logs to tensorflow.log')
    print('\t-Training 1 Epoch over sorted sequences')
    lang_model.train(train_1_input_fn,
                     steps=config['steps_per_epoch']*1)
    print('\t-Training {} Epoch over random sequences'.format(config['epochs']-1))
    lang_model.train(train_input_fn,
                     steps=config['steps_per_epoch']*(config['epochs'] - 1))

    print('* Saving word id map')
    if os.path.isfile(os.path.join(config['ppdata_root'], 'word_ids.pickle')):
        print('\t-File {} already present'.format(os.path.join(config['ppdata_root'], 'word_ids.pickle')))
    else:
        pickle_obj(preprocessor.word_to_ids,
                os.path.join(config['ppdata_root'], 'word_ids.pickle'))
def get_low_and_high_cardinality_categorical_dfs(dataframe, attributes_df, threshold=5, fit=False):
    """
    Returns a tuple of dataframes containing categorical features only:
    - low cardinality: features with number of unique categories less than or equal to threshold
    - high cardinality: features with number of unique categories higher than threshold
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataset dataframe with pre-processed data (i.e. renamed features)
        attributes_df: pd.DataFrame
            Dataframe with feature attributes, based on the data dictionary
        threshold: int
            Threshold to consider high cardinality, based on number of categories 
        fit: boolean
            Indicates if we should measure cardinality or consider previously measured data
    Returns
        tuple: (pd.DataFrame, pd.DataFrame)
            (Dataset view with low cardinality features, Dataset view with high cardinality features)
    """
    # retrieve categorical features
    categorical_df = get_categorical_dataframe(dataframe, attributes_df)
    features = categorical_df.columns.values

    cardinality_count = {}
    # measure or read features cardinality
    if fit:
        for col in features:
            cardinality_count[col] = len(categorical_df[col].unique())
        
        pickle_obj(cardinality_count, 'cardinality_count')
    else:
        cardinality_count = unpickle_obj('cardinality_count')

    # split low and high cardinality features, based on threshold        
    high_cardinality_features = [feature for feature, cardinality in cardinality_count.items() if cardinality > threshold]
    low_cardinality_features = set(features)-set(high_cardinality_features)
    
    # create cardinality views
    low_cardinality_cat_df = categorical_df.loc[:, low_cardinality_features]
    high_cardinality_cat_df = categorical_df.loc[:, high_cardinality_features]
    
    return low_cardinality_cat_df, high_cardinality_cat_df
def encode_low_cardinality_categorical_df(dataframe, fit=False):
   """
    Encode low cardinality categorical features using OneHot Encoding and dropping invariant features
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with pre-processed data (i.e. renamed features), low card. categorical features only
        fit: boolean
            Indicates if we should train or load an encoder
    Returns
        dataframe: pd.DataFrame
            Dataframe with encoded data
    """
    # Train or load an encoder    
    if fit:
        encoder = OneHotEncoder(cols=dataframe.columns.values, drop_invariant=True)
        encoder.fit(dataframe)
        
        pickle_obj(encoder, 'low_card_categorical_encoder')
    else:
        encoder = unpickle_obj('low_card_categorical_encoder')

    # transform data
    return encoder.transform(dataframe)
def encode_ordinal_df(dataframe, fit=False):
   """
    Encode ordinal features, preserving the notion of order and dropping invariant features
    ---
    Arguments
        dataframe: pd.DataFrame
            Dataframe with pre-processed data (i.e. renamed features), ordinal features only
        fit: boolean
            Indicates if we should train or load an encoder
    Returns
        dataframe: pd.DataFrame
            Dataframe with encoded data
    """
    # Train or load an encoder    
    if fit:
        encoder = OrdinalEncoder(cols=dataframe.columns.values, drop_invariant=True)
        encoder.fit(dataframe)
        
        pickle_obj(encoder, 'ordinal_encoder')
    else:
        encoder = unpickle_obj('ordinal_encoder')

    # transform data
    return encoder.transform(dataframe)
Example #11
0
def save_activations(idx, activation, dump_path):
    myutils.mkdir(dump_path)
    myutils.pickle_obj(
        activation, os.path.join(dump_path,
                                 'model_{}_activations'.format(idx)))