Esempio n. 1
0
    def __init__(self,
                 modelpath,
                 input_max_len=INPUT_MAX_LEN,
                 output_max_len=OUTPUT_MAX_LEN):

        self.modelpath = modelpath

        meta = os.path.basename(self.modelpath).split('_')
        self.model_name = meta[0]
        self.x_max_len = input_max_len
        self.y_max_len = output_max_len
        self.x_ix_to_word = load_index(f'{self.model_name}_input_index.csv')
        self.y_ix_to_word = load_index(f'{self.model_name}_output_index.csv')
        self.embedding_dim = int(meta[4])
        self.hidden_dim = int(meta[5])
        self.layer_num = int(meta[6])
        self.learning_rate = float(meta[8][0:-5])
        self.dropout = float(meta[7])

        self.model = AttentionModel(self.model_name, self.x_max_len,
                                    len(self.x_ix_to_word), self.y_max_len,
                                    len(self.y_ix_to_word), self.hidden_dim,
                                    self.layer_num, self.learning_rate,
                                    self.dropout, self.embedding_dim)
        self.model.load_weights(self.modelpath)
Esempio n. 2
0
class CardinalLSTMTransformer(TransformerMixin, BaseEstimator):
    def __init__(self,
                 modelpath,
                 input_max_len=INPUT_MAX_LEN,
                 output_max_len=OUTPUT_MAX_LEN):

        self.modelpath = modelpath

        meta = os.path.basename(self.modelpath).split('_')
        self.model_name = meta[0]
        self.x_max_len = input_max_len
        self.y_max_len = output_max_len
        self.x_ix_to_word = load_index(f'{self.model_name}_input_index.csv')
        self.y_ix_to_word = load_index(f'{self.model_name}_output_index.csv')
        self.embedding_dim = int(meta[4])
        self.hidden_dim = int(meta[5])
        self.layer_num = int(meta[6])
        self.learning_rate = float(meta[8][0:-5])
        self.dropout = float(meta[7])

        self.model = AttentionModel(self.model_name, self.x_max_len,
                                    len(self.x_ix_to_word), self.y_max_len,
                                    len(self.y_ix_to_word), self.hidden_dim,
                                    self.layer_num, self.learning_rate,
                                    self.dropout, self.embedding_dim)
        self.model.load_weights(self.modelpath)

    def fit(self, X: pd.DataFrame, y=None, *args, **kwargs):
        return self

    def transform(self, df: pd.DataFrame, y=None, *args, **kwargs):
        cardinal_ixs = df[df['class'] == 'CARDINAL'].index

        x_series = (df.loc[cardinal_ixs, 'prev_prev'].map(str) + ' ' \
                    + df.loc[cardinal_ixs, 'prev'].map(str) + ' ' \
                    + df.loc[cardinal_ixs, 'before'].map(lambda s: ' '.join(list(s))) + ' ' \
                    + df.loc[cardinal_ixs, 'next'].map(str) + ' ' \
                    + df.loc[cardinal_ixs, 'next_next'].map(str)).str.lower()

        x, _, _ = prepare_matrix(x_series, self.x_max_len,
                                 len(self.x_ix_to_word),
                                 f'{self.model_name}_input_index.csv')
        del x_series

        y_predict = words_list(self.model.test(x), self.y_ix_to_word)
        del x

        if 'after' in df.columns:
            return df.assign(after=df['after'].combine_first(
                pd.Series(y_predict, index=cardinal_ixs)))
        else:
            return df.assign(
                after=pd.Series(y_predict, index=cardinal_ixs, name='after'))
def train(model_name,
          df: pd.DataFrame,
          input_max_len, input_vocab_len, output_max_len, output_vocab_len,
          hidden_dim, layer_num, learning_rate, dropout, embedding_dim,
          epochs, mem_size, batch_size):

    X, X_ix_to_word, X_word_to_ix = prepare_matrix(df['before'],
                                                   input_max_len,
                                                   input_vocab_len,
                                                   f'{model_name}_input_index.csv')

    y, y_ix_to_word, y_word_to_ix = prepare_matrix(df['after'],
                                                   output_max_len,
                                                   output_vocab_len,
                                                   f'{model_name}_output_index.csv')

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.04)
    print(f'x train type={X_train.dtype}, '
          f'size={X_train.shape}, '
          f'density={X_train.nnz / X_train.shape[0] / X_train.shape[1]},'
          f'{sparse_memory_usage(X_train):9.3} Mb')
    print(f'y train type={y_train.dtype}, '
          f'size={y_train.shape}, '
          f'density={y_train.nnz / y_train.shape[0] / y_train.shape[1]},'
          f'{sparse_memory_usage(y_train):9.3} Mb')
    print(f'x test type={X_test.dtype}, '
          f'size={X_test.shape}, '
          f'density={X_test.nnz / X_test.shape[0] / X_test.shape[1]},'
          f'{sparse_memory_usage(X_test):9.3} Mb')
    print(f'y test type={y_test.dtype}, '
          f'size={y_test.shape}, '
          f'density={y_test.nnz / y_test.shape[0] / y_test.shape[1]},'
          f'{sparse_memory_usage(y_test):9.3} Mb')
    del X, y
    gc.collect()

    model = AttentionModel(model_name, input_max_len, len(X_ix_to_word), output_max_len, len(y_ix_to_word),
                           hidden_dim, layer_num, learning_rate, dropout, embedding_dim)

    model.train(X_train, y_train, X_test, y_test, epochs, mem_size, batch_size)

    y_predict = model.test(X_test)
    print('array acc', np.mean(np.all(y_predict == y_test, axis=1)))

    y_predict_str = words_list(y_predict, y_ix_to_word)
    X_str = words_list(X_test.toarray(), X_ix_to_word)
    y_str = words_list(y_test.toarray(), y_ix_to_word)
    result_df = pd.DataFrame(data={'before': X_str, 'actual': y_str, 'predict': y_predict_str})

    print('str acc', len(result_df[result_df['actual'] == result_df['predict']])/len(result_df))

    return result_df