コード例 #1
0
def load_data():
    x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data()
    vocabulary_inv = {
        key: value
        for key, value in enumerate(vocabulary_inv_list)
    }
    y = y.argmax(axis=1)
    # Shuffle data
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x = x[shuffle_indices]
    y = y[shuffle_indices]
    train_len = int(len(x) * 0.9)
    x_train = x[:train_len]
    y_train = y[:train_len]
    x_test = x[train_len:]
    y_test = y[train_len:]
    embedding_weights = train_word2vec(np.vstack((x_train, x_test)),
                                       vocabulary_inv,
                                       num_features=300,
                                       min_word_count=1,
                                       context=10)
    x_train = np.array([[embedding_weights[j] for j in i] for i in x_train])
    x_test = np.array([[embedding_weights[j] for j in i] for i in x_test])

    return x_train, y_train, x_test, y_test, vocabulary_inv
コード例 #2
0
def get_embedding_weights(train_x,
                          test_x,
                          vocabulary_inv,
                          min_count=1,
                          context=10):
    x = np.concatenate((train_x, test_x), axis=0)
    return train_word2vec(x, vocabulary_inv, w2c_len, min_count, context)
コード例 #3
0
def create_model():
    final_paths = "files_path.txt"
    model_path = "models" + slash + "model_5E.model"
    embedding_weights = train_word2vec(vocabulary_inv,
                                       final_paths=final_paths,
                                       model_paths=model_path,
                                       option="load")
																																							
    model = make_configuration()
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

    initialize_weights(model, embedding_weights)

    model.save("trained_models" + slash + "model_one")
    model.save_weights("trained_models" + slash + "model_one_weight")

    return model
コード例 #4
0
import numpy as np
import keras
from keras.models import Sequential, Graph
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.layers.recurrent import LSTM
from keras.utils import np_utils, generic_utils
import data_helpers
from w2v import train_word2vec
from sklearn.cross_validation import StratifiedKFold

cnn_train,lstm_train,Y_train,cnn_vocabulary,cnn_vocabulary_inv,lstm_vocabulary,lstm_vocabulary_inv = data_helpers.load_data()
cnn_embedding_weights,lstm_embedding_weights = train_word2vec(cnn_train, cnn_vocabulary_inv,lstm_train,lstm_vocabulary_inv)
#cnn_train=cnn_embedding_weights[0][cnn_train]
#lstm_train=lstm_embedding_weights[0][lstm_train]

shuffle_indices = np.random.permutation(np.arange(len(Y_train)))
cnn_shuffled = cnn_train[shuffle_indices]
lstm_shuffled = lstm_train[shuffle_indices]
Y_train = Y_train[shuffle_indices]
#Y_train_f=np_utils.to_categorical(Y_train,27)

filter_sizes = (3, 4)
num_filters = 150
hidden_dims = 150

cnn_graph = Graph()
cnn_graph.add_input(name='input', input_shape=(32, 300))
for fsz in filter_sizes:
	conv = Convolution1D(nb_filter=num_filters,filter_length=fsz,border_mode='valid',activation='relu',subsample_length=1)
コード例 #5
0
ファイル: train.py プロジェクト: Zoloman/Keras-NLC
from keras.layers import *

embedding_dim = 150  # The size of the word vectors

batch_size = 32  # Batch Size for Neural Network training
num_epochs = 1  # Number of Epochs to run training
val_split = 0.2  # Percentage of data to be used for validation

min_word_count = 1  # Minimum word count in sentences
context_window_size = 12  # Size for context window

x, y, vocabulary, vocabulary_inv = data_helpers.load_data(
)  # Load training data and vocabulary from data_helpers

embedding_weights = train_word2vec(
    x, vocabulary_inv, embedding_dim, min_word_count,
    context_window_size)  # Train or Load weights for Word Vectors

# Shuffle the data
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices].argmax(axis=1)

print("Vocabulary Size: " + str(len(vocabulary)))

# Declare the model
model = Sequential()
model.add(
    Embedding(len(vocabulary),
              embedding_dim,
              input_length=119,
num_epochs = 100
val_split = 0.1

# Word2Vec parameters, see train_word2vec
min_word_count = 1  # Minimum word count                        
context = 10        # Context window size    

# Data Preparatopn
# ==================================================
#
# Load data
print("Loading data...")
x, y, vocabulary, vocabulary_inv = data_helpers.load_data()

if model_variation=='CNN-non-static' or model_variation=='CNN-static':
    embedding_weights = train_word2vec(x, vocabulary_inv, embedding_dim, min_word_count, context)
    if model_variation=='CNN-static':
        x = embedding_weights[0][x]
elif model_variation=='CNN-rand':
    embedding_weights = None
else:
    raise ValueError('Unknown model variation')    

# Shuffle data
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices].argmax(axis=1)

print("Vocabulary Size: {:d}".format(len(vocabulary)))

# Building model
コード例 #7
0
def create_model(x_train, event_x_train, ners_x_train, sent_x_train, y_train,
                 x_test, event_x_test, ners_x_test, sent_x_test, y_test,
                 vocabulary, vocabulary_inv):

    global sequence_length, events_seq_length, ners_seq_length, sent2vec_seq_length

    #adjust seq lengths
    if sequence_length != x_test.shape[1]:
        print("Adjusting sequence length for actual size: {:d}".format(
            x_test.shape[1]))
        sequence_length = x_test.shape[1]

    if events_seq_length != event_x_test.shape[1]:
        print("Adjusting event sequence length for actual size: {:d}".format(
            event_x_test.shape[1]))
        events_seq_length = event_x_test.shape[1]

    if ners_seq_length != ners_x_test.shape[1]:
        print("Adjusting ners sequence length for actual size: {:d}".format(
            ners_x_test.shape[1]))
        ners_seq_length = ners_x_test.shape[1]

    if sent2vec_seq_length != sent_x_test.shape[1]:
        print(
            "Adjusting sent2vec sequence length for actual size: {:d}".format(
                sent_x_test.shape[1]))
        sent2vec_seq_length = sent_x_test.shape[1]

    # Prepare embedding layer weights and convert inputs for static model
    embedding_weights = train_word2vec(np.vstack((x_train, x_test)),
                                       vocabulary_inv,
                                       num_features=embedding_dim,
                                       min_word_count=min_word_count,
                                       context=context)

    # Build model
    input_shape = (sequence_length, )
    model_input = Input(shape=input_shape, name="Input_Layer")
    embed = Embedding(len(vocabulary_inv),
                      embedding_dim,
                      input_length=sequence_length,
                      name="embedding_layer")(model_input)
    embed = Dropout(dropout_prob[0], name="embedding_dropout_layer")(embed)

    # Convolutional block
    conv_blocks = []
    for sz in filter_sizes:
        conv = Convolution1D(filters=num_filters,
                             kernel_size=sz,
                             padding="valid",
                             activation="relu",
                             strides=1,
                             name="conv_layer_" + str(sz))(embed)
        conv = MaxPooling1D(pool_size=2,
                            name="conv_maxpool_layer_" + str(sz))(conv)
        conv = Flatten(name="conv_flatten_layer_" + str(sz))(conv)
        conv_blocks.append(conv)

    #concatenate conv layers
    conv_layers = Concatenate(name="conv_concate_layer")(
        conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
    flated_conv_layers = Dropout(dropout_prob[1],
                                 name="concate_dropout_layer")(conv_layers)

    #add events input layer
    events_input_layer = Input(shape=(events_seq_length, ),
                               name="event_input_layer")
    events_dense = Dense(int((events_seq_length / 2)),
                         activation="relu",
                         name="event_dense_layer")(events_input_layer)

    #add ners input layer
    ners_input_layer = Input(shape=(ners_seq_length, ), name="ner_input_layer")
    ners_dense = Dense(int((ners_seq_length / 2)),
                       activation="relu",
                       name="ners_dense_layer")(ners_input_layer)

    #add sent2vec input layer
    sent2vec_input_layer = Input(shape=(700, ), name="sent2vec_input_layer")
    sent2vec_dense_layer = Dense(
        350, activation="relu",
        name="sent2vec_dense_layer")(sent2vec_input_layer)

    #merge all input layers
    merged = concatenate(
        [flated_conv_layers, events_dense, ners_dense, sent2vec_dense_layer],
        name="conv_event_ner_sent2vec_merge_layer")

    #convolution layer for the contcatenated features
    merged_reshaped = Reshape((6, 357))(merged)
    merged_conv = Convolution1D(filters=10,
                                kernel_size=3,
                                padding="valid",
                                activation="relu",
                                strides=1,
                                name="merged_conv_layer_" +
                                str(3))(merged_reshaped)
    merged_conv = MaxPooling1D(pool_size=2,
                               name="merged_conv_maxpool_layer_" +
                               str(3))(merged_conv)
    merged_conv = Flatten(name="merged_conv_flatten_layer_" +
                          str(3))(merged_conv)

    #dense layer
    dense = Dense(hidden_dims,
                  activation="relu",
                  name="conv_event_merge_dense_layer")(merged_conv)
    model_output = Dense(num_labels, activation="softmax",
                         name="Output_layer")(dense)

    #create model
    model = Model([
        model_input, events_input_layer, ners_input_layer, sent2vec_input_layer
    ], model_output)
    model.compile(loss="categorical_crossentropy",
                  optimizer="adam",
                  metrics=["accuracy"])

    return model, embedding_weights
def run(embedding_dim, filter_sizes, num_filters, dropout_prob, hidden_dims,
        batch_size, num_epochs, sequence_length, max_words, min_word_count,
        context):
    # Data Preparation
    print("Load data...")
    x_train, y_train, x_test, y_test, vocabulary_inv = load_data(
        data_source, max_words, sequence_length)
    print(x_test.shape[1])
    print(sequence_length)
    if sequence_length != x_test.shape[1]:
        print("Adjusting sequence length for actual size")
        sequence_length = x_test.shape[1]

    print("x_train shape:", x_train.shape)
    print("x_test shape:", x_test.shape)
    print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

    # Prepare embedding layer weights and convert inputs for static model
    print("Model type is", model_type)
    if model_type in ["CNN-non-static", "CNN-static"]:
        embedding_weights = train_word2vec(np.vstack((x_train, x_test)),
                                           vocabulary_inv,
                                           num_features=embedding_dim,
                                           min_word_count=min_word_count,
                                           context=context)
        if model_type == "CNN-static":
            x_train = np.stack([
                np.stack([embedding_weights[word] for word in sentence])
                for sentence in x_train
            ])
            x_test = np.stack([
                np.stack([embedding_weights[word] for word in sentence])
                for sentence in x_test
            ])
            print("x_train static shape:", x_train.shape)
            print("x_test static shape:", x_test.shape)

    elif model_type == "CNN-rand":
        embedding_weights = None
    else:
        raise ValueError("Unknown model type")

    # Build model
    if model_type == "CNN-static":
        input_shape = (sequence_length, embedding_dim)
    else:
        input_shape = (sequence_length, )

    model_input = Input(shape=input_shape)

    # Static model does not have embedding layer
    if model_type == "CNN-static":
        z = model_input
    else:
        z = Embedding(len(vocabulary_inv),
                      embedding_dim,
                      input_length=sequence_length,
                      name="embedding")(model_input)

    z = Dropout(dropout_prob[0])(z)

    # Convolutional block
    conv_blocks = []
    for sz in filter_sizes:
        conv = Convolution1D(filters=num_filters,
                             kernel_size=sz,
                             padding="valid",
                             activation="relu",
                             strides=1)(z)
        conv = MaxPooling1D(pool_size=2)(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)
    z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

    z = Dropout(dropout_prob[1])(z)
    z = Dense(hidden_dims, activation="relu")(z)
    model_output = Dense(1, activation="sigmoid")(z)

    model = Model(model_input, model_output)
    model.compile(loss="binary_crossentropy",
                  optimizer="adam",
                  metrics=["accuracy"])

    # Initialize weights with word2vec
    if model_type == "CNN-non-static":
        weights = np.array([v for v in embedding_weights.values()])
        print("Initializing embedding layer with word2vec weights, shape",
              weights.shape)
        embedding_layer = model.get_layer("embedding")
        embedding_layer.set_weights([weights])

    # Train the model
    model.fit(x_train,
              y_train,
              batch_size=batch_size,
              epochs=num_epochs,
              validation_data=(x_test, y_test),
              verbose=2)
コード例 #9
0
    def build_model(self):
        print("=" * 120)
        print("Model type is", self.model_type)
        if self.model_type in ["non-static", "static"]:
            embedding_weights = train_word2vec(
                self.data_source,
                np.vstack((self.x_train, self.x_test)),
                self.vocabulary_inv,
                num_features=self.embedding_dim,
                min_word_count=self.min_word_count,
                context=self.context)
            if self.model_type == "static":
                self.x_train = np.stack([
                    np.stack([embedding_weights[word] for word in sentence])
                    for sentence in self.x_train
                ])
                self.x_test = np.stack([
                    np.stack([embedding_weights[word] for word in sentence])
                    for sentence in self.x_test
                ])
                print("x_train static shape:", self.x_train.shape)
                print("x_test static shape:", self.x_test.shape)

        elif self.model_type == "rand":
            embedding_weights = None
        else:
            raise ValueError("Unknown model type")

        # 构建模型
        if self.model_type == "static":
            input_shape = (self.sequence_length, self.embedding_dim)
        else:
            input_shape = (self.sequence_length, )

        model_input = Input(shape=input_shape)

        # 静态模型, 没有embedding 层
        if self.model_type == "static":
            z = model_input
        else:
            z = Embedding(len(self.vocabulary_inv),
                          self.embedding_dim,
                          input_length=self.sequence_length,
                          name="embedding")(model_input)

        z = Dropout(self.dropout_prob[0])(z)

        # 卷积模块
        conv_blocks = []
        # 双向LSTM模块
        lstm_blocks = []
        for sz in self.filter_sizes:
            convo = Convolution1D(filters=self.num_filters,
                                  kernel_size=sz,
                                  padding="valid",
                                  activation="relu",
                                  use_bias=True,
                                  bias_initializer='zeros',
                                  strides=1)(z)
            conv = MaxPooling1D(pool_size=2)(convo)
            lstm = Bidirectional(LSTM(self.hidden_dims,
                                      dropout=self.dropout_prob[1]),
                                 input_shape=(int(
                                     (self.sequence_length - sz + 1) / 2),
                                              self.num_filters))(conv)
            conv = Flatten()(conv)
            conv_blocks.append(conv)

            lstm_blocks.append(lstm)
        z = Concatenate()(
            conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
        l = Concatenate()(
            lstm_blocks) if len(lstm_blocks) > 1 else lstm_blocks[0]

        z = Dropout(self.dropout_prob[2])(z)
        z = Dense(self.hidden_dims, activation="relu")(z)
        z = Concatenate()([z, l])
        # z = l
        model_output = Dense(1, activation="sigmoid")(z)

        self.model = Model(model_input, model_output)
        self.model.compile(loss="binary_crossentropy",
                           optimizer="adam",
                           metrics=["accuracy"])

        # 使用word2vec 初始化 权值
        if self.model_type == "non-static":
            weights = np.array([v for v in embedding_weights.values()])
            print("Initializing embedding layer with word2vec weights, shape",
                  weights.shape)
            embedding_layer = self.model.get_layer("embedding")
            embedding_layer.set_weights([weights])
# Data Preparation
print("Load data...")
x_train, y_train, x_test, y_test, vocabulary_inv = load_data(data_source)

if sequence_length != x_test.shape[1]:
    print("Adjusting sequence length for actual size")
    sequence_length = x_test.shape[1]

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

# Prepare embedding layer weights and convert inputs for static model
print("Model type is", model_type)
if model_type in ["CNN-non-static", "CNN-static"]:
    embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim,
                                       min_word_count=min_word_count, context=context)
    if model_type == "CNN-static":
        x_train = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_train])
        x_test = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test])
        print("x_train static shape:", x_train.shape)
        print("x_test static shape:", x_test.shape)

elif model_type == "CNN-rand":
    embedding_weights = None
else:
    raise ValueError("Unknown model type")

# Build model
if model_type == "CNN-static":
    input_shape = (sequence_length, embedding_dim)
else:
コード例 #11
0
ファイル: train.py プロジェクト: belya/troll2vec
    save_parameters(vocabulary_inv, max_length)

    if sequence_length != x_test.shape[1]:
        print("Adjusting sequence length for actual size")
        sequence_length = x_test.shape[1]

    print("x_train shape:", x_train.shape)
    print("x_test shape:", x_test.shape)
    print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

    # Prepare embedding layer weights and convert inputs for static model
    print("Model type is", model_type)
    if model_type in ["CNN-non-static", "CNN-static"]:
        embedding_weights = train_word2vec(np.vstack((x_train, x_test)),
                                           vocabulary_inv,
                                           num_features=embedding_dim,
                                           min_word_count=min_word_count,
                                           context=context,
                                           gensim_model=gensim_model)
        if model_type == "CNN-static":
            x_train = np.stack([
                np.stack([embedding_weights[word] for word in sentence])
                for sentence in x_train
            ])
            x_test = np.stack([
                np.stack([embedding_weights[word] for word in sentence])
                for sentence in x_test
            ])
            print("x_train static shape:", x_train.shape)
            print("x_test static shape:", x_test.shape)

    elif model_type == "CNN-rand":
コード例 #12
0
#     loaded_model_json = json_file.read()
#     json_file.close()
#     model = model_from_json(loaded_model_json)
#     # load weights into new model
#     model.load_weights('model.h5')
#     print('Loaded model from disk')
#     model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

if True:
    print('Loading Data...')
    start = time.time()
    X, Y, vocab, vocab_inv, max_length, unique_answers = data_helpers.load_data(
    )
    end = time.time()
    print('Loaded data successfully took ', (end - start), ' seconds')
    embedding_weights = w2v.train_word2vec(X, vocab_inv, embedding_dim,
                                           min_word_count, context)
    # vocab = json.load(open('json/vocab.json'))
    # vocab_inv = json.load(open('json/vocab_inv.json'))
    # unique_answers = json.load(open('json/answers.json'))
    # max_length = vocab["max_length"]
    # vocab.pop("max_length")
    # embedding_weights = w2v.load_model(vocab_inv, embedding_dim, min_word_count, context)
    graph_in = Input(shape=(max_length, embedding_dim))
    convolutions = []
    for fsz in filter_sizes:
        convolution = Convolution1D(nb_filter=num_filters,
                                    filter_length=fsz,
                                    border_mode='valid',
                                    activation='relu',
                                    subsample_length=1)(graph_in)
        pool = MaxPooling1D(pool_length=2)(convolution)
コード例 #13
0
ファイル: cnn_TW.py プロジェクト: tommasoc80/evalita2018-rug
print(len(Xtest), 'test samples')

print("Xtrain shape:", Xtrain.shape)
print("Xtest shape:", Xtest.shape)
assert Xtrain.shape[1] == Xtest.shape[1]

if sequence_length != Xtrain.shape[1]:
    print("Adjusting sequence length for actual size")
    sequence_length = Xtrain.shape[1]

# Prepare embedding layer weights and convert inputs for static model
print("Model type is", model_type)
if model_type in ["CNN-non-static", "CNN-static"]:
    embedding_weights = train_word2vec(X,
                                       vocabulary_inv,
                                       num_features=embedding_dim,
                                       min_word_count=min_word_count,
                                       context=context)
    # Below if-clause does not concern us since we will use CNN-non-static model
    if model_type == "CNN-static":
        x = np.stack([
            np.stack([embedding_weights[word] for word in sentence])
            for sentence in x
        ])
        # x_test = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test])
        print("x static shape:", x.shape)
        # print("x_test static shape:", x_test.shape)

elif model_type == "CNN-rand":
    embedding_weights = None
else:
コード例 #14
0
vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
shuffle_indices = np.random.permutation(np.arange(len(y)))
x = x[shuffle_indices]
y = y[shuffle_indices]
train_len = int(len(x) * split)
x_train = x[:train_len]
y_train = y[:train_len]
x_test = x[train_len:]
y_test = y[train_len:]
print('Input data built! W2V')
### TRAIN WORD TO VEC!
from w2v import train_word2vec
min_word_count = 3
context = 2
embedding_dim = 30
embedding_weights = train_word2vec(np.vstack((x_train, x_test)), 'node_embeddings', vocabulary_inv, num_features=embedding_dim,
                                       min_word_count=min_word_count, context=context)

### NN
from keras.models import Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding, ZeroPadding1D
from keras.layers.merge import Concatenate
from keras import callbacks
from sklearn import metrics
# Model Hyperparameters
filter_sizes = (3, 8)
num_filters = 20 #10
dropout_prob = (0.5, 0.8)
hidden_dims = 30
batch_size = 64
num_epochs = 1
sequence_length = 3 #400
コード例 #15
0
        print("Word index : \n", len(word_index))

        if sequence_length != x_test.shape[1]:
            print("Adjusting sequence length for actual size")
            sequence_length = x_test.shape[1]

        print("x_train shape:", x_train.shape)
        print("x_test shape:", x_test.shape)
        print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

        # Prepare embedding layer weights and convert inputs for static model
        print("Model type is", model_type)
        if model_type in ["CNN-non-static", "CNN-static"]:
            embedding_weights = train_word2vec(np.vstack((x_train, x_test)),
                                               vocabulary_inv,
                                               file_head=target,
                                               num_features=embedding_dim,
                                               min_word_count=min_word_count,
                                               context=context)

            if model_type == "CNN-static":
                x_train = np.stack([
                    np.stack([embedding_weights[word] for word in sentence])
                    for sentence in x_train
                ])
                x_test = np.stack([
                    np.stack([embedding_weights[word] for word in sentence])
                    for sentence in x_test
                ])
                print("x_train static shape:", x_train.shape)
                print("x_test static shape:", x_test.shape)
コード例 #16
0
    return x_train, y_train, x_test, y_test, vocabulary_inv


print("Loading Data")
x_train, y_train, x_test, y_test, vocabulary_inv = load_data(dataSource)

if sequenceLength != x_test.shape[1]:
    print("Adjusting sequence length for actual size")
    sequenceLength = x_test.shape[1]

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))
embedding_weights = train_word2vec(np.vstack((x_train, x_test)),
                                   vocabulary_inv,
                                   num_features=embeddingDim,
                                   minWordCount=minWordCount,
                                   context=contextWindow)

input_shape = (sequenceLength, )

model_input = Input(shape=input_shape)
z = Embedding(len(vocabulary_inv),
              embeddingDim,
              input_length=sequenceLength,
              name="embedding")(model_input)

z = Dropout(dropoutProb[0])(z)

conv_blocks = []
for sz in filterSizes:
コード例 #17
0
    context = 10  # Context window size

    # Data Preparatopn
    # ==================================================
    #
    # Load data
    print("Loading data...")
    neg_train_path = './data/imdb_train.neg'
    pos_train_path = './data/imdb_train.pos'

    x, y, vocabulary, vocabulary_inv = data_helpers.load_data(
        pos_train_path, neg_train_path)

    if model_variation == 'CNN-non-static' or model_variation == 'CNN-static':
        embedding_weights = train_word2vec(x, vocabulary_inv, model_variation,
                                           embedding_dim, min_word_count,
                                           context)
        if model_variation == 'CNN-static':
            x = embedding_weights[0][x]
    elif model_variation == 'CNN-rand':
        embedding_weights = None
    else:
        raise ValueError('Unknown model variation')

    # Shuffle data
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices].argmax(axis=1)

    print("Vocabulary Size: {:d}".format(len(vocabulary)))
コード例 #18
0
ファイル: sammy.py プロジェクト: positivedefinite/sammy
	print("Stat file created !!")

    
### Part 2B: Network definition & word2vec training
### make sure to delete existing word2vec model if you want to udate it
if sequence_length != x_test.shape[1]:
    print("Adjusting sequence length for actual size")
    sequence_length = x_test.shape[1]

print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

# Prepare embedding layer weights and convert inputs for static model
print("Model type is", model_type)
if model_type in ["CNN-non-static", "CNN-static"]:
    print('Initiating word2vec.')
    embedding_weights = train_word2vec(np.vstack((x_train, x_test, neutral_tweets)), settings['dataset_name'], vocabulary_inv, num_features=embedding_dim,
                                       min_word_count=min_word_count, context=context)
    print('Word2vec done.')
    if model_type == "CNN-static":
        x_train = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_train])
        x_test = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test])
        print("x_train static shape:", x_train.shape)
        print("x_test static shape:", x_test.shape)

elif model_type == "CNN-rand":
    embedding_weights = None
else:
    raise ValueError("Unknown model type")


# Build model
if model_type == "CNN-static":
コード例 #19
0
    val_acc_nd = []
    val_loss_nd = []
    epoch_nr_nd = []

    for run in range(0, 10, 1):
        # Data Preparation
        print("Load data...")
        x_train, y_train, x_test, y_test, vocabulary_inv = load_data(
            data_source, datalength)

        print("x_train[0]: ", x_train[0])

        # train_word2vec returns embedding_weights of the trained word2vec embedding (see w2v.py)
        embedding_weights = train_word2vec(datalength,
                                           np.vstack((x_train, x_test)),
                                           vocabulary_inv,
                                           num_features=embedding_dim,
                                           min_word_count=min_word_count,
                                           context=context)

        print("x_train static shape:", x_train.shape)
        print("x_test static shape:", x_test.shape)

        print("x_train[0]: ", x_train[0])

        if sequence_length != x_test.shape[1]:
            print("Adjusting sequence length for actual size")
            sequence_length = x_test.shape[1]

        model = create_model(sequence_length)
        history = model.fit(x_train,
                            y_train,
コード例 #20
0
ファイル: train_CNN_IMDB.py プロジェクト: legalto/NLP-CNN
    # Word2Vec parameters, see train_word2vec
    min_word_count = 1  # Minimum word count
    context = 10        # Context window size

    # Data Preparatopn
    # ==================================================
    #
    # Load data
    print("Loading data...")
    neg_train_path = './data/imdb_train.neg'
    pos_train_path = './data/imdb_train.pos'

    x, y, vocabulary, vocabulary_inv = data_helpers.load_data(pos_train_path,neg_train_path)

    if model_variation=='CNN-non-static' or model_variation=='CNN-static':
        embedding_weights = train_word2vec(x, vocabulary_inv, model_variation, embedding_dim, min_word_count, context)
        if model_variation=='CNN-static':
            x = embedding_weights[0][x]
    elif model_variation=='CNN-rand':
        embedding_weights = None
    else:
        raise ValueError('Unknown model variation')

    # Shuffle data
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices].argmax(axis=1)



    print("Vocabulary Size: {:d}".format(len(vocabulary)))
コード例 #21
0
num_epochs = 100
val_split = 0.1

# Word2Vec parameters, see train_word2vec
min_word_count = 1  # Minimum word count
context = 10  # Context window size

# Data Preparatopn
# ==================================================
#
# Load data
print("Loading data...")
x, y, vocabulary, vocabulary_inv = data_helpers.load_data()

if model_variation == 'CNN-non-static' or model_variation == 'CNN-static':
    embedding_weights = train_word2vec(x, vocabulary_inv, embedding_dim,
                                       min_word_count, context)
    if model_variation == 'CNN-static':
        x = embedding_weights[0][x]
elif model_variation == 'CNN-rand':
    embedding_weights = None
else:
    raise ValueError('Unknown model variation')

# Shuffle data
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices].argmax(axis=1)

print("Vocabulary Size: {:d}".format(len(vocabulary)))

# Building model
コード例 #22
0
def create_model(x_train,
                 x_test,
                 vocabulary_inv,
                 model_type,
                 embedding_dim,
                 min_word_count,
                 context,
                 sequence_length,
                 dropout_prob,
                 filter_sizes,
                 num_filters,
                 hidden_dims,
                 optimizer="Adagrad"):
    # Prepare embedding layer weights and convert inputs for static model
    print("Model type is", model_type)
    if model_type in ["CNN-non-static", "CNN-static"]:
        embedding_weights = train_word2vec(np.vstack((x_train, x_test)),
                                           vocabulary_inv,
                                           num_features=embedding_dim,
                                           min_word_count=min_word_count,
                                           context=context)
        if model_type == "CNN-static":
            x_train = np.stack([
                np.stack([embedding_weights[word] for word in sentence])
                for sentence in x_train
            ])
            x_test = np.stack([
                np.stack([embedding_weights[word] for word in sentence])
                for sentence in x_test
            ])
            print("x_train static shape:", x_train.shape)
            print("x_test static shape:", x_test.shape)

    elif model_type == "CNN-rand":
        embedding_weights = None
    else:
        raise ValueError("Unknown model type")

    # Build model
    if model_type == "CNN-static":
        input_shape = (sequence_length, embedding_dim)
    else:
        input_shape = (sequence_length, )
    model_input = Input(shape=input_shape)
    # Static model does not have embedding layer
    if model_type == "CNN-static":
        z = model_input
    else:
        z = Embedding(len(vocabulary_inv),
                      embedding_dim,
                      input_length=sequence_length,
                      name="embedding")(model_input)
    z = Dropout(dropout_prob[0])(z)
    # Convolutional block
    conv_blocks = []
    for sz in filter_sizes:
        conv = Convolution1D(filters=num_filters,
                             kernel_size=sz,
                             padding="valid",
                             activation="relu",
                             strides=1)(z)
        conv = MaxPooling1D(pool_size=2)(conv)
        conv = Flatten()(conv)
        conv_blocks.append(conv)
    z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]
    z = Dropout(dropout_prob[1])(z)
    z = Dense(hidden_dims, activation="relu")(z)
    model_output = Dense(1, activation="sigmoid")(z)
    model = Model(model_input, model_output)
    model.compile(loss="binary_crossentropy",
                  optimizer=optimizer,
                  metrics=["accuracy"])
    # Initialize weights with word2vec
    if model_type == "CNN-non-static":
        weights = np.array([v for v in embedding_weights.values()])
        print("Initializing embedding layer with word2vec weights, shape",
              weights.shape)
        embedding_layer = model.get_layer("embedding")
        embedding_layer.set_weights([weights])
    return model