def get_model(encoding): return CSM( layers=[ DictionaryEncoding(vocabulary=encoding), WordEmbedding( dimension={{embedding_dimension}}, vocabulary_size=len(encoding), padding=encoding['PADDING']), {% for layer in word_layers %} {% set layer_index = loop.index0 %} SentenceConvolution( n_feature_maps={{layer.n_feature_maps}}, kernel_width={{layer.kernel_width}}, n_channels={{layer.n_channels}}, n_input_dimensions=1), Bias( n_input_dims=1, n_feature_maps={{layer.n_feature_maps}}), KMaxPooling(k={{layer.k_pooling}}, k_dynamic={{layer.k_dynamic}} if {{layer.k_dynamic}} > 0 else None), {{layer.nonlinearity}}(), {% endfor %} ReshapeForDocuments(), {% for layer in sentence_layers %} {% set layer_index = loop.index0 %} SentenceConvolution( n_feature_maps={{layer.n_feature_maps}}, kernel_width={{layer.kernel_width}}, n_channels={{layer.n_channels}}, n_input_dimensions=1), Bias( n_input_dims=1, n_feature_maps={{layer.n_feature_maps}}), KMaxPooling(k={{layer.k_pooling}}, k_dynamic={{layer.k_dynamic}} if {{layer.k_dynamic}} > 0 else None), {{layer.nonlinearity}}(), {% endfor %} {% if dropout %} Dropout(('b', 'd', 'f', 'w'), 0.5), {% endif %} Softmax( n_classes={{n_classes}}, n_input_dimensions={{softmax_input_dimensions}}), ])
def model_one_layer_variant_2(alphabet): return CSM(layers=[ DictionaryEncoding(vocabulary=alphabet), WordEmbedding(dimension=42, vocabulary_size=len(alphabet)), SentenceConvolution(n_feature_maps=5, kernel_width=6, n_channels=1, n_input_dimensions=42), SumFolding(), KMaxPooling(k=4), Bias(n_input_dims=21, n_feature_maps=5), Tanh(), Softmax(n_classes=2, n_input_dimensions=420), ])
def model_one_layer_large_embedding(alphabet): return CSM(layers=[ DictionaryEncoding(vocabulary=alphabet), WordEmbedding(dimension=32 * 4, vocabulary_size=len(alphabet)), SentenceConvolution(n_feature_maps=5, kernel_width=10, n_channels=1, n_input_dimensions=32 * 4), Relu(), SumFolding(), SumFolding(), SumFolding(), KMaxPooling(k=7), Bias(n_input_dims=16, n_feature_maps=5), Tanh(), MaxFolding(), Softmax(n_classes=2, n_input_dimensions=280), ])
def txtnets_model_from_gensim_word2vec(gensim_model): # build vocabulary mapping encoding = {} for index, word in enumerate(gensim_model.index2word): encoding[word] = index encoding['PADDING'] = len(encoding) vocabulary_size = len(encoding) embedding_dim = gensim_model.syn0.shape[1] E = np.concatenate( [gensim_model.syn0, np.zeros((1, embedding_dim))], axis=0) txtnets_model = CSM(layers=[ DictionaryEncoding(vocabulary=encoding), WordEmbedding( vocabulary_size=vocabulary_size, dimension=embedding_dim, padding=encoding['PADDING'], E=E, ) ]) return txtnets_model
# This model expects lists of words. X = [x.split(" ") for x in X] train_data_provider = LabelledSequenceMinibatchProvider(X=X[:-500], Y=Y[:-500], batch_size=100) print train_data_provider.batches_per_epoch validation_data_provider = LabelledSequenceMinibatchProvider( X=X[-500:], Y=Y[-500:], batch_size=500) word_embedding_model = CSM(layers=[ WordEmbedding( # really a character embedding dimension=16, vocabulary_size=len(alphabet)), SentenceConvolution(n_feature_maps=10, kernel_width=5, n_channels=1, n_input_dimensions=16), SumFolding(), KMaxPooling(k=2), MaxFolding(), Tanh(), ]) word_embedding = WordFromCharacterEmbedding( embedding_model=word_embedding_model, alphabet_encoding=alphabet) # print word_embedding.fprop(X, meta)
os.path.join(os.environ['DATA'], "words", "words.alphabet.encoding.json")) train_data_provider = PaddedSequenceMinibatchProvider( X=data, padding=alphabet['PADDING'], batch_size=100) embedding_dimension = 8 vocabulary_size = len(alphabet) n_feature_maps = 8 kernel_width = 5 pooling_size = 2 n_epochs = 1 model = CSM(layers=[ WordEmbedding(dimension=embedding_dimension, vocabulary_size=len(alphabet)), SentenceConvolution(n_feature_maps=n_feature_maps, kernel_width=kernel_width, n_channels=1, n_input_dimensions=embedding_dimension), SumFolding(), KMaxPooling(k=pooling_size), # Bias( # n_input_dims=embedding_dimension / 2, # n_feature_maps=n_feature_maps), Linear(n_input=n_feature_maps * pooling_size * embedding_dimension / 2, n_output=64), Tanh(), Linear(n_output=1, n_input=64), ])
X_valid = data['valid'] - 1 lengths_valid = data['valid_lbl'][:, 1] Y_valid = data['valid_lbl'][:, 0] - 1 Y_valid = np.equal.outer(Y_valid, np.arange(n_classes)).astype(np.float) assert np.all(np.sum(Y_valid, axis=1) == 1) validation_data_provider = BatchDataProvider(X=X_valid, Y=Y_valid, lengths=lengths_valid) ## BUILD THE MODEL model = CSM( layers=[ WordEmbedding(dimension=embedding_dimension, vocabulary_size=vocabulary_size), SentenceConvolution(n_feature_maps=n_feature_maps, kernel_width=kernel_width, n_channels=1, n_input_dimensions=embedding_dimension), SumFolding(), KMaxPooling(k=pooling_size * 2), Bias(n_input_dims=embedding_dimension / 2, n_feature_maps=n_feature_maps), Tanh(), # Softmax( # n_classes=n_classes, # n_input_dimensions=420), SentenceConvolution(n_feature_maps=n_feature_maps, kernel_width=3,
fixed_n_words=50) print train_data_provider.batches_per_epoch validation_data_provider = LabelledDocumentMinibatchProvider( X=X[-n_validation:], Y=Y[-n_validation:], batch_size=batch_size, padding='PADDING', fixed_n_sentences=15, fixed_n_words=50) model = CSM(layers=[ DictionaryEncoding(vocabulary=encoding), WordEmbedding(dimension=20, vocabulary_size=len(encoding), padding=encoding['PADDING']), Dropout(('b', 'w', 'f'), 0.2), SentenceConvolution(n_feature_maps=10, kernel_width=15, n_channels=20, n_input_dimensions=1), Bias(n_input_dims=1, n_feature_maps=10), KMaxPooling(k=7, k_dynamic=0.5), Tanh(), SentenceConvolution(n_feature_maps=30, kernel_width=9, n_channels=10, n_input_dimensions=1), Bias(n_input_dims=1, n_feature_maps=30), KMaxPooling(k=5),