def build_model_and_objective(n_classes, n_input_dimensions, X, Y): model = CSM( layers=[ Softmax( n_classes=n_classes, n_input_dimensions=n_input_dimensions), ], ) lengths = np.zeros(X.shape[0]) data_provider = BatchDataProvider( X=X, Y=Y, lengths=lengths) cost_function = CrossEntropy() objective = CostMinimizationObjective(cost=cost_function, data_provider=data_provider) update_rule = AdaGradUpdateRule( gamma=0.1, model_template=model) optimizer = SGD(model=model, objective=objective, update_rule=update_rule) return model, objective, optimizer, data_provider
def run(): # random.seed(435) # np.random.seed(2342) np.set_printoptions(linewidth=100) data_dir = os.path.join("../data", "europarlv7") with open(os.path.join( data_dir, "europarl-v7.de-en.en.tokens.clean.json")) as data_file: english_data = json.load(data_file) with open( os.path.join( data_dir, "europarl-v7.de-en.en.tokens.clean.dictionary.encoding.json") ) as dictionary_file: english_dictionary = json.load(dictionary_file) with open(os.path.join( data_dir, "europarl-v7.de-en.de.tokens.clean.json")) as data_file: german_data = json.load(data_file) with open( os.path.join( data_dir, "europarl-v7.de-en.de.tokens.clean.dictionary.encoding.json") ) as dictionary_file: german_dictionary = json.load(dictionary_file) # english_data = english_data[:10000] # german_data = german_data[:10000] english_data = replace_unknowns(english_data, english_dictionary, 'UNKNOWN') german_data = replace_unknowns(german_data, german_dictionary, 'UNKNOWN') batch_size = 100 assert len(english_data) == len(german_data) print len(english_data) / batch_size parallel_en_de_provider = PaddedParallelSequenceMinibatchProvider( X1=list(english_data), X2=list(german_data), batch_size=batch_size, padding='PADDING', ) multilingual_parallel_provider = TaggedProviderCollection({ ('en', 'de'): parallel_en_de_provider }) english_model = CSM(layers=[ DictionaryEncoding(vocabulary=english_dictionary), WordEmbedding(dimension=40, vocabulary_size=len(english_dictionary)), AxisReduction(axis='w'), # SentenceConvolution( # n_feature_maps=15, # kernel_width=10, # n_channels=1, # n_input_dimensions=12), # # SumFolding(), # # KMaxPooling(k=17), # # Bias( # n_input_dims=6, # n_feature_maps=15), # # Tanh(), ]) german_model = CSM(layers=[ DictionaryEncoding(vocabulary=german_dictionary), WordEmbedding(dimension=40, vocabulary_size=len(german_dictionary)), AxisReduction(axis='w'), # SentenceConvolution( # n_feature_maps=15, # kernel_width=10, # n_channels=1, # n_input_dimensions=12), # # SumFolding(), # # KMaxPooling(k=17), # # Bias( # n_input_dims=6, # n_feature_maps=15), # # Tanh(), ]) print english_model print german_model model = TaggedModelCollection({ 'en': english_model, 'de': german_model, }) # regularizer = L2Regularizer(lamb=1e-4) objective = ContrastiveMultilingualEmbeddingObjective( tagged_parallel_sequence_provider=multilingual_parallel_provider, n_contrastive_samples=10, margin=40.0) # objective = CostMinimizationObjective( # cost=cost_function, # data_provider=train_data_provider, # regularizer=regularizer) update_rule = AdaGrad( # gamma=0.01, gamma=0.1, model_template=model) optimizer = SGD(model=model, objective=objective, update_rule=update_rule) time_start = time.time() costs = [] for batch_index, iteration_info in enumerate(optimizer): costs.append(iteration_info['cost']) # print costs[-1] if batch_index % 10 == 0: print "B: {}, E: {}, C: {}, Param size: {}".format( batch_index, # This epoch count will be inaccurate when I move to multilingual (batch_index // parallel_en_de_provider.batches_per_epoch) + 1, costs[-1], np.mean(np.abs(model.pack()))) if batch_index % 100 == 0: with open("model.pkl", 'w') as model_file: pickle.dump(model.move_to_cpu(), model_file, protocol=-1) # if batch_index % 1000 == 0 and batch_index > 0: # with open("model_optimization.pkl", 'w') as model_file: # pickle.dump(optimizer, model_file, protocol=-1) # if batch_index == 500: # break time_end = time.time() print "Time elapsed: {}s".format(time_end - time_start)
__author__ = 'mdenil'
def main(): random.seed(435) np.random.seed(23421) np.set_printoptions(linewidth=100) data_dir = os.path.join("/data/mulga/mdenil/amazon-reviews", "shards") batch_size = 100 with open(os.path.join(data_dir, "dictionary.sentences.clean.encoding.json")) as encoding_file: encoding = json.load(encoding_file) print(len(encoding)) # pretrained_lut = load_word2vec_embeddings( # os.path.join("/data/brown/mdenil/amazon-reviews/word2vec-embeddings", "word-embeddings-30.txt"), # encoding) train_data_provider = ShardedLabelledDocumentMinibatchProvider( shard_dir=os.path.join(data_dir, "train"), shard_pattern="shard_[0-9]*.sentences.clean.projected.json.gz", batch_size=batch_size, padding='PADDING', n_labels=5, # n_labels=2, fixed_n_sentences=15, fixed_n_words=25) validation_data_provider = ShardedLabelledDocumentMinibatchProvider( shard_dir=os.path.join(data_dir, "test"), shard_pattern="shard_[0-9]*.sentences.clean.projected.json.gz", batch_size=batch_size, padding='PADDING', n_labels=5, # n_labels=2, fixed_n_sentences=15, fixed_n_words=25) model = CSM( layers=[ DictionaryEncoding(vocabulary=encoding), WordEmbedding( dimension=30, vocabulary_size=len(encoding), padding=encoding['PADDING']), # WordEmbedding( # dimension=pretrained_lut.shape[1], # vocabulary_size=len(encoding), # padding=encoding['PADDING'], # E=pretrained_lut), # Dropout(('b', 'w', 'f'), 0.2), SentenceConvolution( n_feature_maps=10, kernel_width=3, n_channels=30, n_input_dimensions=1), Bias( n_input_dims=1, n_feature_maps=10), # KMaxPooling(k=7, k_dynamic=0.5), # # Tanh(), # # SentenceConvolution( # n_feature_maps=30, # kernel_width=3, # n_channels=10, # n_input_dimensions=1), # # Bias( # n_input_dims=1, # n_feature_maps=30), KMaxPooling(k=5), Tanh(), ReshapeForDocuments(), SentenceConvolution( n_feature_maps=20, kernel_width=3, n_channels=50, n_input_dimensions=1), Bias( n_input_dims=1, n_feature_maps=20), KMaxPooling(k=5), Tanh(), # Dropout(('b', 'd', 'f', 'w'), 0.5), # Softmax( # # n_classes=2, # n_classes=5, # n_input_dimensions=100), Linear( n_input=100, n_output=1) ] ) print(model) # cost_function = CrossEntropy() cost_function = SquaredError() regularizer = L2Regularizer(lamb=1e-5) objective = CostMinimizationObjective( cost=cost_function, data_provider=train_data_provider, regularizer=regularizer) update_rule = AdaGrad( gamma=0.1, model_template=model) optimizer = SGD( model=model, objective=objective, update_rule=update_rule) n_epochs = 1 # n_batches = train_data_provider.batches_per_epoch * n_epochs time_start = time.time() best_acc = -1.0 progress = [] costs = [] prev_weights = model.pack() for batch_index, iteration_info in enumerate(optimizer): costs.append(iteration_info['cost']) if batch_index % 10 == 0: Y_hat = [] Y_valid = [] for _ in xrange(1): X_valid_batch, Y_valid_batch, meta_valid = validation_data_provider.next_batch() Y_valid.append(Y_valid_batch) Y_hat.append(model.fprop(X_valid_batch, meta=meta_valid)) Y_valid = Y_valid[0].get() Y_hat = Y_hat[0].get() # Y_valid = np.concatenate(Y_valid, axis=0) # Y_hat = np.concatenate(Y_hat, axis=0) # assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6) # acc = np.mean(np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1)) acc = np.mean(np.abs(Y_valid - Y_hat)) # if acc > best_acc: # best_acc = acc # with open("/home/mdenil/model.pkl", 'w') as model_file: # pickle.dump(model, model_file, protocol=-1) current = dict() current['B']=batch_index current['A']=acc current['C']=costs[-1].get() current['Prop']=np.argmax(Y_hat, axis=1).mean() current['Params']=np.mean(np.abs(model.pack())) progress.append(current) print(current) with open("progress.pkl", 'w') as progress_file: pickle.dump(progress, progress_file, protocol=-1) # if batch_index == 100: # break if batch_index % 100 == 0: with open("model.pkl", 'w') as model_file: pickle.dump(model, model_file, protocol=-1) time_end = time.time() print("Time elapsed: {}s".format(time_end - time_start))
Softmax(n_classes=2, n_input_dimensions=28 * 5), ]) print model cost_function = CrossEntropy() regularizer = L2Regularizer(lamb=1e-4) objective = CostMinimizationObjective(cost=cost_function, data_provider=train_data_provider, regularizer=regularizer) update_rule = AdaGrad(gamma=0.01, model_template=model) optimizer = SGD(model=model, objective=objective, update_rule=update_rule) n_epochs = 1 n_batches = train_data_provider.batches_per_epoch * n_epochs time_start = time.time() best_acc = -1.0 progress = [] costs = [] prev_weights = model.pack() for batch_index, iteration_info in enumerate(optimizer): costs.append(iteration_info['cost']) if batch_index % 10 == 0:
def run(): random.seed(435) np.random.seed(2342) np.set_printoptions(linewidth=100) tweets_dir = os.path.join( "../data", "sentiment140_2") # _2 truncates at <3, normal truncates at <5 with open(os.path.join(tweets_dir, "sentiment140.train.clean.json")) as data_file: data = json.loads(data_file.read()) random.shuffle(data) X, Y = map(list, zip(*data)) Y = [[":)", ":("].index(y) for y in Y] with open( os.path.join(tweets_dir, "sentiment140.train.clean.dictionary.encoding.json") ) as alphabet_file: alphabet = json.loads(alphabet_file.read()) with open(os.path.join(tweets_dir, "sentiment140.test.clean.json")) as data_file: data = json.loads(data_file.read()) X_test, Y_test = map(list, zip(*data)) Y_test = [[":)", ":("].index(y) for y in Y_test] print len(alphabet) # X = X[:1000] # Y = Y[:1000] # lists of words # replace unknowns with an unknown character tokenizer = WordPunctTokenizer() new_X = [] for x in X: new_X.append( [w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)]) X = new_X new_X_test = [] for x in X_test: new_X_test.append( [w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)]) X_test = new_X_test batch_size = 50 train_data_provider = LabelledSequenceMinibatchProvider( X=X, Y=Y, batch_size=batch_size, fixed_length=50, padding='PADDING') print train_data_provider.batches_per_epoch validation_data_provider = LabelledSequenceMinibatchProvider( X=X_test, Y=Y_test, batch_size=len(X_test), fixed_length=50, padding='PADDING', shuffle=False) # model = CSM( # layers=[ # DictionaryEncoding(vocabulary=encoding), # # WordEmbedding( # dimension=32, # vocabulary_size=len(encoding)), # # SentenceConvolution( # n_feature_maps=5, # kernel_width=10, # n_channels=1, # n_input_dimensions=32), # # SumFolding(), # # KMaxPooling(k=7), # # Bias( # n_input_dims=16, # n_feature_maps=5), # # Tanh(), # # SumFolding(), # # Softmax( # n_classes=2, # n_input_dimensions=280), # ] # ) # Approximately Nal's model # # model = CSM( # layers=[ # DictionaryEncoding(vocabulary=encoding), # # WordEmbedding( # dimension=12, # vocabulary_size=len(encoding)), # # SentenceConvolution( # n_feature_maps=6, # kernel_width=7, # n_channels=1, # n_input_dimensions=12), # # Bias( # n_input_dims=12, # n_feature_maps=6), # # SumFolding(), # # KMaxPooling(k=4, k_dynamic=0.5), # # Tanh(), # # SentenceConvolution( # n_feature_maps=14, # kernel_width=5, # n_channels=6, # n_input_dimensions=6), # # Bias( # n_input_dims=6, # n_feature_maps=14), # # SumFolding(), # # KMaxPooling(k=4), # # Tanh(), # # Softmax( # n_classes=2, # n_input_dimensions=168), # ] # ) tweet_model = CSM(layers=[ DictionaryEncoding(vocabulary=alphabet), WordEmbedding(dimension=60, vocabulary_size=len(alphabet), padding=alphabet['PADDING']), Dropout(('b', 'w', 'f'), 0.5), SentenceConvolution(n_feature_maps=6, kernel_width=7, n_channels=60, n_input_dimensions=1), Bias(n_input_dims=1, n_feature_maps=6), KMaxPooling(k=4, k_dynamic=0.5), Tanh(), SentenceConvolution(n_feature_maps=14, kernel_width=5, n_channels=6, n_input_dimensions=1), Bias(n_input_dims=1, n_feature_maps=14), KMaxPooling(k=4), Tanh(), # Dropout(('b', 'd', 'f', 'w'), 0.5), # # Linear(n_input=4*40, n_output=4*40), # # Bias( # n_input_dims=4*40, # n_feature_maps=1), Dropout(('b', 'd', 'f', 'w'), 0.5), Softmax(n_classes=2, n_input_dimensions=4 * 14), ]) # model = CSM( # layers=[ # # cpu.model.encoding. # DictionaryEncoding(vocabulary=encoding), # # # cpu.model.embedding. # WordEmbedding( # dimension=28, # vocabulary_size=len(encoding)), # # # HostToDevice(), # # SentenceConvolution( # n_feature_maps=6, # kernel_width=7, # n_channels=1, # n_input_dimensions=28), # # Bias( # n_input_dims=28, # n_feature_maps=6), # # SumFolding(), # # KMaxPooling(k=4, k_dynamic=0.5), # # Tanh(), # # SentenceConvolution( # n_feature_maps=14, # kernel_width=5, # n_channels=6, # n_input_dimensions=14), # # Bias( # n_input_dims=14, # n_feature_maps=14), # # SumFolding(), # # KMaxPooling(k=4), # # Tanh(), # # Softmax( # n_classes=2, # n_input_dimensions=392), # ] # ) print tweet_model cost_function = CrossEntropy() regularizer = L2Regularizer(lamb=1e-5) objective = CostMinimizationObjective(cost=cost_function, data_provider=train_data_provider, regularizer=regularizer) update_rule = AdaGrad(gamma=0.1, model_template=tweet_model) # update_rule = AdaDelta( # rho=0.99, # epsilon=1e-6, # model_template=model) optimizer = SGD(model=tweet_model, objective=objective, update_rule=update_rule) gradient_checker = ModelGradientChecker( CostMinimizationObjective(cost=cost_function, data_provider=validation_data_provider, regularizer=regularizer)) time_start = time.time() best_acc = -1.0 costs = [] for batch_index, iteration_info in enumerate(optimizer): costs.append(iteration_info['cost']) if batch_index % 30 == 0: X_valid, Y_valid, meta_valid = validation_data_provider.next_batch( ) test_model = gpu.model.dropout.remove_dropout(tweet_model) Y_hat = test_model.fprop(X_valid, meta=meta_valid) del test_model Y_hat = Y_hat.get() assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6) # grad_check = gradient_checker.check(model) grad_check = "skipped" time_now = time.time() examples_per_hr = (batch_index * batch_size) / (time_now - time_start) * 3600 acc = np.mean( np.argmax(Y_hat, axis=1) == np.argmax(Y_valid.get(), axis=1)) if acc > best_acc: best_acc = acc with open("model_best_tweets.pkl", 'w') as model_file: pickle.dump(tweet_model.move_to_cpu(), model_file, protocol=-1) # with open("model_best_optimization.pkl", 'w') as model_file: # pickle.dump(optimizer, model_file, protocol=-1) print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, EPH: {}, best acc: {}".format( batch_index, acc, costs[-1], np.argmax(Y_hat, axis=1).mean(), np.mean(np.abs(tweet_model.pack())), examples_per_hr, best_acc) # if batch_index % 2500 == 0 and batch_index > 0: # update_rule.gamma *= 0.5 # if batch_index == 1000: # break # if batch_index % 100 == 0: # with open("model.pkl", 'w') as model_file: # pickle.dump(model.move_to_cpu(), model_file, protocol=-1) # if batch_index % 1000 == 0 and batch_index > 0: # with open("model_optimization.pkl", 'w') as model_file: # pickle.dump(optimizer, model_file, protocol=-1) time_end = time.time() print "Time elapsed: {}s".format(time_end - time_start)
def run(): # random.seed(435) # np.random.seed(2342) np.set_printoptions(linewidth=100) parser = argparse.ArgumentParser( description="Evaluate a trained network on the sentiment140 test set") parser.add_argument("--model_file", help="pickle file to load the model from") parser.add_argument("--best_file", help="html file to write the output to") args = parser.parse_args() tweets_dir = os.path.join( "../data", "sentiment140_2") # _2 truncates at <3, normal truncates at <5 with open(os.path.join(tweets_dir, "sentiment140.train.clean.json")) as data_file: data = json.loads(data_file.read()) random.shuffle(data) X, Y = map(list, zip(*data)) Y = [[":)", ":("].index(y) for y in Y] with open( os.path.join(tweets_dir, "sentiment140.train.clean.dictionary.encoding.json") ) as alphabet_file: alphabet = json.loads(alphabet_file.read()) with open(os.path.join(tweets_dir, "sentiment140.test.clean.json")) as data_file: data = json.loads(data_file.read()) X_test, Y_test = map(list, zip(*data)) Y_test = [[":)", ":("].index(y) for y in Y_test] print len(alphabet) # X = X[:1000] # Y = Y[:1000] # lists of words # replace unknowns with an unknown character tokenizer = WordPunctTokenizer() new_X = [] for x in X: new_X.append( [w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)]) X = new_X new_X_test = [] for x in X_test: new_X_test.append( [w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)]) X_test = new_X_test batch_size = 5 train_data_provider = LabelledSequenceMinibatchProvider( X=X, Y=Y, batch_size=batch_size, fixed_length=50, padding='PADDING') print train_data_provider.batches_per_epoch validation_data_provider = LabelledSequenceMinibatchProvider( X=X_test, Y=Y_test, batch_size=len(X_test), fixed_length=50, padding='PADDING', shuffle=False) with open(args.model_file) as model_file: tweet_model = gpu.model.host_device_component_mapping.move_to_gpu( pickle.load(model_file)) # tweet_model = gpu.model.dropout.remove_dropout(tweet_model) print tweet_model X_valid, Y_valid, meta_valid = validation_data_provider.next_batch() test_model = gpu.model.dropout.remove_dropout(tweet_model) Y_hat = test_model.fprop(X_valid, meta=meta_valid) del test_model best_acc = np.mean( np.argmax(Y_hat.get(), axis=1) == np.argmax(Y_valid.get(), axis=1)) print "Acc at start:", best_acc cost_function = CrossEntropy() regularizer = L2Regularizer(lamb=0.0) objective = CostMinimizationObjective(cost=cost_function, data_provider=train_data_provider, regularizer=regularizer) update_rule = AdaGrad(gamma=2e-3, model_template=tweet_model) optimizer = SGD(model=tweet_model, objective=objective, update_rule=update_rule) gradient_checker = ModelGradientChecker( CostMinimizationObjective(cost=cost_function, data_provider=validation_data_provider, regularizer=regularizer)) time_start = time.time() costs = [] for batch_index, iteration_info in enumerate(optimizer): costs.append(iteration_info['cost']) X_valid, Y_valid, meta_valid = validation_data_provider.next_batch() test_model = gpu.model.dropout.remove_dropout(tweet_model) Y_hat = test_model.fprop(X_valid, meta=meta_valid) del test_model Y_hat = Y_hat.get() assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6) time_now = time.time() examples_per_hr = (batch_index * batch_size) / (time_now - time_start) * 3600 acc = np.mean( np.argmax(Y_hat, axis=1) == np.argmax(Y_valid.get(), axis=1)) if acc > best_acc: best_acc = acc with open(args.best_file, 'w') as model_file: pickle.dump(tweet_model.move_to_cpu(), model_file, protocol=-1) print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, EPH: {}, best acc: {}".format( batch_index, acc, costs[-1], np.argmax(Y_hat, axis=1).mean(), np.mean(np.abs(tweet_model.pack())), examples_per_hr, best_acc) time_end = time.time() print "Time elapsed: {}s".format(time_end - time_start)