def main(): random.seed(665243) np.random.seed(61734) np.set_printoptions(linewidth=100) parser = argparse.ArgumentParser(description="Create summaries from w2vec model.") parser.add_argument('--size', type=int, help="number of sentences to keep") args = parser.parse_args() data_dir = os.path.join("/users/mdenil/code/txtnets/txtnets_deployed/data", "stanfordmovie") with open("model_w2vec_logreg.pkl") as model_file: embedding_model = pickle.load(model_file) logistic_regression = pickle.load(model_file) with open(os.path.join(data_dir, "stanfordmovie.test.sentences.clean.projected.json")) as data_file: data = json.load(data_file) # random.shuffle(data) X, Y = map(list, zip(*data)) Y = [[":)", ":("].index(y) for y in Y] objective = CrossEntropy() test_data_provider = LabelledDocumentMinibatchProvider( X=X, Y=Y, batch_size=1, padding=None, shuffle=False) prog_bar = pyprind.ProgBar(test_data_provider.batches_per_epoch) summaries = [] for _ in range(test_data_provider.batches_per_epoch): x_batch, y_batch, meta_batch = test_data_provider.next_batch() label = [":)", ":("][int(y_batch[0,1])] sentence_importance_scores = get_sentence_importance_scores( embedding_model, logistic_regression, x_batch) most_important_sentence_indexes = np.argsort(sentence_importance_scores) most_important_sentence_indexes = most_important_sentence_indexes[:args.size] most_important_sentence_indexes.sort() summary = [] for i in most_important_sentence_indexes: summary.append(x_batch[i]) summaries.append([summary, label]) prog_bar.update() with open("summaries_{}.json".format(args.size), 'w') as summaries_file: json.dump(summaries, summaries_file) summaries_file.write("\n")
def main(): random.seed(665243) np.random.seed(61734) np.set_printoptions(linewidth=100) parser = argparse.ArgumentParser(description="Create summaries from w2vec model.") parser.add_argument("--size", type=int, help="number of sentences to keep") args = parser.parse_args() data_dir = os.path.join("/users/mdenil/code/txtnets/txtnets_deployed/data", "stanfordmovie") with open("model_w2vec_logreg.pkl") as model_file: embedding_model = pickle.load(model_file) logistic_regression = pickle.load(model_file) with open(os.path.join(data_dir, "stanfordmovie.test.sentences.clean.projected.json")) as data_file: data = json.load(data_file) # random.shuffle(data) X, Y = map(list, zip(*data)) Y = [[":)", ":("].index(y) for y in Y] objective = CrossEntropy() test_data_provider = LabelledDocumentMinibatchProvider(X=X, Y=Y, batch_size=1, padding=None, shuffle=False) prog_bar = pyprind.ProgBar(test_data_provider.batches_per_epoch) summaries = [] for _ in range(test_data_provider.batches_per_epoch): x_batch, y_batch, meta_batch = test_data_provider.next_batch() label = [":)", ":("][int(y_batch[0, 1])] sentence_importance_scores = get_sentence_importance_scores(embedding_model, logistic_regression, x_batch) most_important_sentence_indexes = np.argsort(sentence_importance_scores) most_important_sentence_indexes = most_important_sentence_indexes[: args.size] most_important_sentence_indexes.sort() summary = [] for i in most_important_sentence_indexes: summary.append(x_batch[i]) summaries.append([summary, label]) prog_bar.update() with open("summaries_{}.json".format(args.size), "w") as summaries_file: json.dump(summaries, summaries_file) summaries_file.write("\n")
best_acc = -1.0 progress = [] costs = [] prev_weights = model.pack() for batch_index, iteration_info in enumerate(optimizer): costs.append(iteration_info['cost']) if batch_index % 10 == 0: Y_hat = [] Y_valid = [] for _ in xrange(validation_data_provider.batches_per_epoch): X_valid_batch, Y_valid_batch, meta_valid = validation_data_provider.next_batch() X_valid_batch = maybe_get(X_valid_batch) Y_valid_batch = maybe_get(Y_valid_batch) Y_valid.append(Y_valid_batch) Y_hat.append(maybe_get(model.fprop(X_valid_batch, meta=meta_valid))) Y_valid = np.concatenate(Y_valid, axis=0) Y_hat = np.concatenate(Y_hat, axis=0) assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6) # This is really slow: #grad_check = gradient_checker.check(model) grad_check = "skipped" acc = np.mean(np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1)) if acc > best_acc:
def run(): with open("{{train_data_json}}") as data_file: data = json.load(data_file) random.shuffle(data) X, Y = map(list, zip(*data)) Y = [[":)", ":("].index(y) for y in Y] with open("{{train_encoding_json}}") as encoding_file: encoding = json.load(encoding_file) n_validation = {{n_validation}} batch_size = {{batch_size}} train_data_provider = LabelledDocumentMinibatchProvider( X=X[:-n_validation], Y=Y[:-n_validation], batch_size=batch_size, padding='PADDING', fixed_n_sentences={{fixed_n_sentences}}, fixed_n_words={{fixed_n_words}}) print train_data_provider.batches_per_epoch validation_data_provider = LabelledDocumentMinibatchProvider( X=X[-n_validation:], Y=Y[-n_validation:], batch_size=batch_size, padding='PADDING', fixed_n_sentences={{fixed_n_sentences}}, fixed_n_words={{fixed_n_words}}) model = experiment_config.get_model(encoding) print model cost_function = CrossEntropy() regularizer = L2Regularizer(lamb={{regularizer}}) objective = CostMinimizationObjective(cost=cost_function, data_provider=train_data_provider, regularizer=regularizer) update_rule = AdaGrad(gamma={{adagrad_gamma}}, model_template=model) optimizer = SGD(model=model, objective=objective, update_rule=update_rule) n_epochs = {{n_epochs}} n_batches = train_data_provider.batches_per_epoch * n_epochs time_start = time.time() best_acc = -1.0 progress = [] for batch_index, iteration_info in enumerate(optimizer): if batch_index % {{validation_frequency}} == 0: model_nodropout = cpu.model.dropout.remove_dropout(model) Y_hat = [] Y_valid = [] for _ in xrange(validation_data_provider.batches_per_epoch): X_valid_batch, Y_valid_batch, meta_valid = validation_data_provider.next_batch( ) X_valid_batch = X_valid_batch Y_valid_batch = Y_valid_batch Y_valid.append(Y_valid_batch) Y_hat.append( model_nodropout.fprop(X_valid_batch, meta=meta_valid)) Y_valid = np.concatenate(Y_valid, axis=0) Y_hat = np.concatenate(Y_hat, axis=0) assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6) acc = np.mean( np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1)) if acc > best_acc: best_acc = acc with open(os.path.join("{{job_dir}}", "model_best.pkl"), 'w') as model_file: pickle.dump(model, model_file, protocol=-1) if batch_index % {{save_frequency}} == 0: with open( os.path.join("{{job_dir}}", "model_{:05}.pkl".format(batch_index)), 'w') as model_file: pickle.dump(model, model_file, protocol=-1) print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, best: {}".format( batch_index, acc, iteration_info['cost'], np.argmax(Y_hat, axis=1).mean(), np.mean(np.abs(model.pack())), best_acc) time_now = time.time() examples_per_hr = (batch_index * batch_size) / (time_now - time_start) * 3600 progress.append({ 'batch': batch_index, 'validation_accuracy': acc, 'best_validation_accuracy': best_acc, 'cost': iteration_info['cost'], 'examples_per_hr': examples_per_hr, }) with open(os.path.join("{{job_dir}}", "progress.pkl"), 'w') as progress_file: pickle.dump(progress, progress_file, protocol=-1) if batch_index >= n_batches: break time_end = time.time() print "Time elapsed: {}s".format(time_end - time_start)
def run(): with open("{{train_data_json}}") as data_file: data = json.load(data_file) random.shuffle(data) X, Y = map(list, zip(*data)) Y = [[":)", ":("].index(y) for y in Y] with open("{{train_encoding_json}}") as encoding_file: encoding = json.load(encoding_file) n_validation = {{n_validation}} batch_size = {{batch_size}} train_data_provider = LabelledDocumentMinibatchProvider( X=X[:-n_validation], Y=Y[:-n_validation], batch_size=batch_size, padding='PADDING', fixed_n_sentences={{fixed_n_sentences}}, fixed_n_words={{fixed_n_words}}) print train_data_provider.batches_per_epoch validation_data_provider = LabelledDocumentMinibatchProvider( X=X[-n_validation:], Y=Y[-n_validation:], batch_size=batch_size, padding='PADDING', fixed_n_sentences={{fixed_n_sentences}}, fixed_n_words={{fixed_n_words}}) model = experiment_config.get_model(encoding) print model cost_function = CrossEntropy() regularizer = L2Regularizer(lamb={{regularizer}}) objective = CostMinimizationObjective( cost=cost_function, data_provider=train_data_provider, regularizer=regularizer) update_rule = AdaGrad( gamma={{adagrad_gamma}}, model_template=model) optimizer = SGD( model=model, objective=objective, update_rule=update_rule) n_epochs = {{n_epochs}} n_batches = train_data_provider.batches_per_epoch * n_epochs time_start = time.time() best_acc = -1.0 progress = [] for batch_index, iteration_info in enumerate(optimizer): if batch_index % {{validation_frequency}} == 0: model_nodropout = cpu.model.dropout.remove_dropout(model) Y_hat = [] Y_valid = [] for _ in xrange(validation_data_provider.batches_per_epoch): X_valid_batch, Y_valid_batch, meta_valid = validation_data_provider.next_batch() X_valid_batch = X_valid_batch Y_valid_batch = Y_valid_batch Y_valid.append(Y_valid_batch) Y_hat.append(model_nodropout.fprop(X_valid_batch, meta=meta_valid)) Y_valid = np.concatenate(Y_valid, axis=0) Y_hat = np.concatenate(Y_hat, axis=0) assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6) acc = np.mean(np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1)) if acc > best_acc: best_acc = acc with open(os.path.join("{{job_dir}}", "model_best.pkl"), 'w') as model_file: pickle.dump(model, model_file, protocol=-1) if batch_index % {{save_frequency}} == 0: with open(os.path.join("{{job_dir}}", "model_{:05}.pkl".format(batch_index)), 'w') as model_file: pickle.dump(model, model_file, protocol=-1) print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, best: {}".format( batch_index, acc, iteration_info['cost'], np.argmax(Y_hat, axis=1).mean(), np.mean(np.abs(model.pack())), best_acc) time_now = time.time() examples_per_hr = (batch_index * batch_size) / (time_now - time_start) * 3600 progress.append({ 'batch': batch_index, 'validation_accuracy': acc, 'best_validation_accuracy': best_acc, 'cost': iteration_info['cost'], 'examples_per_hr': examples_per_hr, }) with open(os.path.join("{{job_dir}}", "progress.pkl"), 'w') as progress_file: pickle.dump(progress, progress_file, protocol=-1) if batch_index >= n_batches: break time_end = time.time() print "Time elapsed: {}s".format(time_end - time_start)
time_start = time.time() best_acc = -1.0 progress = [] costs = [] prev_weights = model.pack() for batch_index, iteration_info in enumerate(optimizer): costs.append(iteration_info['cost']) if batch_index % 10 == 0: Y_hat = [] Y_valid = [] for _ in xrange(validation_data_provider.batches_per_epoch): X_valid_batch, Y_valid_batch, meta_valid = validation_data_provider.next_batch( ) X_valid_batch = maybe_get(X_valid_batch) Y_valid_batch = maybe_get(Y_valid_batch) Y_valid.append(Y_valid_batch) Y_hat.append( maybe_get(model.fprop(X_valid_batch, meta=meta_valid))) Y_valid = np.concatenate(Y_valid, axis=0) Y_hat = np.concatenate(Y_hat, axis=0) assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6) # This is really slow: #grad_check = gradient_checker.check(model) grad_check = "skipped" acc = np.mean( np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1))