def build_model_and_objective(n_classes, n_input_dimensions, X, Y): model = CSM( layers=[ Softmax( n_classes=n_classes, n_input_dimensions=n_input_dimensions), ], ) lengths = np.zeros(X.shape[0]) data_provider = BatchDataProvider( X=X, Y=Y, lengths=lengths) cost_function = CrossEntropy() objective = CostMinimizationObjective(cost=cost_function, data_provider=data_provider) update_rule = AdaGrad( gamma=0.1, model_template=model) optimizer = SGD(model=model, objective=objective, update_rule=update_rule) return model, objective, optimizer, data_provider
def main(): random.seed(665243) np.random.seed(61734) np.set_printoptions(linewidth=100) parser = argparse.ArgumentParser(description="Create summaries from w2vec model.") parser.add_argument('--size', type=int, help="number of sentences to keep") args = parser.parse_args() data_dir = os.path.join("/users/mdenil/code/txtnets/txtnets_deployed/data", "stanfordmovie") with open("model_w2vec_logreg.pkl") as model_file: embedding_model = pickle.load(model_file) logistic_regression = pickle.load(model_file) with open(os.path.join(data_dir, "stanfordmovie.test.sentences.clean.projected.json")) as data_file: data = json.load(data_file) # random.shuffle(data) X, Y = map(list, zip(*data)) Y = [[":)", ":("].index(y) for y in Y] objective = CrossEntropy() test_data_provider = LabelledDocumentMinibatchProvider( X=X, Y=Y, batch_size=1, padding=None, shuffle=False) prog_bar = pyprind.ProgBar(test_data_provider.batches_per_epoch) summaries = [] for _ in range(test_data_provider.batches_per_epoch): x_batch, y_batch, meta_batch = test_data_provider.next_batch() label = [":)", ":("][int(y_batch[0,1])] sentence_importance_scores = get_sentence_importance_scores( embedding_model, logistic_regression, x_batch) most_important_sentence_indexes = np.argsort(sentence_importance_scores) most_important_sentence_indexes = most_important_sentence_indexes[:args.size] most_important_sentence_indexes.sort() summary = [] for i in most_important_sentence_indexes: summary.append(x_batch[i]) summaries.append([summary, label]) prog_bar.update() with open("summaries_{}.json".format(args.size), 'w') as summaries_file: json.dump(summaries, summaries_file) summaries_file.write("\n")
def get_sentence_importance_scores(embedding_model, logistic_regression, x): objective = CrossEntropy() x_combined = [w for s in x for w in s] meta_combined = { 'lengths': np.asarray([len(x_combined)]), 'space_below': cpu.space.CPUSpace( axes=('b', 'w'), extents={'b': 1, 'w': len(x_combined)}) } x_combined = np.asarray(x_combined).reshape((1, -1)) embeddings, embeddings_meta, embeddings_state = embedding_model.fprop( x_combined, meta=dict(meta_combined), return_state=True) embeddings_meta['space_below'] = embeddings_meta['space_above'] y_hat, y_hat_meta, log_reg_state = logistic_regression.fprop( embeddings, meta=dict(embeddings_meta), return_state=True) y_hat_meta['space_below'] = y_hat_meta['space_above'] loss, loss_meta, loss_state = objective.fprop( y_hat, max_error_label(y_hat), meta=dict(y_hat_meta)) delta, delta_meta = objective.bprop( y_hat, max_error_label(y_hat), meta=dict(loss_meta), fprop_state=loss_state) delta = logistic_regression.bprop( delta, meta=dict(delta_meta), fprop_state=log_reg_state) C = combiner_matrix(map(len, x)) sentence_delta = np.dot(delta, C) sentence_embedding = np.dot(embeddings, C) # normalize for cosine distance sentence_delta /= np.sqrt(np.sum(sentence_delta**2, axis=1, keepdims=True)) sentence_embedding /= np.sqrt(np.sum(sentence_embedding**2, axis=1, keepdims=True)) sentence_importance_scores = np.abs(np.sum(sentence_delta * sentence_embedding, axis=0)) return sentence_importance_scores
def get_sentence_importance_scores(embedding_model, logistic_regression, x): objective = CrossEntropy() x_combined = [w for s in x for w in s] meta_combined = { "lengths": np.asarray([len(x_combined)]), "space_below": cpu.space.CPUSpace(axes=("b", "w"), extents={"b": 1, "w": len(x_combined)}), } x_combined = np.asarray(x_combined).reshape((1, -1)) embeddings, embeddings_meta, embeddings_state = embedding_model.fprop( x_combined, meta=dict(meta_combined), return_state=True ) embeddings_meta["space_below"] = embeddings_meta["space_above"] y_hat, y_hat_meta, log_reg_state = logistic_regression.fprop( embeddings, meta=dict(embeddings_meta), return_state=True ) y_hat_meta["space_below"] = y_hat_meta["space_above"] loss, loss_meta, loss_state = objective.fprop(y_hat, max_error_label(y_hat), meta=dict(y_hat_meta)) delta, delta_meta = objective.bprop(y_hat, max_error_label(y_hat), meta=dict(loss_meta), fprop_state=loss_state) delta = logistic_regression.bprop(delta, meta=dict(delta_meta), fprop_state=log_reg_state) C = combiner_matrix(map(len, x)) sentence_delta = np.dot(delta, C) sentence_embedding = np.dot(embeddings, C) # normalize for cosine distance sentence_delta /= np.sqrt(np.sum(sentence_delta ** 2, axis=1, keepdims=True)) sentence_embedding /= np.sqrt(np.sum(sentence_embedding ** 2, axis=1, keepdims=True)) sentence_importance_scores = np.abs(np.sum(sentence_delta * sentence_embedding, axis=0)) return sentence_importance_scores
def get_model_output(model, X,Y): #Initializing the data provided data_provider = cpu.optimize.data_provider.LabelledSequenceBatchProvider( X=X, Y=Y, padding='PADDING') #Define the cost function cEntr = CrossEntropy() #Get data and use the model to Predict X, Y, meta = data_provider.next_batch() Y_hat, meta, model_state = model.fprop(X, meta=meta, return_state=True) #Create a Y that maximizes the error of the model Y_inverted = enforce_error(Y_hat) #Bookkeep the spaces and BPROP to get the deltas meta['space_below'] = meta['space_above'] cost, meta, cost_state = cEntr.fprop(Y_hat, Y_inverted, meta=meta) delta, meta = cEntr.bprop(Y_hat, Y_inverted, meta=meta, fprop_state=cost_state) delta, meta = model.bprop(delta, meta=meta, fprop_state=model_state, return_state=True, num_layers=-1) delta, space = meta['space_below'].transform(delta, ('b', 'w')) return Y_hat, Y_inverted, delta
# Tanh(), MaxFolding(), Softmax(n_classes=2, n_input_dimensions=700), ]) print tweet_model # X, Y, meta = train_data_provider.next_batch() # Y, meta, fprop_state = model.fprop(X, meta, return_state=True) # print meta['lengths'] # print Y.shape, meta['space_above'] # print [p.shape for p in model.params()] cost_function = CrossEntropy() objective = CostMinimizationObjective(cost=cost_function, data_provider=train_data_provider) update_rule = AdaGrad(gamma=0.1, model_template=tweet_model) optimizer = SGD(model=tweet_model, objective=objective, update_rule=update_rule) n_epochs = 1 n_batches = train_data_provider.batches_per_epoch * n_epochs costs = [] prev_weights = tweet_model.pack()
def run(): with open("{{train_data_json}}") as data_file: data = json.load(data_file) random.shuffle(data) X, Y = map(list, zip(*data)) Y = [[":)", ":("].index(y) for y in Y] with open("{{train_encoding_json}}") as encoding_file: encoding = json.load(encoding_file) n_validation = {{n_validation}} batch_size = {{batch_size}} train_data_provider = LabelledDocumentMinibatchProvider( X=X[:-n_validation], Y=Y[:-n_validation], batch_size=batch_size, padding='PADDING', fixed_n_sentences={{fixed_n_sentences}}, fixed_n_words={{fixed_n_words}}) print train_data_provider.batches_per_epoch validation_data_provider = LabelledDocumentMinibatchProvider( X=X[-n_validation:], Y=Y[-n_validation:], batch_size=batch_size, padding='PADDING', fixed_n_sentences={{fixed_n_sentences}}, fixed_n_words={{fixed_n_words}}) model = experiment_config.get_model(encoding) print model cost_function = CrossEntropy() regularizer = L2Regularizer(lamb={{regularizer}}) objective = CostMinimizationObjective(cost=cost_function, data_provider=train_data_provider, regularizer=regularizer) update_rule = AdaGrad(gamma={{adagrad_gamma}}, model_template=model) optimizer = SGD(model=model, objective=objective, update_rule=update_rule) n_epochs = {{n_epochs}} n_batches = train_data_provider.batches_per_epoch * n_epochs time_start = time.time() best_acc = -1.0 progress = [] for batch_index, iteration_info in enumerate(optimizer): if batch_index % {{validation_frequency}} == 0: model_nodropout = cpu.model.dropout.remove_dropout(model) Y_hat = [] Y_valid = [] for _ in xrange(validation_data_provider.batches_per_epoch): X_valid_batch, Y_valid_batch, meta_valid = validation_data_provider.next_batch( ) X_valid_batch = X_valid_batch Y_valid_batch = Y_valid_batch Y_valid.append(Y_valid_batch) Y_hat.append( model_nodropout.fprop(X_valid_batch, meta=meta_valid)) Y_valid = np.concatenate(Y_valid, axis=0) Y_hat = np.concatenate(Y_hat, axis=0) assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6) acc = np.mean( np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1)) if acc > best_acc: best_acc = acc with open(os.path.join("{{job_dir}}", "model_best.pkl"), 'w') as model_file: pickle.dump(model, model_file, protocol=-1) if batch_index % {{save_frequency}} == 0: with open( os.path.join("{{job_dir}}", "model_{:05}.pkl".format(batch_index)), 'w') as model_file: pickle.dump(model, model_file, protocol=-1) print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, best: {}".format( batch_index, acc, iteration_info['cost'], np.argmax(Y_hat, axis=1).mean(), np.mean(np.abs(model.pack())), best_acc) time_now = time.time() examples_per_hr = (batch_index * batch_size) / (time_now - time_start) * 3600 progress.append({ 'batch': batch_index, 'validation_accuracy': acc, 'best_validation_accuracy': best_acc, 'cost': iteration_info['cost'], 'examples_per_hr': examples_per_hr, }) with open(os.path.join("{{job_dir}}", "progress.pkl"), 'w') as progress_file: pickle.dump(progress, progress_file, protocol=-1) if batch_index >= n_batches: break time_end = time.time() print "Time elapsed: {}s".format(time_end - time_start)
def main(): random.seed(34532) np.random.seed(675) np.set_printoptions(linewidth=100) data_dir = os.path.join("/users/mdenil/code/txtnets/txtnets_deployed/data", "stanfordmovie") trainer = Word2Vec(train=os.path.join( data_dir, "stanfordmovie.train.sentences.clean.projected.txt"), output="stanford-movie-vectors.bin", cbow=1, size=300, window=8, negative=25, hs=0, sample=1e-4, threads=20, binary=1, iter=15, min_count=1) trainer.train() gensim_model = gensim.models.Word2Vec.load_word2vec_format( "/users/mdenil/code/txtnets/txtnets_deployed/code/stanford-movie-vectors.bin", binary=True) # print(gensim_model.most_similar(["refund"])) # print(gensim_model.most_similar(["amazing"])) embedding_model = txtnets_model_from_gensim_word2vec(gensim_model) with open( os.path.join( data_dir, "stanfordmovie.train.sentences.clean.projected.flat.json") ) as data_file: data = json.load(data_file) random.shuffle(data) X, Y = map(list, zip(*data)) Y = [[":)", ":("].index(y) for y in Y] batch_size = 100 n_validation = 500 train_data_provider = LabelledSequenceMinibatchProvider( X=X[:-n_validation], Y=Y[:-n_validation], batch_size=batch_size, padding='PADDING') transformed_train_data_provider = TransformedLabelledDataProvider( data_source=train_data_provider, transformer=embedding_model) validation_data_provider = LabelledSequenceMinibatchProvider( X=X[-n_validation:], Y=Y[-n_validation:], batch_size=batch_size, padding='PADDING') transformed_validation_data_provider = TransformedLabelledDataProvider( data_source=validation_data_provider, transformer=embedding_model) logistic_regression = CSM(layers=[ Sum(axes=['w']), Softmax(n_input_dimensions=gensim_model.syn0.shape[1], n_classes=2) ]) cost_function = CrossEntropy() regularizer = L2Regularizer(lamb=1e-4) objective = CostMinimizationObjective( cost=cost_function, data_provider=transformed_train_data_provider, regularizer=regularizer) update_rule = AdaGrad(gamma=0.1, model_template=logistic_regression) optimizer = SGD(model=logistic_regression, objective=objective, update_rule=update_rule) for batch_index, iteration_info in enumerate(optimizer): if batch_index % 100 == 0: # print(iteration_info['cost']) Y_hat = [] Y_valid = [] for _ in xrange( transformed_validation_data_provider.batches_per_epoch): X_valid_batch, Y_valid_batch, meta_valid = transformed_validation_data_provider.next_batch( ) Y_valid.append(get(Y_valid_batch)) Y_hat.append( get( logistic_regression.fprop(X_valid_batch, meta=meta_valid))) Y_valid = np.concatenate(Y_valid, axis=0) Y_hat = np.concatenate(Y_hat, axis=0) acc = np.mean( np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1)) print("B: {}, A: {}, C: {}".format(batch_index, acc, iteration_info['cost'])) with open("model_w2vec_logreg.pkl", 'w') as model_file: pickle.dump(embedding_model.move_to_cpu(), model_file, protocol=-1) pickle.dump(logistic_regression.move_to_cpu(), model_file, protocol=-1)
def optimize_and_save(model, alphabet, n_batches, data_file_name, chars_or_words, result_file_name): print result_file_name with gzip.open(data_file_name) as data_file: data = json.loads(data_file.read()) X, Y = map(list, zip(*data)) # shuffle combined = zip(X, Y) random.shuffle(combined) X, Y = map(list, zip(*combined)) # map labels to something useful Y = [[":)", ":("].index(y) for y in Y] if chars_or_words == 'chars': X = [list(x) for x in X] elif chars_or_words == 'words': # replace unknowns with an unknown character tokenizer = WordPunctTokenizer() new_X = [] for x in X: new_X.append([ w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x) ]) X = new_X else: raise ValueError("I don't know what that means :(") train_data_provider = LabelledSequenceMinibatchProvider(X=X[:-500], Y=Y[:-500], batch_size=50, padding='PADDING') validation_data_provider = LabelledSequenceMinibatchProvider( X=X[-500:], Y=Y[-500:], batch_size=500, padding='PADDING') cost_function = CrossEntropy() objective = CostMinimizationObjective(cost=cost_function, data_provider=train_data_provider) update_rule = AdaGrad(gamma=0.05, model_template=model) regularizer = L2Regularizer(lamb=1e-4) optimizer = SGD(model=model, objective=objective, update_rule=update_rule, regularizer=regularizer) print model monitor_info = [] iteration_info = [] for batch_index, info in enumerate(optimizer): iteration_info.append(info) if batch_index % 10 == 0: X_valid, Y_valid, meta_valid = validation_data_provider.next_batch( ) Y_hat = model.fprop(X_valid, meta=meta_valid) assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6) acc = np.mean( np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1)) prop_1 = np.argmax(Y_hat, axis=1).mean() monitor_info.append({ 'batch_index': batch_index, 'acc': acc, 'prop_1': prop_1, }) print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}".format( batch_index, acc, info['cost'], prop_1, np.mean(np.abs(model.pack()))) if batch_index == n_batches - 1: break result = { 'model': model, 'iteration_info': iteration_info, 'monitor_info': monitor_info, } with open(result_file_name, 'w') as result_file: pickle.dump(result, result_file, protocol=-1)