def build_model(weights=None, embedding_size=256, recurrent_gate_size=512, n_features=5, dropout=0.4): """ build_model Inputs: weights - Path to a weights file to load, or None if the model should be built from scratch embedding_size - Size of the embedding layer recurrent_gate_size - Size of the gated recurrent layer n_features - Number of features for the embedding layer dropout - Dropout value Returns: A model object ready for training (or evaluation if a previous model was loaded via `weights`) """ # vvvvv #Modify this if you want to change the structure of the network! # ^^^^^ model_layers = [ Embedding(size=embedding_size, n_features=n_features), GatedRecurrent(size=recurrent_gate_size, p_drop=dropout), Dense(size=1, activation='sigmoid', p_drop=dropout) ] model = RNN(layers=model_layers, cost='BinaryCrossEntropy', verbose=2, updater='Adam') if weights: #Just load the provided model instead, I guess? model = load(weights) return model
def main(ptrain, ntrain, ptest, ntest, out, modeltype): assert modeltype in ["gated_recurrent", "lstm_recurrent"] print("Using the %s model ..." % modeltype) print("Loading data ...") trX, trY = load_data(ptrain, ntrain) teX, teY = load_data(ptest, ntest) tokenizer = Tokenizer(min_df=10, max_features=100000) trX = tokenizer.fit_transform(trX) teX = tokenizer.transform(teX) print("Training ...") if modeltype == "gated_recurrent": layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: layers = [ Embedding(size=256, n_features=tokenizer.n_features), LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) model.fit(trX, trY, n_epochs=10) # Predicting the probabilities of positive labels print("Predicting ...") pr_teX = model.predict(teX).flatten() predY = np.ones(len(teY)) predY[pr_teX < 0.5] = -1 with open(out, "w") as f: for lab, pos_pr, neg_pr in zip(predY, pr_teX, 1 - pr_teX): f.write("%d %f %f\n" % (lab, pos_pr, neg_pr))
def rnn(train_text, train_label): tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) layers = [ Embedding(size=50, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] # print "train_tokens=", train_tokens model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_label) return model
def train_RNN(tokenizer, tokens, labels): """ INPUT: Trained tokenizer class, label array - The arrays of the tokenized critic reviews and the corresponding labels Returns a trained Recurrent Neural Network class object """ layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) path_snapshots = 'model_snapshots' print "Begin fitting RNN" model.fit(tokens, labels, n_epochs=12) return model
def build_model(weights=None, embedding_size=128, recurrent_gate_size=256, n_features=5, dropout=0.1): """ build_model Inputs: weights - Path to a weights file to load, or None if the model should be built from scratch embedding_size - Size of the embedding layer recurrent_gate_size - Size of the gated recurrent layer n_features - Number of features for the embedding layer dropout - Dropout value Returns: A model object ready for training (or evaluation if a previous model was loaded via `weights`) """ # vvvvv #Modify this if you want to change the structure of the network! # ^^^^^ model_layers = [ Embedding(size=embedding_size, n_features=n_features), GatedRecurrent(size=recurrent_gate_size, p_drop=dropout), Dense(size=1, activation='sigmoid', p_drop=dropout) ] args = { 'layers': model_layers, 'cost': 'BinaryCrossEntropy', 'verbose': 2, 'updater': Adadelta(lr=0.5), 'embedding_size': embedding_size } model = RNN(**args) if weights: #Just load the provided model instead, I guess? print "Loading previously created weights file: ", weights model = load(weights) return model
def train(args): zero_words = cPickle.load(gzip.open("zero_shot.pkl.gz")) if args.zero_shot else set() def maybe_zero(s, i): overlap = set(tokenize(s)).intersection(zero_words) if args.zero_shot and len(overlap) > 0: return numpy.zeros(i.shape) else: return i dataset = args.dataset tok_path = args.tokenizer model_path = args.model d = dp.getDataProvider(dataset) pairs = list(d.iterImageSentencePair(split='train')) if args.shuffle: numpy.random.shuffle(pairs) output_size = len(pairs[0]['image']['feat']) embedding_size = args.embedding_size if args.embedding_size is not None else args.hidden_size tokenizer = cPickle.load(gzip.open(args.init_tokenizer)) \ if args.init_tokenizer else Tokenizer(min_df=args.word_freq_threshold, character=args.character) sentences, images = zip(*[ (pair['sentence']['raw'], maybe_zero(pair['sentence']['raw'],pair['image']['feat'])) for pair in pairs ]) scaler = StandardScaler() if args.scaler == 'standard' else NoScaler() images = scaler.fit_transform(images) tokens = [ [tokenizer.encoder['PAD']] + sent + [tokenizer.encoder['END'] ] for sent in tokenizer.fit_transform(sentences) ] tokens_inp = [ token[:-1] for token in tokens ] tokens_out = [ token[1:] for token in tokens ] cPickle.dump(tokenizer, gzip.open(tok_path, 'w')) cPickle.dump(scaler, gzip.open('scaler.pkl.gz','w')) # Validation data valid_pairs = list(d.iterImageSentencePair(split='val')) valid_sents, valid_images = zip(*[ (pair['sentence']['raw'], pair['image']['feat']) for pair in valid_pairs ]) valid_images = scaler.transform(valid_images) valid_tokens = [ [ tokenizer.encoder['PAD'] ] + sent + [tokenizer.encoder['END'] ] for sent in tokenizer.transform(valid_sents) ] valid_tokens_inp = [ token[:-1] for token in valid_tokens ] valid_tokens_out = [ token[1:] for token in valid_tokens ] valid = (valid_tokens_inp, valid_tokens_out, valid_images) updater = passage.updates.Adam(lr=args.rate, clipnorm=args.clipnorm) if args.cost == 'MeanSquaredError': z_cost = MeanSquaredError elif args.cost == 'CosineDistance': z_cost = CosineDistance else: raise ValueError("Unknown cost") if args.hidden_type == 'gru': Recurrent = GatedRecurrent elif args.hidden_type == 'lstm': Recurrent = LstmRecurrent else: Recurrent = GatedRecurrent # if args.init_model is not None: # model_init = cPickle.load(open(args.init_model)) # def values(ps): # return [ p.get_value() for p in ps ] # # FIXME enable this for shared only embeddings # layers = [ Embedding(size=args.hidden_size, n_features=tokenizer.n_features, # weights=values(model_init.layers[0].params)), # Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation, # weights=values(model_init.layers[1].params)), # Combined(left=Dense(size=tokenizer.n_features, activation='softmax', reshape=True, # weights=values(model_init.layers[2].left.params)), # right=Dense(size=output_size, activation=args.out_activation, # weights=values(model_init.layers[2].right.params)) # ) ] # else: # FIXME implement proper pretraining FIXME interpolated = True if not args.non_interpolated else False if args.model_type in ['add', 'mult', 'matrix']: if args.model_type == 'add': layer0 = Direct(size=embedding_size, n_features=tokenizer.n_features, op=Add) elif args.model_type == 'mult': layer0 = Direct(size=embedding_size, n_features=tokenizer.n_features, op=Mult) elif args.model_type == 'matrix': sqrt_size = embedding_size ** 0.5 if not sqrt_size.is_integer(): raise ValueError("Sqrt of embedding_size not integral for matrix model") layer0 = Direct(size=embedding_size, n_features=tokenizer.n_features, op=MatrixMult) layers = [ layer0, Dense(size=output_size, activation=args.out_activation, reshape=False) ] valid = (valid_tokens_inp, valid_images) model = RNN(layers=layers, updater=updater, cost=z_cost, iterator=SortedPadded(shuffle=False), verbose=1) model.fit(tokens_inp, images, n_epochs=args.iterations, batch_size=args.batch_size, len_filter=None, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) elif args.model_type == 'simple': layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), Recurrent(seq_output=False, size=args.hidden_size, activation=args.activation), Dense(size=output_size, activation=args.out_activation, reshape=False) ] valid = (valid_tokens_inp, valid_images) model = RNN(layers=layers, updater=updater, cost=z_cost, iterator=SortedPadded(shuffle=False), verbose=1) model.fit(tokens_inp, images, n_epochs=args.iterations, batch_size=args.batch_size, len_filter=None, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) # FIXME need validation elif args.model_type == 'deep-simple': layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation), Recurrent(seq_output=False, size=args.hidden_size, activation=args.activation), Dense(size=output_size, activation=args.out_activation, reshape=False) ] valid = (valid_tokens_inp, valid_images) model = RNN(layers=layers, updater=updater, cost=z_cost, iterator=SortedPadded(shuffle=False), verbose=1) model.fit(tokens_inp, images, n_epochs=args.iterations, batch_size=args.batch_size, len_filter=None, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) # FIXME need validation elif args.model_type == 'shared_all': if args.zero_shot: raise NotImplementedError # FIXME zero_shot not implemented layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation), Combined(left=Dense(size=tokenizer.n_features, activation='softmax', reshape=True), right=Dense(size=output_size, activation=args.out_activation, reshape=False)) ] model = ForkedRNN(layers=layers, updater=updater, cost_y=CategoricalCrossEntropySwapped, cost_z=z_cost, alpha=args.alpha, size_y=tokenizer.n_features, verbose=1, interpolated=interpolated) model.fit(tokens_inp, tokens_out, images, n_epochs=args.iterations, batch_size=args.batch_size, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) elif args.model_type == 'shared_embeddings': layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), Combined(left=Stacked([Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation), Dense(size=tokenizer.n_features, activation='softmax', reshape=True)]), left_type='id', right=Stacked([Recurrent(seq_output=False, size=args.hidden_size, activation=args.activation), Dense(size=output_size, activation=args.out_activation, reshape=False)]), right_type='id') ] model = ForkedRNN(layers=layers, updater=updater, cost_y=CategoricalCrossEntropySwapped, cost_z=z_cost, alpha=args.alpha, size_y=tokenizer.n_features, verbose=1, interpolated=interpolated, zero_shot=args.zero_shot) model.fit(tokens_inp, tokens_out, images, n_epochs=args.iterations, batch_size=args.batch_size, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) cPickle.dump(model, gzip.open(model_path,"w"))
from passage.layers import GatedRecurrent from passage.layers import LstmRecurrent from passage.layers import Dense from passage.models import RNN from passage.utils import save, load print("Loading data...") num_training = int((1.0 - 0.2) * len(xs)) X_train, y_train, X_test, y_test = xs[:num_training], ys[:num_training], xs[num_training:], ys[num_training:] num_feats = generator.max_id() + 1 layers = [ Embedding(size=128, n_features=num_feats), #LstmRecurrent(size=32), #NOTE - to use a deep RNN, you need all but the final layers with seq_ouput=True #GatedRecurrent(size=128, seq_output=True), #GatedRecurrent(size=256, direction= 'backward' if REVERSE else 'forward'), GatedRecurrent(size=128, seq_output=True), GatedRecurrent(size=128), #Dense(size=64, activation='sigmoid'), Dense(size=len(lst_freq_tags), activation='sigmoid'), ] #emd 128, gru 32/64 is good - 0.70006 causer print("Creating Model") model = RNN(layers=layers, cost='bce')
import sys # --- # --- print 'loading dataset' d = Dataset(settings['FN_DATASET'], settings['FN_VOCABULARY']) d.load() print 'generating labeled training set' train_text,train_labels = d.getNextWordPredTrainset(10) #for t,l in zip(train_text,train_labels): # print t,'->',l tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) save(train_tokens, settings['FN_TRAINED_TOKENIZER']) layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_labels) save(model, settings['FN_MODEL_NEXTWORDPRED'])
if __name__ == "__main__": tr_data = pd.read_csv('labeledTrainData.tsv', delimiter='\t') trX = clean(tr_data['review'].values) trY = tr_data['sentiment'].values print("Training data loaded and cleaned.") tokenizer = Tokenizer(min_df=10, max_features=100000) trX = tokenizer.fit_transform(trX) print("Training data tokenized.") layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) model.fit(trX, trY, n_epochs=10) te_data = pd.read_csv('testData.tsv', delimiter='\t') ids = te_data['id'].values teX = clean(te_data['review'].values)
from passage.layers import LstmRecurrent from passage.layers import Dense from passage.models import RNN from passage.utils import save, load print("Loading data...") num_training = int((1.0 - TEST_SPLIT) * len(xs)) X_train, y_train, X_test, y_test = xs[:num_training], ys[:num_training], xs[ num_training:], ys[num_training:] num_feats = generator.max_id() + 1 layers = [ Embedding(size=64, n_features=num_feats), #LstmRecurrent(size=32), #NOTE - to use a deep RNN, you need all but the final layers with seq_ouput=True #GatedRecurrent(size=64, seq_output=True), GatedRecurrent(size=64, direction='backward' if REVERSE else 'forward'), #LstmRecurrent(size=128), Dense(size=1, activation='sigmoid'), ] #emd 64, gru 64 is good - 0.70833 causer (0 prev sents) print("Creating Model") model = RNN(layers=layers, cost='bce') def find_cutoff(y_test, predictions):
def train_model(modeltype, delta): assert modeltype in ["gated_recurrent", "lstm_recurrent"] print "Begin Training" df_imdb_reviews = pd.read_csv('../data/imdb_review_data.tsv', escapechar='\\', delimiter='\t') X = clean(df_imdb_reviews['review'].values) y = df_imdb_reviews['sentiment'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) print "Tokenize" tokenizer = Tokenizer(min_df=10, max_features=100000) X_train = tokenizer.fit_transform(X_train) X_train = [[float(x) for x in y] for y in X_train] X_test = tokenizer.transform(X_test) X_test = [[float(x) for x in y] for y in X_test] print "Number of featers: {}".format(tokenizer.n_features) print "Training model" if modeltype == "gated_recurrent": layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: layers = [ Embedding(size=256, n_features=tokenizer.n_features), LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] # bce is classification loss for binary classification and sigmoid output model = RNN(layers=layers, cost='bce', updater=Adadelta, (lr=delta)) model.fit(X_train, y_train, n_epochs=20) with open('../data/{}_tokenizer_delta_{}_pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: vectorizer = pickle.dump(tokenizer, f) with open('../data/{}_model_delta_{}._pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: model = pickle.dump(model, f) try: y_pred_te = model.predict(X_test).flatten() >= 0.5 y_pred_tr = model.predict(X_train).flatten() >= 0.5 print 'Test Accuracy: {}'.format(accuracy_score(y_test,y_pred_te)) print 'Test Precision: {}'.format(precision_score(y_test,y_pred_te)) print 'Test Recall: {}'.format(recall_score(y_test,y_pred_te)) print 'Train Accuracy: {}'.format(accuracy_score(y_train,y_pred_tr)) print 'Train Precision: {}'.format(precision_score(y_train,y_pred_tr)) print 'Train Recall: {}'.format(recall_score(y_train,y_pred_tr)) except: print "Unable to perform metrics" return tokenizer, model
def train_and_save_passage_tokenizer_and_rnn_model(x_train, y_train, x_test, character_model=False): """Train and save Passage tokenizer and Passage RNN model. x_train and x_test should each be a series that's already been pre-preocessed: html->text, lowercase, removed punct/#s x_train+x_test are used to build the tokenizer. Note that character-based RNN is a work-in-progress and not actuallly implemented as of now. """ # Note that we assume we have train/test reviews that had been preprocessed: html->text, lowercased, removed # punct/#s # Note in https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py they only # extract text from html, lowercase and strip (no punctuation removal) # Tokenization: Assign each word in the reviews an ID to be used in all reviews tokenizer = Tokenizer(min_df=10, max_features=100000, character=character_model) train_reviews_list = x_train.tolist() tokenizer.fit(train_reviews_list + x_test.tolist()) # Tokenize training reviws (so can use to fit RNN model on) train_reviews_tokenized = tokenizer.transform(train_reviews_list) # Based on https://github.com/vinhkhuc/kaggle-sentiment-popcorn/blob/master/scripts/passage_nn.py which is based # on https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py # RNN Network: # -Each tokenized review will be converted into a sequence of words, where each word has an embedding representation # (256) # -RNN layer (GRU) attempts to find pattern in sequence of words # -Final dense layer is used as a logistic classifier to turn RNN output into a probability/prediction if not character_model: layers = [ Embedding(size=256, n_features=tokenizer.n_features), # May replace with LstmRecurrent for LSTM layer GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: # Character-level RNN # Idea is to convert character tokenizations into one-hot encodings in which case # the embeddings layer is no longer needed train_reviews_tokenized = map( lambda r_indexes: pd.get_dummies( r_indexes, columns=range(tokenizer.n_features + 1)).values, train_reviews_tokenized) layers = [ # May replace with LstmRecurrent for LSTM layer GatedRecurrent(size=100, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] # RNN classifer uses Binary Cross-Entropy as the cost function classifier = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) NUM_EPOCHS = 10 # 10 epochs may take 10+ hours to run depending on machine classifier.fit(train_reviews_tokenized, y_train.tolist(), n_epochs=NUM_EPOCHS) # Store model and tokenizer if character_model: passage.utils.save(classifier, PASSAGE_CHAR_RNN_MODEL) _ = joblib.dump(tokenizer, PASSAGE_CHAR_TOKENIZER, compress=9) else: passage.utils.save(classifier, PASSAGE_RNN_MODEL) _ = joblib.dump(tokenizer, PASSAGE_TOKENIZER, compress=9)