def RNN(data_train, labels_train, data_test, labels_test, n_features): """ Adapted from Passage's sentiment.py at https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py License: MIT """ import numpy as np import pandas as pd from passage.models import RNN from passage.updates import Adadelta from passage.layers import Embedding, GatedRecurrent, Dense from passage.preprocessing import Tokenizer layers = [ Embedding(size=128, n_features=n_features), GatedRecurrent(size=128, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) tokenizer = Tokenizer(min_df=10) X = tokenizer.fit_transform(data) model.fit(X, labels, n_epochs=10) predi = model.predit(data_test).flatten labels_predicted = np.ones(len(data_test)) labels_predicted[predi<0.5] = 0
def tokenize(train): """ INPUT: Array - Text documents (reviews) to train sentiment on Returns trained tokenizer """ tokenizer = Tokenizer(min_df=10, max_features=100000) print "Training tokenizer on reviews" tokenizer.fit(train) return tokenizer
def rnn(train_text, train_label): tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) layers = [ Embedding(size=50, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] # print "train_tokens=", train_tokens model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_label) return model
def main(ptrain, ntrain, ptest, ntest, out, modeltype): assert modeltype in ["gated_recurrent", "lstm_recurrent"] print("Using the %s model ..." % modeltype) print("Loading data ...") trX, trY = load_data(ptrain, ntrain) teX, teY = load_data(ptest, ntest) tokenizer = Tokenizer(min_df=10, max_features=100000) trX = tokenizer.fit_transform(trX) teX = tokenizer.transform(teX) print("Training ...") if modeltype == "gated_recurrent": layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: layers = [ Embedding(size=256, n_features=tokenizer.n_features), LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) model.fit(trX, trY, n_epochs=10) # Predicting the probabilities of positive labels print("Predicting ...") pr_teX = model.predict(teX).flatten() predY = np.ones(len(teY)) predY[pr_teX < 0.5] = -1 with open(out, "w") as f: for lab, pos_pr, neg_pr in zip(predY, pr_teX, 1 - pr_teX): f.write("%d %f %f\n" % (lab, pos_pr, neg_pr))
def train(args): zero_words = cPickle.load(gzip.open("zero_shot.pkl.gz")) if args.zero_shot else set() def maybe_zero(s, i): overlap = set(tokenize(s)).intersection(zero_words) if args.zero_shot and len(overlap) > 0: return numpy.zeros(i.shape) else: return i dataset = args.dataset tok_path = args.tokenizer model_path = args.model d = dp.getDataProvider(dataset) pairs = list(d.iterImageSentencePair(split='train')) if args.shuffle: numpy.random.shuffle(pairs) output_size = len(pairs[0]['image']['feat']) embedding_size = args.embedding_size if args.embedding_size is not None else args.hidden_size tokenizer = cPickle.load(gzip.open(args.init_tokenizer)) \ if args.init_tokenizer else Tokenizer(min_df=args.word_freq_threshold, character=args.character) sentences, images = zip(*[ (pair['sentence']['raw'], maybe_zero(pair['sentence']['raw'],pair['image']['feat'])) for pair in pairs ]) scaler = StandardScaler() if args.scaler == 'standard' else NoScaler() images = scaler.fit_transform(images) tokens = [ [tokenizer.encoder['PAD']] + sent + [tokenizer.encoder['END'] ] for sent in tokenizer.fit_transform(sentences) ] tokens_inp = [ token[:-1] for token in tokens ] tokens_out = [ token[1:] for token in tokens ] cPickle.dump(tokenizer, gzip.open(tok_path, 'w')) cPickle.dump(scaler, gzip.open('scaler.pkl.gz','w')) # Validation data valid_pairs = list(d.iterImageSentencePair(split='val')) valid_sents, valid_images = zip(*[ (pair['sentence']['raw'], pair['image']['feat']) for pair in valid_pairs ]) valid_images = scaler.transform(valid_images) valid_tokens = [ [ tokenizer.encoder['PAD'] ] + sent + [tokenizer.encoder['END'] ] for sent in tokenizer.transform(valid_sents) ] valid_tokens_inp = [ token[:-1] for token in valid_tokens ] valid_tokens_out = [ token[1:] for token in valid_tokens ] valid = (valid_tokens_inp, valid_tokens_out, valid_images) updater = passage.updates.Adam(lr=args.rate, clipnorm=args.clipnorm) if args.cost == 'MeanSquaredError': z_cost = MeanSquaredError elif args.cost == 'CosineDistance': z_cost = CosineDistance else: raise ValueError("Unknown cost") if args.hidden_type == 'gru': Recurrent = GatedRecurrent elif args.hidden_type == 'lstm': Recurrent = LstmRecurrent else: Recurrent = GatedRecurrent # if args.init_model is not None: # model_init = cPickle.load(open(args.init_model)) # def values(ps): # return [ p.get_value() for p in ps ] # # FIXME enable this for shared only embeddings # layers = [ Embedding(size=args.hidden_size, n_features=tokenizer.n_features, # weights=values(model_init.layers[0].params)), # Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation, # weights=values(model_init.layers[1].params)), # Combined(left=Dense(size=tokenizer.n_features, activation='softmax', reshape=True, # weights=values(model_init.layers[2].left.params)), # right=Dense(size=output_size, activation=args.out_activation, # weights=values(model_init.layers[2].right.params)) # ) ] # else: # FIXME implement proper pretraining FIXME interpolated = True if not args.non_interpolated else False if args.model_type in ['add', 'mult', 'matrix']: if args.model_type == 'add': layer0 = Direct(size=embedding_size, n_features=tokenizer.n_features, op=Add) elif args.model_type == 'mult': layer0 = Direct(size=embedding_size, n_features=tokenizer.n_features, op=Mult) elif args.model_type == 'matrix': sqrt_size = embedding_size ** 0.5 if not sqrt_size.is_integer(): raise ValueError("Sqrt of embedding_size not integral for matrix model") layer0 = Direct(size=embedding_size, n_features=tokenizer.n_features, op=MatrixMult) layers = [ layer0, Dense(size=output_size, activation=args.out_activation, reshape=False) ] valid = (valid_tokens_inp, valid_images) model = RNN(layers=layers, updater=updater, cost=z_cost, iterator=SortedPadded(shuffle=False), verbose=1) model.fit(tokens_inp, images, n_epochs=args.iterations, batch_size=args.batch_size, len_filter=None, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) elif args.model_type == 'simple': layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), Recurrent(seq_output=False, size=args.hidden_size, activation=args.activation), Dense(size=output_size, activation=args.out_activation, reshape=False) ] valid = (valid_tokens_inp, valid_images) model = RNN(layers=layers, updater=updater, cost=z_cost, iterator=SortedPadded(shuffle=False), verbose=1) model.fit(tokens_inp, images, n_epochs=args.iterations, batch_size=args.batch_size, len_filter=None, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) # FIXME need validation elif args.model_type == 'deep-simple': layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation), Recurrent(seq_output=False, size=args.hidden_size, activation=args.activation), Dense(size=output_size, activation=args.out_activation, reshape=False) ] valid = (valid_tokens_inp, valid_images) model = RNN(layers=layers, updater=updater, cost=z_cost, iterator=SortedPadded(shuffle=False), verbose=1) model.fit(tokens_inp, images, n_epochs=args.iterations, batch_size=args.batch_size, len_filter=None, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) # FIXME need validation elif args.model_type == 'shared_all': if args.zero_shot: raise NotImplementedError # FIXME zero_shot not implemented layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation), Combined(left=Dense(size=tokenizer.n_features, activation='softmax', reshape=True), right=Dense(size=output_size, activation=args.out_activation, reshape=False)) ] model = ForkedRNN(layers=layers, updater=updater, cost_y=CategoricalCrossEntropySwapped, cost_z=z_cost, alpha=args.alpha, size_y=tokenizer.n_features, verbose=1, interpolated=interpolated) model.fit(tokens_inp, tokens_out, images, n_epochs=args.iterations, batch_size=args.batch_size, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) elif args.model_type == 'shared_embeddings': layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), Combined(left=Stacked([Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation), Dense(size=tokenizer.n_features, activation='softmax', reshape=True)]), left_type='id', right=Stacked([Recurrent(seq_output=False, size=args.hidden_size, activation=args.activation), Dense(size=output_size, activation=args.out_activation, reshape=False)]), right_type='id') ] model = ForkedRNN(layers=layers, updater=updater, cost_y=CategoricalCrossEntropySwapped, cost_z=z_cost, alpha=args.alpha, size_y=tokenizer.n_features, verbose=1, interpolated=interpolated, zero_shot=args.zero_shot) model.fit(tokens_inp, tokens_out, images, n_epochs=args.iterations, batch_size=args.batch_size, snapshot_freq=args.snapshot_freq, path=model_path, valid=valid) cPickle.dump(model, gzip.open(model_path,"w"))
from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save, load random.seed(0) textfile, labelfile = sys.argv[1:] train_text, train_labels = [], [] with io.open(textfile, "r", encoding="utf8") as txtfin, io.open(labelfile, "r") as labelfin: for text, label in zip(txtfin, labelfin): train_text.append(text.strip()) train_labels.append(int(label.strip())) tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) embedding_sizes = [10, 20, 50, 100, 200, 1000] gru_sizes = [10, 20, 50, 100, 200, 1000] epochs = [1, 3, 5, 7, 10] for embedding_size, gru_size, num_epochs in product(embedding_sizes, gru_sizes, epochs): X_train, X_test, y_train, y_test = cross_validation.train_test_split( train_text, train_labels, test_size=0.1, random_state=0 ) layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features), GatedRecurrent(size=gru_size), Dense(size=1, activation="sigmoid"),
from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save, load train_text=['hello world','foo bar'] train_labels=[0,1] test_text=['hello you','not'] tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) # layers = [ # Embedding(size=128, n_features=tokenizer.n_features), # GatedRecurrent(size=128), # Dense(size=1, activation='sigmoid') # ] # model = RNN(layers=layers, cost='BinaryCrossEntropy') # model.fit(train_tokens, train_labels) # model.predict(tokenizer.transform(test_text)) # save(model, 'save_test.pkl') # model = load('save_test.pkl') print train_tokens
from passage.layers import Embedding, GatedRecurrent, Dense from passage.preprocessing import Tokenizer # download data at kaggle.com/c/word2vec-nlp-tutorial/data def clean(texts): return [html.fromstring(text).text_content().lower().strip() for text in texts] if __name__ == "__main__": tr_data = pd.read_csv('labeledTrainData.tsv', delimiter='\t') trX = clean(tr_data['review'].values) trY = tr_data['sentiment'].values print("Training data loaded and cleaned.") tokenizer = Tokenizer(min_df=10, max_features=100000) trX = tokenizer.fit_transform(trX) print("Training data tokenized.") layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) model.fit(trX, trY, n_epochs=10) te_data = pd.read_csv('testData.tsv', delimiter='\t') ids = te_data['id'].values
import os import pandas as pd from sklearn import metrics from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import load, save from load import load_gender_data trX, teX, trY, teY = load_gender_data(ntrain=10000) # Can increase up to 250K or so tokenizer = Tokenizer(min_df=10, max_features=50000) print trX[1] # see a blog example trX = tokenizer.fit_transform(trX) teX = tokenizer.transform(teX) print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ] model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output for i in range(2): model.fit(trX, trY, n_epochs=1) tr_preds = model.predict(trX[:len(teY)]) te_preds = model.predict(teX)
import sys # --- # --- print 'loading dataset' d = Dataset(settings['FN_DATASET'], settings['FN_VOCABULARY']) d.load() print 'generating labeled training set' train_text,train_labels = d.getNextWordPredTrainset(10) #for t,l in zip(train_text,train_labels): # print t,'->',l tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) save(train_tokens, settings['FN_TRAINED_TOKENIZER']) layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_labels) save(model, settings['FN_MODEL_NEXTWORDPRED'])
#!/usr/bin/env python # coding=utf-8 from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save, load train_text = ['hello world','foo bar'] train_labels = [0,1] test_text = ['good man'] tokenizer = Tokenizer() train_tokens = tokenizer.fit_transform(train_text) layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=128), Dense(size=1, activation='sigmoid') ] model = RNN(layers=layers, cost='BinaryCrossEntropy') model.fit(train_tokens, train_labels) print model.predict(tokenizer.transform(test_text)) save(model, 'save_test.pkl') model = load('save_test.pkl')
fullpath = os.path.join(file_loc, relative_path) data = pd.read_csv(fullpath, nrows=ntrain + ntest) X = data['text'].values X = [str(x) for x in X] #ugly nan cleaner Y = data['gender'].values trX = X[:-ntest] teX = X[-ntest:] trY = Y[:-ntest] teY = Y[-ntest:] return trX, teX, trY, teY trX, teX, trY, teY = load_gender_data( ntrain=10000) #Can increase up to 250K or so tokenizer = Tokenizer(min_df=10, max_features=50000) print trX[1:2] #see a blog example trX = tokenizer.fit_transform(trX) teX = tokenizer.transform(teX) print tokenizer.inverse_transform(trX[1:2]) #see what words are kept print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') #sigmoid for binary classification
def train_and_save_passage_tokenizer_and_rnn_model(x_train, y_train, x_test, character_model=False): """Train and save Passage tokenizer and Passage RNN model. x_train and x_test should each be a series that's already been pre-preocessed: html->text, lowercase, removed punct/#s x_train+x_test are used to build the tokenizer. Note that character-based RNN is a work-in-progress and not actuallly implemented as of now. """ # Note that we assume we have train/test reviews that had been preprocessed: html->text, lowercased, removed # punct/#s # Note in https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py they only # extract text from html, lowercase and strip (no punctuation removal) # Tokenization: Assign each word in the reviews an ID to be used in all reviews tokenizer = Tokenizer(min_df=10, max_features=100000, character=character_model) train_reviews_list = x_train.tolist() tokenizer.fit(train_reviews_list + x_test.tolist()) # Tokenize training reviws (so can use to fit RNN model on) train_reviews_tokenized = tokenizer.transform(train_reviews_list) # Based on https://github.com/vinhkhuc/kaggle-sentiment-popcorn/blob/master/scripts/passage_nn.py which is based # on https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py # RNN Network: # -Each tokenized review will be converted into a sequence of words, where each word has an embedding representation # (256) # -RNN layer (GRU) attempts to find pattern in sequence of words # -Final dense layer is used as a logistic classifier to turn RNN output into a probability/prediction if not character_model: layers = [ Embedding(size=256, n_features=tokenizer.n_features), # May replace with LstmRecurrent for LSTM layer GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: # Character-level RNN # Idea is to convert character tokenizations into one-hot encodings in which case # the embeddings layer is no longer needed train_reviews_tokenized = map( lambda r_indexes: pd.get_dummies( r_indexes, columns=range(tokenizer.n_features + 1)).values, train_reviews_tokenized) layers = [ # May replace with LstmRecurrent for LSTM layer GatedRecurrent(size=100, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] # RNN classifer uses Binary Cross-Entropy as the cost function classifier = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) NUM_EPOCHS = 10 # 10 epochs may take 10+ hours to run depending on machine classifier.fit(train_reviews_tokenized, y_train.tolist(), n_epochs=NUM_EPOCHS) # Store model and tokenizer if character_model: passage.utils.save(classifier, PASSAGE_CHAR_RNN_MODEL) _ = joblib.dump(tokenizer, PASSAGE_CHAR_TOKENIZER, compress=9) else: passage.utils.save(classifier, PASSAGE_RNN_MODEL) _ = joblib.dump(tokenizer, PASSAGE_TOKENIZER, compress=9)
def train_model(modeltype, delta): assert modeltype in ["gated_recurrent", "lstm_recurrent"] print "Begin Training" df_imdb_reviews = pd.read_csv('../data/imdb_review_data.tsv', escapechar='\\', delimiter='\t') X = clean(df_imdb_reviews['review'].values) y = df_imdb_reviews['sentiment'].values X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) print "Tokenize" tokenizer = Tokenizer(min_df=10, max_features=100000) X_train = tokenizer.fit_transform(X_train) X_train = [[float(x) for x in y] for y in X_train] X_test = tokenizer.transform(X_test) X_test = [[float(x) for x in y] for y in X_test] print "Number of featers: {}".format(tokenizer.n_features) print "Training model" if modeltype == "gated_recurrent": layers = [ Embedding(size=256, n_features=tokenizer.n_features), GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: layers = [ Embedding(size=256, n_features=tokenizer.n_features), LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=True, p_drop=0.5), Dense(size=1, activation='sigmoid', init='orthogonal') ] # bce is classification loss for binary classification and sigmoid output model = RNN(layers=layers, cost='bce', updater=Adadelta, (lr=delta)) model.fit(X_train, y_train, n_epochs=20) with open('../data/{}_tokenizer_delta_{}_pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: vectorizer = pickle.dump(tokenizer, f) with open('../data/{}_model_delta_{}._pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f: model = pickle.dump(model, f) try: y_pred_te = model.predict(X_test).flatten() >= 0.5 y_pred_tr = model.predict(X_train).flatten() >= 0.5 print 'Test Accuracy: {}'.format(accuracy_score(y_test,y_pred_te)) print 'Test Precision: {}'.format(precision_score(y_test,y_pred_te)) print 'Test Recall: {}'.format(recall_score(y_test,y_pred_te)) print 'Train Accuracy: {}'.format(accuracy_score(y_train,y_pred_tr)) print 'Train Precision: {}'.format(precision_score(y_train,y_pred_tr)) print 'Train Recall: {}'.format(recall_score(y_train,y_pred_tr)) except: print "Unable to perform metrics" return tokenizer, model
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories) print len(newsgroups_train.data), len(newsgroups_test.data) from sklearn import metrics from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save tokenizer = Tokenizer(min_df=10, max_features=50000) X_train = tokenizer.fit_transform(newsgroups_train.data) X_test = tokenizer.transform(newsgroups_test.data) Y_train = newsgroups_train.target Y_test = newsgroups_test.target print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid',
newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), categories=categories) newsgroups_test = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), categories=categories) print len(newsgroups_train.data), len(newsgroups_test.data) from sklearn import metrics from passage.preprocessing import Tokenizer from passage.layers import Embedding, GatedRecurrent, Dense from passage.models import RNN from passage.utils import save tokenizer = Tokenizer(min_df=10, max_features=50000) X_train = tokenizer.fit_transform(newsgroups_train.data) X_test = tokenizer.transform(newsgroups_test.data) Y_train = newsgroups_train.target Y_test = newsgroups_test.target print tokenizer.n_features layers = [ Embedding(size=128, n_features=tokenizer.n_features), GatedRecurrent(size=256, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False), Dense(size=1, activation='sigmoid', init='orthogonal') # sigmoid for binary classification ] model = RNN(layers=layers, cost='bce') # bce is classification loss for binary classification and sigmoid output
def predict(model, test_text): tokenizer = Tokenizer() result = model.predict(tokenizer.fit_transform(test_text)) # print result.shape # print "result =", result return result