def tokenize(train): """ INPUT: Array - Text documents (reviews) to train sentiment on Returns trained tokenizer """ tokenizer = Tokenizer(min_df=10, max_features=100000) print "Training tokenizer on reviews" tokenizer.fit(train) return tokenizer
def train_and_save_passage_tokenizer_and_rnn_model(x_train, y_train, x_test, character_model=False): """Train and save Passage tokenizer and Passage RNN model. x_train and x_test should each be a series that's already been pre-preocessed: html->text, lowercase, removed punct/#s x_train+x_test are used to build the tokenizer. Note that character-based RNN is a work-in-progress and not actuallly implemented as of now. """ # Note that we assume we have train/test reviews that had been preprocessed: html->text, lowercased, removed # punct/#s # Note in https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py they only # extract text from html, lowercase and strip (no punctuation removal) # Tokenization: Assign each word in the reviews an ID to be used in all reviews tokenizer = Tokenizer(min_df=10, max_features=100000, character=character_model) train_reviews_list = x_train.tolist() tokenizer.fit(train_reviews_list + x_test.tolist()) # Tokenize training reviws (so can use to fit RNN model on) train_reviews_tokenized = tokenizer.transform(train_reviews_list) # Based on https://github.com/vinhkhuc/kaggle-sentiment-popcorn/blob/master/scripts/passage_nn.py which is based # on https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py # RNN Network: # -Each tokenized review will be converted into a sequence of words, where each word has an embedding representation # (256) # -RNN layer (GRU) attempts to find pattern in sequence of words # -Final dense layer is used as a logistic classifier to turn RNN output into a probability/prediction if not character_model: layers = [ Embedding(size=256, n_features=tokenizer.n_features), # May replace with LstmRecurrent for LSTM layer GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] else: # Character-level RNN # Idea is to convert character tokenizations into one-hot encodings in which case # the embeddings layer is no longer needed train_reviews_tokenized = map( lambda r_indexes: pd.get_dummies( r_indexes, columns=range(tokenizer.n_features + 1)).values, train_reviews_tokenized) layers = [ # May replace with LstmRecurrent for LSTM layer GatedRecurrent(size=100, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75), Dense(size=1, activation='sigmoid', init='orthogonal') ] # RNN classifer uses Binary Cross-Entropy as the cost function classifier = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5)) NUM_EPOCHS = 10 # 10 epochs may take 10+ hours to run depending on machine classifier.fit(train_reviews_tokenized, y_train.tolist(), n_epochs=NUM_EPOCHS) # Store model and tokenizer if character_model: passage.utils.save(classifier, PASSAGE_CHAR_RNN_MODEL) _ = joblib.dump(tokenizer, PASSAGE_CHAR_TOKENIZER, compress=9) else: passage.utils.save(classifier, PASSAGE_RNN_MODEL) _ = joblib.dump(tokenizer, PASSAGE_TOKENIZER, compress=9)