Esempio n. 1
0
def tokenize(train):
	"""
	INPUT: Array
		- Text documents (reviews) to train sentiment on
	Returns trained tokenizer
	"""
	tokenizer = Tokenizer(min_df=10, max_features=100000)
	print "Training tokenizer on reviews"
	tokenizer.fit(train)
	return tokenizer
def train_and_save_passage_tokenizer_and_rnn_model(x_train,
                                                   y_train,
                                                   x_test,
                                                   character_model=False):
    """Train and save Passage tokenizer and Passage RNN model.

    x_train and x_test should each be a series that's already been pre-preocessed: html->text, lowercase, removed
    punct/#s
    x_train+x_test are used to build the tokenizer.

    Note that character-based RNN is a work-in-progress and not actuallly implemented as of now.
    """

    # Note that we assume we have train/test reviews that had been preprocessed: html->text, lowercased, removed
    # punct/#s

    # Note in https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py they only
    # extract text from html, lowercase and strip (no punctuation removal)

    # Tokenization: Assign each word in the reviews an ID to be used in all reviews
    tokenizer = Tokenizer(min_df=10,
                          max_features=100000,
                          character=character_model)

    train_reviews_list = x_train.tolist()
    tokenizer.fit(train_reviews_list + x_test.tolist())

    # Tokenize training reviws (so can use to fit RNN model on)
    train_reviews_tokenized = tokenizer.transform(train_reviews_list)

    # Based on https://github.com/vinhkhuc/kaggle-sentiment-popcorn/blob/master/scripts/passage_nn.py which is based
    # on https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py

    # RNN Network:
    # -Each tokenized review will be converted into a sequence of words, where each word has an embedding representation
    # (256)
    # -RNN layer (GRU) attempts to find pattern in sequence of words
    # -Final dense layer is used as a logistic classifier to turn RNN output into a probability/prediction
    if not character_model:
        layers = [
            Embedding(size=256, n_features=tokenizer.n_features),
            # May replace with LstmRecurrent for LSTM layer
            GatedRecurrent(size=512,
                           activation='tanh',
                           gate_activation='steeper_sigmoid',
                           init='orthogonal',
                           seq_output=False,
                           p_drop=0.75),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]
    else:
        # Character-level RNN
        # Idea is to convert character tokenizations into one-hot encodings in which case
        # the embeddings layer is no longer needed
        train_reviews_tokenized = map(
            lambda r_indexes: pd.get_dummies(
                r_indexes, columns=range(tokenizer.n_features + 1)).values,
            train_reviews_tokenized)
        layers = [
            # May replace with LstmRecurrent for LSTM layer
            GatedRecurrent(size=100,
                           activation='tanh',
                           gate_activation='steeper_sigmoid',
                           init='orthogonal',
                           seq_output=False,
                           p_drop=0.75),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]

    # RNN classifer uses Binary Cross-Entropy as the cost function
    classifier = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
    NUM_EPOCHS = 10
    # 10 epochs may take 10+ hours to run depending on machine
    classifier.fit(train_reviews_tokenized,
                   y_train.tolist(),
                   n_epochs=NUM_EPOCHS)

    # Store model and tokenizer
    if character_model:
        passage.utils.save(classifier, PASSAGE_CHAR_RNN_MODEL)
        _ = joblib.dump(tokenizer, PASSAGE_CHAR_TOKENIZER, compress=9)
    else:
        passage.utils.save(classifier, PASSAGE_RNN_MODEL)
        _ = joblib.dump(tokenizer, PASSAGE_TOKENIZER, compress=9)