Example #1
0
def build_model(weights=None,
                embedding_size=256,
                recurrent_gate_size=512,
                n_features=5,
                dropout=0.4):
    """
    build_model

    Inputs:
        weights - Path to a weights file to load, or None if the model should be built from scratch
        embedding_size - Size of the embedding layer
        recurrent_gate_size - Size of the gated recurrent layer
        n_features - Number of features for the embedding layer
        dropout - Dropout value

    Returns:
        A model object ready for training (or evaluation if a previous model was loaded via `weights`)
    """
    # vvvvv
    #Modify this if you want to change the structure of the network!
    # ^^^^^
    model_layers = [
        Embedding(size=embedding_size, n_features=n_features),
        GatedRecurrent(size=recurrent_gate_size, p_drop=dropout),
        Dense(size=1, activation='sigmoid', p_drop=dropout)
    ]
    model = RNN(layers=model_layers,
                cost='BinaryCrossEntropy',
                verbose=2,
                updater='Adam')
    if weights:  #Just load the provided model instead, I guess?
        model = load(weights)
    return model
Example #2
0
def main(ptrain, ntrain, ptest, ntest, out, modeltype):
    assert modeltype in ["gated_recurrent", "lstm_recurrent"]

    print("Using the %s model ..." % modeltype)
    print("Loading data ...")
    trX, trY = load_data(ptrain, ntrain)
    teX, teY = load_data(ptest, ntest)

    tokenizer = Tokenizer(min_df=10, max_features=100000)
    trX = tokenizer.fit_transform(trX)
    teX = tokenizer.transform(teX)

    print("Training ...")
    if modeltype == "gated_recurrent":
        layers = [
            Embedding(size=256, n_features=tokenizer.n_features),
            GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid',
                           init='orthogonal', seq_output=False, p_drop=0.75),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]
    else:
        layers = [
            Embedding(size=256, n_features=tokenizer.n_features),
            LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid',
                          init='orthogonal', seq_output=False, p_drop=0.75),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]

    model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
    model.fit(trX, trY, n_epochs=10)

    # Predicting the probabilities of positive labels
    print("Predicting ...")
    pr_teX = model.predict(teX).flatten()

    predY = np.ones(len(teY))
    predY[pr_teX < 0.5] = -1

    with open(out, "w") as f:
        for lab, pos_pr, neg_pr in zip(predY, pr_teX, 1 - pr_teX):
            f.write("%d %f %f\n" % (lab, pos_pr, neg_pr))
Example #3
0
def rnn(train_text, train_label):
    tokenizer = Tokenizer()
    train_tokens = tokenizer.fit_transform(train_text)
    layers = [
        Embedding(size=50, n_features=tokenizer.n_features),
        GatedRecurrent(size=128),
        Dense(size=1, activation='sigmoid')
    ]
    #    print "train_tokens=", train_tokens
    model = RNN(layers=layers, cost='BinaryCrossEntropy')
    model.fit(train_tokens, train_label)
    return model
Example #4
0
def train_RNN(tokenizer, tokens, labels):
	"""
	INPUT: Trained tokenizer class, label array
		- The arrays of the tokenized critic reviews and the corresponding labels
	Returns a trained Recurrent Neural Network class object
	"""
	layers = [
		Embedding(size=256, n_features=tokenizer.n_features),
		GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid', init='orthogonal', seq_output=False, p_drop=0.75),
		Dense(size=1, activation='sigmoid', init='orthogonal')
	]

	model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))

	path_snapshots = 'model_snapshots'

	print "Begin fitting RNN"

	model.fit(tokens, labels, n_epochs=12)

	return model
Example #5
0
def build_model(weights=None,
                embedding_size=128,
                recurrent_gate_size=256,
                n_features=5,
                dropout=0.1):
    """
    build_model

    Inputs:
        weights - Path to a weights file to load, or None if the model should be built from scratch
        embedding_size - Size of the embedding layer
        recurrent_gate_size - Size of the gated recurrent layer
        n_features - Number of features for the embedding layer
        dropout - Dropout value

    Returns:
        A model object ready for training (or evaluation if a previous model was loaded via `weights`)
    """
    # vvvvv
    #Modify this if you want to change the structure of the network!
    # ^^^^^
    model_layers = [
        Embedding(size=embedding_size, n_features=n_features),
        GatedRecurrent(size=recurrent_gate_size, p_drop=dropout),
        Dense(size=1, activation='sigmoid', p_drop=dropout)
    ]
    args = {
        'layers': model_layers,
        'cost': 'BinaryCrossEntropy',
        'verbose': 2,
        'updater': Adadelta(lr=0.5),
        'embedding_size': embedding_size
    }
    model = RNN(**args)
    if weights:  #Just load the provided model instead, I guess?
        print "Loading previously created weights file: ", weights
        model = load(weights)
    return model
Example #6
0
def train(args):
    zero_words = cPickle.load(gzip.open("zero_shot.pkl.gz")) if args.zero_shot else set()
    def maybe_zero(s, i):
        overlap = set(tokenize(s)).intersection(zero_words)    
        if args.zero_shot and len(overlap) > 0:
            return numpy.zeros(i.shape)
        else:
            return i
    dataset = args.dataset
    tok_path = args.tokenizer
    model_path = args.model 
    d = dp.getDataProvider(dataset)
    pairs = list(d.iterImageSentencePair(split='train'))
    if args.shuffle:
        numpy.random.shuffle(pairs)
    output_size = len(pairs[0]['image']['feat'])
    embedding_size = args.embedding_size if args.embedding_size is not None else args.hidden_size
    tokenizer = cPickle.load(gzip.open(args.init_tokenizer)) \
                    if args.init_tokenizer else Tokenizer(min_df=args.word_freq_threshold, character=args.character)
    sentences, images = zip(*[ (pair['sentence']['raw'], maybe_zero(pair['sentence']['raw'],pair['image']['feat']))
                               for pair in pairs ])
    scaler = StandardScaler() if args.scaler == 'standard' else NoScaler()
    images = scaler.fit_transform(images)
    tokens = [ [tokenizer.encoder['PAD']] + sent + [tokenizer.encoder['END'] ] 
               for sent in tokenizer.fit_transform(sentences) ]
    tokens_inp = [ token[:-1] for token in tokens ]

    tokens_out = [ token[1:]  for token in tokens ]

    cPickle.dump(tokenizer, gzip.open(tok_path, 'w'))
    cPickle.dump(scaler, gzip.open('scaler.pkl.gz','w'))
    # Validation data
    valid_pairs = list(d.iterImageSentencePair(split='val'))
    valid_sents, valid_images  = zip(*[ (pair['sentence']['raw'], pair['image']['feat'])
                                        for pair in valid_pairs ])
    valid_images = scaler.transform(valid_images)
    valid_tokens = [ [ tokenizer.encoder['PAD'] ] + sent + [tokenizer.encoder['END'] ] 
                       for sent in tokenizer.transform(valid_sents) ]
    valid_tokens_inp = [ token[:-1] for token in valid_tokens ]
    valid_tokens_out = [ token[1:] for token in valid_tokens ]
    valid = (valid_tokens_inp, valid_tokens_out, valid_images)

    updater = passage.updates.Adam(lr=args.rate, clipnorm=args.clipnorm)
    if args.cost == 'MeanSquaredError':
        z_cost = MeanSquaredError
    elif args.cost == 'CosineDistance':
        z_cost = CosineDistance
    else:
        raise ValueError("Unknown cost")
    if args.hidden_type == 'gru':
        Recurrent = GatedRecurrent
    elif args.hidden_type == 'lstm':
        Recurrent = LstmRecurrent
    else:
        Recurrent = GatedRecurrent
    # if args.init_model is not None:
    #     model_init =  cPickle.load(open(args.init_model))
        
    #     def values(ps):
    #         return [ p.get_value() for p in ps ]
    #     # FIXME enable this for shared only embeddings 
    #     layers = [  Embedding(size=args.hidden_size, n_features=tokenizer.n_features, 
    #                           weights=values(model_init.layers[0].params)), 
    #                 Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation,
    #                                weights=values(model_init.layers[1].params)),
    #                 Combined(left=Dense(size=tokenizer.n_features, activation='softmax', reshape=True,
    #                                     weights=values(model_init.layers[2].left.params)), 
    #                          right=Dense(size=output_size, activation=args.out_activation, 
    #                                      weights=values(model_init.layers[2].right.params))
    #                                  ) ]
        
    # else:
    # FIXME implement proper pretraining FIXME
    interpolated = True if not args.non_interpolated else False
    if args.model_type in ['add', 'mult', 'matrix']:
        if args.model_type == 'add':
            layer0 = Direct(size=embedding_size, n_features=tokenizer.n_features, op=Add)
        elif args.model_type == 'mult':
            layer0 = Direct(size=embedding_size, n_features=tokenizer.n_features, op=Mult)
        elif args.model_type == 'matrix':
            sqrt_size = embedding_size ** 0.5
            if not sqrt_size.is_integer():
                raise ValueError("Sqrt of embedding_size not integral for matrix model")
            layer0 = Direct(size=embedding_size, n_features=tokenizer.n_features, op=MatrixMult)
        layers = [ layer0, Dense(size=output_size, activation=args.out_activation, reshape=False) ]
        valid = (valid_tokens_inp, valid_images)
        model = RNN(layers=layers, updater=updater, cost=z_cost, 
                    iterator=SortedPadded(shuffle=False), verbose=1)
        model.fit(tokens_inp, images, n_epochs=args.iterations, batch_size=args.batch_size, len_filter=None,
                  snapshot_freq=args.snapshot_freq, path=model_path, valid=valid)
    elif args.model_type   == 'simple':
        layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features),
                   Recurrent(seq_output=False, size=args.hidden_size, activation=args.activation),
                   Dense(size=output_size, activation=args.out_activation, reshape=False)
                 ]
        valid = (valid_tokens_inp, valid_images)
        model = RNN(layers=layers, updater=updater, cost=z_cost, 
                    iterator=SortedPadded(shuffle=False), verbose=1)
        model.fit(tokens_inp, images, n_epochs=args.iterations, batch_size=args.batch_size, len_filter=None,
                  snapshot_freq=args.snapshot_freq, path=model_path, valid=valid)
        # FIXME need validation
    elif args.model_type   == 'deep-simple':
        layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features),
                   Recurrent(seq_output=True,  size=args.hidden_size, activation=args.activation),
                   Recurrent(seq_output=False, size=args.hidden_size, activation=args.activation),
                   Dense(size=output_size, activation=args.out_activation, reshape=False)
                 ]
        valid = (valid_tokens_inp, valid_images)
        model = RNN(layers=layers, updater=updater, cost=z_cost, 
                    iterator=SortedPadded(shuffle=False), verbose=1)
        model.fit(tokens_inp, images, n_epochs=args.iterations, batch_size=args.batch_size, len_filter=None,
                  snapshot_freq=args.snapshot_freq, path=model_path, valid=valid)
        # FIXME need validation
        
    elif args.model_type == 'shared_all':
        if args.zero_shot:
            raise NotImplementedError # FIXME zero_shot not implemented
        layers = [  Embedding(size=embedding_size, n_features=tokenizer.n_features), 
                    Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation),
                    Combined(left=Dense(size=tokenizer.n_features, activation='softmax', reshape=True), 
                             right=Dense(size=output_size, activation=args.out_activation, reshape=False)) ] 

        model = ForkedRNN(layers=layers, updater=updater, cost_y=CategoricalCrossEntropySwapped, 
                          cost_z=z_cost, alpha=args.alpha, size_y=tokenizer.n_features, 
                          verbose=1, interpolated=interpolated) 

        model.fit(tokens_inp, tokens_out, images, n_epochs=args.iterations, batch_size=args.batch_size,
                  snapshot_freq=args.snapshot_freq, path=model_path, valid=valid)
    elif args.model_type == 'shared_embeddings':
        layers = [ Embedding(size=embedding_size, n_features=tokenizer.n_features),
                   Combined(left=Stacked([Recurrent(seq_output=True, size=args.hidden_size, activation=args.activation), 
                                          Dense(size=tokenizer.n_features, activation='softmax', reshape=True)]), 
                            left_type='id',
                            right=Stacked([Recurrent(seq_output=False, size=args.hidden_size, activation=args.activation), 
                                           Dense(size=output_size, activation=args.out_activation, reshape=False)]),
                            right_type='id')
                        ]

        model = ForkedRNN(layers=layers, updater=updater, cost_y=CategoricalCrossEntropySwapped, 
                          cost_z=z_cost, alpha=args.alpha, size_y=tokenizer.n_features, 
                          verbose=1, interpolated=interpolated, zero_shot=args.zero_shot)

        model.fit(tokens_inp, tokens_out, images, n_epochs=args.iterations, batch_size=args.batch_size,
                  snapshot_freq=args.snapshot_freq, path=model_path, valid=valid)

    cPickle.dump(model, gzip.open(model_path,"w"))
Example #7
0
from passage.layers import GatedRecurrent
from passage.layers import LstmRecurrent
from passage.layers import Dense

from passage.models import RNN
from passage.utils import save, load

print("Loading data...")
num_training = int((1.0 - 0.2) * len(xs))

X_train, y_train, X_test, y_test = xs[:num_training], ys[:num_training], xs[num_training:], ys[num_training:]

num_feats = generator.max_id() + 1

layers = [
    Embedding(size=128, n_features=num_feats),
    #LstmRecurrent(size=32),
    #NOTE - to use a deep RNN, you need all but the final layers with seq_ouput=True
    #GatedRecurrent(size=128, seq_output=True),
    #GatedRecurrent(size=256, direction= 'backward' if REVERSE else 'forward'),
    GatedRecurrent(size=128, seq_output=True),
    GatedRecurrent(size=128),
    #Dense(size=64, activation='sigmoid'),
    Dense(size=len(lst_freq_tags), activation='sigmoid'),
]

#emd 128, gru 32/64 is good - 0.70006 causer

print("Creating Model")
model = RNN(layers=layers, cost='bce')
import sys

# ---

# ---

print 'loading dataset'
d = Dataset(settings['FN_DATASET'], settings['FN_VOCABULARY'])
d.load()

print 'generating labeled training set'
train_text,train_labels = d.getNextWordPredTrainset(10)
#for t,l in zip(train_text,train_labels):
#    print t,'->',l

tokenizer = Tokenizer()
train_tokens = tokenizer.fit_transform(train_text)
save(train_tokens, settings['FN_TRAINED_TOKENIZER'])

layers = [
    Embedding(size=128, n_features=tokenizer.n_features),
    GatedRecurrent(size=128),
    Dense(size=1, activation='sigmoid')
]

model = RNN(layers=layers, cost='BinaryCrossEntropy')
model.fit(train_tokens, train_labels)

save(model, settings['FN_MODEL_NEXTWORDPRED'])
Example #9
0

if __name__ == "__main__":
    tr_data = pd.read_csv('labeledTrainData.tsv', delimiter='\t')
    trX = clean(tr_data['review'].values)
    trY = tr_data['sentiment'].values

    print("Training data loaded and cleaned.")

    tokenizer = Tokenizer(min_df=10, max_features=100000)
    trX = tokenizer.fit_transform(trX)

    print("Training data tokenized.")

    layers = [
        Embedding(size=256, n_features=tokenizer.n_features),
        GatedRecurrent(size=512,
                       activation='tanh',
                       gate_activation='steeper_sigmoid',
                       init='orthogonal',
                       seq_output=False,
                       p_drop=0.75),
        Dense(size=1, activation='sigmoid', init='orthogonal')
    ]

    model = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
    model.fit(trX, trY, n_epochs=10)

    te_data = pd.read_csv('testData.tsv', delimiter='\t')
    ids = te_data['id'].values
    teX = clean(te_data['review'].values)
from passage.layers import LstmRecurrent
from passage.layers import Dense

from passage.models import RNN
from passage.utils import save, load

print("Loading data...")
num_training = int((1.0 - TEST_SPLIT) * len(xs))

X_train, y_train, X_test, y_test = xs[:num_training], ys[:num_training], xs[
    num_training:], ys[num_training:]

num_feats = generator.max_id() + 1

layers = [
    Embedding(size=64, n_features=num_feats),
    #LstmRecurrent(size=32),
    #NOTE - to use a deep RNN, you need all but the final layers with seq_ouput=True
    #GatedRecurrent(size=64, seq_output=True),
    GatedRecurrent(size=64, direction='backward' if REVERSE else 'forward'),
    #LstmRecurrent(size=128),
    Dense(size=1, activation='sigmoid'),
]

#emd 64, gru 64 is good - 0.70833 causer (0 prev sents)

print("Creating Model")
model = RNN(layers=layers, cost='bce')


def find_cutoff(y_test, predictions):
def train_model(modeltype, delta):

    assert modeltype in ["gated_recurrent", "lstm_recurrent"]
    print "Begin Training"

    df_imdb_reviews = pd.read_csv('../data/imdb_review_data.tsv', escapechar='\\', delimiter='\t')

    X = clean(df_imdb_reviews['review'].values)
    y = df_imdb_reviews['sentiment'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
    print "Tokenize"

    tokenizer = Tokenizer(min_df=10, max_features=100000)
    X_train = tokenizer.fit_transform(X_train)
    X_train = [[float(x) for x in  y] for y in X_train]
    X_test = tokenizer.transform(X_test)
    X_test = [[float(x) for x in  y] for y in X_test]

    print "Number of featers: {}".format(tokenizer.n_features)

    print "Training model"

    if modeltype == "gated_recurrent":
        layers = [
            Embedding(size=256, n_features=tokenizer.n_features),
            GatedRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid',
                           init='orthogonal', seq_output=True, p_drop=0.5),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]
    else:
        layers = [
            Embedding(size=256, n_features=tokenizer.n_features),
            LstmRecurrent(size=512, activation='tanh', gate_activation='steeper_sigmoid',
                          init='orthogonal', seq_output=True, p_drop=0.5),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]

    # bce is classification loss for binary classification and sigmoid output
    model = RNN(layers=layers, cost='bce', updater=Adadelta, (lr=delta))
    model.fit(X_train, y_train, n_epochs=20)

    with open('../data/{}_tokenizer_delta_{}_pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f:
        vectorizer = pickle.dump(tokenizer, f)
    with open('../data/{}_model_delta_{}._pdrop_0.5.pkl'.format(modeltype, delta), 'w') as f:
        model = pickle.dump(model, f)

    try:
        y_pred_te = model.predict(X_test).flatten() >= 0.5
        y_pred_tr = model.predict(X_train).flatten() >= 0.5
        print 'Test Accuracy: {}'.format(accuracy_score(y_test,y_pred_te))
        print 'Test Precision: {}'.format(precision_score(y_test,y_pred_te))
        print 'Test Recall: {}'.format(recall_score(y_test,y_pred_te))
        print 'Train Accuracy: {}'.format(accuracy_score(y_train,y_pred_tr))
        print 'Train Precision: {}'.format(precision_score(y_train,y_pred_tr))
        print 'Train Recall: {}'.format(recall_score(y_train,y_pred_tr))

    except:
        print "Unable to perform metrics"

    return tokenizer, model
def train_and_save_passage_tokenizer_and_rnn_model(x_train,
                                                   y_train,
                                                   x_test,
                                                   character_model=False):
    """Train and save Passage tokenizer and Passage RNN model.

    x_train and x_test should each be a series that's already been pre-preocessed: html->text, lowercase, removed
    punct/#s
    x_train+x_test are used to build the tokenizer.

    Note that character-based RNN is a work-in-progress and not actuallly implemented as of now.
    """

    # Note that we assume we have train/test reviews that had been preprocessed: html->text, lowercased, removed
    # punct/#s

    # Note in https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py they only
    # extract text from html, lowercase and strip (no punctuation removal)

    # Tokenization: Assign each word in the reviews an ID to be used in all reviews
    tokenizer = Tokenizer(min_df=10,
                          max_features=100000,
                          character=character_model)

    train_reviews_list = x_train.tolist()
    tokenizer.fit(train_reviews_list + x_test.tolist())

    # Tokenize training reviws (so can use to fit RNN model on)
    train_reviews_tokenized = tokenizer.transform(train_reviews_list)

    # Based on https://github.com/vinhkhuc/kaggle-sentiment-popcorn/blob/master/scripts/passage_nn.py which is based
    # on https://github.com/IndicoDataSolutions/Passage/blob/master/examples/sentiment.py

    # RNN Network:
    # -Each tokenized review will be converted into a sequence of words, where each word has an embedding representation
    # (256)
    # -RNN layer (GRU) attempts to find pattern in sequence of words
    # -Final dense layer is used as a logistic classifier to turn RNN output into a probability/prediction
    if not character_model:
        layers = [
            Embedding(size=256, n_features=tokenizer.n_features),
            # May replace with LstmRecurrent for LSTM layer
            GatedRecurrent(size=512,
                           activation='tanh',
                           gate_activation='steeper_sigmoid',
                           init='orthogonal',
                           seq_output=False,
                           p_drop=0.75),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]
    else:
        # Character-level RNN
        # Idea is to convert character tokenizations into one-hot encodings in which case
        # the embeddings layer is no longer needed
        train_reviews_tokenized = map(
            lambda r_indexes: pd.get_dummies(
                r_indexes, columns=range(tokenizer.n_features + 1)).values,
            train_reviews_tokenized)
        layers = [
            # May replace with LstmRecurrent for LSTM layer
            GatedRecurrent(size=100,
                           activation='tanh',
                           gate_activation='steeper_sigmoid',
                           init='orthogonal',
                           seq_output=False,
                           p_drop=0.75),
            Dense(size=1, activation='sigmoid', init='orthogonal')
        ]

    # RNN classifer uses Binary Cross-Entropy as the cost function
    classifier = RNN(layers=layers, cost='bce', updater=Adadelta(lr=0.5))
    NUM_EPOCHS = 10
    # 10 epochs may take 10+ hours to run depending on machine
    classifier.fit(train_reviews_tokenized,
                   y_train.tolist(),
                   n_epochs=NUM_EPOCHS)

    # Store model and tokenizer
    if character_model:
        passage.utils.save(classifier, PASSAGE_CHAR_RNN_MODEL)
        _ = joblib.dump(tokenizer, PASSAGE_CHAR_TOKENIZER, compress=9)
    else:
        passage.utils.save(classifier, PASSAGE_RNN_MODEL)
        _ = joblib.dump(tokenizer, PASSAGE_TOKENIZER, compress=9)