コード例 #1
0
def build_model_and_objective(n_classes, n_input_dimensions, X, Y):
    model = CSM(
        layers=[
            Softmax(
                n_classes=n_classes,
                n_input_dimensions=n_input_dimensions),
        ],
        )

    lengths = np.zeros(X.shape[0])
    data_provider = BatchDataProvider(
        X=X,
        Y=Y,
        lengths=lengths)

    cost_function = CrossEntropy()

    objective = CostMinimizationObjective(cost=cost_function, data_provider=data_provider)

    update_rule = AdaGradUpdateRule(
        gamma=0.1,
        model_template=model)

    optimizer = SGD(model=model, objective=objective, update_rule=update_rule)

    return model, objective, optimizer, data_provider
コード例 #2
0
def remove_dropout(model):
    new_model = []
    ratio = 0
    for layer in model.layers:
        if layer.__class__.__name__ == 'Dropout':
            ratio = layer.dropout_rate
        else:
            if ratio == 0:
                new_model.append(layer)
            else:
                new_model.append(__function_mapping[layer.__class__.__name__](
                    layer, ratio))
                ratio = 0

    return CSM(layers=new_model)
コード例 #3
0
def run():
    # random.seed(435)
    # np.random.seed(2342)
    np.set_printoptions(linewidth=100)

    data_dir = os.path.join("../data", "europarlv7")

    with open(os.path.join(
            data_dir, "europarl-v7.de-en.en.tokens.clean.json")) as data_file:
        english_data = json.load(data_file)

    with open(
            os.path.join(
                data_dir,
                "europarl-v7.de-en.en.tokens.clean.dictionary.encoding.json")
    ) as dictionary_file:
        english_dictionary = json.load(dictionary_file)

    with open(os.path.join(
            data_dir, "europarl-v7.de-en.de.tokens.clean.json")) as data_file:
        german_data = json.load(data_file)

    with open(
            os.path.join(
                data_dir,
                "europarl-v7.de-en.de.tokens.clean.dictionary.encoding.json")
    ) as dictionary_file:
        german_dictionary = json.load(dictionary_file)

    # english_data = english_data[:10000]
    # german_data = german_data[:10000]

    english_data = replace_unknowns(english_data, english_dictionary,
                                    'UNKNOWN')
    german_data = replace_unknowns(german_data, german_dictionary, 'UNKNOWN')

    batch_size = 100

    assert len(english_data) == len(german_data)
    print len(english_data) / batch_size

    parallel_en_de_provider = PaddedParallelSequenceMinibatchProvider(
        X1=list(english_data),
        X2=list(german_data),
        batch_size=batch_size,
        padding='PADDING',
    )

    multilingual_parallel_provider = TaggedProviderCollection({
        ('en', 'de'):
        parallel_en_de_provider
    })

    english_model = CSM(layers=[
        DictionaryEncoding(vocabulary=english_dictionary),
        WordEmbedding(dimension=40, vocabulary_size=len(english_dictionary)),
        AxisReduction(axis='w'),

        # SentenceConvolution(
        #     n_feature_maps=15,
        #     kernel_width=10,
        #     n_channels=1,
        #     n_input_dimensions=12),
        #
        # SumFolding(),
        #
        # KMaxPooling(k=17),
        #
        # Bias(
        #     n_input_dims=6,
        #     n_feature_maps=15),
        #
        # Tanh(),
    ])

    german_model = CSM(layers=[
        DictionaryEncoding(vocabulary=german_dictionary),
        WordEmbedding(dimension=40, vocabulary_size=len(german_dictionary)),
        AxisReduction(axis='w'),

        # SentenceConvolution(
        #     n_feature_maps=15,
        #     kernel_width=10,
        #     n_channels=1,
        #     n_input_dimensions=12),
        #
        # SumFolding(),
        #
        # KMaxPooling(k=17),
        #
        # Bias(
        #     n_input_dims=6,
        #     n_feature_maps=15),
        #
        # Tanh(),
    ])

    print english_model
    print german_model

    model = TaggedModelCollection({
        'en': english_model,
        'de': german_model,
    })

    # regularizer = L2Regularizer(lamb=1e-4)

    objective = ContrastiveMultilingualEmbeddingObjective(
        tagged_parallel_sequence_provider=multilingual_parallel_provider,
        n_contrastive_samples=10,
        margin=40.0)

    # objective = CostMinimizationObjective(
    #     cost=cost_function,
    #     data_provider=train_data_provider,
    #     regularizer=regularizer)

    update_rule = AdaGrad(
        # gamma=0.01,
        gamma=0.1,
        model_template=model)

    optimizer = SGD(model=model, objective=objective, update_rule=update_rule)

    time_start = time.time()

    costs = []
    for batch_index, iteration_info in enumerate(optimizer):
        costs.append(iteration_info['cost'])

        # print costs[-1]

        if batch_index % 10 == 0:
            print "B: {}, E: {}, C: {}, Param size: {}".format(
                batch_index,
                # This epoch count will be inaccurate when I move to multilingual
                (batch_index // parallel_en_de_provider.batches_per_epoch) + 1,
                costs[-1],
                np.mean(np.abs(model.pack())))

        if batch_index % 100 == 0:
            with open("model.pkl", 'w') as model_file:
                pickle.dump(model.move_to_cpu(), model_file, protocol=-1)

        # if batch_index % 1000 == 0 and batch_index > 0:
        #     with open("model_optimization.pkl", 'w') as model_file:
        #         pickle.dump(optimizer, model_file, protocol=-1)

        # if batch_index == 500:
        #     break

    time_end = time.time()

    print "Time elapsed: {}s".format(time_end - time_start)
コード例 #4
0
def run():
    random.seed(435)
    np.random.seed(2342)
    np.set_printoptions(linewidth=100)

    tweets_dir = os.path.join("../data", "sentiment140_2")  # _2 truncates at <3, normal truncates at <5

    with open(os.path.join(tweets_dir, "sentiment140.train.clean.json")) as data_file:
        data = json.loads(data_file.read())
        random.shuffle(data)
        X, Y = map(list, zip(*data))
        Y = [[":)", ":("].index(y) for y in Y]

    with open(os.path.join(tweets_dir, "sentiment140.train.clean.dictionary.encoding.json")) as alphabet_file:
        alphabet = json.loads(alphabet_file.read())

    with open(os.path.join(tweets_dir, "sentiment140.test.clean.json")) as data_file:
        data = json.loads(data_file.read())
        X_test, Y_test = map(list, zip(*data))
        Y_test = [[":)", ":("].index(y) for y in Y_test]

    print len(alphabet)

    # X = X[:1000]
    # Y = Y[:1000]

    # lists of words
    # replace unknowns with an unknown character
    tokenizer = WordPunctTokenizer()
    new_X = []
    for x in X:
        new_X.append([w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)])
    X = new_X

    new_X_test = []
    for x in X_test:
        new_X_test.append([w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)])
    X_test = new_X_test


    batch_size = 50

    train_data_provider = LabelledSequenceMinibatchProvider(
        X=X,
        Y=Y,
        batch_size=batch_size,
        fixed_length=50,
        padding='PADDING')

    print train_data_provider.batches_per_epoch

    validation_data_provider = LabelledSequenceMinibatchProvider(
        X=X_test,
        Y=Y_test,
        batch_size=len(X_test),
        fixed_length=50,
        padding='PADDING',
        shuffle=False)


    # model = CSM(
    #     layers=[
    #         DictionaryEncoding(vocabulary=encoding),
    #
    #         WordEmbedding(
    #             dimension=32,
    #             vocabulary_size=len(encoding)),
    #
    #         SentenceConvolution(
    #             n_feature_maps=5,
    #             kernel_width=10,
    #             n_channels=1,
    #             n_input_dimensions=32),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=7),
    #
    #         Bias(
    #             n_input_dims=16,
    #             n_feature_maps=5),
    #
    #         Tanh(),
    #
    #         SumFolding(),
    #
    #         Softmax(
    #             n_classes=2,
    #             n_input_dimensions=280),
    #         ]
    #     )

    # Approximately Nal's model
    #
    # model = CSM(
    #     layers=[
    #         DictionaryEncoding(vocabulary=encoding),
    #
    #         WordEmbedding(
    #             dimension=12,
    #             vocabulary_size=len(encoding)),
    #
    #         SentenceConvolution(
    #             n_feature_maps=6,
    #             kernel_width=7,
    #             n_channels=1,
    #             n_input_dimensions=12),
    #
    #         Bias(
    #             n_input_dims=12,
    #             n_feature_maps=6),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4, k_dynamic=0.5),
    #
    #         Tanh(),
    #
    #         SentenceConvolution(
    #             n_feature_maps=14,
    #             kernel_width=5,
    #             n_channels=6,
    #             n_input_dimensions=6),
    #
    #         Bias(
    #             n_input_dims=6,
    #             n_feature_maps=14),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4),
    #
    #         Tanh(),
    #
    #         Softmax(
    #             n_classes=2,
    #             n_input_dimensions=168),
    #         ]
    # )

    tweet_model = CSM(
        layers=[
            DictionaryEncoding(vocabulary=alphabet),

            WordEmbedding(
                dimension=60,
                vocabulary_size=len(alphabet),
                padding=alphabet['PADDING']),

            Dropout(('b', 'w', 'f'), 0.5),

            SentenceConvolution(
                n_feature_maps=6,
                kernel_width=7,
                n_channels=60,
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps=6),

            KMaxPooling(k=4, k_dynamic=0.5),

            Tanh(),

            SentenceConvolution(
                n_feature_maps=14,
                kernel_width=5,
                n_channels=6,
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps=14),

            KMaxPooling(k=4),

            Tanh(),

            # Dropout(('b', 'd', 'f', 'w'), 0.5),
            #
            # Linear(n_input=4*40, n_output=4*40),
            #
            # Bias(
            #     n_input_dims=4*40,
            #     n_feature_maps=1),

            Dropout(('b', 'd', 'f', 'w'), 0.5),

            Softmax(
                n_classes=2,
                n_input_dimensions=4*14),
            ]
    )

    # model = CSM(
    #     layers=[
    #         # cpu.model.encoding.
    #         DictionaryEncoding(vocabulary=encoding),
    #
    #         # cpu.model.embedding.
    #         WordEmbedding(
    #             dimension=28,
    #             vocabulary_size=len(encoding)),
    #
    #         # HostToDevice(),
    #
    #         SentenceConvolution(
    #             n_feature_maps=6,
    #             kernel_width=7,
    #             n_channels=1,
    #             n_input_dimensions=28),
    #
    #         Bias(
    #             n_input_dims=28,
    #             n_feature_maps=6),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4, k_dynamic=0.5),
    #
    #         Tanh(),
    #
    #         SentenceConvolution(
    #             n_feature_maps=14,
    #             kernel_width=5,
    #             n_channels=6,
    #             n_input_dimensions=14),
    #
    #         Bias(
    #             n_input_dims=14,
    #             n_feature_maps=14),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4),
    #
    #         Tanh(),
    #
    #         Softmax(
    #             n_classes=2,
    #             n_input_dimensions=392),
    #         ]
    # )

    print tweet_model

    cost_function = CrossEntropy()

    regularizer = L2Regularizer(lamb=1e-5)

    objective = CostMinimizationObjective(
        cost=cost_function,
        data_provider=train_data_provider,
        regularizer=regularizer)

    update_rule = AdaGrad(
        gamma=0.1,
        model_template=tweet_model)

    # update_rule = AdaDelta(
    #     rho=0.99,
    #     epsilon=1e-6,
    #     model_template=model)

    optimizer = SGD(
        model=tweet_model,
        objective=objective,
        update_rule=update_rule)

    gradient_checker = ModelGradientChecker(
        CostMinimizationObjective(
            cost=cost_function,
            data_provider=validation_data_provider,
            regularizer=regularizer))

    time_start = time.time()

    best_acc = -1.0

    costs = []
    for batch_index, iteration_info in enumerate(optimizer):
        costs.append(iteration_info['cost'])

        if batch_index % 30 == 0:
            X_valid, Y_valid, meta_valid = validation_data_provider.next_batch()

            test_model = gpu.model.dropout.remove_dropout(tweet_model)
            Y_hat = test_model.fprop(X_valid, meta=meta_valid)
            del test_model

            Y_hat = Y_hat.get()
            assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6)

            # grad_check = gradient_checker.check(model)
            grad_check = "skipped"

            time_now = time.time()
            examples_per_hr = (batch_index * batch_size) / (time_now - time_start) * 3600

            acc = np.mean(np.argmax(Y_hat, axis=1) == np.argmax(Y_valid.get(), axis=1))

            if acc > best_acc:
                best_acc = acc
                with open("model_best_tweets.pkl", 'w') as model_file:
                    pickle.dump(tweet_model.move_to_cpu(), model_file, protocol=-1)
                # with open("model_best_optimization.pkl", 'w') as model_file:
                #     pickle.dump(optimizer, model_file, protocol=-1)


            print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, EPH: {}, best acc: {}".format(
                batch_index,
                acc,
                costs[-1],
                np.argmax(Y_hat, axis=1).mean(),
                np.mean(np.abs(tweet_model.pack())),
                examples_per_hr,
                best_acc)

        # if batch_index % 2500 == 0 and batch_index > 0:
        #     update_rule.gamma *= 0.5

        # if batch_index == 1000:
        #     break

        # if batch_index % 100 == 0:
        #     with open("model.pkl", 'w') as model_file:
        #         pickle.dump(model.move_to_cpu(), model_file, protocol=-1)

        # if batch_index % 1000 == 0 and batch_index > 0:
        #     with open("model_optimization.pkl", 'w') as model_file:
        #         pickle.dump(optimizer, model_file, protocol=-1)


    time_end = time.time()

    print "Time elapsed: {}s".format(time_end - time_start)
コード例 #5
0
__author__ = 'mdenil'
コード例 #6
0
    model = CSM(
        layers=[
            DictionaryEncoding(vocabulary=encoding),

            WordEmbedding(
                dimension=20,
                vocabulary_size=len(encoding),
                padding=encoding['PADDING']),

            Dropout(('b', 'f', 'w'), 0.2),

            SentenceConvolution(
                n_feature_maps=12,
                kernel_width=15,
                n_channels=20,
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps=12),

            KMaxPooling(k=7, k_dynamic=0.5),

            SentenceConvolution(
                n_feature_maps=13,
                kernel_width=6,
                n_channels=12,
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps=13),

            KMaxPooling(k=5),

            Tanh(),

            ReshapeForDocuments(),

            SentenceConvolution(
                n_feature_maps=28,
                kernel_width=13,
                n_channels=13*5,
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps=28),

            KMaxPooling(k=5),

            Tanh(),

            Dropout(('b', 'd', 'f', 'w'), 0.5),

            Linear(n_input=28*5, n_output=28*5),

            Bias(n_input_dims=28*5, n_feature_maps=1),

            Dropout(('b', 'd', 'f', 'w'), 0.5),

            Softmax(
                n_classes=2,
                n_input_dimensions=28*5),
            ]
    )
コード例 #7
0
def main():
    random.seed(435)
    np.random.seed(23421)
    np.set_printoptions(linewidth=100)

    data_dir = os.path.join("/data/mulga/mdenil/amazon-reviews", "shards")

    batch_size = 100

    with open(os.path.join(data_dir, "dictionary.sentences.clean.encoding.json")) as encoding_file:
        encoding = json.load(encoding_file)

    print(len(encoding))


    # pretrained_lut = load_word2vec_embeddings(
    #     os.path.join("/data/brown/mdenil/amazon-reviews/word2vec-embeddings", "word-embeddings-30.txt"),
    #     encoding)


    train_data_provider = ShardedLabelledDocumentMinibatchProvider(
        shard_dir=os.path.join(data_dir, "train"),
        shard_pattern="shard_[0-9]*.sentences.clean.projected.json.gz",
        batch_size=batch_size,
        padding='PADDING',
        n_labels=5,
        # n_labels=2,
        fixed_n_sentences=15,
        fixed_n_words=25)

    validation_data_provider = ShardedLabelledDocumentMinibatchProvider(
        shard_dir=os.path.join(data_dir, "test"),
        shard_pattern="shard_[0-9]*.sentences.clean.projected.json.gz",
        batch_size=batch_size,
        padding='PADDING',
        n_labels=5,
        # n_labels=2,
        fixed_n_sentences=15,
        fixed_n_words=25)

    model = CSM(
        layers=[
            DictionaryEncoding(vocabulary=encoding),

            WordEmbedding(
                dimension=30,
                vocabulary_size=len(encoding),
                padding=encoding['PADDING']),

            # WordEmbedding(
            #     dimension=pretrained_lut.shape[1],
            #     vocabulary_size=len(encoding),
            #     padding=encoding['PADDING'],
            #     E=pretrained_lut),

            # Dropout(('b', 'w', 'f'), 0.2),

            SentenceConvolution(
                n_feature_maps=10,
                kernel_width=3,
                n_channels=30,
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps=10),

            # KMaxPooling(k=7, k_dynamic=0.5),
            #
            # Tanh(),
            #
            # SentenceConvolution(
            #     n_feature_maps=30,
            #     kernel_width=3,
            #     n_channels=10,
            #     n_input_dimensions=1),
            #
            # Bias(
            #     n_input_dims=1,
            #     n_feature_maps=30),

            KMaxPooling(k=5),

            Tanh(),

            ReshapeForDocuments(),

            SentenceConvolution(
                n_feature_maps=20,
                kernel_width=3,
                n_channels=50,
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps=20),

            KMaxPooling(k=5),

            Tanh(),

            # Dropout(('b', 'd', 'f', 'w'), 0.5),

            # Softmax(
            #     # n_classes=2,
            #     n_classes=5,
            #     n_input_dimensions=100),

            Linear(
                n_input=100,
                n_output=1)
            ]
    )

    print(model)


    # cost_function = CrossEntropy()
    cost_function = SquaredError()

    regularizer = L2Regularizer(lamb=1e-5)

    objective = CostMinimizationObjective(
        cost=cost_function,
        data_provider=train_data_provider,
        regularizer=regularizer)

    update_rule = AdaGrad(
        gamma=0.1,
        model_template=model)

    optimizer = SGD(
        model=model,
        objective=objective,
        update_rule=update_rule)

    n_epochs = 1
    # n_batches = train_data_provider.batches_per_epoch * n_epochs

    time_start = time.time()

    best_acc = -1.0


    progress = []
    costs = []
    prev_weights = model.pack()
    for batch_index, iteration_info in enumerate(optimizer):
        costs.append(iteration_info['cost'])

        if batch_index % 10 == 0:

            Y_hat = []
            Y_valid = []
            for _ in xrange(1):
                X_valid_batch, Y_valid_batch, meta_valid = validation_data_provider.next_batch()
                Y_valid.append(Y_valid_batch)
                Y_hat.append(model.fprop(X_valid_batch, meta=meta_valid))
            Y_valid = Y_valid[0].get()
            Y_hat = Y_hat[0].get()
            # Y_valid = np.concatenate(Y_valid, axis=0)
            # Y_hat = np.concatenate(Y_hat, axis=0)
            # assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6)

            # acc = np.mean(np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1))
            acc = np.mean(np.abs(Y_valid - Y_hat))

            # if acc > best_acc:
            #     best_acc = acc
            # with open("/home/mdenil/model.pkl", 'w') as model_file:
            #     pickle.dump(model, model_file, protocol=-1)

            current = dict()
            current['B']=batch_index
            current['A']=acc
            current['C']=costs[-1].get()
            current['Prop']=np.argmax(Y_hat, axis=1).mean()
            current['Params']=np.mean(np.abs(model.pack()))

            progress.append(current)
            print(current)
            with open("progress.pkl", 'w') as progress_file:
                pickle.dump(progress, progress_file, protocol=-1)

        # if batch_index == 100:
        #     break

        if batch_index % 100 == 0:
            with open("model.pkl", 'w') as model_file:
                pickle.dump(model, model_file, protocol=-1)

    time_end = time.time()

    print("Time elapsed: {}s".format(time_end - time_start))
コード例 #8
0
 model = CSM(layers=[
     DictionaryEncoding(vocabulary=encoding),
     WordEmbedding(dimension=20,
                   vocabulary_size=len(encoding),
                   padding=encoding['PADDING']),
     Dropout(('b', 'f', 'w'), 0.2),
     SentenceConvolution(n_feature_maps=12,
                         kernel_width=15,
                         n_channels=20,
                         n_input_dimensions=1),
     Bias(n_input_dims=1, n_feature_maps=12),
     KMaxPooling(k=7, k_dynamic=0.5),
     SentenceConvolution(n_feature_maps=13,
                         kernel_width=6,
                         n_channels=12,
                         n_input_dimensions=1),
     Bias(n_input_dims=1, n_feature_maps=13),
     KMaxPooling(k=5),
     Tanh(),
     ReshapeForDocuments(),
     SentenceConvolution(n_feature_maps=28,
                         kernel_width=13,
                         n_channels=13 * 5,
                         n_input_dimensions=1),
     Bias(n_input_dims=1, n_feature_maps=28),
     KMaxPooling(k=5),
     Tanh(),
     Dropout(('b', 'd', 'f', 'w'), 0.5),
     Linear(n_input=28 * 5, n_output=28 * 5),
     Bias(n_input_dims=28 * 5, n_feature_maps=1),
     Dropout(('b', 'd', 'f', 'w'), 0.5),
     Softmax(n_classes=2, n_input_dimensions=28 * 5),
 ])
コード例 #9
0
def run():
    random.seed(435)
    np.random.seed(2342)
    np.set_printoptions(linewidth=100)

    tweets_dir = os.path.join(
        "../data",
        "sentiment140_2")  # _2 truncates at <3, normal truncates at <5

    with open(os.path.join(tweets_dir,
                           "sentiment140.train.clean.json")) as data_file:
        data = json.loads(data_file.read())
        random.shuffle(data)
        X, Y = map(list, zip(*data))
        Y = [[":)", ":("].index(y) for y in Y]

    with open(
            os.path.join(tweets_dir,
                         "sentiment140.train.clean.dictionary.encoding.json")
    ) as alphabet_file:
        alphabet = json.loads(alphabet_file.read())

    with open(os.path.join(tweets_dir,
                           "sentiment140.test.clean.json")) as data_file:
        data = json.loads(data_file.read())
        X_test, Y_test = map(list, zip(*data))
        Y_test = [[":)", ":("].index(y) for y in Y_test]

    print len(alphabet)

    # X = X[:1000]
    # Y = Y[:1000]

    # lists of words
    # replace unknowns with an unknown character
    tokenizer = WordPunctTokenizer()
    new_X = []
    for x in X:
        new_X.append(
            [w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)])
    X = new_X

    new_X_test = []
    for x in X_test:
        new_X_test.append(
            [w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)])
    X_test = new_X_test

    batch_size = 50

    train_data_provider = LabelledSequenceMinibatchProvider(
        X=X, Y=Y, batch_size=batch_size, fixed_length=50, padding='PADDING')

    print train_data_provider.batches_per_epoch

    validation_data_provider = LabelledSequenceMinibatchProvider(
        X=X_test,
        Y=Y_test,
        batch_size=len(X_test),
        fixed_length=50,
        padding='PADDING',
        shuffle=False)

    # model = CSM(
    #     layers=[
    #         DictionaryEncoding(vocabulary=encoding),
    #
    #         WordEmbedding(
    #             dimension=32,
    #             vocabulary_size=len(encoding)),
    #
    #         SentenceConvolution(
    #             n_feature_maps=5,
    #             kernel_width=10,
    #             n_channels=1,
    #             n_input_dimensions=32),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=7),
    #
    #         Bias(
    #             n_input_dims=16,
    #             n_feature_maps=5),
    #
    #         Tanh(),
    #
    #         SumFolding(),
    #
    #         Softmax(
    #             n_classes=2,
    #             n_input_dimensions=280),
    #         ]
    #     )

    # Approximately Nal's model
    #
    # model = CSM(
    #     layers=[
    #         DictionaryEncoding(vocabulary=encoding),
    #
    #         WordEmbedding(
    #             dimension=12,
    #             vocabulary_size=len(encoding)),
    #
    #         SentenceConvolution(
    #             n_feature_maps=6,
    #             kernel_width=7,
    #             n_channels=1,
    #             n_input_dimensions=12),
    #
    #         Bias(
    #             n_input_dims=12,
    #             n_feature_maps=6),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4, k_dynamic=0.5),
    #
    #         Tanh(),
    #
    #         SentenceConvolution(
    #             n_feature_maps=14,
    #             kernel_width=5,
    #             n_channels=6,
    #             n_input_dimensions=6),
    #
    #         Bias(
    #             n_input_dims=6,
    #             n_feature_maps=14),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4),
    #
    #         Tanh(),
    #
    #         Softmax(
    #             n_classes=2,
    #             n_input_dimensions=168),
    #         ]
    # )

    tweet_model = CSM(layers=[
        DictionaryEncoding(vocabulary=alphabet),
        WordEmbedding(dimension=60,
                      vocabulary_size=len(alphabet),
                      padding=alphabet['PADDING']),
        Dropout(('b', 'w', 'f'), 0.5),
        SentenceConvolution(n_feature_maps=6,
                            kernel_width=7,
                            n_channels=60,
                            n_input_dimensions=1),
        Bias(n_input_dims=1, n_feature_maps=6),
        KMaxPooling(k=4, k_dynamic=0.5),
        Tanh(),
        SentenceConvolution(n_feature_maps=14,
                            kernel_width=5,
                            n_channels=6,
                            n_input_dimensions=1),
        Bias(n_input_dims=1, n_feature_maps=14),
        KMaxPooling(k=4),
        Tanh(),

        # Dropout(('b', 'd', 'f', 'w'), 0.5),
        #
        # Linear(n_input=4*40, n_output=4*40),
        #
        # Bias(
        #     n_input_dims=4*40,
        #     n_feature_maps=1),
        Dropout(('b', 'd', 'f', 'w'), 0.5),
        Softmax(n_classes=2, n_input_dimensions=4 * 14),
    ])

    # model = CSM(
    #     layers=[
    #         # cpu.model.encoding.
    #         DictionaryEncoding(vocabulary=encoding),
    #
    #         # cpu.model.embedding.
    #         WordEmbedding(
    #             dimension=28,
    #             vocabulary_size=len(encoding)),
    #
    #         # HostToDevice(),
    #
    #         SentenceConvolution(
    #             n_feature_maps=6,
    #             kernel_width=7,
    #             n_channels=1,
    #             n_input_dimensions=28),
    #
    #         Bias(
    #             n_input_dims=28,
    #             n_feature_maps=6),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4, k_dynamic=0.5),
    #
    #         Tanh(),
    #
    #         SentenceConvolution(
    #             n_feature_maps=14,
    #             kernel_width=5,
    #             n_channels=6,
    #             n_input_dimensions=14),
    #
    #         Bias(
    #             n_input_dims=14,
    #             n_feature_maps=14),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4),
    #
    #         Tanh(),
    #
    #         Softmax(
    #             n_classes=2,
    #             n_input_dimensions=392),
    #         ]
    # )

    print tweet_model

    cost_function = CrossEntropy()

    regularizer = L2Regularizer(lamb=1e-5)

    objective = CostMinimizationObjective(cost=cost_function,
                                          data_provider=train_data_provider,
                                          regularizer=regularizer)

    update_rule = AdaGrad(gamma=0.1, model_template=tweet_model)

    # update_rule = AdaDelta(
    #     rho=0.99,
    #     epsilon=1e-6,
    #     model_template=model)

    optimizer = SGD(model=tweet_model,
                    objective=objective,
                    update_rule=update_rule)

    gradient_checker = ModelGradientChecker(
        CostMinimizationObjective(cost=cost_function,
                                  data_provider=validation_data_provider,
                                  regularizer=regularizer))

    time_start = time.time()

    best_acc = -1.0

    costs = []
    for batch_index, iteration_info in enumerate(optimizer):
        costs.append(iteration_info['cost'])

        if batch_index % 30 == 0:
            X_valid, Y_valid, meta_valid = validation_data_provider.next_batch(
            )

            test_model = gpu.model.dropout.remove_dropout(tweet_model)
            Y_hat = test_model.fprop(X_valid, meta=meta_valid)
            del test_model

            Y_hat = Y_hat.get()
            assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6)

            # grad_check = gradient_checker.check(model)
            grad_check = "skipped"

            time_now = time.time()
            examples_per_hr = (batch_index * batch_size) / (time_now -
                                                            time_start) * 3600

            acc = np.mean(
                np.argmax(Y_hat, axis=1) == np.argmax(Y_valid.get(), axis=1))

            if acc > best_acc:
                best_acc = acc
                with open("model_best_tweets.pkl", 'w') as model_file:
                    pickle.dump(tweet_model.move_to_cpu(),
                                model_file,
                                protocol=-1)
                # with open("model_best_optimization.pkl", 'w') as model_file:
                #     pickle.dump(optimizer, model_file, protocol=-1)

            print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, EPH: {}, best acc: {}".format(
                batch_index, acc, costs[-1],
                np.argmax(Y_hat, axis=1).mean(),
                np.mean(np.abs(tweet_model.pack())), examples_per_hr, best_acc)

        # if batch_index % 2500 == 0 and batch_index > 0:
        #     update_rule.gamma *= 0.5

        # if batch_index == 1000:
        #     break

        # if batch_index % 100 == 0:
        #     with open("model.pkl", 'w') as model_file:
        #         pickle.dump(model.move_to_cpu(), model_file, protocol=-1)

        # if batch_index % 1000 == 0 and batch_index > 0:
        #     with open("model_optimization.pkl", 'w') as model_file:
        #         pickle.dump(optimizer, model_file, protocol=-1)

    time_end = time.time()

    print "Time elapsed: {}s".format(time_end - time_start)