def run():
    random.seed(435)
    np.random.seed(2342)
    np.set_printoptions(linewidth=100)

    tweets_dir = os.path.join("../data", "sentiment140_2")  # _2 truncates at <3, normal truncates at <5

    with open(os.path.join(tweets_dir, "sentiment140.train.clean.json")) as data_file:
        data = json.loads(data_file.read())
        random.shuffle(data)
        X, Y = map(list, zip(*data))
        Y = [[":)", ":("].index(y) for y in Y]

    with open(os.path.join(tweets_dir, "sentiment140.train.clean.dictionary.encoding.json")) as alphabet_file:
        alphabet = json.loads(alphabet_file.read())

    with open(os.path.join(tweets_dir, "sentiment140.test.clean.json")) as data_file:
        data = json.loads(data_file.read())
        X_test, Y_test = map(list, zip(*data))
        Y_test = [[":)", ":("].index(y) for y in Y_test]

    print len(alphabet)

    # X = X[:1000]
    # Y = Y[:1000]

    # lists of words
    # replace unknowns with an unknown character
    tokenizer = WordPunctTokenizer()
    new_X = []
    for x in X:
        new_X.append([w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)])
    X = new_X

    new_X_test = []
    for x in X_test:
        new_X_test.append([w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)])
    X_test = new_X_test


    batch_size = 50

    train_data_provider = LabelledSequenceMinibatchProvider(
        X=X,
        Y=Y,
        batch_size=batch_size,
        fixed_length=50,
        padding='PADDING')

    print train_data_provider.batches_per_epoch

    validation_data_provider = LabelledSequenceMinibatchProvider(
        X=X_test,
        Y=Y_test,
        batch_size=len(X_test),
        fixed_length=50,
        padding='PADDING',
        shuffle=False)


    # model = CSM(
    #     layers=[
    #         DictionaryEncoding(vocabulary=encoding),
    #
    #         WordEmbedding(
    #             dimension=32,
    #             vocabulary_size=len(encoding)),
    #
    #         SentenceConvolution(
    #             n_feature_maps=5,
    #             kernel_width=10,
    #             n_channels=1,
    #             n_input_dimensions=32),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=7),
    #
    #         Bias(
    #             n_input_dims=16,
    #             n_feature_maps=5),
    #
    #         Tanh(),
    #
    #         SumFolding(),
    #
    #         Softmax(
    #             n_classes=2,
    #             n_input_dimensions=280),
    #         ]
    #     )

    # Approximately Nal's model
    #
    # model = CSM(
    #     layers=[
    #         DictionaryEncoding(vocabulary=encoding),
    #
    #         WordEmbedding(
    #             dimension=12,
    #             vocabulary_size=len(encoding)),
    #
    #         SentenceConvolution(
    #             n_feature_maps=6,
    #             kernel_width=7,
    #             n_channels=1,
    #             n_input_dimensions=12),
    #
    #         Bias(
    #             n_input_dims=12,
    #             n_feature_maps=6),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4, k_dynamic=0.5),
    #
    #         Tanh(),
    #
    #         SentenceConvolution(
    #             n_feature_maps=14,
    #             kernel_width=5,
    #             n_channels=6,
    #             n_input_dimensions=6),
    #
    #         Bias(
    #             n_input_dims=6,
    #             n_feature_maps=14),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4),
    #
    #         Tanh(),
    #
    #         Softmax(
    #             n_classes=2,
    #             n_input_dimensions=168),
    #         ]
    # )

    tweet_model = CSM(
        layers=[
            DictionaryEncoding(vocabulary=alphabet),

            WordEmbedding(
                dimension=60,
                vocabulary_size=len(alphabet),
                padding=alphabet['PADDING']),

            Dropout(('b', 'w', 'f'), 0.5),

            SentenceConvolution(
                n_feature_maps=6,
                kernel_width=7,
                n_channels=60,
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps=6),

            KMaxPooling(k=4, k_dynamic=0.5),

            Tanh(),

            SentenceConvolution(
                n_feature_maps=14,
                kernel_width=5,
                n_channels=6,
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps=14),

            KMaxPooling(k=4),

            Tanh(),

            # Dropout(('b', 'd', 'f', 'w'), 0.5),
            #
            # Linear(n_input=4*40, n_output=4*40),
            #
            # Bias(
            #     n_input_dims=4*40,
            #     n_feature_maps=1),

            Dropout(('b', 'd', 'f', 'w'), 0.5),

            Softmax(
                n_classes=2,
                n_input_dimensions=4*14),
            ]
    )

    # model = CSM(
    #     layers=[
    #         # cpu.model.encoding.
    #         DictionaryEncoding(vocabulary=encoding),
    #
    #         # cpu.model.embedding.
    #         WordEmbedding(
    #             dimension=28,
    #             vocabulary_size=len(encoding)),
    #
    #         # HostToDevice(),
    #
    #         SentenceConvolution(
    #             n_feature_maps=6,
    #             kernel_width=7,
    #             n_channels=1,
    #             n_input_dimensions=28),
    #
    #         Bias(
    #             n_input_dims=28,
    #             n_feature_maps=6),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4, k_dynamic=0.5),
    #
    #         Tanh(),
    #
    #         SentenceConvolution(
    #             n_feature_maps=14,
    #             kernel_width=5,
    #             n_channels=6,
    #             n_input_dimensions=14),
    #
    #         Bias(
    #             n_input_dims=14,
    #             n_feature_maps=14),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4),
    #
    #         Tanh(),
    #
    #         Softmax(
    #             n_classes=2,
    #             n_input_dimensions=392),
    #         ]
    # )

    print tweet_model

    cost_function = CrossEntropy()

    regularizer = L2Regularizer(lamb=1e-5)

    objective = CostMinimizationObjective(
        cost=cost_function,
        data_provider=train_data_provider,
        regularizer=regularizer)

    update_rule = AdaGrad(
        gamma=0.1,
        model_template=tweet_model)

    # update_rule = AdaDelta(
    #     rho=0.99,
    #     epsilon=1e-6,
    #     model_template=model)

    optimizer = SGD(
        model=tweet_model,
        objective=objective,
        update_rule=update_rule)

    gradient_checker = ModelGradientChecker(
        CostMinimizationObjective(
            cost=cost_function,
            data_provider=validation_data_provider,
            regularizer=regularizer))

    time_start = time.time()

    best_acc = -1.0

    costs = []
    for batch_index, iteration_info in enumerate(optimizer):
        costs.append(iteration_info['cost'])

        if batch_index % 30 == 0:
            X_valid, Y_valid, meta_valid = validation_data_provider.next_batch()

            test_model = gpu.model.dropout.remove_dropout(tweet_model)
            Y_hat = test_model.fprop(X_valid, meta=meta_valid)
            del test_model

            Y_hat = Y_hat.get()
            assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6)

            # grad_check = gradient_checker.check(model)
            grad_check = "skipped"

            time_now = time.time()
            examples_per_hr = (batch_index * batch_size) / (time_now - time_start) * 3600

            acc = np.mean(np.argmax(Y_hat, axis=1) == np.argmax(Y_valid.get(), axis=1))

            if acc > best_acc:
                best_acc = acc
                with open("model_best_tweets.pkl", 'w') as model_file:
                    pickle.dump(tweet_model.move_to_cpu(), model_file, protocol=-1)
                # with open("model_best_optimization.pkl", 'w') as model_file:
                #     pickle.dump(optimizer, model_file, protocol=-1)


            print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, EPH: {}, best acc: {}".format(
                batch_index,
                acc,
                costs[-1],
                np.argmax(Y_hat, axis=1).mean(),
                np.mean(np.abs(tweet_model.pack())),
                examples_per_hr,
                best_acc)

        # if batch_index % 2500 == 0 and batch_index > 0:
        #     update_rule.gamma *= 0.5

        # if batch_index == 1000:
        #     break

        # if batch_index % 100 == 0:
        #     with open("model.pkl", 'w') as model_file:
        #         pickle.dump(model.move_to_cpu(), model_file, protocol=-1)

        # if batch_index % 1000 == 0 and batch_index > 0:
        #     with open("model_optimization.pkl", 'w') as model_file:
        #         pickle.dump(optimizer, model_file, protocol=-1)


    time_end = time.time()

    print "Time elapsed: {}s".format(time_end - time_start)
__author__ = 'mdenil'
def run():
    # random.seed(435)
    # np.random.seed(2342)
    np.set_printoptions(linewidth=100)

    parser = argparse.ArgumentParser(
        description="Evaluate a trained network on the sentiment140 test set")
    parser.add_argument("--model_file", help="pickle file to load the model from")
    parser.add_argument("--best_file", help="html file to write the output to")
    args = parser.parse_args()

    tweets_dir = os.path.join("../data", "sentiment140_2")  # _2 truncates at <3, normal truncates at <5

    with open(os.path.join(tweets_dir, "sentiment140.train.clean.json")) as data_file:
        data = json.loads(data_file.read())
        random.shuffle(data)
        X, Y = map(list, zip(*data))
        Y = [[":)", ":("].index(y) for y in Y]

    with open(os.path.join(tweets_dir, "sentiment140.train.clean.dictionary.encoding.json")) as alphabet_file:
        alphabet = json.loads(alphabet_file.read())

    with open(os.path.join(tweets_dir, "sentiment140.test.clean.json")) as data_file:
        data = json.loads(data_file.read())
        X_test, Y_test = map(list, zip(*data))
        Y_test = [[":)", ":("].index(y) for y in Y_test]

    print len(alphabet)

    # X = X[:1000]
    # Y = Y[:1000]

    # lists of words
    # replace unknowns with an unknown character
    tokenizer = WordPunctTokenizer()
    new_X = []
    for x in X:
        new_X.append([w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)])
    X = new_X

    new_X_test = []
    for x in X_test:
        new_X_test.append([w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)])
    X_test = new_X_test


    batch_size = 5

    train_data_provider = LabelledSequenceMinibatchProvider(
        X=X,
        Y=Y,
        batch_size=batch_size,
        fixed_length=50,
        padding='PADDING')

    print train_data_provider.batches_per_epoch

    validation_data_provider = LabelledSequenceMinibatchProvider(
        X=X_test,
        Y=Y_test,
        batch_size=len(X_test),
        fixed_length=50,
        padding='PADDING',
        shuffle=False)

    with open(args.model_file) as model_file:
        tweet_model = gpu.model.host_device_component_mapping.move_to_gpu(pickle.load(model_file))
        # tweet_model = gpu.model.dropout.remove_dropout(tweet_model)

    print tweet_model

    X_valid, Y_valid, meta_valid = validation_data_provider.next_batch()
    test_model = gpu.model.dropout.remove_dropout(tweet_model)
    Y_hat = test_model.fprop(X_valid, meta=meta_valid)
    del test_model
    best_acc = np.mean(np.argmax(Y_hat.get(), axis=1) == np.argmax(Y_valid.get(), axis=1))
    print "Acc at start:", best_acc

    cost_function = CrossEntropy()

    regularizer = L2Regularizer(lamb=0.0)

    objective = CostMinimizationObjective(
        cost=cost_function,
        data_provider=train_data_provider,
        regularizer=regularizer)

    update_rule = AdaGrad(
        gamma=2e-3,
        model_template=tweet_model)

    optimizer = SGD(
        model=tweet_model,
        objective=objective,
        update_rule=update_rule)

    gradient_checker = ModelGradientChecker(
        CostMinimizationObjective(
            cost=cost_function,
            data_provider=validation_data_provider,
            regularizer=regularizer))

    time_start = time.time()

    costs = []
    for batch_index, iteration_info in enumerate(optimizer):
        costs.append(iteration_info['cost'])

        X_valid, Y_valid, meta_valid = validation_data_provider.next_batch()
        test_model = gpu.model.dropout.remove_dropout(tweet_model)
        Y_hat = test_model.fprop(X_valid, meta=meta_valid)
        del test_model

        Y_hat = Y_hat.get()
        assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6)

        time_now = time.time()
        examples_per_hr = (batch_index * batch_size) / (time_now - time_start) * 3600

        acc = np.mean(np.argmax(Y_hat, axis=1) == np.argmax(Y_valid.get(), axis=1))

        if acc > best_acc:
            best_acc = acc
            with open(args.best_file, 'w') as model_file:
                pickle.dump(tweet_model.move_to_cpu(), model_file, protocol=-1)


        print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, EPH: {}, best acc: {}".format(
            batch_index,
            acc,
            costs[-1],
            np.argmax(Y_hat, axis=1).mean(),
            np.mean(np.abs(tweet_model.pack())),
            examples_per_hr,
            best_acc)


    time_end = time.time()

    print "Time elapsed: {}s".format(time_end - time_start)
Beispiel #4
0
def run():
    random.seed(435)
    np.random.seed(2342)
    np.set_printoptions(linewidth=100)

    tweets_dir = os.path.join(
        "../data",
        "sentiment140_2")  # _2 truncates at <3, normal truncates at <5

    with open(os.path.join(tweets_dir,
                           "sentiment140.train.clean.json")) as data_file:
        data = json.loads(data_file.read())
        random.shuffle(data)
        X, Y = map(list, zip(*data))
        Y = [[":)", ":("].index(y) for y in Y]

    with open(
            os.path.join(tweets_dir,
                         "sentiment140.train.clean.dictionary.encoding.json")
    ) as alphabet_file:
        alphabet = json.loads(alphabet_file.read())

    with open(os.path.join(tweets_dir,
                           "sentiment140.test.clean.json")) as data_file:
        data = json.loads(data_file.read())
        X_test, Y_test = map(list, zip(*data))
        Y_test = [[":)", ":("].index(y) for y in Y_test]

    print len(alphabet)

    # X = X[:1000]
    # Y = Y[:1000]

    # lists of words
    # replace unknowns with an unknown character
    tokenizer = WordPunctTokenizer()
    new_X = []
    for x in X:
        new_X.append(
            [w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)])
    X = new_X

    new_X_test = []
    for x in X_test:
        new_X_test.append(
            [w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)])
    X_test = new_X_test

    batch_size = 50

    train_data_provider = LabelledSequenceMinibatchProvider(
        X=X, Y=Y, batch_size=batch_size, fixed_length=50, padding='PADDING')

    print train_data_provider.batches_per_epoch

    validation_data_provider = LabelledSequenceMinibatchProvider(
        X=X_test,
        Y=Y_test,
        batch_size=len(X_test),
        fixed_length=50,
        padding='PADDING',
        shuffle=False)

    # model = CSM(
    #     layers=[
    #         DictionaryEncoding(vocabulary=encoding),
    #
    #         WordEmbedding(
    #             dimension=32,
    #             vocabulary_size=len(encoding)),
    #
    #         SentenceConvolution(
    #             n_feature_maps=5,
    #             kernel_width=10,
    #             n_channels=1,
    #             n_input_dimensions=32),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=7),
    #
    #         Bias(
    #             n_input_dims=16,
    #             n_feature_maps=5),
    #
    #         Tanh(),
    #
    #         SumFolding(),
    #
    #         Softmax(
    #             n_classes=2,
    #             n_input_dimensions=280),
    #         ]
    #     )

    # Approximately Nal's model
    #
    # model = CSM(
    #     layers=[
    #         DictionaryEncoding(vocabulary=encoding),
    #
    #         WordEmbedding(
    #             dimension=12,
    #             vocabulary_size=len(encoding)),
    #
    #         SentenceConvolution(
    #             n_feature_maps=6,
    #             kernel_width=7,
    #             n_channels=1,
    #             n_input_dimensions=12),
    #
    #         Bias(
    #             n_input_dims=12,
    #             n_feature_maps=6),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4, k_dynamic=0.5),
    #
    #         Tanh(),
    #
    #         SentenceConvolution(
    #             n_feature_maps=14,
    #             kernel_width=5,
    #             n_channels=6,
    #             n_input_dimensions=6),
    #
    #         Bias(
    #             n_input_dims=6,
    #             n_feature_maps=14),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4),
    #
    #         Tanh(),
    #
    #         Softmax(
    #             n_classes=2,
    #             n_input_dimensions=168),
    #         ]
    # )

    tweet_model = CSM(layers=[
        DictionaryEncoding(vocabulary=alphabet),
        WordEmbedding(dimension=60,
                      vocabulary_size=len(alphabet),
                      padding=alphabet['PADDING']),
        Dropout(('b', 'w', 'f'), 0.5),
        SentenceConvolution(n_feature_maps=6,
                            kernel_width=7,
                            n_channels=60,
                            n_input_dimensions=1),
        Bias(n_input_dims=1, n_feature_maps=6),
        KMaxPooling(k=4, k_dynamic=0.5),
        Tanh(),
        SentenceConvolution(n_feature_maps=14,
                            kernel_width=5,
                            n_channels=6,
                            n_input_dimensions=1),
        Bias(n_input_dims=1, n_feature_maps=14),
        KMaxPooling(k=4),
        Tanh(),

        # Dropout(('b', 'd', 'f', 'w'), 0.5),
        #
        # Linear(n_input=4*40, n_output=4*40),
        #
        # Bias(
        #     n_input_dims=4*40,
        #     n_feature_maps=1),
        Dropout(('b', 'd', 'f', 'w'), 0.5),
        Softmax(n_classes=2, n_input_dimensions=4 * 14),
    ])

    # model = CSM(
    #     layers=[
    #         # cpu.model.encoding.
    #         DictionaryEncoding(vocabulary=encoding),
    #
    #         # cpu.model.embedding.
    #         WordEmbedding(
    #             dimension=28,
    #             vocabulary_size=len(encoding)),
    #
    #         # HostToDevice(),
    #
    #         SentenceConvolution(
    #             n_feature_maps=6,
    #             kernel_width=7,
    #             n_channels=1,
    #             n_input_dimensions=28),
    #
    #         Bias(
    #             n_input_dims=28,
    #             n_feature_maps=6),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4, k_dynamic=0.5),
    #
    #         Tanh(),
    #
    #         SentenceConvolution(
    #             n_feature_maps=14,
    #             kernel_width=5,
    #             n_channels=6,
    #             n_input_dimensions=14),
    #
    #         Bias(
    #             n_input_dims=14,
    #             n_feature_maps=14),
    #
    #         SumFolding(),
    #
    #         KMaxPooling(k=4),
    #
    #         Tanh(),
    #
    #         Softmax(
    #             n_classes=2,
    #             n_input_dimensions=392),
    #         ]
    # )

    print tweet_model

    cost_function = CrossEntropy()

    regularizer = L2Regularizer(lamb=1e-5)

    objective = CostMinimizationObjective(cost=cost_function,
                                          data_provider=train_data_provider,
                                          regularizer=regularizer)

    update_rule = AdaGrad(gamma=0.1, model_template=tweet_model)

    # update_rule = AdaDelta(
    #     rho=0.99,
    #     epsilon=1e-6,
    #     model_template=model)

    optimizer = SGD(model=tweet_model,
                    objective=objective,
                    update_rule=update_rule)

    gradient_checker = ModelGradientChecker(
        CostMinimizationObjective(cost=cost_function,
                                  data_provider=validation_data_provider,
                                  regularizer=regularizer))

    time_start = time.time()

    best_acc = -1.0

    costs = []
    for batch_index, iteration_info in enumerate(optimizer):
        costs.append(iteration_info['cost'])

        if batch_index % 30 == 0:
            X_valid, Y_valid, meta_valid = validation_data_provider.next_batch(
            )

            test_model = gpu.model.dropout.remove_dropout(tweet_model)
            Y_hat = test_model.fprop(X_valid, meta=meta_valid)
            del test_model

            Y_hat = Y_hat.get()
            assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6)

            # grad_check = gradient_checker.check(model)
            grad_check = "skipped"

            time_now = time.time()
            examples_per_hr = (batch_index * batch_size) / (time_now -
                                                            time_start) * 3600

            acc = np.mean(
                np.argmax(Y_hat, axis=1) == np.argmax(Y_valid.get(), axis=1))

            if acc > best_acc:
                best_acc = acc
                with open("model_best_tweets.pkl", 'w') as model_file:
                    pickle.dump(tweet_model.move_to_cpu(),
                                model_file,
                                protocol=-1)
                # with open("model_best_optimization.pkl", 'w') as model_file:
                #     pickle.dump(optimizer, model_file, protocol=-1)

            print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, EPH: {}, best acc: {}".format(
                batch_index, acc, costs[-1],
                np.argmax(Y_hat, axis=1).mean(),
                np.mean(np.abs(tweet_model.pack())), examples_per_hr, best_acc)

        # if batch_index % 2500 == 0 and batch_index > 0:
        #     update_rule.gamma *= 0.5

        # if batch_index == 1000:
        #     break

        # if batch_index % 100 == 0:
        #     with open("model.pkl", 'w') as model_file:
        #         pickle.dump(model.move_to_cpu(), model_file, protocol=-1)

        # if batch_index % 1000 == 0 and batch_index > 0:
        #     with open("model_optimization.pkl", 'w') as model_file:
        #         pickle.dump(optimizer, model_file, protocol=-1)

    time_end = time.time()

    print "Time elapsed: {}s".format(time_end - time_start)
Beispiel #5
0
def run():
    # random.seed(435)
    # np.random.seed(2342)
    np.set_printoptions(linewidth=100)

    parser = argparse.ArgumentParser(
        description="Evaluate a trained network on the sentiment140 test set")
    parser.add_argument("--model_file",
                        help="pickle file to load the model from")
    parser.add_argument("--best_file", help="html file to write the output to")
    args = parser.parse_args()

    tweets_dir = os.path.join(
        "../data",
        "sentiment140_2")  # _2 truncates at <3, normal truncates at <5

    with open(os.path.join(tweets_dir,
                           "sentiment140.train.clean.json")) as data_file:
        data = json.loads(data_file.read())
        random.shuffle(data)
        X, Y = map(list, zip(*data))
        Y = [[":)", ":("].index(y) for y in Y]

    with open(
            os.path.join(tweets_dir,
                         "sentiment140.train.clean.dictionary.encoding.json")
    ) as alphabet_file:
        alphabet = json.loads(alphabet_file.read())

    with open(os.path.join(tweets_dir,
                           "sentiment140.test.clean.json")) as data_file:
        data = json.loads(data_file.read())
        X_test, Y_test = map(list, zip(*data))
        Y_test = [[":)", ":("].index(y) for y in Y_test]

    print len(alphabet)

    # X = X[:1000]
    # Y = Y[:1000]

    # lists of words
    # replace unknowns with an unknown character
    tokenizer = WordPunctTokenizer()
    new_X = []
    for x in X:
        new_X.append(
            [w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)])
    X = new_X

    new_X_test = []
    for x in X_test:
        new_X_test.append(
            [w if w in alphabet else 'UNKNOWN' for w in tokenizer.tokenize(x)])
    X_test = new_X_test

    batch_size = 5

    train_data_provider = LabelledSequenceMinibatchProvider(
        X=X, Y=Y, batch_size=batch_size, fixed_length=50, padding='PADDING')

    print train_data_provider.batches_per_epoch

    validation_data_provider = LabelledSequenceMinibatchProvider(
        X=X_test,
        Y=Y_test,
        batch_size=len(X_test),
        fixed_length=50,
        padding='PADDING',
        shuffle=False)

    with open(args.model_file) as model_file:
        tweet_model = gpu.model.host_device_component_mapping.move_to_gpu(
            pickle.load(model_file))
        # tweet_model = gpu.model.dropout.remove_dropout(tweet_model)

    print tweet_model

    X_valid, Y_valid, meta_valid = validation_data_provider.next_batch()
    test_model = gpu.model.dropout.remove_dropout(tweet_model)
    Y_hat = test_model.fprop(X_valid, meta=meta_valid)
    del test_model
    best_acc = np.mean(
        np.argmax(Y_hat.get(), axis=1) == np.argmax(Y_valid.get(), axis=1))
    print "Acc at start:", best_acc

    cost_function = CrossEntropy()

    regularizer = L2Regularizer(lamb=0.0)

    objective = CostMinimizationObjective(cost=cost_function,
                                          data_provider=train_data_provider,
                                          regularizer=regularizer)

    update_rule = AdaGrad(gamma=2e-3, model_template=tweet_model)

    optimizer = SGD(model=tweet_model,
                    objective=objective,
                    update_rule=update_rule)

    gradient_checker = ModelGradientChecker(
        CostMinimizationObjective(cost=cost_function,
                                  data_provider=validation_data_provider,
                                  regularizer=regularizer))

    time_start = time.time()

    costs = []
    for batch_index, iteration_info in enumerate(optimizer):
        costs.append(iteration_info['cost'])

        X_valid, Y_valid, meta_valid = validation_data_provider.next_batch()
        test_model = gpu.model.dropout.remove_dropout(tweet_model)
        Y_hat = test_model.fprop(X_valid, meta=meta_valid)
        del test_model

        Y_hat = Y_hat.get()
        assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6)

        time_now = time.time()
        examples_per_hr = (batch_index * batch_size) / (time_now -
                                                        time_start) * 3600

        acc = np.mean(
            np.argmax(Y_hat, axis=1) == np.argmax(Y_valid.get(), axis=1))

        if acc > best_acc:
            best_acc = acc
            with open(args.best_file, 'w') as model_file:
                pickle.dump(tweet_model.move_to_cpu(), model_file, protocol=-1)

        print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, EPH: {}, best acc: {}".format(
            batch_index, acc, costs[-1],
            np.argmax(Y_hat, axis=1).mean(),
            np.mean(np.abs(tweet_model.pack())), examples_per_hr, best_acc)

    time_end = time.time()

    print "Time elapsed: {}s".format(time_end - time_start)