Esempio n. 1
0
def build_model_and_objective(n_classes, n_input_dimensions, X, Y):
    model = CSM(
        layers=[
            Softmax(
                n_classes=n_classes,
                n_input_dimensions=n_input_dimensions),
        ],
        )

    lengths = np.zeros(X.shape[0])
    data_provider = BatchDataProvider(
        X=X,
        Y=Y,
        lengths=lengths)

    cost_function = CrossEntropy()

    objective = CostMinimizationObjective(cost=cost_function, data_provider=data_provider)

    update_rule = AdaGrad(
        gamma=0.1,
        model_template=model)

    optimizer = SGD(model=model, objective=objective, update_rule=update_rule)

    return model, objective, optimizer, data_provider
Esempio n. 2
0
def get_model(encoding):

    return CSM(
        layers=[
            DictionaryEncoding(vocabulary=encoding),

            WordEmbedding(
                dimension={{embedding_dimension}},
                vocabulary_size=len(encoding),
                padding=encoding['PADDING']),

            {% for layer in word_layers %}
            {% set layer_index = loop.index0 %}

            SentenceConvolution(
                n_feature_maps={{layer.n_feature_maps}},
                kernel_width={{layer.kernel_width}},
                n_channels={{layer.n_channels}},
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps={{layer.n_feature_maps}}),

            KMaxPooling(k={{layer.k_pooling}}, k_dynamic={{layer.k_dynamic}} if {{layer.k_dynamic}} > 0 else None),

            {{layer.nonlinearity}}(),

            {% endfor %}

            ReshapeForDocuments(),

            {% for layer in sentence_layers %}
            {% set layer_index = loop.index0 %}

            SentenceConvolution(
                n_feature_maps={{layer.n_feature_maps}},
                kernel_width={{layer.kernel_width}},
                n_channels={{layer.n_channels}},
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps={{layer.n_feature_maps}}),

            KMaxPooling(k={{layer.k_pooling}}, k_dynamic={{layer.k_dynamic}} if {{layer.k_dynamic}} > 0 else None),

            {{layer.nonlinearity}}(),

            {% endfor %}

            {% if dropout %}
            Dropout(('b', 'd', 'f', 'w'), 0.5),
            {% endif %}

            Softmax(
                n_classes={{n_classes}},
                n_input_dimensions={{softmax_input_dimensions}}),
            ])
def model_one_layer_variant_2(alphabet):
    return CSM(layers=[
        DictionaryEncoding(vocabulary=alphabet),
        WordEmbedding(dimension=42, vocabulary_size=len(alphabet)),
        SentenceConvolution(n_feature_maps=5,
                            kernel_width=6,
                            n_channels=1,
                            n_input_dimensions=42),
        SumFolding(),
        KMaxPooling(k=4),
        Bias(n_input_dims=21, n_feature_maps=5),
        Tanh(),
        Softmax(n_classes=2, n_input_dimensions=420),
    ])
Esempio n. 4
0
def remove_dropout(model):
    new_model = []
    ratio = 0
    for layer in model.layers:
        if layer.__class__.__name__ == 'Dropout':
            ratio = layer.dropout_rate
        else:
            if ratio == 0:
                new_model.append(layer)
            else:
                new_model.append(__function_mapping[layer.__class__.__name__](
                    layer, ratio))
                ratio = 0

    return CSM(layers=new_model)
def model_one_layer_large_embedding(alphabet):
    return CSM(layers=[
        DictionaryEncoding(vocabulary=alphabet),
        WordEmbedding(dimension=32 * 4, vocabulary_size=len(alphabet)),
        SentenceConvolution(n_feature_maps=5,
                            kernel_width=10,
                            n_channels=1,
                            n_input_dimensions=32 * 4),
        Relu(),
        SumFolding(),
        SumFolding(),
        SumFolding(),
        KMaxPooling(k=7),
        Bias(n_input_dims=16, n_feature_maps=5),
        Tanh(),
        MaxFolding(),
        Softmax(n_classes=2, n_input_dimensions=280),
    ])
def txtnets_model_from_gensim_word2vec(gensim_model):
    # build vocabulary mapping
    encoding = {}
    for index, word in enumerate(gensim_model.index2word):
        encoding[word] = index
    encoding['PADDING'] = len(encoding)

    vocabulary_size = len(encoding)
    embedding_dim = gensim_model.syn0.shape[1]

    E = np.concatenate(
        [gensim_model.syn0, np.zeros((1, embedding_dim))], axis=0)

    txtnets_model = CSM(layers=[
        DictionaryEncoding(vocabulary=encoding),
        WordEmbedding(
            vocabulary_size=vocabulary_size,
            dimension=embedding_dim,
            padding=encoding['PADDING'],
            E=E,
        )
    ])

    return txtnets_model
    tweet_model = CSM(
        layers=[
            # cpu.model.encoding.
            DictionaryEncoding(vocabulary=alphabet),

            # cpu.model.embedding.
            WordEmbedding(
                dimension=28,
                vocabulary_size=len(alphabet)),

            # HostToDevice(),

            SentenceConvolution(
                n_feature_maps=6,
                kernel_width=7,
                n_channels=1,
                n_input_dimensions=28),

            Bias(
                n_input_dims=28,
                n_feature_maps=6),

            SumFolding(),

            KMaxPooling(k=4, k_dynamic=0.5),

            Tanh(),

            SentenceConvolution(
                n_feature_maps=14,
                kernel_width=5,
                n_channels=6,
                n_input_dimensions=14),

            Bias(
                n_input_dims=14,
                n_feature_maps=14),

            SumFolding(),

            KMaxPooling(k=4),

            Tanh(),

            Softmax(
                n_classes=2,
                n_input_dimensions=392),
            ]
    )
def main():
    random.seed(34532)
    np.random.seed(675)
    np.set_printoptions(linewidth=100)

    data_dir = os.path.join("/users/mdenil/code/txtnets/txtnets_deployed/data", "stanfordmovie")


    trainer = Word2Vec(
        train=os.path.join(data_dir, "stanfordmovie.train.sentences.clean.projected.txt"),
        output="stanford-movie-vectors.bin",
        cbow=1,
        size=300,
        window=8,
        negative=25,
        hs=0,
        sample=1e-4,
        threads=20,
        binary=1,
        iter=15,
        min_count=1)

    trainer.train()

    gensim_model = gensim.models.Word2Vec.load_word2vec_format(
        "/users/mdenil/code/txtnets/txtnets_deployed/code/stanford-movie-vectors.bin",
        binary=True)

    # print(gensim_model.most_similar(["refund"]))
    # print(gensim_model.most_similar(["amazing"]))

    embedding_model = txtnets_model_from_gensim_word2vec(gensim_model)

    with open(os.path.join(data_dir, "stanfordmovie.train.sentences.clean.projected.flat.json")) as data_file:
        data = json.load(data_file)

    random.shuffle(data)
    X, Y = map(list, zip(*data))
    Y = [[":)", ":("].index(y) for y in Y]

    batch_size = 100
    n_validation = 500

    train_data_provider = LabelledSequenceMinibatchProvider(
        X=X[:-n_validation],
        Y=Y[:-n_validation],
        batch_size=batch_size,
        padding='PADDING')

    transformed_train_data_provider = TransformedLabelledDataProvider(
        data_source=train_data_provider,
        transformer=embedding_model)

    validation_data_provider = LabelledSequenceMinibatchProvider(
        X=X[-n_validation:],
        Y=Y[-n_validation:],
        batch_size=batch_size,
        padding='PADDING')

    transformed_validation_data_provider = TransformedLabelledDataProvider(
        data_source=validation_data_provider,
        transformer=embedding_model)

    logistic_regression = CSM(
        layers=[
            Sum(axes=['w']),

            Softmax(
                n_input_dimensions=gensim_model.syn0.shape[1],
                n_classes=2)
        ]
    )

    cost_function = CrossEntropy()
    regularizer = L2Regularizer(lamb=1e-4)
    objective = CostMinimizationObjective(
        cost=cost_function,
        data_provider=transformed_train_data_provider,
        regularizer=regularizer)
    update_rule = AdaGrad(
        gamma=0.1,
        model_template=logistic_regression)

    optimizer = SGD(
        model=logistic_regression,
        objective=objective,
        update_rule=update_rule)


    for batch_index, iteration_info in enumerate(optimizer):
        if batch_index % 100 == 0:
            # print(iteration_info['cost'])

            Y_hat = []
            Y_valid = []
            for _ in xrange(transformed_validation_data_provider.batches_per_epoch):
                X_valid_batch, Y_valid_batch, meta_valid = transformed_validation_data_provider.next_batch()
                Y_valid.append(get(Y_valid_batch))
                Y_hat.append(get(logistic_regression.fprop(X_valid_batch, meta=meta_valid)))
            Y_valid = np.concatenate(Y_valid, axis=0)
            Y_hat = np.concatenate(Y_hat, axis=0)

            acc = np.mean(np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1))

            print("B: {}, A: {}, C: {}".format(
                batch_index,
                acc,
                iteration_info['cost']))

            with open("model_w2vec_logreg.pkl", 'w') as model_file:
                pickle.dump(embedding_model.move_to_cpu(), model_file, protocol=-1)
                pickle.dump(logistic_regression.move_to_cpu(), model_file, protocol=-1)
    model = CSM(
        layers=[
            WordEmbedding(
                dimension=embedding_dimension,
                vocabulary_size=len(alphabet)),

            SentenceConvolution(
                n_feature_maps=n_feature_maps,
                kernel_width=kernel_width,
                n_channels=1,
                n_input_dimensions=embedding_dimension),

            SumFolding(),

            KMaxPooling(k=pooling_size),

            # Bias(
            #     n_input_dims=embedding_dimension / 2,
            #     n_feature_maps=n_feature_maps),

            Linear(
                n_input=n_feature_maps*pooling_size*embedding_dimension / 2,
                n_output=64
            ),

            Tanh(),

            Linear(
                n_output=1,
                n_input=64),
        ]
    )
Esempio n. 10
0
    vocabulary_size = len(alphabet)
    n_feature_maps = 8
    kernel_width = 5
    pooling_size = 2

    n_epochs = 1

    model = CSM(layers=[
        WordEmbedding(dimension=embedding_dimension,
                      vocabulary_size=len(alphabet)),
        SentenceConvolution(n_feature_maps=n_feature_maps,
                            kernel_width=kernel_width,
                            n_channels=1,
                            n_input_dimensions=embedding_dimension),
        SumFolding(),
        KMaxPooling(k=pooling_size),

        # Bias(
        #     n_input_dims=embedding_dimension / 2,
        #     n_feature_maps=n_feature_maps),
        Linear(n_input=n_feature_maps * pooling_size * embedding_dimension / 2,
               n_output=64),
        Tanh(),
        Linear(n_output=1, n_input=64),
    ])

    print model

    cost_function = LargeMarginCost(0.1)
    noise_model = RandomAlphabetCorruption(alphabet)

    objective = NoiseContrastiveObjective(cost=cost_function,
Esempio n. 11
0
 model = CSM(layers=[
     DictionaryEncoding(vocabulary=encoding),
     WordEmbedding(dimension=20,
                   vocabulary_size=len(encoding),
                   padding=encoding['PADDING']),
     Dropout(('b', 'w', 'f'), 0.2),
     SentenceConvolution(n_feature_maps=10,
                         kernel_width=15,
                         n_channels=20,
                         n_input_dimensions=1),
     Bias(n_input_dims=1, n_feature_maps=10),
     KMaxPooling(k=7, k_dynamic=0.5),
     Tanh(),
     SentenceConvolution(n_feature_maps=30,
                         kernel_width=9,
                         n_channels=10,
                         n_input_dimensions=1),
     Bias(n_input_dims=1, n_feature_maps=30),
     KMaxPooling(k=5),
     Tanh(),
     ReshapeForDocuments(),
     SentenceConvolution(n_feature_maps=20,
                         kernel_width=11,
                         n_channels=30 * 5,
                         n_input_dimensions=1),
     Bias(n_input_dims=1, n_feature_maps=20),
     KMaxPooling(k=5),
     Tanh(),
     Dropout(('b', 'd', 'f', 'w'), 0.5),
     Softmax(n_classes=2, n_input_dimensions=100),
 ])
Esempio n. 12
0
def main():
    random.seed(34532)
    np.random.seed(675)
    np.set_printoptions(linewidth=100)

    data_dir = os.path.join("/users/mdenil/code/txtnets/txtnets_deployed/data",
                            "stanfordmovie")

    trainer = Word2Vec(train=os.path.join(
        data_dir, "stanfordmovie.train.sentences.clean.projected.txt"),
                       output="stanford-movie-vectors.bin",
                       cbow=1,
                       size=300,
                       window=8,
                       negative=25,
                       hs=0,
                       sample=1e-4,
                       threads=20,
                       binary=1,
                       iter=15,
                       min_count=1)

    trainer.train()

    gensim_model = gensim.models.Word2Vec.load_word2vec_format(
        "/users/mdenil/code/txtnets/txtnets_deployed/code/stanford-movie-vectors.bin",
        binary=True)

    # print(gensim_model.most_similar(["refund"]))
    # print(gensim_model.most_similar(["amazing"]))

    embedding_model = txtnets_model_from_gensim_word2vec(gensim_model)

    with open(
            os.path.join(
                data_dir,
                "stanfordmovie.train.sentences.clean.projected.flat.json")
    ) as data_file:
        data = json.load(data_file)

    random.shuffle(data)
    X, Y = map(list, zip(*data))
    Y = [[":)", ":("].index(y) for y in Y]

    batch_size = 100
    n_validation = 500

    train_data_provider = LabelledSequenceMinibatchProvider(
        X=X[:-n_validation],
        Y=Y[:-n_validation],
        batch_size=batch_size,
        padding='PADDING')

    transformed_train_data_provider = TransformedLabelledDataProvider(
        data_source=train_data_provider, transformer=embedding_model)

    validation_data_provider = LabelledSequenceMinibatchProvider(
        X=X[-n_validation:],
        Y=Y[-n_validation:],
        batch_size=batch_size,
        padding='PADDING')

    transformed_validation_data_provider = TransformedLabelledDataProvider(
        data_source=validation_data_provider, transformer=embedding_model)

    logistic_regression = CSM(layers=[
        Sum(axes=['w']),
        Softmax(n_input_dimensions=gensim_model.syn0.shape[1], n_classes=2)
    ])

    cost_function = CrossEntropy()
    regularizer = L2Regularizer(lamb=1e-4)
    objective = CostMinimizationObjective(
        cost=cost_function,
        data_provider=transformed_train_data_provider,
        regularizer=regularizer)
    update_rule = AdaGrad(gamma=0.1, model_template=logistic_regression)

    optimizer = SGD(model=logistic_regression,
                    objective=objective,
                    update_rule=update_rule)

    for batch_index, iteration_info in enumerate(optimizer):
        if batch_index % 100 == 0:
            # print(iteration_info['cost'])

            Y_hat = []
            Y_valid = []
            for _ in xrange(
                    transformed_validation_data_provider.batches_per_epoch):
                X_valid_batch, Y_valid_batch, meta_valid = transformed_validation_data_provider.next_batch(
                )
                Y_valid.append(get(Y_valid_batch))
                Y_hat.append(
                    get(
                        logistic_regression.fprop(X_valid_batch,
                                                  meta=meta_valid)))
            Y_valid = np.concatenate(Y_valid, axis=0)
            Y_hat = np.concatenate(Y_hat, axis=0)

            acc = np.mean(
                np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1))

            print("B: {}, A: {}, C: {}".format(batch_index, acc,
                                               iteration_info['cost']))

            with open("model_w2vec_logreg.pkl", 'w') as model_file:
                pickle.dump(embedding_model.move_to_cpu(),
                            model_file,
                            protocol=-1)
                pickle.dump(logistic_regression.move_to_cpu(),
                            model_file,
                            protocol=-1)
Esempio n. 13
0
    ## BUILD THE MODEL

    model = CSM(
        layers=[
            WordEmbedding(dimension=embedding_dimension,
                          vocabulary_size=vocabulary_size),
            SentenceConvolution(n_feature_maps=n_feature_maps,
                                kernel_width=kernel_width,
                                n_channels=1,
                                n_input_dimensions=embedding_dimension),
            SumFolding(),
            KMaxPooling(k=pooling_size * 2),
            Bias(n_input_dims=embedding_dimension / 2,
                 n_feature_maps=n_feature_maps),
            Tanh(),

            # Softmax(
            #     n_classes=n_classes,
            #     n_input_dimensions=420),
            SentenceConvolution(n_feature_maps=n_feature_maps,
                                kernel_width=3,
                                n_channels=n_feature_maps,
                                n_input_dimensions=embedding_dimension / 2),
            KMaxPooling(k=pooling_size),
            Bias(n_input_dims=embedding_dimension / 2,
                 n_feature_maps=n_feature_maps),
            Tanh(),
            Softmax(n_classes=n_classes, n_input_dimensions=420),
        ], )

    print model
Esempio n. 14
0
    model = CSM(
        layers=[
            WordEmbedding(
                dimension=embedding_dimension,
                vocabulary_size=vocabulary_size),

            SentenceConvolution(
                n_feature_maps=n_feature_maps,
                kernel_width=kernel_width,
                n_channels=1,
                n_input_dimensions=embedding_dimension),

            # KMaxPooling(k=pooling_size),

            # TODO: make a bias that runs along the w dimension
            Bias(
                n_input_dims=embedding_dimension,
                n_feature_maps=n_feature_maps),

            MaxFolding(),

            SentenceConvolution(
                n_feature_maps=3,
                kernel_width=5,
                n_channels=n_feature_maps,
                n_input_dimensions=embedding_dimension / 2),


            MaxFolding(),
            MaxFolding(),

            Softmax(
                n_classes=n_classes,
                n_input_dimensions=3*(context_length + kernel_width - 1 + 5 - 1)*embedding_dimension / 8),
            ]
    )
Esempio n. 15
0
    model = CSM(
        layers=[
            DictionaryEncoding(vocabulary=encoding),

            WordEmbedding(
                dimension=20,
                vocabulary_size=len(encoding),
                padding=encoding['PADDING']),

            Dropout(('b', 'w', 'f'), 0.2),

            SentenceConvolution(
                n_feature_maps=10,
                kernel_width=15,
                n_channels=20,
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps=10),

            KMaxPooling(k=7, k_dynamic=0.5),

            Tanh(),

            SentenceConvolution(
                n_feature_maps=30,
                kernel_width=9,
                n_channels=10,
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps=30),

            KMaxPooling(k=5),

            Tanh(),

            ReshapeForDocuments(),

            SentenceConvolution(
                n_feature_maps=20,
                kernel_width=11,
                n_channels=30*5,
                n_input_dimensions=1),

            Bias(
                n_input_dims=1,
                n_feature_maps=20),

            KMaxPooling(k=5),

            Tanh(),

            Dropout(('b', 'd', 'f', 'w'), 0.5),

            Softmax(
                n_classes=2,
                n_input_dimensions=100),
            ]
    )
Esempio n. 16
0
    train_data_provider = LabelledSequenceMinibatchProvider(X=X[:-500],
                                                            Y=Y[:-500],
                                                            batch_size=100)

    print train_data_provider.batches_per_epoch

    validation_data_provider = LabelledSequenceMinibatchProvider(
        X=X[-500:], Y=Y[-500:], batch_size=500)

    word_embedding_model = CSM(layers=[
        WordEmbedding(  # really a character embedding
            dimension=16,
            vocabulary_size=len(alphabet)),
        SentenceConvolution(n_feature_maps=10,
                            kernel_width=5,
                            n_channels=1,
                            n_input_dimensions=16),
        SumFolding(),
        KMaxPooling(k=2),
        MaxFolding(),
        Tanh(),
    ])

    word_embedding = WordFromCharacterEmbedding(
        embedding_model=word_embedding_model, alphabet_encoding=alphabet)

    # print word_embedding.fprop(X, meta)

    tweet_model = CSM(layers=[
        word_embedding,
        SentenceConvolution(n_feature_maps=5,
    #         ]
    # )

    tweet_model = CSM(layers=[
        # cpu.model.encoding.
        DictionaryEncoding(vocabulary=alphabet),

        # cpu.model.embedding.
        WordEmbedding(dimension=28, vocabulary_size=len(alphabet)),

        # HostToDevice(),
        SentenceConvolution(n_feature_maps=6,
                            kernel_width=7,
                            n_channels=1,
                            n_input_dimensions=28),
        Bias(n_input_dims=28, n_feature_maps=6),
        SumFolding(),
        KMaxPooling(k=4, k_dynamic=0.5),
        Tanh(),
        SentenceConvolution(n_feature_maps=14,
                            kernel_width=5,
                            n_channels=6,
                            n_input_dimensions=14),
        Bias(n_input_dims=14, n_feature_maps=14),
        SumFolding(),
        KMaxPooling(k=4),
        Tanh(),
        Softmax(n_classes=2, n_input_dimensions=392),
    ])

    print tweet_model