Ejemplo n.º 1
0
def build_model_and_objective(n_classes, n_input_dimensions, X, Y):
    model = CSM(
        layers=[
            Softmax(
                n_classes=n_classes,
                n_input_dimensions=n_input_dimensions),
        ],
        )

    lengths = np.zeros(X.shape[0])
    data_provider = BatchDataProvider(
        X=X,
        Y=Y,
        lengths=lengths)

    cost_function = CrossEntropy()

    objective = CostMinimizationObjective(cost=cost_function, data_provider=data_provider)

    update_rule = AdaGrad(
        gamma=0.1,
        model_template=model)

    optimizer = SGD(model=model, objective=objective, update_rule=update_rule)

    return model, objective, optimizer, data_provider
Ejemplo n.º 2
0
def main():
    random.seed(665243)
    np.random.seed(61734)
    np.set_printoptions(linewidth=100)

    parser = argparse.ArgumentParser(description="Create summaries from w2vec model.")
    parser.add_argument('--size', type=int, help="number of sentences to keep")
    args = parser.parse_args()

    data_dir = os.path.join("/users/mdenil/code/txtnets/txtnets_deployed/data", "stanfordmovie")


    with open("model_w2vec_logreg.pkl") as model_file:
        embedding_model = pickle.load(model_file)
        logistic_regression = pickle.load(model_file)


    with open(os.path.join(data_dir, "stanfordmovie.test.sentences.clean.projected.json")) as data_file:
        data = json.load(data_file)

    # random.shuffle(data)
    X, Y = map(list, zip(*data))
    Y = [[":)", ":("].index(y) for y in Y]

    objective = CrossEntropy()

    test_data_provider = LabelledDocumentMinibatchProvider(
        X=X,
        Y=Y,
        batch_size=1,
        padding=None,
        shuffle=False)

    prog_bar = pyprind.ProgBar(test_data_provider.batches_per_epoch)

    summaries = []

    for _ in range(test_data_provider.batches_per_epoch):
        x_batch, y_batch, meta_batch = test_data_provider.next_batch()
        label = [":)", ":("][int(y_batch[0,1])]

        sentence_importance_scores = get_sentence_importance_scores(
            embedding_model, logistic_regression, x_batch)

        most_important_sentence_indexes = np.argsort(sentence_importance_scores)
        most_important_sentence_indexes = most_important_sentence_indexes[:args.size]
        most_important_sentence_indexes.sort()

        summary = []
        for i in most_important_sentence_indexes:
            summary.append(x_batch[i])

        summaries.append([summary, label])

        prog_bar.update()


    with open("summaries_{}.json".format(args.size), 'w') as summaries_file:
        json.dump(summaries, summaries_file)
        summaries_file.write("\n")
Ejemplo n.º 3
0
def get_sentence_importance_scores(embedding_model, logistic_regression, x):
    objective = CrossEntropy()

    x_combined = [w for s in x for w in s]

    meta_combined = {
        'lengths': np.asarray([len(x_combined)]),
        'space_below': cpu.space.CPUSpace(
            axes=('b', 'w'),
            extents={'b': 1, 'w': len(x_combined)})
        }

    x_combined = np.asarray(x_combined).reshape((1, -1))

    embeddings, embeddings_meta, embeddings_state = embedding_model.fprop(
        x_combined, meta=dict(meta_combined), return_state=True)
    embeddings_meta['space_below'] = embeddings_meta['space_above']
    y_hat, y_hat_meta, log_reg_state = logistic_regression.fprop(
        embeddings, meta=dict(embeddings_meta), return_state=True)
    y_hat_meta['space_below'] = y_hat_meta['space_above']

    loss, loss_meta, loss_state = objective.fprop(
        y_hat, max_error_label(y_hat), meta=dict(y_hat_meta))

    delta, delta_meta = objective.bprop(
        y_hat, max_error_label(y_hat), meta=dict(loss_meta), fprop_state=loss_state)

    delta = logistic_regression.bprop(
        delta, meta=dict(delta_meta), fprop_state=log_reg_state)

    C = combiner_matrix(map(len, x))

    sentence_delta = np.dot(delta, C)
    sentence_embedding = np.dot(embeddings, C)

    # normalize for cosine distance
    sentence_delta /= np.sqrt(np.sum(sentence_delta**2, axis=1, keepdims=True))
    sentence_embedding /= np.sqrt(np.sum(sentence_embedding**2, axis=1, keepdims=True))

    sentence_importance_scores = np.abs(np.sum(sentence_delta * sentence_embedding, axis=0))

    return sentence_importance_scores
def get_sentence_importance_scores(embedding_model, logistic_regression, x):
    objective = CrossEntropy()

    x_combined = [w for s in x for w in s]

    meta_combined = {
        "lengths": np.asarray([len(x_combined)]),
        "space_below": cpu.space.CPUSpace(axes=("b", "w"), extents={"b": 1, "w": len(x_combined)}),
    }

    x_combined = np.asarray(x_combined).reshape((1, -1))

    embeddings, embeddings_meta, embeddings_state = embedding_model.fprop(
        x_combined, meta=dict(meta_combined), return_state=True
    )
    embeddings_meta["space_below"] = embeddings_meta["space_above"]
    y_hat, y_hat_meta, log_reg_state = logistic_regression.fprop(
        embeddings, meta=dict(embeddings_meta), return_state=True
    )
    y_hat_meta["space_below"] = y_hat_meta["space_above"]

    loss, loss_meta, loss_state = objective.fprop(y_hat, max_error_label(y_hat), meta=dict(y_hat_meta))

    delta, delta_meta = objective.bprop(y_hat, max_error_label(y_hat), meta=dict(loss_meta), fprop_state=loss_state)

    delta = logistic_regression.bprop(delta, meta=dict(delta_meta), fprop_state=log_reg_state)

    C = combiner_matrix(map(len, x))

    sentence_delta = np.dot(delta, C)
    sentence_embedding = np.dot(embeddings, C)

    # normalize for cosine distance
    sentence_delta /= np.sqrt(np.sum(sentence_delta ** 2, axis=1, keepdims=True))
    sentence_embedding /= np.sqrt(np.sum(sentence_embedding ** 2, axis=1, keepdims=True))

    sentence_importance_scores = np.abs(np.sum(sentence_delta * sentence_embedding, axis=0))

    return sentence_importance_scores
def get_model_output(model, X,Y):
    #Initializing the data provided
    data_provider = cpu.optimize.data_provider.LabelledSequenceBatchProvider(
        X=X, Y=Y, padding='PADDING')


    #Define the cost function
    cEntr = CrossEntropy()

    #Get data and use the model to Predict
    X, Y, meta = data_provider.next_batch()
    Y_hat, meta, model_state = model.fprop(X, meta=meta, return_state=True)

    #Create a Y that maximizes the error of the model
    Y_inverted = enforce_error(Y_hat)

    #Bookkeep the spaces and BPROP to get the deltas
    meta['space_below'] = meta['space_above']
    cost, meta, cost_state = cEntr.fprop(Y_hat, Y_inverted, meta=meta)
    delta, meta = cEntr.bprop(Y_hat, Y_inverted, meta=meta, fprop_state=cost_state)
    delta, meta = model.bprop(delta, meta=meta, fprop_state=model_state, return_state=True, num_layers=-1)
    delta, space = meta['space_below'].transform(delta, ('b', 'w'))

    return Y_hat, Y_inverted, delta
Ejemplo n.º 6
0
        # Tanh(),
        MaxFolding(),
        Softmax(n_classes=2, n_input_dimensions=700),
    ])

    print tweet_model

    # X, Y, meta = train_data_provider.next_batch()
    # Y, meta, fprop_state = model.fprop(X, meta, return_state=True)

    # print meta['lengths']
    # print Y.shape, meta['space_above']

    # print [p.shape for p in model.params()]

    cost_function = CrossEntropy()

    objective = CostMinimizationObjective(cost=cost_function,
                                          data_provider=train_data_provider)

    update_rule = AdaGrad(gamma=0.1, model_template=tweet_model)

    optimizer = SGD(model=tweet_model,
                    objective=objective,
                    update_rule=update_rule)

    n_epochs = 1
    n_batches = train_data_provider.batches_per_epoch * n_epochs

    costs = []
    prev_weights = tweet_model.pack()
Ejemplo n.º 7
0
def run():

    with open("{{train_data_json}}") as data_file:
        data = json.load(data_file)
        random.shuffle(data)
        X, Y = map(list, zip(*data))
        Y = [[":)", ":("].index(y) for y in Y]

    with open("{{train_encoding_json}}") as encoding_file:
        encoding = json.load(encoding_file)

    n_validation = {{n_validation}}
    batch_size = {{batch_size}}

    train_data_provider = LabelledDocumentMinibatchProvider(
        X=X[:-n_validation],
        Y=Y[:-n_validation],
        batch_size=batch_size,
        padding='PADDING',
        fixed_n_sentences={{fixed_n_sentences}},
        fixed_n_words={{fixed_n_words}})

    print train_data_provider.batches_per_epoch

    validation_data_provider = LabelledDocumentMinibatchProvider(
        X=X[-n_validation:],
        Y=Y[-n_validation:],
        batch_size=batch_size,
        padding='PADDING',
        fixed_n_sentences={{fixed_n_sentences}},
        fixed_n_words={{fixed_n_words}})

    model = experiment_config.get_model(encoding)

    print model

    cost_function = CrossEntropy()

    regularizer = L2Regularizer(lamb={{regularizer}})

    objective = CostMinimizationObjective(cost=cost_function,
                                          data_provider=train_data_provider,
                                          regularizer=regularizer)

    update_rule = AdaGrad(gamma={{adagrad_gamma}}, model_template=model)

    optimizer = SGD(model=model, objective=objective, update_rule=update_rule)

    n_epochs = {{n_epochs}}
    n_batches = train_data_provider.batches_per_epoch * n_epochs

    time_start = time.time()

    best_acc = -1.0

    progress = []

    for batch_index, iteration_info in enumerate(optimizer):
        if batch_index % {{validation_frequency}} == 0:

            model_nodropout = cpu.model.dropout.remove_dropout(model)
            Y_hat = []
            Y_valid = []
            for _ in xrange(validation_data_provider.batches_per_epoch):
                X_valid_batch, Y_valid_batch, meta_valid = validation_data_provider.next_batch(
                )
                X_valid_batch = X_valid_batch
                Y_valid_batch = Y_valid_batch
                Y_valid.append(Y_valid_batch)
                Y_hat.append(
                    model_nodropout.fprop(X_valid_batch, meta=meta_valid))
            Y_valid = np.concatenate(Y_valid, axis=0)
            Y_hat = np.concatenate(Y_hat, axis=0)
            assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6)

            acc = np.mean(
                np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1))

            if acc > best_acc:
                best_acc = acc
                with open(os.path.join("{{job_dir}}", "model_best.pkl"),
                          'w') as model_file:
                    pickle.dump(model, model_file, protocol=-1)

            if batch_index % {{save_frequency}} == 0:
                with open(
                        os.path.join("{{job_dir}}",
                                     "model_{:05}.pkl".format(batch_index)),
                        'w') as model_file:
                    pickle.dump(model, model_file, protocol=-1)

            print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}, best: {}".format(
                batch_index, acc, iteration_info['cost'],
                np.argmax(Y_hat, axis=1).mean(), np.mean(np.abs(model.pack())),
                best_acc)

            time_now = time.time()

            examples_per_hr = (batch_index * batch_size) / (time_now -
                                                            time_start) * 3600

            progress.append({
                'batch': batch_index,
                'validation_accuracy': acc,
                'best_validation_accuracy': best_acc,
                'cost': iteration_info['cost'],
                'examples_per_hr': examples_per_hr,
            })

            with open(os.path.join("{{job_dir}}", "progress.pkl"),
                      'w') as progress_file:
                pickle.dump(progress, progress_file, protocol=-1)

        if batch_index >= n_batches:
            break

    time_end = time.time()

    print "Time elapsed: {}s".format(time_end - time_start)
Ejemplo n.º 8
0
def main():
    random.seed(34532)
    np.random.seed(675)
    np.set_printoptions(linewidth=100)

    data_dir = os.path.join("/users/mdenil/code/txtnets/txtnets_deployed/data",
                            "stanfordmovie")

    trainer = Word2Vec(train=os.path.join(
        data_dir, "stanfordmovie.train.sentences.clean.projected.txt"),
                       output="stanford-movie-vectors.bin",
                       cbow=1,
                       size=300,
                       window=8,
                       negative=25,
                       hs=0,
                       sample=1e-4,
                       threads=20,
                       binary=1,
                       iter=15,
                       min_count=1)

    trainer.train()

    gensim_model = gensim.models.Word2Vec.load_word2vec_format(
        "/users/mdenil/code/txtnets/txtnets_deployed/code/stanford-movie-vectors.bin",
        binary=True)

    # print(gensim_model.most_similar(["refund"]))
    # print(gensim_model.most_similar(["amazing"]))

    embedding_model = txtnets_model_from_gensim_word2vec(gensim_model)

    with open(
            os.path.join(
                data_dir,
                "stanfordmovie.train.sentences.clean.projected.flat.json")
    ) as data_file:
        data = json.load(data_file)

    random.shuffle(data)
    X, Y = map(list, zip(*data))
    Y = [[":)", ":("].index(y) for y in Y]

    batch_size = 100
    n_validation = 500

    train_data_provider = LabelledSequenceMinibatchProvider(
        X=X[:-n_validation],
        Y=Y[:-n_validation],
        batch_size=batch_size,
        padding='PADDING')

    transformed_train_data_provider = TransformedLabelledDataProvider(
        data_source=train_data_provider, transformer=embedding_model)

    validation_data_provider = LabelledSequenceMinibatchProvider(
        X=X[-n_validation:],
        Y=Y[-n_validation:],
        batch_size=batch_size,
        padding='PADDING')

    transformed_validation_data_provider = TransformedLabelledDataProvider(
        data_source=validation_data_provider, transformer=embedding_model)

    logistic_regression = CSM(layers=[
        Sum(axes=['w']),
        Softmax(n_input_dimensions=gensim_model.syn0.shape[1], n_classes=2)
    ])

    cost_function = CrossEntropy()
    regularizer = L2Regularizer(lamb=1e-4)
    objective = CostMinimizationObjective(
        cost=cost_function,
        data_provider=transformed_train_data_provider,
        regularizer=regularizer)
    update_rule = AdaGrad(gamma=0.1, model_template=logistic_regression)

    optimizer = SGD(model=logistic_regression,
                    objective=objective,
                    update_rule=update_rule)

    for batch_index, iteration_info in enumerate(optimizer):
        if batch_index % 100 == 0:
            # print(iteration_info['cost'])

            Y_hat = []
            Y_valid = []
            for _ in xrange(
                    transformed_validation_data_provider.batches_per_epoch):
                X_valid_batch, Y_valid_batch, meta_valid = transformed_validation_data_provider.next_batch(
                )
                Y_valid.append(get(Y_valid_batch))
                Y_hat.append(
                    get(
                        logistic_regression.fprop(X_valid_batch,
                                                  meta=meta_valid)))
            Y_valid = np.concatenate(Y_valid, axis=0)
            Y_hat = np.concatenate(Y_hat, axis=0)

            acc = np.mean(
                np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1))

            print("B: {}, A: {}, C: {}".format(batch_index, acc,
                                               iteration_info['cost']))

            with open("model_w2vec_logreg.pkl", 'w') as model_file:
                pickle.dump(embedding_model.move_to_cpu(),
                            model_file,
                            protocol=-1)
                pickle.dump(logistic_regression.move_to_cpu(),
                            model_file,
                            protocol=-1)
def optimize_and_save(model, alphabet, n_batches, data_file_name,
                      chars_or_words, result_file_name):

    print result_file_name

    with gzip.open(data_file_name) as data_file:
        data = json.loads(data_file.read())
        X, Y = map(list, zip(*data))

        # shuffle
        combined = zip(X, Y)
        random.shuffle(combined)
        X, Y = map(list, zip(*combined))

        # map labels to something useful
        Y = [[":)", ":("].index(y) for y in Y]

    if chars_or_words == 'chars':
        X = [list(x) for x in X]
    elif chars_or_words == 'words':
        # replace unknowns with an unknown character
        tokenizer = WordPunctTokenizer()
        new_X = []
        for x in X:
            new_X.append([
                w if w in alphabet else 'UNKNOWN'
                for w in tokenizer.tokenize(x)
            ])
        X = new_X
    else:
        raise ValueError("I don't know what that means :(")

    train_data_provider = LabelledSequenceMinibatchProvider(X=X[:-500],
                                                            Y=Y[:-500],
                                                            batch_size=50,
                                                            padding='PADDING')

    validation_data_provider = LabelledSequenceMinibatchProvider(
        X=X[-500:], Y=Y[-500:], batch_size=500, padding='PADDING')

    cost_function = CrossEntropy()

    objective = CostMinimizationObjective(cost=cost_function,
                                          data_provider=train_data_provider)

    update_rule = AdaGrad(gamma=0.05, model_template=model)

    regularizer = L2Regularizer(lamb=1e-4)

    optimizer = SGD(model=model,
                    objective=objective,
                    update_rule=update_rule,
                    regularizer=regularizer)

    print model

    monitor_info = []
    iteration_info = []
    for batch_index, info in enumerate(optimizer):
        iteration_info.append(info)

        if batch_index % 10 == 0:
            X_valid, Y_valid, meta_valid = validation_data_provider.next_batch(
            )

            Y_hat = model.fprop(X_valid, meta=meta_valid)
            assert np.all(np.abs(Y_hat.sum(axis=1) - 1) < 1e-6)

            acc = np.mean(
                np.argmax(Y_hat, axis=1) == np.argmax(Y_valid, axis=1))
            prop_1 = np.argmax(Y_hat, axis=1).mean()

            monitor_info.append({
                'batch_index': batch_index,
                'acc': acc,
                'prop_1': prop_1,
            })

            print "B: {}, A: {}, C: {}, Prop1: {}, Param size: {}".format(
                batch_index, acc, info['cost'], prop_1,
                np.mean(np.abs(model.pack())))

        if batch_index == n_batches - 1:
            break

    result = {
        'model': model,
        'iteration_info': iteration_info,
        'monitor_info': monitor_info,
    }

    with open(result_file_name, 'w') as result_file:
        pickle.dump(result, result_file, protocol=-1)