Esempio n. 1
0
def main():
    from data import processing
    batch_size = 128
    embedding_size = 128  # Dimension of the embedding vector.
    skip_window = 1  # How many words to consider left and right.
    num_skips = 2  # How many times to reuse an input to generate a label.
    num_sampled = 64 * 10
    num_epocs = 4
    filename = '/home/dlian/data/location_prediction/gowalla/Gowalla_totalCheckins.txt'
    loc_seq_index = processing(filename)
    data, vocabulary_size = gen_data(loc_seq_index[:1])

    train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
    loss, embed = word2vec(train_inputs, train_labels, embedding_size,
                           num_sampled, vocabulary_size)
    train_op = tf.train.AdagradOptimizer(1.0).minimize(loss)

    num_steps = num_epocs * len(data) // batch_size

    saver = tf.train.Saver({embed.name: embed})
    with tf.Session() as session:
        # We must initialize all variables before we use them.
        tf.global_variables_initializer().run()
        print('Initialized')

        average_loss = 0
        for step in xrange(num_steps):
            batch_inputs, batch_labels = generate_batch(
                data, batch_size, num_skips, skip_window)
            feed_dict = {
                train_inputs: batch_inputs,
                train_labels: batch_labels
            }
            _, loss_val = session.run([train_op, loss], feed_dict=feed_dict)
            average_loss += loss_val

            if step % 2000 == 0:
                if step > 0:
                    average_loss /= 2000
                # The average loss is an estimate of the loss over the last 2000 batches.
                print('Average loss at step ', step, ' out of ', num_steps,
                      ' : ', average_loss)
                average_loss = 0

        saver.save(
            session,
            '/home/dlian/data/location_prediction/gowalla/logdir/model.ckpt')
Esempio n. 2
0
def main():
    import os.path
    import json
    filename = '/home/dlian/data/location_prediction/gowalla/Gowalla_totalCheckins.txt'
    loc_seq_index = processing(filename)
    loc_seq_index = loc_seq_index[:1000]
    num_locations = max(l for (u, time_loc) in loc_seq_index
                        for t, l in time_loc) + 1
    print('{0} locations, {1} users'.format(num_locations, len(loc_seq_index)))
    batch_size = 32
    max_seq_len = 10
    epocs = 50
    embedding_size = 50
    learning_rate = 0.1
    print(
        'embed_size:{0}, max sequence length:{1}, batch size:{2}, learn_rate:{3}'
        .format(embedding_size, max_seq_len, batch_size, learning_rate))
    ratio = 0.8
    test = get_test(loc_seq_index, max_seq_len, ratio)
    batches = prepare_batches(loc_seq_index, num_locations - 1, batch_size,
                              max_seq_len, ratio)

    seq_input = tf.placeholder(tf.int32, shape=[None, None], name='input_seq')
    class_output = tf.placeholder(tf.int32, shape=[None], name='output_class')
    seq_len = tf.placeholder(tf.int32, shape=[None], name='sequence_length')
    weight_mask = tf.placeholder(tf.float32,
                                 shape=[None, None],
                                 name='weight_mask')
    keep_prob = tf.placeholder(tf.float32)
    loss, acc_op, pred_top_op = classifier_seq(seq=seq_input,
                                               labels=class_output,
                                               weight_mask=weight_mask,
                                               num_loc=num_locations,
                                               embed_size=embedding_size,
                                               seq_len=seq_len,
                                               k=50,
                                               num_samples=-1,
                                               keep_prob=keep_prob)

    train_op = tf.train.AdagradOptimizer(
        learning_rate=learning_rate).minimize(loss)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        total_loss = 0
        for iter in range(epocs):
            for batch_index in range(len(batches)):
                batch = batches[batch_index]
                X, Y, length, weight = get_batch(loc_seq_index, batch)
                _, loss_value = sess.run(
                    [train_op, loss],
                    feed_dict={
                        seq_input: X,
                        class_output: Y,
                        seq_len: length,
                        weight_mask: weight,
                        keep_prob: 0.5
                    })
                total_loss += loss_value

            X, Y, length, weight = test
            acc, pred = sess.run(
                [acc_op, pred_top_op],
                feed_dict={
                    seq_input: X,
                    class_output: Y,
                    seq_len: length,
                    weight_mask: weight,
                    keep_prob: 1
                })

            print total_loss, acc
            total_loss = 0
Esempio n. 3
0
from data import processing

path = r'data_csv/ner_dataset.csv'

processing(path_to_data=path)
Esempio n. 4
0
def main():
    import os.path
    import json
    filename = '/home/dlian/data/location_prediction/gowalla/Gowalla_totalCheckins.txt'
    loc_seq_index = processing(filename)
    loc_seq_index = loc_seq_index[:1000]
    num_locations = max(l for (u, time_loc) in loc_seq_index
                        for t, l in time_loc) + 1
    print('{0} locations, {1} users'.format(num_locations, len(loc_seq_index)))
    batch_size = 64
    max_seq_len = 10
    epocs = 50
    embedding_size = 50
    learning_rate = 0.1
    print(
        'embed_size:{0}, max sequence length:{1}, batch size:{2}, learn_rate:{3}'
        .format(embedding_size, max_seq_len, batch_size, learning_rate))

    test = get_test(loc_seq_index, max_seq_len)
    batches = prepare_batches(loc_seq_index, -1, batch_size, max_seq_len)

    seq_input = tf.placeholder(tf.int32, shape=[None, None], name='input_seq')
    class_output = tf.placeholder(tf.int32, shape=[None], name='output_class')
    seq_len = tf.placeholder(tf.int32, shape=[None], name='sequence_length')
    weight_mask = tf.placeholder(tf.float32,
                                 shape=[None, None],
                                 name='weight_mask')
    keep_prob = tf.placeholder(tf.float32)
    loss, acc_op, pred_top_op = classifier_seq(seq=seq_input,
                                               labels=class_output,
                                               weight_mask=weight_mask,
                                               num_loc=num_locations,
                                               embed_size=embedding_size,
                                               seq_len=seq_len,
                                               k=50,
                                               num_samples=-1,
                                               keep_prob=keep_prob)
    merged = tf.summary.merge_all()

    train_op = tf.train.AdagradOptimizer(
        learning_rate=learning_rate).minimize(loss)
    with tf.Session() as sess:
        train_writer = tf.summary.FileWriter(
            '/home/dlian/data/location_prediction/gowalla/train', sess.graph)
        test_writer = tf.summary.FileWriter(
            '/home/dlian/data/location_prediction/gowalla/test')
        sess.run(tf.global_variables_initializer())
        total_loss = 0
        for iter in range(3):
            summary = None
            for batch_index in range(len(batches)):
                batch = batches[batch_index]
                X, Y, length, weight = get_batch(loc_seq_index, batch)
                _, loss_value, summary = sess.run(
                    [train_op, loss, merged],
                    feed_dict={
                        seq_input: X,
                        class_output: Y,
                        seq_len: length,
                        weight_mask: weight,
                        keep_prob: 0.5
                    })
                total_loss += loss_value

            train_writer.add_summary(summary)

            X, Y, length, weight = test
            acc, pred, summary = sess.run(
                [acc_op, pred_top_op, merged],
                feed_dict={
                    seq_input: X,
                    class_output: Y,
                    seq_len: length,
                    weight_mask: weight,
                    keep_prob: 1
                })
            test_writer.add_summary(summary)
            print total_loss, acc
            total_loss = 0

            with open(
                    '/home/dlian/data/location_prediction/gowalla/pred{0}.txt'.
                    format(iter), 'w') as fout:
                for ii, (x, p, y) in enumerate(zip(X, pred[:, 0], Y)):
                    if p != y:
                        fout.writelines('{3}, {0}, {1}, {2}\n'.format(
                            y, p, x, ii))
        train_writer.close()
        test_writer.close()
Esempio n. 5
0
def main():
    import os.path
    import json
    filename = '/home/dlian/data/location_prediction/gowalla/Gowalla_totalCheckins.txt'
    loc_seq_index = processing(filename)
    loc_seq_index = loc_seq_index[:1000]
    num_locations = max(l for (u, time_loc) in loc_seq_index
                        for t, l in time_loc) + 1
    batch_size = 60
    max_seq_len = 10
    epocs = 50
    embedding_size = 50

    test = get_test(loc_seq_index, max_seq_len)

    seq_input = tf.placeholder(tf.int32, shape=[None, None], name='input_seq')
    class_output = tf.placeholder(tf.int32, shape=[None], name='output_class')
    seq_len = tf.placeholder(tf.int32, shape=[None], name='sequence_length')
    weight_mask = tf.placeholder(tf.float32,
                                 shape=[None, None],
                                 name='weight_mask')
    loss, ndcg_op, acc_op = classifier_seq(seq=seq_input,
                                           labels=class_output,
                                           weight_mask=weight_mask,
                                           num_loc=num_locations,
                                           embed_size=embedding_size,
                                           seq_len=seq_len,
                                           k=50,
                                           num_samples=-1)

    train_op = tf.train.AdagradOptimizer(learning_rate=0.1).minimize(loss)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for iter in range(epocs):
            total_loss = 0
            for u in range(len(loc_seq_index)):
                X, Y, length, weight = get_batch(loc_seq_index[u][1],
                                                 batch_size, max_seq_len,
                                                 num_locations - 1)
                _, loss_value = sess.run(
                    [train_op, loss],
                    feed_dict={
                        seq_input: X,
                        class_output: Y,
                        seq_len: length,
                        weight_mask: weight
                    })
                total_loss += loss_value
            print total_loss

            X, Y, length, weight = test
            ndcg, acc = sess.run(
                [ndcg_op, acc_op],
                feed_dict={
                    seq_input: X,
                    class_output: Y,
                    seq_len: length,
                    weight_mask: weight
                })

            print(ndcg, acc)
Esempio n. 6
0
def main():
    import os.path
    import json
    from data import processing
    filename = '/home/dlian/data/location_prediction/gowalla/Gowalla_totalCheckins.txt'
    loc_seq_index = processing(filename)
    loc_seq_index = sorted(loc_seq_index, key=lambda e: len(e[1]))
    loc_seq_index = loc_seq_index[:100]
    num_locations = max(l for (u, time_loc) in loc_seq_index
                        for t, l in time_loc) + 1
    batch_size = 1
    epocs = 100
    embedding_size = 100

    train_batches = get_train_batches(loc_seq_index, batch_size)
    test_batches = get_test_batches(loc_seq_index, 100)

    seq_input = tf.placeholder(tf.int32, shape=[None, None], name='x')
    class_output = tf.placeholder(tf.int32, shape=[None, None], name='y')
    weight_mask = tf.placeholder(tf.float32, shape=[None, None], name='weight')
    seq_len = tf.placeholder(tf.int32, shape=[None], name='sequence_length')
    loss, ndcg_op, acc_op = seq_classifer(source=seq_input,
                                          target=class_output,
                                          weight_mask=weight_mask,
                                          num_loc=num_locations,
                                          seq_len=seq_len,
                                          embed_size=embedding_size,
                                          k=50,
                                          num_samples=-1)
    global_step = tf.Variable(0, name='global_step', trainable=False)
    train_op = tf.train.AdagradOptimizer(learning_rate=0.2).minimize(
        loss, global_step=global_step)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        for iter in range(epocs):
            loss_value_total = 0
            for batch_index in range(len(train_batches)):
                X, Y, weight, length = train_batches[batch_index]
                #print length
                _, loss_value = sess.run(
                    [train_op, loss],
                    feed_dict={
                        seq_input: X,
                        class_output: Y,
                        weight_mask: weight,
                        seq_len: length
                    })
                #print('{0}th batch, loss:{1}, batch_size:{2}'.format(batch_index, loss_value, len(length)))
                loss_value_total += loss_value
            print loss_value_total

            count = 0
            ndcg_total = 0
            acc_total = 0

            for batch_index in range(len(test_batches)):
                X, Y, weight, length = test_batches[batch_index]
                ndcg, acc = sess.run(
                    [ndcg_op, acc_op],
                    feed_dict={
                        seq_input: X,
                        class_output: Y,
                        weight_mask: weight,
                        seq_len: length
                    })
                ndcg_total += ndcg
                acc_total += acc
                count += 1
            print(ndcg_total, acc_total)
Esempio n. 7
0
    return tf.contrib.data.Dataset.from_tensor_slices((uid, loc, target))


def get_test_dataset(loc_seq_index):
    uid = []
    loc = []
    target = []
    for u, time_loc in loc_seq_index:
        uid.append(u)
        loc.append(time_loc[-2][1])
        target.append(time_loc[-1][1])
    return tf.contrib.data.Dataset.from_tensor_slices((uid, loc, target))


filename = '/home/dlian/data/location_prediction/gowalla/Gowalla_totalCheckins.txt'
loc_seq_index = processing(filename)
num_loc = max(l for (u, time_loc) in loc_seq_index for t, l in time_loc) + 1
num_users = len(loc_seq_index)
num_sampled = 20
batch_size = 128
embedding_size = 128
epocs = 20
k = 50

graph = tf.Graph()
with graph.as_default():

    train_set = get_train_dataset(loc_seq_index)
    test_set = get_test_dataset(loc_seq_index)

    test = test_set.batch(num_users)