capturing_data = True
    else:
        if not line:
            raise 'Empty line during capturing phase'
            pass

        if line.startswith('END_DATA'):
            stream("Capture finished")
            capturing_data = False
            break
        else:
            decodeData(line)

stream("Decoding finished")

x_text, y = data_helpers.load_data_labels(datasets)

# Parameters
# ==================================================

# Data loading params
tf.flags.DEFINE_float("dev_sample_percentage", .1,
                      "Percentage of the training data to use for validation")
tf.flags.DEFINE_string("positive_data_file",
                       "./data/rt-polaritydata/rt-polarity.pos",
                       "Data source for the positive data.")
tf.flags.DEFINE_string("negative_data_file",
                       "./data/rt-polaritydata/rt-polarity.neg",
                       "Data source for the negative data.")

# Model Hyperparameters
Beispiel #2
0
tf.flags.DEFINE_boolean("allow_soft_placement", True,
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    # x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.positive_data_file, FLAGS.negative_data_file)
    x_raw, y_test = data_helpers.load_data_labels(FLAGS.data_file,
                                                  FLAGS.label_file)
    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
test2 = "/home/xxliu10/bigdata/classification/originnewsdata/2.test"
test3 = "/home/xxliu10/bigdata/classification/originnewsdata/3.test"
test4 = "/home/xxliu10/bigdata/classification/originnewsdata/4.test"
test5 = "/home/xxliu10/bigdata/classification/originnewsdata/5.test"

# Data Preparation
# ==================================================

# Load data
print("Loading data...")
datasets = data_helpers.get_datasets_textinline(train1, train2, train3, train4,
                                                train5)
testdatasets = data_helpers.get_datasets_textinline(test1, test2, test3, test4,
                                                    test5)

x_train_ns, y_train_ns = data_helpers.load_data_labels(datasets)
x_dev_ns, y_dev_ns = data_helpers.load_data_labels(testdatasets)
x_text = x_train_ns + x_dev_ns
print(len(x_train_ns))
print(len(x_dev_ns))
print(len(x_text))
# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x_tr = np.array(list(vocab_processor.fit_transform(x_train_ns)))
x_te = np.array(list(vocab_processor.fit_transform(x_dev_ns)))

# Randomly shuffle data
np.random.seed(10)
tr_shuffle_indices = np.random.permutation(np.arange(len(y_train_ns)))
te_shuffle_indices = np.random.permutation(np.arange(len(y_dev_ns)))
Beispiel #4
0
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc', 'talk.religion.misc']

training_classes = ['comp.graphics', 'alt.atheism', 'comp.sys.mac.hardware', 'misc.forsale', 'rec.autos']


# load data
print("Loading data...")
if dataset == "20newsgroup":
    datasets = data_helpers.get_datasets_20newsgroup(subset='train', categories=training_classes, remove=()) # TODO: use the remove parameter
    x_text, y_train = data_helpers.load_data_labels_remove_SW(datasets)
else:
    dataset = data_helpers.get_datasets_localdata("./data/20newsgroup", categories=None) # TODO: tweak parameters in the future
    x_text, y_train = data_helpers.load_data_labels(dataset) # text is stored in x_test; # labels are stored in y

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text]) # TODO: should be hardcoded to save time
print("Max document length: {}".format(max_document_length))
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x_train = np.array(list(vocab_processor.fit_transform(x_text)))


# Randomly shuffle data
# np.random.seed(10)
# shuffle_indices = np.random.permutation(np.arange(len(y)))
# x_shuffled = x[shuffle_indices]
# y_shuffled = y[shuffle_indices]

print(x_train.shape)
Beispiel #5
0
dataset_name = cfg["datasets"]["default"]

# Data Preparation
# ==================================================

# Load data
print("Loading data...Cool Joey :-)")
datasets = None

if dataset_name == "HAR_small":
    datasets_train = data_helpers.get_datasets(
        cfg["datasets"][dataset_name]["training_data_file"]["path"])
    datasets_test = data_helpers.get_datasets(
        cfg["datasets"][dataset_name]["testing_data_file"]["path"])

x, y = data_helpers.load_data_labels(datasets_train)
x_test, y_test = data_helpers.load_data_labels(datasets_test)

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/dev set
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
print("Train/Test split: {:d}/{:d}".format(len(y_train), len(y_test)))
Beispiel #6
0
# Misc Parameters
tf.flags.DEFINE_boolean("allow_soft_placement", True,
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# CHANGE THIS: Load data. Load your own data here
if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_data_labels()
    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
Beispiel #7
0
print("")

datasets = None

# CHANGE THIS: Load data. Load your own data here
dataset_name = cfg["datasets"]["default"]
if FLAGS.eval_train:
    if dataset_name == "mrpolarity":
        datasets = data_helpers.get_datasets_mrpolarity(cfg["datasets"][dataset_name]["positive_data_file"]["path"],
                                             cfg["datasets"][dataset_name]["negative_data_file"]["path"])
    elif dataset_name == "20newsgroup":
        datasets = data_helpers.get_datasets_20newsgroup(subset="test",
                                              categories=cfg["datasets"][dataset_name]["categories"],
                                              shuffle=cfg["datasets"][dataset_name]["shuffle"],
                                              random_state=cfg["datasets"][dataset_name]["random_state"])
    x_raw, y_test = data_helpers.load_data_labels(datasets)
    y_test = np.argmax(y_test, axis=1)
    print("Total number of test examples: {}".format(len(y_test)))
else:
    if dataset_name == "mrpolarity":
        datasets = {"target_names": ['positive_examples', 'negative_examples']}
        x_raw = ["a masterpiece four years in the making", "everything is off."]
        y_test = [1, 0]
    else:
        datasets = {"target_names": ['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']}
        x_raw = ["The number of reported cases of gonorrhea in Colorado increased",
                 "I am in the market for a 24-bit graphics card for a PC"]
        y_test = [2, 1]

x_words_raw, x_tags, x_labels, x_trees, x_indices, y, y_labels = data_helpers.load_data_labels('/u/a/n/anant/Dropbox/539_project/generated_test_data/')
x_words = x_words_raw
Beispiel #8
0
import numpy as np
import tensorflow as tf
from tensorflow.contrib import learn

import config
import data_helpers
from text_cnn import TextCNN

# params
print("\nparameters config:")
for k, v in config.config.items():
    print("{}={}".format(k, v))

# load data
x_test, y = data_helpers.load_data_labels(config.config["positive_data_file"],
                                          config.config["negative_data_file"])

# build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_test])
vocab_processr = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processr.fit_transform(x_test)))

# randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# split train/test set
dev_sample_index = -1 * int(config.config["dev_sample_percentage"] * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
def main():
    import time
    start_time = time.time()

    FLAGS = flagClass()

    with open("config.yml", 'r') as ymlfile:
        cfg = yaml.load(ymlfile)

    dataset_name = cfg["datasets"]["default"]
    if FLAGS.enable_word_embeddings and cfg['word_embeddings'][
            'default'] is not None:
        embedding_name = cfg['word_embeddings']['default']
        embedding_dimension = cfg['word_embeddings'][embedding_name][
            'dimension']
    else:
        embedding_dimension = FLAGS.embedding_dim

    # Data Preparation
    # ==================================================

    # Load data

    print("Loading data...")
    datasets = None
    if dataset_name == "mrpolarity":
        datasets = data_helpers.get_datasets_mrpolarity(
            cfg["datasets"][dataset_name]["positive_data_file"]["path"],
            cfg["datasets"][dataset_name]["negative_data_file"]["path"])
    elif dataset_name == 'spamham':
        datasets = data_helpers.get_datasets_mrpolarity(
            cfg["datasets"][dataset_name]["spam_file"]["path"],
            cfg["datasets"][dataset_name]["ham_file"]["path"])
    elif dataset_name == "20newsgroup":
        datasets = data_helpers.get_datasets_20newsgroup(
            subset="train",
            categories=cfg["datasets"][dataset_name]["categories"],
            shuffle=cfg["datasets"][dataset_name]["shuffle"],
            random_state=cfg["datasets"][dataset_name]["random_state"])
    elif dataset_name == "dbpedia":
        datasets = data_helpers.get_datasets_dbpedia(
            cfg["datasets"][dataset_name]["train_file"]["path"],
            cfg["datasets"][dataset_name]["train_file"]["limit"])
    elif dataset_name == "email":
        datasets = data_helpers.get_datasets_email(
            container_path=cfg["datasets"][dataset_name]["container_path"],
            categories=cfg["datasets"][dataset_name]["categories"],
            shuffle=cfg["datasets"][dataset_name]["shuffle"],
            random_state=cfg["datasets"][dataset_name]["random_state"])
    elif dataset_name == "localdata":
        datasets = data_helpers.get_datasets_localdata(
            container_path=cfg["datasets"][dataset_name]["container_path"],
            categories=cfg["datasets"][dataset_name]["categories"],
            shuffle=cfg["datasets"][dataset_name]["shuffle"],
            random_state=cfg["datasets"][dataset_name]["random_state"])
    x_text, y = data_helpers.load_data_labels(datasets)

    # Build vocabulary

    # To limit memory usage, you can cut off input text to first 40 words
    # Other research has shown that first 40 words in text (IMDB dataset?)
    # were representative of the content of the sentence for classification
    # purposes - Comment out one of the two lines below

    # max_document_length = max([len(x.split(" ")) for x in x_text])
    max_document_length = 40  # read up to 40 words from each sentence
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    x = np.array(list(vocab_processor.fit_transform(x_text)))

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[
        dev_sample_index:]
    y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[
        dev_sample_index:]
    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    print('Sequence_length={}'.format(x_train.shape[1]))

    # Training
    # ==================================================

    with tf.Graph().as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            cnn = TextCNN(sequence_length=x_train.shape[1],
                          num_classes=y_train.shape[1],
                          vocab_size=len(vocab_processor.vocabulary_),
                          embedding_size=embedding_dimension,
                          filter_sizes=list(
                              map(int, FLAGS.filter_sizes.split(","))),
                          num_filters=FLAGS.num_filters,
                          l2_reg_lambda=FLAGS.l2_reg_lambda)

            # Define Training procedure
            global_step = tf.Variable(0, name="global_step", trainable=False)
            optimizer = tf.train.AdamOptimizer(cnn.learning_rate)
            grads_and_vars = optimizer.compute_gradients(cnn.loss)
            train_op = optimizer.apply_gradients(grads_and_vars,
                                                 global_step=global_step)

            # Keep track of gradient values and sparsity (optional)
            grad_summaries = []
            for g, v in grads_and_vars:
                if g is not None:
                    grad_hist_summary = tf.summary.histogram(
                        "{}/grad/hist".format(v.name), g)
                    sparsity_summary = tf.summary.scalar(
                        "{}/grad/sparsity".format(v.name),
                        tf.nn.zero_fraction(g))
                    grad_summaries.append(grad_hist_summary)
                    grad_summaries.append(sparsity_summary)
            grad_summaries_merged = tf.summary.merge(grad_summaries)

            # Output directory for models and summaries
            timestamp = str(int(time.time()))
            out_dir = os.path.abspath(
                os.path.join(os.path.curdir, "runs", timestamp))
            print("Writing to {}\n".format(out_dir))

            # Summaries for loss and accuracy
            loss_summary = tf.summary.scalar("loss", cnn.loss)
            acc_summary = tf.summary.scalar("accuracy", cnn.accuracy)

            # Train Summaries
            train_summary_op = tf.summary.merge(
                [loss_summary, acc_summary, grad_summaries_merged])
            train_summary_dir = os.path.join(out_dir, "summaries", "train")
            train_summary_writer = tf.summary.FileWriter(
                train_summary_dir, sess.graph)

            # Dev summaries
            dev_summary_op = tf.summary.merge([loss_summary, acc_summary])
            dev_summary_dir = os.path.join(out_dir, "summaries", "dev")
            dev_summary_writer = tf.summary.FileWriter(dev_summary_dir,
                                                       sess.graph)

            # Checkpoint directory. Tensorflow assumes this directory already exists so we need to create it
            checkpoint_dir = os.path.abspath(
                os.path.join(out_dir, "checkpoints"))
            checkpoint_prefix = os.path.join(checkpoint_dir, "model")
            if not os.path.exists(checkpoint_dir):
                os.makedirs(checkpoint_dir)
            saver = tf.train.Saver(tf.global_variables(),
                                   max_to_keep=FLAGS.num_checkpoints)

            # Write vocabulary
            vocab_processor.save(os.path.join(out_dir, "vocab"))

            # Initialize all variables
            sess.run(tf.global_variables_initializer())
            if FLAGS.enable_word_embeddings and cfg['word_embeddings'][
                    'default'] is not None:
                vocabulary = vocab_processor.vocabulary_
                initW = None
                if embedding_name == 'word2vec':
                    # load embedding vectors from the word2vec
                    print("Load word2vec file {}".format(
                        cfg['word_embeddings']['word2vec']['path']))
                    initW = data_helpers.load_embedding_vectors_word2vec(
                        vocabulary, cfg['word_embeddings']['word2vec']['path'],
                        cfg['word_embeddings']['word2vec']['binary'])
                    print("word2vec file has been loaded")
                elif embedding_name == 'glove':
                    # load embedding vectors from the glove
                    print("Load glove file {}".format(
                        cfg['word_embeddings']['glove']['path']))
                    initW = data_helpers.load_embedding_vectors_glove(
                        vocabulary, cfg['word_embeddings']['glove']['path'],
                        embedding_dimension)
                    print("glove file has been loaded\n")
                sess.run(cnn.W.assign(initW))

            def train_step(x_batch, y_batch, learning_rate):
                """
                A single training step
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: FLAGS.dropout_keep_prob,
                    cnn.learning_rate: learning_rate
                }
                _, step, summaries, loss, accuracy = sess.run([
                    train_op, global_step, train_summary_op, cnn.loss,
                    cnn.accuracy
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}, learning_rate {:g}".
                      format(time_str, step, loss, accuracy, learning_rate))
                train_summary_writer.add_summary(summaries, step)

            def dev_step(x_batch, y_batch, writer=None):
                """
                Evaluates model on a dev set
                """
                feed_dict = {
                    cnn.input_x: x_batch,
                    cnn.input_y: y_batch,
                    cnn.dropout_keep_prob: 1.0
                }
                step, summaries, loss, accuracy, gr = sess.run([
                    global_step, dev_summary_op, cnn.loss, cnn.accuracy,
                    cnn.grad
                ], feed_dict)
                time_str = datetime.datetime.now().isoformat()
                print("{}: step {}, loss {:g}, acc {:g}, gr {}".format(
                    time_str, step, loss, accuracy, gr))
                if writer:
                    writer.add_summary(summaries, step)

            # Generate batches
            batches = data_helpers.batch_iter(list(zip(x_train, y_train)),
                                              FLAGS.batch_size,
                                              FLAGS.num_epochs)
            print("Number of epochs: {}".format(FLAGS.num_epochs))
            num_batches_per_epoch = int(
                (len(list(zip(x_train, y_train))) - 1) / FLAGS.batch_size) + 1
            print("Batches per epoch: {}".format(num_batches_per_epoch))
            print("Batch size: {}".format(FLAGS.batch_size))
            # It uses dynamic learning rate with a high value at the beginning to speed up the training
            max_learning_rate = 0.005
            min_learning_rate = 0.0001
            decay_speed = FLAGS.decay_coefficient * len(
                y_train) / FLAGS.batch_size
            # Training loop. For each batch...
            counter = 0
            for batch in batches:
                learning_rate = min_learning_rate + (
                    max_learning_rate - min_learning_rate) * math.exp(
                        -counter / decay_speed)
                counter += 1
                x_batch, y_batch = zip(*batch)
                train_step(x_batch, y_batch, learning_rate)
                current_step = tf.train.global_step(sess, global_step)
                if current_step % FLAGS.evaluate_every == 0:
                    print("\nEvaluation:")
                    dev_step(x_dev, y_dev, writer=dev_summary_writer)
                    print("")
                if current_step % FLAGS.checkpoint_every == 0:
                    path = saver.save(sess,
                                      checkpoint_prefix,
                                      global_step=current_step)
                    print("Saved model checkpoint to {}\n".format(path))
    print("runtime was " + str(time.time() - start_time))
Beispiel #10
0
                        "Allow device soft device placement")
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()

print("\nPamaeters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper, value))

print("")
# CHANGE THIS:load data

if FLAGS.eval_train:
    x_raw, y_test = data_helpers.load_data_labels(FLAGS.positive_data_file,
                                                  FLAGS.negative_data_file)
    y_test = np.argmax(y_test, axis=1)
else:
    x_raw = ["a masterpiece four years in the making", "everything is off."]
    y_test = [1, 0]

# Map data into vocabulary
vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

checkpoint_file = tf.train.latest_checkpoint(FLAGS.checkpoint_dir)
graph = tf.Graph()
Beispiel #11
0
# FLAGS._parse_flags() 弃用了


# FLAGS.flag_values_dict()
FLAGS.flag_values_dict()
print('\nParameters:')

for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), str(value.value)))


# ===============================
# 数据预处理
# 载入数据
print('loding data...')
x_text, y = data_helpers.load_data_labels(FLAGS.positive_data_fill, FLAGS.negative_data_file)

# build vocabulary 每篇文章中的词数最大的那一个
# 创建词汇表
max_document_length = max([len(x.split(' ')) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length=max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.array(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# split train/test set  分开测试与训练集 这里是从后往前
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
        cfg["datasets"][dataset_name]["positive_data_file"]["path"],
        cfg["datasets"][dataset_name]["negative_data_file"]["path"])
elif dataset_name == "20newsgroup":
    datasets = data_helpers.get_datasets_20newsgroup(
        subset="train",
        categories=cfg["datasets"][dataset_name]["categories"],
        shuffle=cfg["datasets"][dataset_name]["shuffle"],
        random_state=cfg["datasets"][dataset_name]["random_state"])
elif dataset_name == "localdata":
    datasets = data_helpers.get_datasets_localdata(
        container_path=cfg["datasets"][dataset_name]["container_path"],
        categories=cfg["datasets"][dataset_name]["categories"],
        shuffle=cfg["datasets"][dataset_name]["shuffle"],
        random_state=cfg["datasets"][dataset_name]["random_state"])
x_text, y = data_helpers.load_data_labels(FLAGS.anger_dir, FLAGS.disgust_dir,
                                          FLAGS.fear_dir, FLAGS.neutral_dir,
                                          FLAGS.sadness_dir,
                                          FLAGS.surprise_dir)

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
Beispiel #13
0
def preprocess():
    # Data Preparation
    # ==================================================

    # Load data
    print("Loading data...")
    datasets = data_helpers.get_datasets_political_parties()
    x_text, y = data_helpers.load_data_labels(datasets)
    #print('x_text',x_text)
    #print('labels',y)

    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in x_text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    if args.pre_trained:
        print('Load pre-trained word vectors')
        with open('fasttext_vocab_en.dat', 'rb') as fr:
            vocab = pickle.load(fr)
        embedding = np.load('fasttext_embedding_en.npy')

        pretrain = vocab_processor.fit(vocab.keys())
        x = np.array(list(vocab_processor.transform(x_text)))

        embedding_size = FLAGS.fasttext_embedding_dim
        vocab_size = len(vocab)
    else:
        x = np.array(list(vocab_processor.fit_transform(x_text)))
        embedding_size = FLAGS.embedding_dim
        vocab_size = len(vocab_processor.vocabulary_)
    #print('VocabPr',x)

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    #print('x_shuffled', x_shuffled)
    #print('y_shuffled', y_shuffled)
    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    #dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    #x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    #y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

    
    train_frac = 0.7
    val_frac = 0.2
    test_frac = 0.1


    def train_test_val_split(x_shuffled):
        return (x_shuffled[:int(len(x_shuffled)*train_frac)], 
        x_shuffled[int(len(x_shuffled)*train_frac) : (int(len(x_shuffled)*train_frac) + int(len(x_shuffled)*val_frac))], 
        x_shuffled[(int(len(x_shuffled)*train_frac) + int(len(x_shuffled)*val_frac)):])
    
    def train_test_val_labels(y_shuffled):
        return (y_shuffled[:int(len(y_shuffled)*train_frac)], 
        y_shuffled[int(len(y_shuffled)*train_frac) : (int(len(y_shuffled)*train_frac) + int(len(y_shuffled)*val_frac))], 
        y_shuffled[(int(len(y_shuffled)*train_frac) + int(len(y_shuffled)*val_frac)):])

    x_train, x_dev, x_test = train_test_val_split(x_shuffled)
    y_train, y_dev, y_test = train_test_val_labels(y_shuffled)
    #print('shape',x_train.shape)
    #print("Vocabulary". vocab_processor.vocabulary_)
    #print("Vocabulary",vocab_processor.vocabulary_._mapping)
    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    print('x_train',x_train.shape)
    print('y_train',y_train.shape)
    return x_train, y_train, vocab_processor,vocab_size, embedding_size, embedding, x_dev, y_dev, x_test, y_test
Beispiel #14
0
            cfg["datasets"][dataset_name]["positive_data_file"]["path"],
            cfg["datasets"][dataset_name]["negative_data_file"]["path"])
    elif dataset_name == "20newsgroup":
        datasets = data_helpers.get_datasets_20newsgroup(
            subset="train",
            categories=cfg["datasets"][dataset_name]["categories"],
            shuffle=cfg["datasets"][dataset_name]["shuffle"],
            random_state=cfg["datasets"][dataset_name]["random_state"])
    elif dataset_name == "localdata":
        datasets = data_helpers.get_datasets_localdata(
            container_path=cfg["datasets"][dataset_name]["container_path"],
            categories=cfg["datasets"][dataset_name]["categories"],
            shuffle=cfg["datasets"][dataset_name]["shuffle"],
            random_state=cfg["datasets"][dataset_name]["random_state"])

x_text, y = data_helpers.load_data_labels(datasets)  # x_test are the comments
# y is 0 or 1 for negative/positive

# Build vocabulary
max_document_length = max([len(x.split(" ")) for x in x_text])
vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
x = np.array(list(vocab_processor.fit_transform(x_text)))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
# TODO: This is very crude, should use cross-validation
Beispiel #15
0
tf.flags.DEFINE_boolean("log_device_placement", False,
                        "Log placement of ops on devices")

FLAGS = tf.flags.FLAGS
FLAGS._parse_flags()
print("\nParameters:")
for attr, value in sorted(FLAGS.__flags.items()):
    print("{}={}".format(attr.upper(), value))
print("")

# Data Preparation
# ==================================================

# Load data
print("Loading data...")
x, y, vocab_processor = data_helpers.load_data_labels(FLAGS.data_file,
                                                      FLAGS.label_file)

# Randomly shuffle data
# np.random.seed(10)
# shuffle_indices = np.random.permutation(np.arange(len(y)))
# x_shuffled = x[shuffle_indices]
# y_shuffled = y[shuffle_indices]

x_shuffled = x
y_shuffled = y
# Split train/test set
# TODO: This is very crude, should use cross-validation
dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]
print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
def preprocess():
    # Data Preparation
    # ==================================================

    # Load data
    print("Loading data...")
    datasets = data_helpers.get_datasets_tobacco()
    x_text, y = data_helpers.load_data_labels(datasets)
    #print('x_text',x_text)
    #print('labels',y)

    # Build vocabulary
    max_document_length = max([len(x.split(" ")) for x in x_text])
    #print('max_document_length',max_document_length)
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length)
    #print("vocab_processor",vocab_processor)
    x = np.array(list(vocab_processor.fit_transform(x_text)))
    #print('VocabPr',x)

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    #print('x_shuffled', x_shuffled)
    #print('y_shuffled', y_shuffled)
    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    #dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    #x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]
    #y_train, y_dev = y_shuffled[:dev_sample_index], y_shuffled[dev_sample_index:]

    train_frac = 0.7
    val_frac = 0.2
    test_frac = 0.1

    def train_test_val_split(x_shuffled):
        return (x_shuffled[:int(len(x_shuffled) * train_frac)],
                x_shuffled[int(len(x_shuffled) *
                               train_frac):(int(len(x_shuffled) * train_frac) +
                                            int(len(x_shuffled) * val_frac))],
                x_shuffled[(int(len(x_shuffled) * train_frac) +
                            int(len(x_shuffled) * val_frac)):])

    def train_test_val_labels(y_shuffled):
        return (y_shuffled[:int(len(y_shuffled) * train_frac)],
                y_shuffled[int(len(y_shuffled) *
                               train_frac):(int(len(y_shuffled) * train_frac) +
                                            int(len(y_shuffled) * val_frac))],
                y_shuffled[(int(len(y_shuffled) * train_frac) +
                            int(len(y_shuffled) * val_frac)):])

    x_train, x_dev, x_test = train_test_val_split(x_shuffled)
    y_train, y_dev, y_test = train_test_val_labels(y_shuffled)
    #print('shape',x_train.shape)
    #print("Vocabulary". vocab_processor.vocabulary_)
    #print("Vocabulary",vocab_processor.vocabulary_._mapping)
    print("Vocabulary Size: {:d}".format(len(vocab_processor.vocabulary_)))
    print("Train/Dev split: {:d}/{:d}".format(len(y_train), len(y_dev)))
    print('x_train', x_train.shape)
    print('y_train', y_train.shape)
    return x_train, y_train, vocab_processor, x_dev, y_dev, x_test, y_test
Beispiel #17
0
if FLAGS.eval_train:
    if dataset_name == "mrpolarity":
        datasets = data_helpers.get_datasets_mrpolarity(
            cfg["datasets"][dataset_name]["positive_data_file"]["path"],
            cfg["datasets"][dataset_name]["negative_data_file"]["path"])
    elif dataset_name == "20newsgroup":
        datasets = data_helpers.get_datasets_20newsgroup(
            subset="test",
            categories=cfg["datasets"][dataset_name]["categories"],
            shuffle=cfg["datasets"][dataset_name]["shuffle"],
            random_state=cfg["datasets"][dataset_name]["random_state"])
    elif dataset_name == "abstract":
        datasets = data_helpers.get_datasets_abstract("data/")
    elif dataset_name == 'intents':
        datasets = data_helpers.get_datasets_intentst("data/")
    x_text, y = data_helpers.load_data_labels(datasets)
    x_raw, y_test = data_helpers.load_data_labels(datasets)
    y_test = np.argmax(y_test, axis=1)
    print("Total number of test examples: {}".format(len(y_test)))
else:
    if dataset_name == "mrpolarity":
        x_raw = [
            "a masterpiece four years in the making", "everything is off."
        ]
        y_test = [1, 0]
    else:
        x_raw = [
            "Experimental results on a large number of real-world data sets show that the proposed algorithm outperforms existing HMC methods",
            "In this paper, we overcome these deficiencies by proposing a hierarchy-aware loss function that is more appropriate for HMC."
        ]
        y_test = [2, 1]
Beispiel #18
0
import time
import datetime
import data_helpers
from text_cnn import TextCNN
from tensorflow.contrib import learn
import csv
from sklearn import metrics

params = json.loads(open('./parameters.json').read())

checkpoint_dir = sys.argv[1]
datasets = None

filename = '20_news_group_to_test.csv.zip'

x_raw, y_test, labels = data_helpers.load_data_labels(filename, 0)
y_test = np.argmax(y_test, axis=1)
print("Total number of test examples: {}".format(len(y_test)))

# Map data into vocabulary
vocab_path = os.path.join(checkpoint_dir, "..", "vocab")
vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
x_test = np.array(list(vocab_processor.transform(x_raw)))

print("\nEvaluating...\n")

# Evaluation
# ==================================================
checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
graph = tf.Graph()
with graph.as_default():