Python build_vocabulary Exemples, data_helpers.build_vocabulary Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : eval.py Projet : chenyang-charles/Affect-In-Tweets

for task in tasks:
    print 'Running for task', task
    checkpoint_dir = FLAGS.checkpoint_dir.format(task)

    # CHANGE THIS: Load data. Load your own data here
    x_raw, y_test = data_helpers.load_data_and_labels(FLAGS.data_dir, task,
                                                      'test')
    #y_test = np.argmax(y_test, axis=1)
    #print y_test

    # Map data into vocabulary
    #vocab_path = os.path.join(FLAGS.checkpoint_dir, "..", "vocab")
    #vocab_processor = learn.preprocessing.VocabularyProcessor.restore(vocab_path)
    #x_test = np.array(list(vocab_processor.transform(x_raw)))
    x_test, vocab_vector = data_helpers.build_vocabulary(x_raw)
    #np.save('tmp/x_test.data', x_test)
    #x_test = np.load('tmp/x_test.data.npy')
    #vocab_vector = np.load('tmp/vocab_vector.data.npy')

    print("\nEvaluating...\n")

    # Evaluation
    # ==================================================
    print checkpoint_dir
    checkpoint_file = tf.train.latest_checkpoint(checkpoint_dir)
    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=FLAGS.allow_soft_placement,
            log_device_placement=FLAGS.log_device_placement)

Exemple #2

0

Afficher le fichier

# gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.333)
run_config = tf.ConfigProto()
run_config.gpu_options.allow_growth = True

# Output directory for models and summaries
out_dir = data_helpers.mkdir_if_not_exist("./runs")

# Load true_sentences and build vocab
true_sentences = data_helpers.read_and_clean_file(FLAGS.true_data_file)
padding_true_sentences = data_helpers.padding_sentences(true_sentences,
                                                        FLAGS.padding_token, FLAGS.max_sentences_length)

# Question: should we build voc just use true sentences or use all chinese word dic?
# Here we use true sentences
voc,voc_size = data_helpers.build_vocabulary(padding_true_sentences,'./runs/vocab')

true_data = np.array(data_helpers.sentence2matrix(true_sentences,FLAGS.max_sentences_length,voc))
# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(true_sentences)))
true_data_shuffled = true_data[shuffle_indices]


#fake_factors = fake_factor_dist.sample((FLAGS.batch_size, FLAGS.max_sentences_length,FLAGS.embedding_dim))

global_graph = tf.Graph()

with global_graph.as_default():
	sess = tf.Session(graph=global_graph)
	gan_model = GANModel(batch_size=FLAGS.batch_size,

Exemple #3

0

Afficher le fichier

Fichier : train.py Projet : chenyang-charles/Affect-In-Tweets

#tasks = ['anger', 'fear', 'joy', 'sadness']

tasks = ['joy', 'sadness']

for task in tasks:
    # Load data
    print("Loading data...")
    x_text, y = data_helpers.load_data_and_labels(FLAGS.data_dir, task, 'train')

    # Build vocabulary
    #max_document_length = max([len(x.split(" ")) for x in x_text])
    #vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length)
    #x = np.array(list(vocab_processor.fit_transform(x_text)))

    x, vocab_vector = data_helpers.build_vocabulary(x_text)
    #np.save('tmp/x.data', x)
    #np.save('tmp/vocab_vector.data', vocab_vector)
    #x = np.load('tmp/x.data.npy')
    #vocab_vector = np.load('tmp/vocab_vector.data.npy')

    # Randomly shuffle data
    np.random.seed(10)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/test set
    # TODO: This is very crude, should use cross-validation
    dev_sample_index = -1 * int(FLAGS.dev_sample_percentage * float(len(y)))
    x_train, x_dev = x_shuffled[:dev_sample_index], x_shuffled[dev_sample_index:]

Exemple #4

0

Afficher le fichier

Fichier : train.py Projet : BlueSummerTrain/classify_text

                                              FLAGS.shizheng_data_file,\
                                              FLAGS.tiyu_data_file,\
                                              FLAGS.yule_data_file)
sentences = data_helpers.padding_sentences(x_text, FLAGS.padding_token,max_sentence_len)

print("len(x_text)",len(x_text))
print("len(y)",len(y))
# Build vocabulary
voc = None
vocsize = None

if os.path.exists('./runs/vocab'):
    # when sess restore,just reload vocab 
    voc,vocsize = data_helpers.read_vocabulary('./runs/vocab') 
else:
    voc,vocsize = data_helpers.build_vocabulary(sentences,'./runs/vocab')

x = np.array(data_helpers.sentence2matrix(sentences,max_sentence_len,voc))

# Randomly shuffle data
np.random.seed(10)
shuffle_indices = np.random.permutation(np.arange(len(y)))
x_shuffled = x[shuffle_indices]
y_shuffled = y[shuffle_indices]

# Split train/test set
data_len = len(x_shuffled)
x_train,x_dev,y_train,y_dev= train_test_split(x_shuffled,y_shuffled,test_size=FLAGS.dev_per,random_state=42)
print("Total/Train/Dev: {:d}/{:d}/{:d}".format(data_len,len(y_train), len(y_dev)))

# Training