def init_test(): mode_train, mode_test = 'TR', 'TE' dict_obj = set_dict.Dictionary() # train object params_train = set_params.ParamsClass(mode=mode_train) dir_train = set_dir.Directory(mode_train) params_train.num_classes = len(dict_obj.label_dict) # test object params_test = set_params.ParamsClass(mode=mode_test) dir_test = set_dir.Directory(mode_test) params_test.num_instances, params_test.indices = get_length(dir_test.data_filename) params_test.batch_size = 1 params_test.num_classes = len(dict_obj.label_dict) word_emb_path = dir_train.word_embedding word_emb_matrix = np.float32(np.genfromtxt(word_emb_path, delimiter=' ')) params_train.vocab_size = params_test.vocab_size = len(word_emb_matrix) print('***** INITIALIZING TF GRAPH *****') session = tf.Session() with tf.variable_scope("classifier", reuse=None): test_obj = model.DeepAttentionClassifier(params_test, dir_test) model_saver = tf.train.Saver() print('Loading model ...') model_saver.restore(session, set_dir.Directory('TE').test_model) print('**** MODEL LOADED ****\n') return session, test_obj
def generate_indexed_labels(): label_hash = {} input_file = open(set_dir.Directory('TR').label_filename).readlines() curr_count = 0 for each_label in input_file: curr_label = each_label.strip() if curr_label not in label_hash: label_hash[curr_label] = curr_count curr_count += 1 label_map_file = open(set_dir.Directory('TR').label_map_dict, 'wb') pickle.dump(label_hash, label_map_file, protocol=pickle.HIGHEST_PROTOCOL) print('Total classes %d' % (len(label_hash)))
def main(): set_dir_obj = set_dir.Directory() class_arr = get_imagenet_class() with tf.Session() as sess: img_vgg_obj = ImagePredictVGG(set_dir_obj.weights_dir) test_folder = set_dir_obj.sample_test_dir gold_label, images = [], [] for filename in os.listdir(test_folder): img_arr = resize_image(read_image(test_folder + '/' + filename), img_size=(224, 224)) images.append(img_arr) gold_label.append(filename) prob = sess.run(img_vgg_obj.prob, feed_dict={img_vgg_obj.img_batch: np.asarray(images)}) for idx, each_prob in enumerate(prob): max = np.argmax(each_prob) plt.text(0.5, -4.5, 'Predicted: ' + class_arr[max].strip() + ', Prob.: ' + str(each_prob[max]), fontsize=12) plot_image(images[idx]) print gold_label[idx], max, class_arr[max].strip(), each_prob[max]
def run_epoch(session, eval_op, model_obj, dict_obj, verbose=False): print('\nrun epoch') output_file = open(set_dir.Directory('TE').log_emb_path + '/word_embedding.csv', 'w') params = model_obj.params dir_obj = model_obj.dir_obj data_filename = dir_obj.data_filename label_filename = dir_obj.label_filename for step, (input_seq_arr, length_arr, label_arr) \ in enumerate(reader.data_iterator(params, data_filename, label_filename, model_obj.params.indices, dict_obj)): feed_dict = {model_obj.word_input: input_seq_arr, model_obj.seq_length: length_arr, model_obj.label: label_arr} emb_matrix, logits, _ = session.run([model_obj.word_emb_matrix, model_obj.logits, eval_op], feed_dict=feed_dict) for each_emb in emb_matrix: output_file.write(' '.join(str(x) for x in each_emb).strip() + '\n') break print 'Embedding file written ...'
def __init__(self, mode='TR'): """ :param mode: 'TR' for train, 'TE' for test, 'VA' for valid """ self.mode = mode self.rel_dir = set_dir.Directory(mode) # gloveDict = rel_dir.glove_path self.word_dict = pickle.load(open(self.rel_dir.glove_present_training_word_vocab, 'rb')) self.word_emb = self.rel_dir.word_embedding self.glove_present_word_csv = np.float32(np.genfromtxt(self.word_emb, delimiter=' ')) self.label_dict = pickle.load(open(self.rel_dir.label_map_dict, 'rb'))
def generate_vocab(self, training_file): word_dict = {} word_counter = 2 max_sequence_length = 0 training_file_pointer = open(training_file, 'r') print('\nReading Training File .... ') tokenized_training_file = open(self.dataDir + '/tokenized_training', 'w') for line in training_file_pointer: line = line.rstrip() string = re.split(r'\t', line) size = len(string) tokenized_training_string = '' for j in range(size): string[j] = re.sub(r'#[0-9]+', r'', string[j].strip()) # tokenized_sent = nltk.word_tokenize((string[j]).decode('utf-8')) # tokenized_sent.append('<eos>') # tokenized_sent = string[j].split(" ") tokenized_sent = string[j].split(" ") tokenized_string = ' '.join(tokenized_sent) tokenized_training_string += tokenized_string + '\t' for token in tokenized_sent: if (word_dict.has_key(token) == False): word_dict[token] = word_counter word_counter += 1 # tokenized_training_file.write(tokenized_training_string.encode('utf-8').rstrip('\t')) tokenized_training_file.write( tokenized_training_string.rstrip('\t')) tokenized_training_file.write('\n') curr_seq_length = len(tokenized_training_string.split()) if (curr_seq_length > max_sequence_length): max_sequence_length = curr_seq_length word_vocab = open(set_dir.Directory('TR').word_vocab_dict, 'wb') pickle.dump(word_dict, word_vocab, protocol=cPickle.HIGHEST_PROTOCOL) word_vocab.close() training_file_pointer.close() tokenized_training_file.close() print( 'Reading Completed \n ========================== \n Unique tokens: excluding padding and unkown words %d \n Max. sequence length: %d\n ==========================\n' % (word_counter - 2, max_sequence_length)) # print(word_dict) return word_dict
def generate_indexed_labels(): label_hash = {} # input_file = open(set_dir.Directory('TR').label_filename).readlines() # curr_count = 0 # for each_label in input_file: # curr_label = each_label.strip() # if not label_hash.has_key(curr_label): # label_hash[curr_label] = curr_count # curr_count += 1 label_hash["joy"] = 0 label_hash["sadness"] = 1 label_hash["disgust"] = 2 label_hash["anger"] = 3 label_hash["fear"] = 4 label_hash["surprise"] = 5 label_hash["neutral"] = 6 label_map_file = open(set_dir.Directory('TR').label_map_dict, 'wb') pickle.dump(label_hash, label_map_file, protocol=pickle.HIGHEST_PROTOCOL) print 'Total classes %d' % (len(label_hash))
def run_epoch(session, eval_op, model_obj, dict_obj, verbose=False): epoch_combined_loss = 0.0 total_correct = 0.0 total_instances = 0.0 print('\nrun epoch') output_file = open(set_dir.Directory('TE').test_cost_path, 'w') params = model_obj.params dir_obj = model_obj.dir_obj data_filename = dir_obj.data_filename label_filename = dir_obj.label_filename for step, (input_seq_arr, length_arr, label_arr) \ in enumerate(reader.data_iterator(params, data_filename, label_filename, model_obj.params.indices, dict_obj)): feed_dict = { model_obj.word_input: input_seq_arr, model_obj.seq_length: length_arr, model_obj.label: label_arr } loss, prediction, probabilities, _ = session.run([ model_obj.loss, model_obj.prediction, model_obj.probabilities, eval_op ], feed_dict=feed_dict) total_correct += np.sum(prediction == label_arr) total_instances += params.batch_size epoch_combined_loss += loss for each_pred in prediction: output_file.write(str(each_pred + 1) + '\n') print 'CE loss: %.4f, Total instances: %d, Correct: %d, Accuracy: %.4f' \ % (epoch_combined_loss, total_instances, total_correct, (total_correct / total_instances) * 100) return epoch_combined_loss
import os import tensorflow as tf from tensorflow.contrib.tensorboard.plugins import projector from global_module.settings_module import set_dir import numpy as np #TODO: clean code LOG_DIR = set_dir.Directory('TE').log_emb_path metadata = LOG_DIR + '/word_metadata.tsv' wordemb = LOG_DIR + '/word_embedding.csv' emb = tf.Variable(np.genfromtxt(wordemb), name='word_emb') with tf.Session() as sess: saver = tf.train.Saver([emb]) sess.run(emb.initializer) saver.save(sess, os.path.join(LOG_DIR, 'emb.ckpt')) config = projector.ProjectorConfig() # One can add multiple embeddings. embedding = config.embeddings.add() embedding.tensor_name = emb.name # Link this tensor to its metadata file (e.g. labels). embedding.metadata_path = metadata # Saves a config file that TensorBoard will read during startup. projector.visualize_embeddings(tf.summary.FileWriter(LOG_DIR), config)
def __init__(self): self.dataDir = set_dir.Directory('TR').data_path self.vocabDir = set_dir.Directory('TR').vocab_path self.gloveDict = set_dir.Directory('TR').glove_path self.config = set_params.ParamsClass('TR')
def util(self): training_file = set_dir.Directory('TR').data_filename self.generate_vocab(training_file) vocab_size = self.extract_glove_vectors( set_dir.Directory('TR').word_vocab_dict, self.gloveDict) return vocab_size
import sys from global_module.settings_module import set_dir # root_folder = dir_obj = set_dir.Directory('TE') cost_file = open(dir_obj.test_cost_path, 'r') output = open(dir_obj.test_pred_path, 'w') output_seq = open(dir_obj.test_seq_op_path, 'w') test_seq_file = open(dir_obj.data_filename, 'r') count_iter = 1 min_pos = -1 step_val = 6 pred_ans = '' pred_goal = '' pred_slot = '' multiplier = -1.0 # if probability, 1.0 if cost min_cost = sys.float_info.max for costLine, pred_line in zip(cost_file, test_seq_file): costLine = costLine.rstrip() ans = pred_line.rstrip() cost = multiplier * float(costLine) if (count_iter < step_val): if (min_cost > cost): min_cost = cost min_pos = count_iter pred_ans = ans
def get_imagenet_class(): file = open(set_dir.Directory().imagenet_class_dir, 'r') class_arr = file.readlines() return class_arr
def run_train(dict_obj): mode_train, mode_valid, mode_all = 'TR', 'VA', 'ALL' # train object params_train = set_params.ParamsClass(mode=mode_train) dir_train = set_dir.Directory(mode_train) params_train.num_instances, params_train.indices = get_length( dir_train.data_filename) # valid object params_valid = set_params.ParamsClass(mode=mode_valid) dir_valid = set_dir.Directory(mode_valid) params_valid.num_instances, params_valid.indices = get_length( dir_valid.data_filename) params_train.num_classes = params_valid.num_classes = len( dict_obj.label_dict) if params_train.enable_shuffle: random.shuffle(params_train.indices) random.shuffle(params_valid.indices) min_loss = sys.float_info.max word_emb_path = dir_train.word_embedding word_emb_matrix = np.float32(np.genfromtxt(word_emb_path, delimiter=' ')) params_train.vocab_size = params_valid.vocab_size = len(word_emb_matrix) print('***** INITIALIZING TF GRAPH *****') timestamp = str(int(time.time())) train_out_dir = os.path.abspath( os.path.join(dir_train.log_path, "train", timestamp)) valid_out_dir = os.path.abspath( os.path.join(dir_train.log_path, "valid", timestamp)) print("Writing to {}\n".format(train_out_dir)) with tf.Graph().as_default(), tf.Session() as session: # random_normal_initializer = tf.random_normal_initializer() # random_uniform_initializer = tf.random_uniform_initializer(-params_train.init_scale, params_train.init_scale) xavier_initializer = tf.contrib.layers.xavier_initializer( uniform=True, seed=None, dtype=tf.float32) with tf.variable_scope("classifier", reuse=None, initializer=xavier_initializer): train_obj = model.DeepAttentionClassifier(params_train, dir_train) train_writer = tf.summary.FileWriter(train_out_dir, session.graph) valid_writer = tf.summary.FileWriter(valid_out_dir) if not params_train.enable_checkpoint: session.run(tf.global_variables_initializer()) if params_train.enable_checkpoint: ckpt = tf.train.get_checkpoint_state(dir_train.model_path) if ckpt and ckpt.model_checkpoint_path: print("Loading model from: %s" % ckpt.model_checkpoint_path) tf.train.Saver().restore(session, ckpt.model_checkpoint_path) elif not params_train.use_random_initializer: session.run( tf.assign(train_obj.word_emb_matrix, word_emb_matrix, name="word_embedding_matrix")) with tf.variable_scope("classifier", reuse=True, initializer=xavier_initializer): valid_obj = model.DeepAttentionClassifier(params_valid, dir_valid) print('**** TF GRAPH INITIALIZED ****') start_time = time.time() for i in range(params_train.max_max_epoch): lr_decay = params_train.lr_decay**max(i - params_train.max_epoch, 0.0) train_obj.assign_lr(session, params_train.learning_rate * lr_decay) # print(params_train.learning_rate * lr_decay) print('\n++++++++=========+++++++\n') print("Epoch: %d Learning rate: %.5f" % (i + 1, session.run(train_obj.lr))) train_loss, _ = run_epoch(session, train_writer, train_obj.train_op, min_loss, train_obj, dict_obj, i, verbose=True) print("Epoch: %d Train loss: %.3f" % (i + 1, train_loss)) valid_loss, curr_loss = run_epoch(session, valid_writer, tf.no_op(), min_loss, valid_obj, dict_obj, i) if curr_loss < min_loss: min_loss = curr_loss print("Epoch: %d Valid loss: %.3f" % (i + 1, valid_loss)) curr_time = time.time() print('1 epoch run takes ' + str(((curr_time - start_time) / (i + 1)) / 60) + ' minutes.') train_writer.close() valid_writer.close()
def util(): raw_training_file = set_dir.Directory('TR').raw_train_path training_file = set_dir.Directory('TR').data_filename sample_train_file(raw_training_file, training_file, set_params.ParamsClass().sampling_threshold)
def extract_glove_vectors(word_vocab_file, glove_file): glove_vocab_dict = pickle.load(open(glove_file, 'rb')) word_vocab_dict = pickle.load(open(word_vocab_file, 'rb')) length_word_vector = 0 glove_present_training_word_vocab_dict = collections.OrderedDict() glove_present_training_word_counter = 2 # 3 # glove_present_training_word_counter = 1 glove_present_word_vector_dict = collections.OrderedDict() glove_present_training_word_vocab_dict['PAD'] = 0 glove_present_training_word_vocab_dict['UNK'] = 1 # 2 glove_present_word_vector_dict[1] = glove_vocab_dict['food'] if (length_word_vector == 0): length_word_vector = len(glove_vocab_dict['food'].split(' ')) for key, value in word_vocab_dict.items(): if (config.all_lowercase): if (glove_vocab_dict.has_key(key.lower())): key = key.lower() elif (glove_vocab_dict.has_key(key)): key = key elif (glove_vocab_dict.has_key(key.title())): key = key.title() elif (glove_vocab_dict.has_key(key.upper())): key = key.upper() else: key = key.lower() if (key not in glove_present_training_word_vocab_dict): if (config.use_unknown_word): if (glove_vocab_dict.has_key(key) and config.use_random_initializer == False): if (key != 'UNK'): glove_present_training_word_vocab_dict[ key] = glove_present_training_word_counter glove_present_word_vector_dict[ glove_present_training_word_counter] = glove_vocab_dict.get( key) glove_present_training_word_counter += 1 else: glove_present_training_word_vocab_dict[ key] = glove_present_training_word_counter vec_str = '' for i in range(length_word_vector): vec_str += str(round(random.uniform(-0.9, 0.9), 6)) + ' ' glove_present_word_vector_dict[ glove_present_training_word_counter] = vec_str.strip() glove_present_training_word_counter += 1 elif (glove_vocab_dict.has_key(key) and config.use_random_initializer == False and config.use_unknown_word == False): if (key != 'UNK'): glove_present_training_word_vocab_dict[ key] = glove_present_training_word_counter glove_present_word_vector_dict[ glove_present_training_word_counter] = glove_vocab_dict.get( key) glove_present_training_word_counter += 1 elif (config.use_random_initializer): glove_present_training_word_vocab_dict[ key] = glove_present_training_word_counter glove_present_word_vector_dict[ glove_present_training_word_counter] = glove_vocab_dict.get( 'UNK') glove_present_training_word_counter += 1 # else : # print('Error') word_vector_file = open(set_dir.Directory('TR').word_embedding, 'w') writer = csv.writer(word_vector_file) string = '' for i in range(length_word_vector): string += '0 ' word_vector_file.write(string.rstrip(' ') + '\n') # word_vector_file.write(string.rstrip(' ') + '\n') # zeros vector (id 1) for key, value in glove_present_word_vector_dict.items(): writer.writerow([value]) glove_present_training_word_vocab = open( set_dir.Directory('TR').glove_present_training_word_vocab, 'wb') pickle.dump(glove_present_training_word_vocab_dict, glove_present_training_word_vocab) print(glove_present_training_word_vocab_dict) print( 'Glove_present_unique_training_tokens, Total unique tokens, Glove token size' ) print(len(glove_present_training_word_vocab_dict), len(word_vocab_dict), len(glove_vocab_dict)) word_vector_file.close() print('\nVocab Size:') # print(len(glove_present_word_vector_dict)+2) print(len(glove_present_training_word_vocab_dict)) glove_present_training_word_vocab.close() # return(len(glove_present_word_vector_dict)+2) ##### # WORD METADATA #### meta_file = open(dirObj.word_emb_tsv, 'w') # meta_file.write('Word' + '\t' + 'Id' + '\n') for key, value in glove_present_training_word_vocab_dict.items(): # meta_file.write(key + '\t' + str(value) + '\n') meta_file.write(key + '\n') meta_file.close() ##### return len(glove_present_word_vector_dict) + 1
def extract_glove_vectors(self, word_vocab_file, glove_file): glove_vocab_dict = cPickle.load(open(glove_file, 'rb')) word_vocab_dict = cPickle.load(open(word_vocab_file, 'rb')) length_word_vector = 0 glove_present_training_word_vocab_dict = {} glove_present_training_word_counter = 2 # 3 # glove_present_training_word_counter = 1 glove_present_word_vector_dict = {} glove_present_training_word_vocab_dict['UNK'] = 1 # 2 glove_present_word_vector_dict[1] = glove_vocab_dict.get('UNK') if (length_word_vector == 0): length_word_vector = len(glove_vocab_dict.get('the').split(' ')) for key, value in word_vocab_dict.items(): if (self.config.all_lowercase): if (glove_vocab_dict.has_key(key.lower())): key = key.lower() elif (glove_vocab_dict.has_key(key)): key = key elif (glove_vocab_dict.has_key(key.title())): key = key.title() elif (glove_vocab_dict.has_key(key.upper())): key = key.upper() else: key = key.lower() if (not glove_present_training_word_vocab_dict.has_key(key)): if (self.config.use_unknown_word): if (glove_vocab_dict.has_key(key) and self.config.use_random_initializer == False): if (key != 'UNK'): glove_present_training_word_vocab_dict[ key] = glove_present_training_word_counter glove_present_word_vector_dict[ glove_present_training_word_counter] = glove_vocab_dict.get( key) glove_present_training_word_counter += 1 else: glove_present_training_word_vocab_dict[ key] = glove_present_training_word_counter vec_str = '' for i in range(length_word_vector): vec_str += str(round(random.uniform(-0.9, 0.9), 6)) + ' ' glove_present_word_vector_dict[ glove_present_training_word_counter] = vec_str.strip( ) glove_present_training_word_counter += 1 elif (glove_vocab_dict.has_key(key) and self.config.use_random_initializer == False and self.config.use_unknown_word == False): if (key != 'UNK'): glove_present_training_word_vocab_dict[ key] = glove_present_training_word_counter glove_present_word_vector_dict[ glove_present_training_word_counter] = glove_vocab_dict.get( key) glove_present_training_word_counter += 1 elif (self.config.use_random_initializer): glove_present_training_word_vocab_dict[ key] = glove_present_training_word_counter glove_present_word_vector_dict[ glove_present_training_word_counter] = glove_vocab_dict.get( 'UNK') glove_present_training_word_counter += 1 # else : # print('Error') word_vector_file = open(set_dir.Directory('TR').word_embedding, 'w') writer = csv.writer(word_vector_file) string = '' for i in range(length_word_vector): string += '0 ' word_vector_file.write(string.rstrip(' ') + '\n') # word_vector_file.write(string.rstrip(' ') + '\n') # zeros vector (id 1) for key, value in glove_present_word_vector_dict.items(): writer.writerow([value]) glove_present_training_word_vocab = open( set_dir.Directory('TR').glove_present_training_word_vocab, 'wb') pickle.dump(glove_present_training_word_vocab_dict, glove_present_training_word_vocab, protocol=cPickle.HIGHEST_PROTOCOL) print(glove_present_training_word_vocab_dict) print( 'Glove_present_unique_training_tokens, Total unique tokens, Glove token size' ) print(len(glove_present_word_vector_dict), len(word_vocab_dict), len(glove_vocab_dict)) word_vector_file.close() print('\nVocab Size:') # print(len(glove_present_word_vector_dict)+2) print(len(glove_present_word_vector_dict) + 1) glove_present_training_word_vocab.close() # return(len(glove_present_word_vector_dict)+2) return (len(glove_present_word_vector_dict) + 1)
def __init__(self): self.glove_dict = cPickle.load( open(set_dir.Directory('TR').glove_path, 'rb')) self.config = set_params.ParamsClass('TR')
# id = 1 for unkown words # word_vocab.pkl -> map of unique training words and ids # glove_present_training_word_vocab.pkl -> map of unique training words that are present in glove data and their new ids # word_embedding.csv -> word embedding corresponding to glove_present_words import pickle import csv import pickle import random import re from global_module.settings_module import set_dir, set_params import collections dirObj = set_dir.Directory('TR') dataDir = dirObj.data_path vocabDir = dirObj.vocab_path gloveDict = dirObj.glove_path config = set_params.ParamsClass('TR') def generate_vocab(training_file): word_dict = collections.OrderedDict() word_counter = 2 max_sequence_length = 0 training_file_pointer = open(training_file, 'r') print('\nReading Training File .... ')
# id = 0 for padding # id = 1 for unkown words # word_vocab.pkl -> map of unique training words and ids # glove_present_training_word_vocab.pkl -> map of unique training words that are present in glove data and their new ids # word_embedding.csv -> word embedding corresponding to glove_present_words import cPickle import csv import pickle import random import re from global_module.settings_module import set_dir, set_params dataDir = set_dir.Directory('TR').data_path vocabDir = set_dir.Directory('TR').vocab_path gloveDict = set_dir.Directory('TR').glove_path config = set_params.ParamsClass('TR') def generate_vocab(training_file): word_dict = {} word_counter = 2 max_sequence_length = 0 training_file_pointer = open(training_file, 'r') print('\nReading Training File .... ') tokenized_training_file = open(dataDir + '/tokenized_training', 'w')
import cPickle as pickle from global_module.settings_module import set_dir rel_dir = set_dir.Directory('TR') def convert(test_filename): label_dict = pickle.load(open(rel_dir.label_map_dict, 'rb')) test_file = open(test_filename, 'r') op_file = open(test_filename + '_output.txt', 'w') new_map = {} for actual_id, mapped_id in label_dict.iteritems(): new_map[mapped_id] = actual_id for line in test_file: line = line.strip() op_file.write(new_map[int(line) - 1] + '\n') op_file.close() test_file.close() # convert('/home/aykumar/aykumar_home/self/deep-text-classifier/global_module/utility_dir/folder1/output/dummy_rnn.txt')
def main(): params = set_params.ParamsClass(mode='TR') dir_obj = set_dir.Directory('TR') classifier_obj = DeepAttentionClassifier(params, dir_obj)
import cPickle import re from global_module.settings_module import set_dir, set_params glove_dict = cPickle.load(open(set_dir.Directory('TR').glove_path, 'rb')) config = set_params.ParamsClass('TR') def sample_train_file(raw_training_file, training_file, threshold): raw_training_file_pointer = open(raw_training_file, 'r') training_file_pointer = open(training_file, 'w') word_dict = {} print('\nReading raw training file .... ') for line in raw_training_file_pointer: line = line.rstrip() # line = line.lower() string = re.split(r'\t', line) size = len(string) tokenized_training_string = '' for j in range(size): # string[j] = re.sub(r'#[0-9]+', r'', string[j].strip()) tokenized_sent = string[j].split(" ") tokenized_string = ' '.join(tokenized_sent) tokenized_training_string += tokenized_string + '\t' for token in tokenized_sent: if token not in word_dict:
def main(): training_file = set_dir.Directory('TR').data_filename word_dict = generate_vocab(training_file) vocab_size = extract_glove_vectors( set_dir.Directory('TR').word_vocab_dict, gloveDict) return vocab_size