Esempio n. 1
0
 def createName(self, title, currentPage, maxPage, extension=".png"):
     page = utility.getZeroFillNumberString(currentPage, maxPage)
     title = re.sub(" ", "", title)
     # return os.path.join(self.destination, f"{self.prefix}{title}_{page}{self.suffix}{extension}")
     name = os.path.join(self.destination, title,
                         f"{self.prefix}{page}{self.suffix}{extension}")
     utility.makeDir(os.path.split(name)[0])
     return name
Esempio n. 2
0
def createLogDirectories(log_dir):
	timestamp = int(time.time())
	main_dir = os.path.join(log_dir, str(timestamp))
	train_log_dir = os.path.join(main_dir,"train")
	checkpoint_dir = os.path.join(main_dir,"checkpoints")
	utility.makeDir(train_log_dir)
	utility.makeDir(checkpoint_dir)
	return {"main_dir": main_dir, "train_log_dir": train_log_dir, "checkpoint_dir": checkpoint_dir}
Esempio n. 3
0
def generateLogDirectories(log_directory):
	summary_directory = os.path.join(log_directory,"summaries")
	train_log_directory = os.path.join(summary_directory,"train")
	valid_log_directory = os.path.join(summary_directory, "valid")
	train_model_directory = os.path.join(log_directory,"models")
	train_model_file = os.path.join(train_model_directory, 'checkpoint')
	utility.makeDir(train_log_directory)
	utility.makeDir(train_model_directory)
	utility.makeDir(valid_log_directory)
	return {
			'summary_directory':summary_directory,
			'train_log_directory':train_log_directory,
			'valid_log_directory':valid_log_directory,
			'train_model_directory':train_model_directory,
			'train_model_file':train_model_file
			}
Esempio n. 4
0
def executeTraining(train_dataset_merged, valid_dataset_merged, num_epochs, batch_size, vocabulary_size, embedding_size, num_labels, hidden_size, 
					summary_frequency, embeddings_file, log_directory, num_checkpoints = 5):
	graph = tf.Graph()
	with graph.as_default():
		tg = TrainingGraph(vocabulary_size, embedding_size, num_labels, hidden_size)
		precision_tf = tf.placeholder(shape=[], dtype=tf.float32,name='precision')
		recall_tf = tf.placeholder(shape=[], dtype=tf.float32,name='recall')
		f1_tf = tf.placeholder(shape=[], dtype=tf.float32,name='f1')
	train_batch = BatchGenerator(train_dataset_merged, batch_size, num_labels)
	valid_batch = BatchGenerator(valid_dataset_merged, len(valid_dataset_merged), num_labels)
	
	precision_summary = tf.summary.scalar('precision_summary',precision_tf)
	recall_summary = tf.summary.scalar('recall_summary',recall_tf)
	f1_summary = tf.summary.scalar('f1_summary',f1_tf)
	stat_summary = tf.summary.merge([precision_summary, recall_summary, f1_summary])
	stat_dict={}

	summary_directory = os.path.join(log_directory,"summaries")
	train_log_directory = os.path.join(summary_directory,"train")
	valid_log_directory = os.path.join(summary_directory, "valid")
	train_model_directory = os.path.join(log_directory,"models")
	train_model_file = os.path.join(train_model_directory, 'checkpoint')
	utility.makeDir(train_log_directory)
	utility.makeDir(train_model_directory)
	utility.makeDir(valid_log_directory)
	train_summary_writer = tf.summary.FileWriter(train_log_directory, graph = graph)
	
	num_iters = (len(train_dataset_merged) // batch_size) * num_epochs
	feed_dict={}
	embeddings = utility.loadEmbeddings(embeddings_file)
	feed_dict[tg.embeddings] = embeddings
	print("Will take {} iters".format(num_iters))
	# average loss of the system as whole
	overall_avg_loss = 0.0
	with tf.Session(graph=graph) as session:
		session.run(tf.global_variables_initializer())
		saver = tf.train.Saver(tf.global_variables(),max_to_keep = num_checkpoints)
		for i in range(num_iters):
			batch = train_batch.next_batch()
			feed_dict[tg.inp_x], lbls_batch = createInpOutListsFromBatch(batch)
			feed_dict[tg.labels] = np.transpose(lbls_batch,[1,0])
			chk = False
			num_classifiers_to_test = len(tg.classifiers)
			# if 1 in feed_dict[tg.labels][0]:
			# 	chk=True
			# print(feed_dict[tg.labels])
			# dec=[hotDecode(x) for x in feed_dict[tg.labels]]
			# print(dec)
			# lbls = np.array(feed_dict[tg.labels])
			# print(lbls[:,dec[0][0]-1])
			# break
			# break

			# store loss and predictions for each classifier. Loss is for general overall loss average
			# calculation of system as a whole, storing labels to calculate accuracy.
			classifier_ops = []
			# loss across all classifiers.
			net_loss = 0.0
			train_summary = None
			for j in range(num_classifiers_to_test):
				if j==num_classifiers_to_test-1:
					cl, prediction, _, train_summary = session.run([tg.classifiers[j].loss, tg.classifiers[j].prediction, tg.classifiers[j].optimizer, tg.all_summaries], feed_dict=feed_dict)
				else:	
					cl, prediction, _ = session.run([tg.classifiers[j].loss, tg.classifiers[j].prediction, tg.classifiers[j].optimizer], feed_dict=feed_dict)
				# print(prediction)
				classifier_ops.append(prediction)
			# for x in range(len(prediction)):
			# 	print(prediction[i],"--",feed_dict[tg.labels][i])
			# print(cl)
			# break
			net_loss+=cl
			# print(prediction)
			# average of loss across all classifiers
			# net_loss/=len(tg.classifiers)
			# print(net_loss)
			# classifier_ops[0][4] = [-1]
			# break
			overall_avg_loss+=net_loss
			# classifier_ops is num_classifiers x batch_size.
			# convert it to batch x num_classifiers
			classifier_ops = np.transpose(classifier_ops,[1,0,2])
			# print(classifier_ops)
			# storing one hot vecs in diff var to get rid of third dimension while generating one hot vecs.
			# sort of concatenate for free at cost for memory 
			pred_hot_vec = []
			for j in range(len(classifier_ops)):
				pred_hot_vec.append(hotEncodeDistribution(classifier_ops[j]))
			# memory cleanup, bit agressive
			del classifier_ops
			train_summary_writer.add_summary(train_summary,i)
			# print(pred_hot_vec)
			# print("labels")
			# print(feed_dict[tg.labels])
			# break
			f1, precision, recall = get_accuracy(pred_hot_vec, lbls_batch)
			stat_dict[precision_tf] = precision
			stat_dict[recall_tf] = recall
			stat_dict[f1_tf] = f1
			pre, rec, ef1, ss = session.run([precision_tf, recall_tf, f1_tf, stat_summary],feed_dict = stat_dict)
			train_summary_writer.add_summary(ss,i)

			print("step {}/{}: loss: {}, f1:{}".format(i,num_iters,net_loss, f1))
			if i%summary_frequency==0:
				save_loc = saver.save(session, train_model_file, global_step = i)
				print("Saving model at {}".format(save_loc))
				print((len(pred_hot_vec),len(pred_hot_vec[0])), (len(lbls_batch),len(lbls_batch[0])))
				for j in range(len(pred_hot_vec)):
					print(pred_hot_vec[j],"--",lbls_batch[j][:num_classifiers_to_test])
					print()
Esempio n. 5
0
parser = argparse.ArgumentParser()
parser.add_argument("--embedding-file",help="embeddings txt file to read from", required=True)
args = parser.parse_args()

res = loadDataset("../data/nodes.csv","../data/groups.csv","../data/group-edges.csv")
dataset = res['node2labels']
nodes=res['nodes']
labels = res['labels']
split_ratio = 0.75
random.shuffle(dataset)
splitBorder = int(len(dataset)*split_ratio)
train_dataset = dataset[:splitBorder]
valid_dataset = dataset[splitBorder:]
train_dataset_merged = collectNodesAndLabels(train_dataset)
valid_dataset_merged = collectNodesAndLabels(valid_dataset)
num_epochs = 1			
batch_size = 5
hidden_size = 50
embedding_size = 128
summary_frequency = 10
# print(batch2string(train_batch.next_batch()))
# t = TrainingGraph(len(nodes), 128, len(labels), 15)
timestamp = int(time.time())
log_directory = os.path.join("classifier_runs",str(timestamp))
utility.makeDir(log_directory)
write_metadata = os.path.join(log_directory,"metadata.txt")
writeMeta(write_metadata, args.embedding_file, hidden_size)
executeTraining(train_dataset_merged, valid_dataset_merged, num_epochs, batch_size, len(nodes), embedding_size, len(labels), hidden_size, 
					summary_frequency, args.embedding_file, log_directory)
# print(t.classifiers[0].loss)
Esempio n. 6
0
 def setDestination(self, entry, dest):
     self.destination = self.path
     if dest != "":
         self.destination = utility.makeDir(dest)
Esempio n. 7
0
	
	configfile = os.path.join(args.model_directory,'classify_config.txt')
	config_dict = ast.literal_eval(readFile(configfile))
	config = utility.ConfigProvider()
	config.setDict(config_dict)
	embedding_size = config.getOption('embedding_size')
	
	model_directory = os.path.join(args.model_directory,'models')
	timestamp = int(time.time())
	log_dir = os.path.join(args.log_dir,str(timestamp))
	eval_log_dir = os.path.join(log_dir,'eval_summaries')
	nodeToLabel = None
	
	if args.eval_file!=None:
		nodeToLabel = readLabelsFile(args.eval_file, args.eval_file_delim)
		utility.makeDir(eval_log_dir)
		meta_file_path = os.path.join(log_dir,"meta.txt")
		classifier.writeMeta(meta_file_path,num_nodes = vocabulary_size, num_labels=num_labels,hidden_size=hidden_size,
							 embedding_size=embedding_size,learned_from=args.model_directory,eval_file=args.eval_file)
	
	graph = tf.Graph()
	with graph.as_default():
		tg = classifier.TrainingGraph(vocabulary_size, embedding_size, num_labels, hidden_size)
		f1_tf = tf.placeholder(shape=[],name='f1',dtype=tf.float32)
		prec_tf = tf.placeholder(shape=[],name='precision',dtype=tf.float32)
		rec_tf =tf.placeholder(shape=[],name='recall',dtype=tf.float32)
		avg_f1_tf = tf.placeholder(shape=[],name='avg_f1',dtype=tf.float32)
		avg_prec_tf = tf.placeholder(shape=[],name='avg_prec',dtype=tf.float32)
		avg_rec_tf = tf.placeholder(shape=[],name='avg_rec',dtype=tf.float32)

		f1_summary = tf.summary.scalar("f1_summary",f1_tf)
Esempio n. 8
0
 def createName(self, entry):
   number = utility.getZeroFillNumberString(self.createCount, self.num)
   name = os.path.join(self.destination, os.path.basename(entry.parent), f"{self.prefix}{os.path.basename(entry.parent)}_{number}{self.suffix}{entry.suffix}")
   utility.makeDir(os.path.split(name)[0])
   self.createCount += 1
   return name