def on_execute(self): """Executes an algorithm several times""" while self.runs < self.maxRuns: # continue running until algorithm is done while self.algorithm.isDone is False: self.algorithm.execute() self.dataHelper.writeCSVLine(self.area) # if algorithm completes one run, reset state if self.algorithm.isDone is True and self.runs < self.maxRuns: print('Run {} is complete! 🎉'.format(self.runs)) # save area to csv self.dataHelper.writeArea(self.area) self.dataHelper = DataHelper() # reset area and algorithm self.area = copy.deepcopy(self.originalArea) self.algorithm = copy.copy(self.originalAlgorithm) self.algorithm.area = self.area self.runs += 1 if self.runs == self.maxRuns: print('✨✨ I succesfully ran {}' 'times! ✨✨'.format(self.runs))
def __init__(self, logging, graph=None, file=None, start=0, ints=False): self.logging = logging if not os.path.isdir(logging): os.mkdir(logging) if graph == None and file != None: self.helper = DataHelper(file, NP=False) self.samples = self.helper.GetSamples() if ints == True: samples = [] for i in self.samples: samples.append([int(i[0]), int(i[1]), int(i[2])]) self.samples = np.array(samples) self.G = self.readGraph(file, ints=ints) self.uG = self.readGraph(file, ints=ints, unweight=True) elif graph != None and file == None: self.G = graph self.uG = nx.Graph(self.G) self.samples = [] for edge in self.G.edges(): for i in self.G[edge[0]][edge[1]]: self.samples.append([ edge[0], self.G[edge[0]][edge[1]][i]['attr'], edge[1] ]) else: raise Exception self.start = start self.node2id, self.id2node = self._node2id() self.edge2id, self.id2edge = self._edge2id()
def __init__(self, checkpoint_path, model_selector, cuda=False): #cuda = True self.dh = DataHelper() # 数据库读取 self.checkpoint_path = checkpoint_path self.model_selector = model_selector self.cuda = cuda l.info("[I] Model Loading....") self.compute_app = Application(checkpoint_path, cuda=cuda, model_name=model_selector) l.info("[I] Model loaded...")
def __init__(self, area, algorithm): """Initiate all elements necessary to run an algorithm without visualization Keyword arguments: area -- the area that should be used in the algoritm algorithm -- the algorithm by which the given area is filled """ self.area = area self.algorithm = algorithm self.dataHelper = DataHelper()
def train(self, train_file, iteration): print( "________________________________________________________________________________________________tarin starts" ) model = multi_class_perceptron() c = DataHelper() instances = [] sentence_count = 0 for sentence in c.read_sentence(train_file): sentence_count += 1 for token in sentence: feature = token.feature_extracter(model.feature_constructor) # print feature instances.append( (feature, model.pos_constructor(token.gold_pos))) weights_statistics = model.weights_constructor() self.table_statistics.append([ str(weights_statistics[1]), str(weights_statistics[0]), str(len(instances)), str(sentence_count) ]) table = AsciiTable(self.table_statistics) print(table.table) for iter_round in range(iteration): start = time.time() for (feature, pos_tag) in instances: #print (feature, pos_tag) score = model.weight_scores(feature) #print score predicted_tag = model.predict(score) #print predicted_tag if predicted_tag != pos_tag: model.update(feature, pos_tag, predicted_tag) end = time.time() print 'Iteration' + '\t' + str( iter_round + 1) + '\t' + 'done.', " runs at:", end - start, "seconds" model_file = 'dumps\\model_' + str(iter_round + 1) + '.dump' model.save(model_file) print( "________________________________________________________________________________________________tarin ends" )
def __init__(self, area, algorithm, runs): """Initiate all elements necessary to run an algorithm consecutively without visualizations. Keyword arguments: area -- the area that should be visualised algorithm -- the algorithm by which the given area is filled runs -- the amount of times the algorithm should run """ self.area = area self.algorithm = algorithm self.originalArea = copy.deepcopy(area) self.originalAlgorithm = copy.copy(algorithm) self.runs = 0 self.dataHelper = DataHelper() self.maxRuns = runs
class BulkVisualizer(Visualizer): """Draws consecutive visualisations for consecuetively created areas""" def __init__(self, area, algorithm, runs): """Initiate all elements necessary to run an algorithm consecutively and create consecutive visualizations for them. Keyword arguments: area -- the area that should be visualised algorithm -- the algorithm by which the given area is filled runs -- the amount of times the algorithm should be run and the amount of visualizations to be made """ super().__init__(area, algorithm) self.originalArea = copy.deepcopy(area) self.originalAlgorithm = copy.copy(algorithm) self.runs = 0 self.allTimeHigh = 0 self.dataHelper = DataHelper() self.maxRuns = runs def on_render(self): """Runs and visualizes the algorithm""" while self.runs < self.maxRuns: # continue running until algorithm is done super().on_render() # track highest found area value if self.area.price > self.allTimeHigh: self.allTimeHigh = self.area.price # when a run is finished... if self.algorithm.isDone is True and self.runs < self.maxRuns: print('🎉🎉Run {} is complete! 🎉🎉'.format(self.runs)) # ...save values to csv file self.dataHelper.writeArea(self.area) # ...restore to a fresh state (empty area) self.area = copy.deepcopy(self.originalArea) self.algorithm = copy.copy(self.originalAlgorithm) self.algorithm.area = self.area self.runs += 1 if self.runs == self.maxRuns: print('✨✨ I succesfully ran {} times! ✨✨'.format( self.runs))
def __init__(self, area, algorithm, runs): """Initiate all elements necessary to run an algorithm consecutively and create consecutive visualizations for them. Keyword arguments: area -- the area that should be visualised algorithm -- the algorithm by which the given area is filled runs -- the amount of times the algorithm should be run and the amount of visualizations to be made """ super().__init__(area, algorithm) self.originalArea = copy.deepcopy(area) self.originalAlgorithm = copy.copy(algorithm) self.runs = 0 self.allTimeHigh = 0 self.dataHelper = DataHelper() self.maxRuns = runs
class NoDrawBulkVisualizer: """Runs an algorithm several times without visualization""" def __init__(self, area, algorithm, runs): """Initiate all elements necessary to run an algorithm consecutively without visualizations. Keyword arguments: area -- the area that should be visualised algorithm -- the algorithm by which the given area is filled runs -- the amount of times the algorithm should run """ self.area = area self.algorithm = algorithm self.originalArea = copy.deepcopy(area) self.originalAlgorithm = copy.copy(algorithm) self.runs = 0 self.dataHelper = DataHelper() self.maxRuns = runs def on_execute(self): """Executes an algorithm several times""" while self.runs < self.maxRuns: # continue running until algorithm is done while self.algorithm.isDone is False: self.algorithm.execute() self.dataHelper.writeCSVLine(self.area) # if algorithm completes one run, reset state if self.algorithm.isDone is True and self.runs < self.maxRuns: print('Run {} is complete! 🎉'.format(self.runs)) # save area to csv self.dataHelper.writeArea(self.area) self.dataHelper = DataHelper() # reset area and algorithm self.area = copy.deepcopy(self.originalArea) self.algorithm = copy.copy(self.originalAlgorithm) self.algorithm.area = self.area self.runs += 1 if self.runs == self.maxRuns: print('✨✨ I succesfully ran {}' 'times! ✨✨'.format(self.runs))
class Model_Performance(): ''' ''' def __init__(self, checkpoint_path, model_name): self.chkpoint = checkpoint_path self.app = Application(load_path=checkpoint_path, model_name=model_name) self.datahelper = DataHelper() def CrossArchTestFast(self, arch1, arch2): ''' :param arch1: path to sqlite database :param arch2: path to sqlite database :return: ''' pairs = self.datahelper.get_cross_archtecture_pair(arch1, arch2) result = [] lables = [] pool = mp.Pool(processes=10) start_time = datetime.now() pairs = pairs[0:10] for (source, target, label) in tqdm(pairs, desc="Sim Computing."): source_tree = source[-1] target_tree = target[-1] lables.append(label) result.append( pool.apply_async(self.app.similarity_tree, ( source_tree, target_tree, ))) pool.close() pool.join() predictions = [] for res in result: predictions.append(res.get()) time_cost = (datetime.now() - start_time).seconds fpr, tpr, thresholds = roc_curve(np.array(lables), np.array(predictions)) logger.info("===> architecture info : %s vs %s" % (arch1, arch2)) logger.info("===> time: %s" % (datetime.now())) logger.info("model: %s" % self.chkpoint) logger.info( "compute %d function pairs, which cost %d seconds; %f pairs per sec" % (len(pairs), time_cost, len(pairs) * 1.0 / time_cost)) logger.info("predictions= %s" % json.dumps(predictions)) logger.info("labels= %s" % json.dumps(lables)) logger.info("fpr= %s" % json.dumps(fpr.tolist())) logger.info("tpr= %s" % json.dumps(tpr.tolist())) logger.info("thresholds= %s" % json.dumps(thresholds.tolist()))
class NoDrawVisualizer: """Runs an algorithm without visualization""" def __init__(self, area, algorithm): """Initiate all elements necessary to run an algorithm without visualization Keyword arguments: area -- the area that should be used in the algoritm algorithm -- the algorithm by which the given area is filled """ self.area = area self.algorithm = algorithm self.dataHelper = DataHelper() def on_execute(self): """Starts and executes the algorithm""" # while algorithm is not done, run it while self.algorithm.isDone is False: self.algorithm.execute() # save area state between every step self.dataHelper.writeArea(self.area)
def viterbi_tagger(self, test_file): print "________________________________________________________________________________________________viterbi_tagger starts" c_3 = DataHelper("dataset\\train.col") stream_emission_matrix = gzip.open("dumps\\emission_matrix.dump", 'rb') emission_matrix = cPickle.load(stream_emission_matrix) stream_emission_matrix.close() stream_transition_matrix = gzip.open("dumps\\transition_matrix.dump", 'rb') transition_matrix = cPickle.load(stream_transition_matrix) stream_transition_matrix.close() for x in transition_matrix: for p in transition_matrix[x]: if transition_matrix[x][p] > 0.2: print p, x, transition_matrix[x][p] sentence_count = 0 word_count = 0 output = open('dumps\\dev-predicted-viterbi.col', 'w') for sentence in c_3.read_sentence(test_file): observation = sentence.word_list() sentence_count += 1 word_count += len(observation) #print observation states = sentence.tag_list() #print states #for word in observation: # if word in emission_matrix: # states = states + emission_matrix[word].keys() # else: # states = states + ['NN'] states = list(set(states)) #states.insert(0, '<S>') #start = time.time() prediction = self.viterbi_smoothing(observation, states, emission_matrix, transition_matrix) #end = time.time() #print 'Sentence '+str(sentence_count)+' at', end - start for i in range(len(prediction[0])): output.write('%s\t%s\n' % (prediction[1][i], prediction[0][i])) output.write('\n') output.close() Ctag = DataHelper("dataset\\test.col") TagSet = Ctag.tagSet(Ctag) self.table_statistics.append([ str(emission_matrix.__len__()), str(len(TagSet)), str(word_count), str(sentence_count) ]) table = AsciiTable(self.table_statistics) print(table.table) print "________________________________________________________________________________________________viterbi_tagger ends" Cgold = DataHelper("dataset\\test.col") GoldWordTagList = Cgold.Tokenize(Cgold) Cpred = DataHelper("dumps\\dev-predicted-viterbi.col") PredWordTagList = Cpred.Tokenize(Cpred) eval = Evaluation() per_tag = False f_measure = eval.Evaluate(per_tag, GoldWordTagList, PredWordTagList, TagSet) print 'F-Measure Micro:' + '\t' + f_measure[0] print 'F-Measure Macro:' + '\t' + f_measure[1] print final_eval = Evaluation() f_per_tag = True per_tag_table = final_eval.Evaluate(f_per_tag, GoldWordTagList, PredWordTagList, TagSet) print per_tag_table
def tagger(self, filename, iteration): print "________________________________________________________________________________________________Perceptron tagger starts" for iter_round in range(iteration): model_file = 'dumps\\model_' + str(iter_round + 1) + '.dump' print 'Reading from file' + '\t' + model_file.split('\\')[1] model = multi_class_perceptron(model_file) c = DataHelper() output = open('dumps\\dev-predicted.col', 'w') for sentence in c.read_sentence(filename): for token in sentence: feature = token.feature_extracter(model.return_features) score = model.weight_scores(feature) predicted_tag = model.predict(score) pos_tag = model.pos_constructor(token.gold_pos) output.write( '%s\t%s\n' % (token.word, model.return_pos_reverse(predicted_tag))) output.write('\n') output.close() Cgold = DataHelper("dataset\\test.col") GoldWordTagList = Cgold.Tokenize(Cgold) Cpred = DataHelper("dumps\\dev-predicted.col") PredWordTagList = Cpred.Tokenize(Cpred) Ctag = DataHelper("dataset\\test.col") TagSet = Ctag.tagSet(Ctag) eval = Evaluation() per_tag = False f_measure = eval.Evaluate(per_tag, GoldWordTagList, PredWordTagList, TagSet) print 'F-Measure Micro:' + '\t' + f_measure[0] print 'F-Measure Macro:' + '\t' + f_measure[1] print final_eval = Evaluation() f_per_tag = True per_tag_table = final_eval.Evaluate(f_per_tag, GoldWordTagList, PredWordTagList, TagSet) print per_tag_table print "________________________________________________________________________________________________Perceptron tagger ends"
import matplotlib.pyplot as plt import numpy as np import tensorflow as tf from network import SimpleNet, SimpleRNN from datahelper import DataHelper x = SimpleNet.x cnn = SimpleNet.build_cnn(x, reuse=None, raw_out=True) rnn = SimpleRNN.build_graph(cnn) infer = tf.nn.softmax(rnn) y = SimpleRNN.y dh = DataHelper() saver = tf.train.Saver() #print('Recovering') #saver.recover_last_checkpoints('../models') with tf.Session() as sess: print('Restoring') saver.restore(sess, '../models/cnn-rnn-100.ckpt') while True: img, lbl = dh.get_next_example(split_channels=True) _y = sess.run(infer, feed_dict={x:img}) print(f'Label:{lbl}\nGuess:{_y}')
def main(): """Present the user with choices about the algorithm to run""" # set some values placementOrder = None waterAmountChoice = None print("----------------------") print("WELCOME TO AMSTELHAEGE \n") # let the user choose a new or existing grid gridChoice = int( input('Do you want to load in a grid ' 'or start from scratch?\n' '1: Load a grid\n' '2: Start from scratch\n' 'Your choice: ')) print("") # let the user choose an algorithm algorithmChoice = int( input('What algorithm do you want to run?\n' '1: Random \n' '2: Greedy \n' '3: SpeedRandom\n' '4: HillClimbing\n' '5: Simmulated Annealing\n' '6: Do Nothing (show only)\n' 'Your choice: ')) if algorithmChoice == 4 or algorithmChoice == 5: print("") totalIterations = int( input('How many steps should algorithm make?\n' 'Your choice: ')) if algorithmChoice == 5: print("") typeOfSimulatedAnnealing = int( input('What type of Simulated' ' Annealing?\n' '1: lineair \n' '2: exponential\n' '3: sigmoidal\n' 'Your choice: ')) if algorithmChoice == 5: print("") beginTemp = int( input('What is the begin temperature? (Try "50")\n' 'Your choice: ')) print("") endTemp = int( input('What is the end temperature? (Try "0")\n' 'Your choice: ')) print("") correctionShortening = int( input('What correction factor for' ' shortening the would you like' ' to use? (Try "1000")\n' ' Your choice: ')) print("") # let the user choose a type of visualization. # Normal visualizer renders a single visualisation # Bulk visualizer can render several maps after one another visualizerChoice = int( input('What visualizer do you want?\n' '1: Normal visualizer\n' '2: Bulk visualizer\n' '3: No-draw normal visualizer\n' '4: No-draw bulk visualizer\n' 'Your choice: ')) print("") isEmpty = True fhAmount = 0 bAmount = 0 mAmount = 0 if gridChoice == 1: # ask the user for the file where the existing grid is stored fileName = str(input('Please provide a file name: \n' 'Your choice: ')) area = DataHelper(fileName).getArea() isEmpty = False else: # or create a new grid area = Area() houseAmountChoice = int( input('How many houses do you want? \n' '1: 20\n' '2: 40\n' '3: 60\n' 'Your choice: ')) print("") # set the correct ratio of house types # for different amounts of houses if houseAmountChoice == 1: fhAmount = 12 bAmount = 5 mAmount = 3 elif houseAmountChoice == 2: fhAmount = 24 bAmount = 10 mAmount = 6 elif houseAmountChoice == 3: fhAmount = 36 bAmount = 15 mAmount = 9 # if applicable for the algorithm chosen, provide further choices if algorithmChoice != 2 and gridChoice != 1: placementOrder = int( input('In what order do you want houses ' 'to be placed on the map?\n' '1: Random \n' '2: First Mansions, then Bungalows, ' 'then Family homes \n' 'Your choice: ')) print("") waterAmountChoice = int( input('How many water areas' ' do you want on the map? \n' '1: 1 Area \n' '2: 2 Area\'s \n' '3: 3 Area\'s \n' '4: 4 Area\'s \n' '5: Random amount of Area\'s \n' 'Your choice: ')) if waterAmountChoice == "5": waterAmountChoice = "Random" print("") # initiate the algorithm chosen by the user if algorithmChoice == 1: algorithm = RandomAlgorithm(area, fhAmount, bAmount, mAmount, placementOrder, waterAmountChoice, isEmpty) elif algorithmChoice == 2: algorithm = GreedyAlgorithm(area, fhAmount, bAmount, mAmount, isEmpty) elif algorithmChoice == 3: algorithm = SpeedRandomAlgorithm(area, fhAmount, bAmount, mAmount, placementOrder, waterAmountChoice, isEmpty) elif algorithmChoice == 4: algorithm = HillClimbingAlgorithm(area, fhAmount, bAmount, mAmount, placementOrder, waterAmountChoice, isEmpty, totalIterations) elif algorithmChoice == 5: algorithm = HillClimbingAlgorithm(area, fhAmount, bAmount, mAmount, placementOrder, waterAmountChoice, isEmpty, totalIterations, beginTemp, endTemp, typeOfSimulatedAnnealing, correctionShortening) elif algorithmChoice == 6: algorithm = Algorithm(area, fhAmount, bAmount, mAmount, isEmpty) # initiate the visualization requested by the user if visualizerChoice == 1: # enable downward graphing if algorithmChoice == 5: visualizer = Visualizer(area, algorithm, True) else: visualizer = Visualizer(area, algorithm, False) elif visualizerChoice == 2: runs = int(input('How many runs do you want to do? \n' 'Your choice: ')) visualizer = BulkVisualizer(area, algorithm, runs) elif visualizerChoice == 3: visualizer = NoDrawVisualizer(area, algorithm) elif visualizerChoice == 4: runs = int(input('How many runs do you want to do? \n' 'Your choice: ')) visualizer = NoDrawBulkVisualizer(area, algorithm, runs) # notify the user of the end of the menu print("Starting your Algorithm...") print("----------------------") visualizer.on_execute()
parser = argparse.ArgumentParser() parser.add_argument("--more_news_times", type=int, default=3, help="Number of click the 'more news' button") parser.add_argument("--threading_num", type=int, default=3, help="Number of threading") parser.add_argument('-o', "--output_dir", type=str, default="../output") parser.add_argument('-i', "--ip_list_file", type=str, default="../input/alive_ip_list.txt") parser.add_argument('-v', "--visited_url", type=str, default="../input/visited_url.txt") parser.add_argument('-u', "--unvisited_url", type=str, default="../input/unvisited_url.txt") parser.add_argument('-r', "--root_url", type=str, default="../input/root_url.txt") args = parser.parse_args() datahelper = DataHelper() spider = Spider(datahelper=datahelper, args=args) spider.start()
def _train_network(net, eval_net): global params global x global y iters = tf.Variable(1, trainable=False) learning_rate = None if params['decay_steps']: learning_rate = tf.train.exponential_decay( params['start_learning_rate'], iters, params['decay_steps'], params['decay_base']) else: learning_rate = tf.Variable(params['start_learning_rate'], trainable=False) with tf.name_scope('loss'): #loss_weights = 1.003 - tf.reduce_max(y, axis=1) kl = lambda p, q: tf.losses.softmax_cross_entropy( p, q, reduction=tf.losses.Reduction.MEAN) hs_kl = lambda p, q: tf.multiply(0.5, tf.square(kl(p, q))) loss = tf.losses.softmax_cross_entropy( y, net, weights=1.0, reduction=tf.losses.Reduction.MEAN) #loss = tf.nn.softmax_cross_entropy_with_logits(logits=net, # labels=y, # weights=loss_weights, # reduction=tf.losses.Reduction.MEAN) #optimizer = tf.train.MomentumOptimizer(learning_rate=learning_rate, momentum=params['momentum']) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate) grads = optimizer.compute_gradients(loss) with tf.name_scope('clipping'): grads = [(tf.clip_by_value(grad, -1.5, 1.5), var) for grad, var in grads] update = optimizer.apply_gradients(grads, global_step=iters) # with tf.name_scope('grads'): # for grad, var in grads: # tf.summary.histogram(f"{var.name.split(':')[0]}", grad) # with tf.name_scope('weights'): # for grad, var in grads: # tf.summary.histogram(f"{var.name.split(':')[0]}", var) learning_rate_reduce = params['learning_rate_reduce'] # this should have a more general implementation, we chose 0 because # accuracy will grow as it improves top_result = 0.0 dh = DataHelper(batch_size=params['batch_size'], train_size=params['train_size'], label_noise=params['label_noise'], bands=params['bands'], transform_func=eval(params['trans_func']) if params['trans_func'] else None) with tf.name_scope('metrics'): evaluate.evaluate_tensorboard(eval_net, y) summaries = tf.summary.merge_all() init = tf.global_variables_initializer() saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: if params['restore']: saver.restore(sess, tf.train.latest_checkpoint(params['model_dir'])) else: sess.run(init) trainWriter = tf.summary.FileWriter(params['tf_train_dir'], graph=sess.graph) testWriter = tf.summary.FileWriter(params['tf_test_dir'], graph=sess.graph) # run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) # run_metadata = tf.RunMetadata() run_options = None run_metadata = None top_result = 0 while iters.eval() < params['iter_limit']: current_iter = iters.eval() if learning_rate_reduce and current_iter in learning_rate_reduce: sess.run(learning_rate.assign(learning_rate.eval() / 10)) if params['print']: tf.logging.info(f"Training iter:{current_iter}") batch_xs, batch_ys = dh.get_next_batch(iter_based=True) batch = {x: batch_xs, y: batch_ys} sess.run(update, feed_dict=batch) if current_iter % 10 == 0: if params['print']: tf.logging.info("Evaluating") s = sess.run(summaries, feed_dict=batch) trainWriter.add_summary(s, current_iter) if current_iter % 100 == 0: if params['print']: tf.logging.info('Testing') batch_xs, batch_ys = dh.get_next_batch(force_test=True) batch[x] = batch_xs batch[y] = batch_ys s = sess.run(summaries, feed_dict=batch) testWriter.add_summary(s, current_iter) evals = evaluate.evaluate(sess, eval_net, x, y, batch_xs, batch_ys, params['test_progress']) if params['save_progress'] and evals[0] > top_result: if params['print']: tf.logging.info('Saving checkpoint') model_path = os.path.join(params['model_dir'], 'res-net.ckpt') saver.save(sess, model_path, global_step=iters) top_result = evals[0] # This needs to be printed so that the async trainer can see the result if params['rtrn_eval']: print(top_result)
'name': 'mask', 'value': Mask.RANDOM }, # {'name': 'learning_rate', 'value': 0.01}, # (parameters for NECTR) # {'name': 'nectr_n_hidden_layers', 'min': 1, 'max': 2, 'step': 1}, # {'name': 'nectr_n_neurons', 'min': 5, 'max': 45, 'step': 10}] # {'name': 'nectr_poisson', 'value': True}, # {'name': 'nectr_item_counts', 'value': True}, # {'name': 'nectr_train_tf_on_solutions', 'value': False}, # {'name': 'nectr_learning_rate', 'value': 0.1}, # {'name': 'nectr_nn_regularization_type', 'value': 'l1'}, # {'name': 'nectr_nn_regularization', 'type': 'exp', 'min': 1e-2, 'max': 1e-2}, # {'name': 'nectr_lambda_completion', 'type': 'exp', 'min': 2e-2, 'max': 2e-0}, { 'name': 'nectr_n_epoch_completion', 'min': 2, 'max': 2, 'step': 2 } ] # Setup DataHelper utils DataHelper.setup(PATH_TO_DATA) # Setup cross validation cv = CrossValidation(datahelper=DataHelper, parameters=parameters) # Run the cross validation pipeline cv.run_pipeline(model, PATH_TO_RESULTS, plot=False)
def _train_network(net): global params global x global y iters = tf.Variable(0, trainable=False) learning_rate = None if params['decay_steps']: learning_rate = tf.train.exponential_decay( params['start_learning_rate'], iters, params['decay_steps'], params['decay_base']) else: learning_rate = tf.Variable(params['start_learning_rate']) cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(net, y)) # find a way tp paramertize the optimizer optimizer = tf.train.MomentumOptimizer(learning_rate, params['momentum'], params['nesterov']) optimize = optimizer.minimize(cost, global_step=iters) init = tf.initialize_all_variables() saver = tf.train.Saver() learning_rate_reduce = params['learning_rate_reduce'] start = time.time() # this should have a more general implementation, we chose 0 because # accuracy will grow as it improves top_result = 0.0 with tf.Session() as sess: sess.run(init) for epoch in range(1, params['epoch_limit'] + 1): if params['print']: print epoch dh = DataHelper(batch_size=params['batch_size'], train_size=params['train_size'], label_noise=params['label_noise'], bands=params['bands'], transform_func=eval(params['trans_func']) if params['trans_func'] else None) if learning_rate_reduce and epoch in learning_rate_reduce: sess.run(learning_rate.assign(learning_rate.eval() / 10.0)) while dh.training: batch_xs, batch_ys = dh.get_next_batch() sess.run(optimize, feed_dict={x: batch_xs, y: batch_ys}) if iters.eval() % 20 == 0: evaluate.evaluate(sess, net, x, y, batch_xs, batch_ys, params['train_progress']) #testing batch_xs, batch_ys = dh.get_next_batch() results = evaluate.evaluate(sess, net, x, y, batch_xs, batch_ys, params['test_progress']) if params['save_progress'] and results[0] > top_result: if params['print']: print 'Saving checkpoint' saver.save(sess, params['model_dir'], global_step=iters) top_result = results[0] if params['print']: print 'Epoch took {} seconds'.format(time.time() - start) if params['rtrn_eval']: print top_result
def p_drop(a, p): a[a<np.percentile(a, p)] *= 1e-3 return a scaler = MinMaxScaler() #scaler = MinMaxScaler(feature_range=(-1, 1)) valid_img = lambda a: a.sum()>0 and np.isfinite(a).sum()==np.prod(a.shape) scale = lambda a: scaler.fit_transform(a.reshape(-1,1)).reshape(84,84).astype(np.float32) mean_subtraction = lambda a: a-a.mean() drop_percentile = lambda a, p: p_drop(a,p) identity = lambda a: a b_trans = lambda a: scale(drop_percentile(a, 50)) if valid_img(a) else a dh = DataHelper(batch_size=batch_size, band_transform_func=b_trans) epoch = 1 len_epoch = len(dh._train_imgs) print(f'Epoch length = {len_epoch}') summaries = tf.summary.merge_all() saver = tf.train.Saver() with tf.Session() as sess: if restore_file: saver.restore(sess, restore_file) else: sess.run(init) trainWriter = tf.summary.FileWriter('../report/tf-log/train', graph=sess.graph) testWriter = tf.summary.FileWriter('../report/tf-log/test', graph=sess.graph)
def _keys(self, fn): helper = DataHelper(fn) if not os.path.isfile('../data/middle/entity2id.txt'): helper.id2file() return list(helper.node2id.keys())
import tensorflow as tf import numpy as np import sys from datahelper import DataHelper VOCAB_SIZE = 10000 EMBEDDING_SIZE = 1 LEARNING_RATE = 1e-3 MINI_BATCH_SIZE = 256 NORMALIZE_LAYER = 0 data_helper = DataHelper(_voc_size=VOCAB_SIZE) data_helper.load_train_ins_and_process("data/train.50_51.ins") data_helper.load_eval_ins("data/eval.52.ins") print "data loaded" def eval_auc(eval_res, eval_label): sorted_res = np.argsort(eval_res, axis=0) m = 0 n = 0 rank = 0 for k in range(sorted_res.shape[0]): idx = sorted_res[k][0] if eval_label[idx][0] == 1: m += 1
class GraphMes: def __init__(self, graph=None, file=None, start=0, ints=False): if graph==None and file!=None: self.helper = DataHelper(file,NP=False) self.samples = self.helper.GetSamples() if ints == True: samples = [] for i in self.samples: samples.append([int(i[0]), int(i[1]),int(i[2])]) self.samples = np.array(samples) self.G = self.readGraph(file, ints=ints) self.uG = self.readGraph(file, ints=ints, unweight = True) elif graph!=None and file==None: self.G = graph self.uG = nx.Graph(self.G) self.samples = [] for edge in self.G.edges(): for i in self.G[edge[0]][edge[1]]: self.samples.append([edge[0], self.G[edge[0]][edge[1]][i]['attr'], edge[1]]) else: raise Exception self.start = start self.node2id, self.id2node = self._node2id() self.edge2id, self.id2edge = self._edge2id() def readGraph(self, sf, ints=False, unweight = False): self.SamplesCnt = len(self.samples) if unweight == True: G = nx.Graph() for sample in self.samples: G.add_edge(sample[0],sample[2]) else: G = nx.MultiDiGraph() for sample in self.samples: G.add_edge(sample[0],sample[2],attr=sample[1]) return G def graph2id(self, of): with open(of, 'w') as f: for h,r,t in self.samples: f.write(str(self.node2id[h])+' '+str(self.edge2id[r])+' '+str(self.node2id[t])+'\n') def _node2id(self): node2id = dict() id2node = dict() index = 0 for node in self.G.nodes(): node2id.update({node:self.start+index}) id2node.update({self.start+index:node}) index += 1 return node2id, id2node def _edge2id(self): edge2id = dict() id2edge = dict() self.attrs = set() for edge in self.G.edges(): for i in self.G[edge[0]][edge[1]]: # print(self.G[edge[0]][edge[1]][i]['attr']) self.attrs.add(self.G[edge[0]][edge[1]][i]['attr']) index = 0 for attr in self.attrs: edge2id.update({attr:self.start+index}) id2edge.update({self.start+index:attr}) index += 1 return edge2id, id2edge def id2file(self, nodefn, edgefn): with open(nodefn, 'w') as nf: for i in range(len(self.node2id)): nf.write(self.id2node[i]+' '+str(i)+'\n') with open(edgefn, 'w') as ef: for i in range(len(self.edge2id)): ef.write(self.id2edge[i]+' '+str(i)+'\n') def _update_margin(self, searched, margin): margin_backup = copy.copy(margin) for i in margin_backup: for j in self.G.neighbors(i): if j not in searched: margin.add(j) for i in margin_backup: margin.remove(i) searched.add(i) if len(margin) == 0: random_sampling = np.random.randint(0, len(self.nodes)-1) while( random_sampling not in searched and len(margin)==0): margin.add(random_sampling) random_sampling = np.random.randint(0, len(self.nodes)-1) def cohesive(self, windowSize): all_bs = nx.eigenvector_centrality(self.uG) searched = set() margin = set() windows = set() center = np.random.randint(0,len(self.nodes)-1) windows.add(center) margin.add(center) searched.add(center) while(len(windows) < windowSize): margin_bs = {} self._update_margin(searched, margin) for i in margin: margin_bs.update({i:all_bs[i]}) margin_bs_sort = texthelper.sortDict(margin_bs, By="value", reverse=True) for j in margin_bs_sort: windows.add(j[0]) if len(windows) >= windowSize: break return windows @property def nodes(self): return list(self.G.nodes) @property def nodeCnt(self): return len(self.G.nodes) @property def samplesCnt(self): return len(self.samples) @property def edges(self): return list(self.attrs) @property def edgeCnt(self): return len(self.attrs)
class Asteria(): ''' Functions: 1. calculate the similarity between functions 2. calculate the similarity between asts 3. calculate the similarity between ast encodings ''' def __init__(self, checkpoint_path, model_selector, cuda=False): #cuda = True self.dh = DataHelper() # 数据库读取 self.checkpoint_path = checkpoint_path self.model_selector = model_selector self.cuda = cuda l.info("[I] Model Loading....") self.compute_app = Application(checkpoint_path, cuda=cuda, model_name=model_selector) l.info("[I] Model loaded...") def ast_encode_similarity(self, sources=[], targets=[], threshold=0): ''' :param sources:list: source asts :param targets:list: target asts :return: dict: key is function_name, value is a dict :{'rank':[], 'info':(function_name, elf_path, elf_file_name, caller, callee, ast_encode)} ''' result = defaultdict(dict) for (function_name, elf_path, elf_name, scaller, scallee, ast_encode), _ in tqdm(sources): res = [] pool = Pool(processes=cpu_count() - 2) for (tfunction_name, telf_path, telf_name, tcaller, tcallee, tast_encode), _ in targets: if tast_encode is None: print("%s encode not exits" % tfunction_name) res.append((pool.apply_async( self.compute_app.similarity_treeencoding_with_correction, (json.loads(ast_encode), json.loads(tast_encode), (scaller, scallee), (tcaller, tcallee))), tfunction_name, telf_path, telf_name)) pool.close() pool.join() similarity_list = [] for r in res: sim = r[0].get() if sim >= threshold: similarity_list.append(((r[1], r[2]), sim)) similarity_list.sort(key=lambda x: x[1], reverse=True) # 排序 result[function_name]['rank'] = similarity_list result[function_name]['info'] = (function_name, elf_path, telf_name) return result def prefilter(self, ast1, ast2): ''' :param ast1: :param ast2: :return: if ast1 and ast2 are too different , return 1. ''' c1 = ast1.num_children c2 = ast2.num_children if abs(c1 - c2) > 30: return 1 if c1 / c2 > 3 or c2 / c1 > 3: return 1 return 0 def ast_similarity(self, sources=[], targets=[], astfilter=None, threshold=0): ''' :param sources: list: source asts :param targets: list: target asts func_info:[function_name, elf_path, elf_file_name, caller, callee, ast_encode] :param astfilter: a filter function to filter out ast pairs which are too different. :return: dict: key {'rank':[], 'info':(function_name, elf_path, elf_file_name, caller, callee, ast_encode)} ''' result = {} N = len(sources) i = 0 TN = len(targets) astfilter = self.prefilter if astfilter: l.error("Filter Function is applied.") for s_func_info, s_ast in sources: i += 1 result[s_func_info[0]] = {'rank': '', 'info': ''} res = [] with tqdm(targets, desc="[%d] of %d" % (i, N), dynamic_ncols=True) as t: for func_info, t_ast in t: if astfilter and astfilter(s_ast, t_ast): res.append([func_info, 0]) else: res.append([ func_info, self.compute_app.similarity_tree_with_correction( s_ast, t_ast, [s_func_info[-3], s_func_info[-2]], [func_info[-3], func_info[-2]]) ]) res = list(filter(lambda x: x[1] > threshold, res)) res.sort(key=lambda x: x[1], reverse=True) # 排序 result[s_func_info[0]]['rank'] = res result[s_func_info[0]]['info'] = s_func_info return result def db_similarity(self, source_db, target_db, ast, threshold, start=-1, end=-1): ''' :param source_db: aught to be vulnerability database path :param target_db: firmware function database :param ast: True:直接使用ast进行计算相似度;False,使用ast的编码之后的向量进行相似度计算 :param threshold: float: 0~1 :param start/end: the position for select in sql limit :return: ''' source_asts = [] target_asts = [] elf_names = set() where_suffix = " limit 0,20" # the number of vulnerability functions does not exceeds 100 for func in list( self.dh.get_functions(source_db, where_suffix=where_suffix)): # limit vul function number source_asts.append(func) elf_names.add("'" + func[0][2].split('.')[0] + "%'") elf_files = " or ".join(elf_names) # where_suffix = " where elf_file_name like %s" % elf_files #l.info("[DB] the firmware select filter is %s" % where_suffix) where_suffix = "" for func in self.dh.get_functions(target_db, start=start, end=end, where_suffix=where_suffix): target_asts.append(func) if ast: return self.ast_similarity(source_asts, target_asts, threshold=threshold) else: return self.ast_encode_similarity(source_asts, target_asts, threshold=threshold)
def __init__(self, checkpoint_path, model_name): self.chkpoint = checkpoint_path self.app = Application(load_path=checkpoint_path, model_name=model_name) self.datahelper = DataHelper()
import tensorflow as tf import numpy as np import sys from datahelper import DataHelper VOCAB_SIZE=10000 EMBEDDING_SIZE=1 LEARNING_RATE=1e-3 MINI_BATCH_SIZE=256 NORMALIZE_LAYER=0 data_helper = DataHelper(_voc_size = VOCAB_SIZE) data_helper.load_train_ins_and_process("data/train.50_51.ins") data_helper.load_eval_ins("data/eval.52.ins") print "data loaded" def eval_auc(eval_res, eval_label): sorted_res = np.argsort(eval_res, axis=0) m = 0 n = 0 rank = 0 for k in range(sorted_res.shape[0]): idx = sorted_res[k][0] if eval_label[idx][0] == 1: m += 1 rank += k + 1
class GraphMes: def __init__(self, logging, graph=None, file=None, start=0, ints=False): self.logging = logging if not os.path.isdir(logging): os.mkdir(logging) if graph == None and file != None: self.helper = DataHelper(file, NP=False) self.samples = self.helper.GetSamples() if ints == True: samples = [] for i in self.samples: samples.append([int(i[0]), int(i[1]), int(i[2])]) self.samples = np.array(samples) self.G = self.readGraph(file, ints=ints) self.uG = self.readGraph(file, ints=ints, unweight=True) elif graph != None and file == None: self.G = graph self.uG = nx.Graph(self.G) self.samples = [] for edge in self.G.edges(): for i in self.G[edge[0]][edge[1]]: self.samples.append([ edge[0], self.G[edge[0]][edge[1]][i]['attr'], edge[1] ]) else: raise Exception self.start = start self.node2id, self.id2node = self._node2id() self.edge2id, self.id2edge = self._edge2id() def readGraph(self, sf, ints=False, unweight=False): self.SamplesCnt = len(self.samples) if unweight == True: G = nx.Graph() for sample in self.samples: G.add_edge(sample[0], sample[2]) else: G = nx.MultiDiGraph() for sample in self.samples: G.add_edge(sample[0], sample[2], attr=sample[1]) return G def graph2id(self, of): with open(of, 'w') as f: for h, r, t in self.samples: f.write( str(self.node2id[h]) + ' ' + str(self.edge2id[r]) + ' ' + str(self.node2id[t]) + '\n') def _node2id(self): node2id = dict() id2node = dict() index = 0 for node in self.G.nodes(): node2id.update({node: self.start + index}) id2node.update({self.start + index: node}) index += 1 return node2id, id2node def _edge2id(self): edge2id = dict() id2edge = dict() self.attrs = set() for edge in self.G.edges(): for i in self.G[edge[0]][edge[1]]: # print(self.G[edge[0]][edge[1]][i]['attr']) self.attrs.add(self.G[edge[0]][edge[1]][i]['attr']) index = 0 for attr in self.attrs: edge2id.update({attr: self.start + index}) id2edge.update({self.start + index: attr}) index += 1 return edge2id, id2edge def id2file(self, nodefn, edgefn): with open(nodefn, 'w') as nf: for i in range(len(self.node2id)): nf.write(self.id2node[i] + ' ' + str(i) + '\n') with open(edgefn, 'w') as ef: for i in range(len(self.edge2id)): ef.write(self.id2edge[i] + ' ' + str(i) + '\n') def zipf(self, plot=True): print('-------------') x, y = [], [] degree = nx.degree_histogram(self.G) for i in range(len(degree)): if degree[i] != 0: y.append(degree[i] / float(sum(degree))) x.append(i) xdata = np.array(x) ydata = np.array(y) fita, fitb = optimize.curve_fit(powerLaw, xdata, ydata) print(fita, fitb) if plot == False: return fita, fitb else: # x = np.linspace(xdata.min(),xdata.max(),50) # y = fita[1]*powerNp(x,-fita[0]) plt.figure() plt.title("Degree distribution curve fitting\n") matplotlib.rc('xtick', labelsize=30) matplotlib.rc('ytick', labelsize=30) plt.text(max(xdata) * 0.4, max(ydata) * 0.4, 'y=' + "{:.3f}".format(fita[1]) + '*x^-' + "{:.3f}".format(fita[0]), ha='center') plt.plot(xdata, ydata, '.') # plt.plot(xdata,ydata,label='data') plt.xlabel('k(rank order)') plt.ylabel('p(k)') plt.savefig(self.logging + '/zipf.png') plt.close(0) plt.figure() plt.title("Degree distribution curve fitting (log)\n") plt.text(max(xdata) * 0.4, max(ydata) * 0.4, 'y=' + "{:.3f}".format(fita[1]) + '*x^-' + "{:.3f}".format(fita[0]), ha='center') plt.xlabel('k(rank order)') plt.ylabel('p(k)') plt.loglog(xdata, ydata, '.') # plt.loglog(xdata,ydata,'g',label='data') plt.savefig(self.logging + '/zipf_log.png') return fita, fitb def zipf_coeffi(self, plot=True): # print(nx.average_clustering(graphmes.uG)) degree = {} zipf_coeffi = {} for i in self.uG.nodes(): if self.uG.degree(i) in degree: degree[self.uG.degree(i)].append(i) else: degree.update({self.uG.degree(i): [i]}) for i in degree: zipf_coeffi.update({i: 0}) for node in degree[i]: zipf_coeffi[i] += nx.clustering(self.uG, node) zipf_coeffi[i] /= len(degree[i]) zipf_coeffi = np.array(texthelper.sortDict(zipf_coeffi, By="key")) if plot == False: return zipf_coeffi else: xdata = zipf_coeffi[:, 0] ydata = zipf_coeffi[:, 1] fita, fitb = optimize.curve_fit(powerLaw, xdata, ydata) plt.figure() plt.title("Degree-Clustering distribution curve fitting\n") plt.text(max(xdata) * 0.4, max(ydata) * 0.4, 'y=' + "{:.2f}".format(fita[1]) + '*x^-' + "{:.2f}".format(fita[0]), ha='center') plt.plot(xdata, ydata, '.') # plt.plot(xdata,ydata,'.', label='data') plt.xlabel('k') plt.ylabel('clustering') plt.savefig(self.logging + '/zipf_coeffi.png') plt.close(0) plt.figure() plt.text(max(xdata) * 0.4, max(ydata) * 0.4, 'y=' + "{:.2f}".format(fita[1]) + '*x^-' + "{:.2f}".format(fita[0]), ha='center') plt.title("Degree-Clustering distribution curve fitting (log)\n") plt.loglog(xdata, ydata, '.') # plt.loglog(xdata,ydata,'.', label='data') plt.xlabel('log(k)') plt.ylabel('log(clustering)') plt.savefig(self.logging + '/zipf_coeffi_log.png') plt.close(0) return zipf_coeffi def record(self, additional=True): with open(self.logging + '/info.txt', 'w') as f: f.write(" Number of nodes :" + str(len(self.nodes)) + '\n') f.write(" Number of edges :" + str(len(self.edges)) + '\n') f.write(" Number of samples :" + str(self.samplesCnt) + '\n') if additional: uG = nx.Graph(self.G) connectedCnt = nx.number_connected_components(uG) f.write(" number_connected_components :" + str(connectedCnt) + '\n') if connectedCnt == 1: f.write(" Diameter :" + str(nx.diameter(uG)) + '\n') f.write(" Radius :" + str(nx.radius(uG)) + '\n') f.write(" average_shortest_path_length :" + str(nx.average_shortest_path_length(uG)) + '\n') f.write(" Density :" + str(nx.density(uG)) + '\n') f.write(" average_clustering :" + str(nx.average_clustering(uG)) + '\n') f.write(" node_connectivity :" + str(nx.node_connectivity(self.G)) + '\n') f.write(" global_efficiency :" + str(nx.global_efficiency(uG)) + '\n') @property def nodes(self): return list(self.G.nodes) @property def nodeCnt(self): return len(self.G.nodes) @property def samplesCnt(self): return len(self.samples) @property def edges(self): return list(self.attrs) @property def edgeCnt(self): return len(self.attrs)
if args.balanced_real_images: if args.split == 'train': image_prefix = "COCO_train2014_000000" else: image_prefix = "COCO_val2014_000000" image_postfix = ".jpg" elif args.abstract_scene_images: if args.split == 'train': image_prefix = "abstract_v002_train2015_0000000" else: raise NotImplementedError() image_postfix = ".png" helper = DataHelper(args.annot_file, args.ques_file) # Write dataset to file with open(args.output_file, "w") as output_file: for i in range(len(helper.dataset['annotations'])): imd_id = helper.dataset['annotations'][i]['image_id'] img_name = image_prefix + pad_with_zero(imd_id, args) + image_postfix ques_id = helper.dataset['annotations'][i]['question_id'] question = helper.qqa[ques_id]['question'] # Convert to comma-separated token string question = ','.join(question.strip().split()) answer = helper.dataset['annotations'][i]['multiple_choice_answer']
print W_out.get_shape() print b_out.get_shape() print out.get_shape() # No changes to old network.py beyond this. Will be updating this soon. cost = tf.reduce_mean(tf.squared_difference(out, y_)) optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost) #optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost) rmse = tf.sqrt(tf.reduce_mean(tf.squared_difference(out, y_))) # Initialize init = tf.initialize_all_variables() dh = DataHelper(batch_size, test_idx=test_start) saver = tf.train.Saver() with tf.Session() as sess: sess.run(init) print sess.run(W_conv1), sess.run(b_conv1), sess.run(W_conv2), sess.run( b_conv2) test_data, test_labels = dh.get_test_data(test_size) epoch = 1 train_start = time.time() while epoch <= epochs: epoch_start = time.time() print 'Training Epoch {}...'.format(epoch) # get data, test_idx = 19000 is ~83% train test split dh = DataHelper(batch_size, test_idx=test_start) # test data
def setUp(self): """ Read the data file and create model with training data """ data = DataHelper().read_data() self.x_train, self.x_test, self.y_train, self.y_test = DataHelper( ).split_data(data) self.model = self.mh.create_model(self.x_train, self.y_train)