def all_launcher(rts, logger): ''' The entire data processing chain has been called, this will take a couple of hours (at least) to complete. ''' stopwatch = timer.Timer() log.to_db(rts, 'dataset', 'all', stopwatch, event='start') print 'Start of building %s %s dataset.' % (rts.language.name, rts.project) functions = ordered_dict.OrderedDict(((downloader_launcher, 'download'), (extract_launcher, 'extract'), #(sort_launcher, 'sort'), (store_launcher, 'store'), (transformer_launcher, 'transform'))) for function, callname in functions.iteritems(): if callname not in rts.ignore: print 'Launching %s' % function.func_name res = function(rts, logger) if res == False: sys.exit(False) elif res == None: pass stopwatch.elapsed() log.to_db(rts, 'dataset', 'all', stopwatch, event='finish')
def __init__(self, process_type): if process_type == 'train': self.anno_path = cfg.YOLOv2.TRAIN_DATA self.batch_size = cfg.TRAIN.BATCH_SIZE self.is_training = True self.data_type = loader_cfg.TRAINING_LOADER_FLAGS else: self.anno_path = cfg.YOLOv2.TEST_DATA self.batch_size = cfg.EVAL.BATCH_SIZE self.is_training = False self.dataset_loader = simone_loader.SimoneDatasetLoader( loader_cfg.DATASET_DIR, loader_cfg.TRAINING_LOADER_FLAGS, True) self.num_samples = self.dataset_loader.get_total_num() self.num_batchs = int(np.ceil(self.num_samples / self.batch_size) - 2) self.batch_count = 0 self.is_use_thread = cfg.YOLOv2.IS_USE_THREAD self.img_anchors = self.load_anchors(cfg.IMG.ANCHORS) self.loader_need_exit = 0 self.timer = timer.Timer() self.per_step_ano = [] if self.is_use_thread: self.prepr_data = [] self.max_cache_size = 10 self.lodaer_processing = threading.Thread(target=self.loader) self.lodaer_processing.start()
def __init__(self, preprocessor, dataset_type): if dataset_type == 'train': self.anno_path = cfg.CONTFUSE.TRAIN_DATA self.batch_size = cfg.TRAIN.BATCH_SIZE self.is_data_aug = cfg.TRAIN.IS_DATA_AUG if dataset_type == 'val': self.anno_path = cfg.CONTFUSE.VAL_DATA self.batch_size = cfg.EVAL.BATCH_SIZE self.is_data_aug = False if dataset_type == 'test': self.anno_path = cfg.CONTFUSE.TEST_DATA self.batch_size = cfg.EVAL.BATCH_SIZE self.is_data_aug = False self.img_anchors = loader.load_anchors(cfg.IMAGE.ANCHORS) self.bev_anchors = loader.load_anchors(cfg.BEV.ANCHORS) self.annotations = loader.load_annotations(self.anno_path) self.num_samples = len(self.annotations) self.num_batchs = int(np.ceil(self.num_samples / self.batch_size)) self.batch_count = 0 self.is_use_thread = cfg.CONTFUSE.IS_USE_THREAD self.cuda_preprocessor = preprocessor.preprocessor self.loader_need_exit = 0 self.timer = timer.Timer() if self.is_use_thread: self.prepr_data = [] self.max_cache_size = 10 self.lodaer_processing = threading.Thread(target=self.loader) self.lodaer_processing.start()
def run(): with timer.Timer("reading dataset"): dataset_train = util.read_float_dataset(train_filename) header_train = util.read_header(train_filename) dataset_test = util.read_float_dataset(test_filename) header_test = util.read_header(test_filename) with timer.Timer("reading trees"): tree_states = [] for filename in sorted(glob.glob(tree_basename.replace('%d', '*[0-9]'))): tree = pygv.AGraph(filename) tree_state = TreeState(tree, filename) tree_states.append(tree_state) #tree.layout(prog='dot') #tree.draw(filename+".png") #num_trees = len(tree_states) with timer.Timer("extracting binary variables"): domain, pairs_dict = binarize(tree_states, header_test) for tree_state in tree_states: tree_state.domain = domain tree_state.pairs_dict = pairs_dict binarize_dataset(dataset_train, domain, pairs_dict, header_train, binarized_train_filename) binarize_dataset(dataset_test, domain, pairs_dict, header_test, binarized_test_filename) discretize_dataset(dataset_train, domain, pairs_dict, header_train, discretized_train_filename) discretize_dataset(dataset_test, domain, pairs_dict, header_test, discretized_test_filename) with timer.Timer("binarizing trees"): binarize_tree_states(tree_states, domain, pairs_dict, binarized_tree_basename) with timer.Timer("writing constraints"): write_constraints( domain, pairs_dict, [constraint_filename_working, constraint_filename_output], constraint_sdd_filename, constraint_vtree_filename) print "\tdiscretization: " for k, v in pairs_dict.iteritems(): print "\t", k, v
def diff_launcher(rts, logger): print 'Start creating diff dataset' stopwatch = timer.Timer() log.to_db(rts, 'dataset', 'diff', stopwatch, event='start') log.to_csv(logger, rts, 'Start', 'Diff', diff_launcher) differ.launcher(rts) stopwatch.elapsed() log.to_db(rts, 'dataset', 'diff', stopwatch, event='finish') log.to_csv(logger, rts, 'Finish', 'Diff', diff_launcher)
def downloader_launcher(rts, logger): ''' This launcher calls the dump downloader to download a Wikimedia dump file. ''' print 'Start downloading' stopwatch = timer.Timer() log.to_db(rts, 'dataset', 'download', stopwatch, event='start') downloader.launcher(rts, logger) stopwatch.elapsed() log.to_db(rts, 'dataset', 'download', stopwatch, event='finish')
def _run(self, cmd, soft_limit, *args, **kwargs): result = ExecutorResult(cmd) self._open_streams() kw = self.kwargs.copy() kw['stdin'] = self.stdin_fp kw['stdout'] = self.stdout_fp kw['stderr'] = self.stderr_fp kw.update(**kwargs) with timer.Timer() as t: # try to execute the command try: logger.debug('running cmd {}, {}, {}', cmd, args, kw) process = subprocess.Popen(cmd, *args, **kw) except FileNotFoundError as ex: duration = t.duration self.decrease_timepool(duration) result.message = 'File not found' self._close_streams() return result(status=ExecutorStatus.FILE_NOT_FOUND, error=ex, duration=duration) # try to wait for the command to finish try: rc = process.wait(self._time_left) except subprocess.TimeoutExpired as ex: duration = t.duration self.decrease_timepool(duration) process.kill() result.message = 'Terminated: global timeout was reached' self._close_streams() return result(status=ExecutorStatus.GLOBAL_TIMEOUT, error=ex, duration=duration) # decrease limit duration = t.duration self.decrease_timepool(duration) result.stdin = self.stdin_path result.stdout = self.stdout_path result.stderr = self.stderr_path # determine result if rc == 0: status = ExecutorStatus.OK if soft_limit and t.duration > soft_limit: status = ExecutorStatus.SOFT_TIMEOUT else: status = ExecutorStatus.ERROR_WHILE_RUNNING return result(status=status, returncode=rc, duration=duration)
def transformer_launcher(rts, logger): ''' This function derives a number of variables from the editors_raw collection this will significantly improve processing speed. ''' print 'Start transforming dataset' stopwatch = timer.Timer() log.to_db(rts, 'dataset', 'transform', stopwatch, event='start') log.to_csv(logger, rts, 'Start', 'Transform', transformer_launcher) #transformer.transform_editors_multi_launcher(rts) transformer.transform_editors_single_launcher(rts) stopwatch.elapsed() log.to_db(rts, 'dataset', 'transform', stopwatch, event='finish') log.to_csv(logger, rts, 'Finish', 'Transform', transformer_launcher)
def store_launcher(rts, logger): ''' The data is ready to be stored once the sorted function has completed. This function starts storing data in MongoDB. ''' print 'Start storing data in %s' % rts.storage stopwatch = timer.Timer() log.to_db(rts, 'dataset', 'store', stopwatch, event='start') log.to_csv(logger, rts, 'Start', 'Store', store_launcher) store.launcher(rts) store.launcher_articles(rts) stopwatch.elapsed() log.to_db(rts, 'dataset', 'store', stopwatch, event='finish') log.to_csv(logger, rts, 'Finish', 'Store', store_launcher)
def __init__(self, n, lm_path='/path/to/project/resources/LMs/sample.lm'): #'/proj/fluke/resources/LMs/en.giga.noUN.5gram.lm.bin' """Initialize language models. """ # Record the maximum size of ngrams in the stored LM self.n = n with timer.Timer(): print "Initializing language models", self.lm = srilm.initLM(n) srilm.readLM(self.lm, lm_path) print
def _run(self, cmd, soft_limit=0, *args, **kwargs): cp = self.kwargs.copy() cp.update( dict( stdin=self.stdin_fp, stdout=self.stdout_fp, stderr=self.stderr_fp, )) cp.update(kwargs) result = ExecutorResult(cmd) with timer.Timer() as t: # try to execute the command try: print(cmd, args, cp) process = sp.Popen(cmd, *args, **cp) except FileNotFoundError as ex: duration = t.duration self._time_left -= duration self.message = 'File not found' return result(status=ExecutorStatus.FILE_NOT_FOUND, error=ex, duration=duration) # try to wait for the command to finish try: rc = process.wait(self._time_left) except sp.TimeoutExpired as ex: duration = t.duration self._time_left -= duration process.kill() self.message = 'Terminated: global timeout was reached' return result(status=ExecutorStatus.GLOBAL_TIMEOUT, error=ex, duration=duration) # decrease limit duration = t.duration self._time_left -= duration # determine result if rc == 0: status = ExecutorStatus.OK if soft_limit and t.duration > soft_limit: status = ExecutorStatus.SOFT_TIMEOUT else: status = ExecutorStatus.ERROR_WHILE_RUNNING return result(status=status, returncode=rc, duration=duration)
def generate_chart_data(rts, func, **kwargs): ''' This is the entry function to be called to generate data for creating charts. ''' stopwatch = timer.Timer() plugin = retrieve_plugin(func) if not plugin: available_plugins = inventory.available_analyses() raise exceptions.UnknownPluginError(plugin, available_plugins) plugin = getattr(plugin, func) feedback(func, rts) tasks = JoinableQueue() result = JoinableQueue() mgr = Manager() lock = mgr.RLock() obs = dict() obs_proxy = mgr.dict(obs) db = storage.init_database(rts.storage, rts.dbname, rts.collection) editors = db.retrieve_distinct_keys('editor') #editors = editors[:500] if rts.collection.find('editors_dataset') > -1: min_year, max_year = determine_project_year_range(db, 'new_wikipedian') kwargs['min_year'] = min_year kwargs['max_year'] = max_year fmt = kwargs.pop('format', 'long') time_unit = kwargs.pop('time_unit', 'year') var = dataset.Variable('count', time_unit, lock, obs_proxy, **kwargs) try: print 'Determining whether plugin requires preloaded data...' preloader = getattr(plugin, 'preload') print 'Preloading data...' data = preloader(rts) except Exception, error: data = None
def extract_launcher(rts, logger): ''' The extract launcher is used to extract the required variables from a dump file. If the zip file is a known archive then it will first launch the unzip launcher. ''' print 'Extracting data from XML' stopwatch = timer.Timer() log.to_db(rts, 'dataset', 'extract', stopwatch, event='start') log.to_csv(logger, rts, 'Start', 'Extract', extract_launcher) #remove output from previous run. file_utils.delete_file(rts.txt, None, directory=True) file_utils.create_directory(rts.txt) extracter.launcher(rts) stopwatch.elapsed() log.to_db(rts, 'dataset', 'extract', stopwatch, event='finish') log.to_csv(logger, rts, 'Finish', 'Extract', extract_launcher)
def test(args, dataset, scaffold, logger): sess_cfg = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) sess = tf.Session(config=sess_cfg) fetches = scaffold["fetches"] metrics = scaffold["metrics"] # Load checkpoint if possible saver = scaffold["saver"] if args.ckpt and Path(args.ckpt + ".index").exists(): checkpoint_path = args.ckpt else: latest_filename = "best_checkpoint" if args.load_best_ckpt else None ckpt = tf.train.get_checkpoint_state(args.model_dir, latest_filename) if ckpt and ckpt.model_checkpoint_path: checkpoint_path = ckpt.model_checkpoint_path else: raise ValueError("Missing checkpoint for restoring.") saver.restore(sess, checkpoint_path) logger.info("Restoring parameters from %s", checkpoint_path) test_loss_acc = tools.Accumulator() ti = timer.Timer() logger.info("Start testing ...") # Test sess.run( [dataset["parent_iter"].initializer, tf.local_variables_initializer()]) while True: try: ti.tic() total_loss_val, _ = sess.run( [fetches["total_loss"], metrics["acc_up"]]) ti.toc() test_loss_acc.update(total_loss_val) except tf.errors.OutOfRangeError: break acc_val = sess.run(metrics["acc"]) logger.info("Test loss: %.4f, Test acc: %.4f, %.2f step/s", test_loss_acc.avg, acc_val, ti.speed) sess.close()
def dataset_launcher(rts, logger): ''' Dataset launcher is the entry point to generate datasets from the command line. ''' print 'Start generating dataset' stopwatch = timer.Timer() log.to_db(rts, 'dataset', 'export', stopwatch, event='start') for plugin in rts.plugins: #cProfile.runctx('analyzer.generate_chart_data(rts, plugin, **rts.keywords)', globals(), locals(), filename="analyzer.cprof") analyzer.generate_chart_data(rts, plugin, **rts.keywords) log.to_csv(logger, rts, 'Start', 'Dataset', dataset_launcher, plugin=plugin, dbname=rts.dbname, collection=rts.editors_dataset) stopwatch.elapsed() log.to_db(rts, 'dataset', 'export', stopwatch, event='finish') log.to_csv(logger, rts, 'Finish', 'Dataset', dataset_launcher)
def annotate(sents, *annotator_names): """Annotate one or more sentences with the given annotators. """ if len(annotator_names) == 0: print "WARNING: no annotator specified" return if len(annotator_names) == 1 and \ (isinstance(annotator_names[0], tuple) or isinstance(annotator_names[0], list)): # Annotators may be provided in a list or tuple annotator_names = annotator_names[0] if not isinstance(sents, tuple) and not isinstance(sents, list): # One or more sentences may be provided sents = [sents] for annotator_name in annotator_names: annotator = load_annotator(annotator_name) with timer.Timer(): annotator.run_on_corpus(sents)
def __init__(self): self.initial_weight = cfg.EVAL.WEIGHT self.time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) self.moving_ave_decay = cfg.YOLOv2.MOVING_AVE_DECAY self.eval_logdir = "./data/logs/eval" self.evalset = dataset.Dataset('test') self.output_dir = cfg.EVAL.OUTPUT_PRED_PATH self.img_anchors = loader.load_anchors(cfg.IMG.ANCHORS) with tf.name_scope('model'): self.model = yolov2_network.YOLOv2Network() self.net = self.model.load() self.img_pred = self.net['img_pred'] config = ConfigProto() config.gpu_options.allow_growth = True self.sess = InteractiveSession(config=config) self.saver = tf.train.Saver() #ema_obj.variables_to_restore()) self.saver.restore(self.sess, self.initial_weight) self.timer = timer.Timer()
def __init__(self, args): torch.manual_seed(args.seed) np.random.seed(args.seed) self.args = args self.device = hp.assign_device(args.device) self.run_id = str(round(time.time() % 1e7)) print(f"Run id: {self.run_id}") self.log_dir, self.checkpoint_dir, self.samples_dir = hp.init_logs( args, self.run_id, self.log_dir_formatter(args)) print( f"Process id: {str(os.getpid())} | hostname: {socket.gethostname()}" ) print(f"Run id: {self.run_id}") print(f"Time: {datetime.now()}") self.pp = pprint.PrettyPrinter(indent=4) self.pp.pprint(vars(args)) print('==> Building model..') self.timer = timer.Timer() self.mode = args.mode self.build_model()
def __init__(self, sess, config): if config.learning_rate_D < 0: config.learning_rate_D = config.learning_rate """ Args: sess: TensorFlow session config: The configuration; see main.py for entries """ self.format = 'NCHW' self.timer = timer.Timer() self.dataset = config.dataset if config.architecture == 'dc128': config.output_size = 128 elif config.architecture in ['dc64', 'dcgan64']: config.output_size = 64 output_size = config.output_size self.sess = sess if config.real_batch_size == -1: config.real_batch_size = config.batch_size self.config = config self.is_grayscale = (config.c_dim == 1) self.batch_size = config.batch_size self.real_batch_size = config.real_batch_size self.sample_size = 64 if self.config.is_train else config.batch_size #self.sample_size = batch_size self.output_size = output_size self.data_dir = config.data_dir self.z_dim = self.config.z_dim self.gf_dim = config.gf_dim self.df_dim = config.df_dim self.dof_dim = self.config.dof_dim self.c_dim = config.c_dim self.input_dim = self.output_size * self.output_size * self.c_dim discriminator_desc = '_dc' if self.config.learning_rate_D == self.config.learning_rate: lr = 'lr%.8f' % self.config.learning_rate else: lr = 'lr%.8fG%fD' % (self.config.learning_rate, self.config.learning_rate_D) arch = '%dx%d' % (self.config.gf_dim, self.config.df_dim) self.description = ( "%s%s_%s%s_%sd%d-%d-%d_%s_%s_%s" % (self.dataset, arch, self.config.architecture, discriminator_desc, self.config.model + '-' + self.config.kernel, self.config.dsteps, self.config.start_dsteps, self.config.gsteps, self.batch_size, self.output_size, lr)) if self.config.dof_dim > 1: self.description += '_dof{}'.format(self.config.dof_dim) if self.config.batch_norm: self.description += '_bn' self.max_to_keep = 5 self._ensure_dirs() self.with_labels = config.with_labels if self.with_labels: self.num_classes = 1000 stdout = sys.stdout if self.config.log: self.old_stdout = sys.stdout self.old_stderr = sys.stderr self.log_file = open(os.path.join(self.sample_dir, 'log.txt'), 'w', buffering=1) print('Execution start time: %s' % time.ctime()) print('Log file: %s' % self.log_file) stdout = self.log_file sys.stdout = self.log_file sys.stderr = self.log_file if config.compute_scores: self.scorer = scorer.Scorer(self.sess, self.dataset, config.MMD_lr_scheduler, stdout=stdout) print('Execution start time: %s' % time.ctime()) pprint.PrettyPrinter().pprint(vars(self.config)) #if self.config.multi_gpu: # self.build_model_multi_gpu() #else: self.build_model() self.initialized_for_sampling = config.is_train
def load_treebank(self, treebank_path): """Load dependencies from a file containing Stanford-style dependency parses. """ with timer.Timer(): num_sents = 0 with open(treebank_path) as f: started_sent = False for line in f: if line == '\n': # Ignore line but note if a sentence was just # completed if started_sent: num_sents += 1 sys.stdout.write("Loading treebank sentences: " + str(num_sents) + "\r") started_sent = False continue started_sent = True match = re.match(self.dep_re, line) if match is None: print "ERROR: Unexpected Stanford dependency format" print line continue label, token0, t0, token1, t1 = match.groups() # In the Stanford typed dependency format, token0 is # the governor/head and token1 is the dependent direction = None if t0 > t1: # Head follows dependent: left attachment direction = -1 elif t0 < t1: # Head precedes dependent: right attachment direction = 1 else: print "ERROR: Unexpected token indices" print line continue # Note counts of words token0 = token0.lower() if token0 != 'ROOT' else token0 token1 = token1.lower() self.add_to_counter(self.word_counts, label, token0, token1, direction) # Note counts of stems stem0 = porter2.stem(token0) stem1 = porter2.stem(token1) self.add_to_counter(self.stem_counts, label, stem0, stem1, direction) # Note total number of unique labels, words and stems self.all_labels.add(label) self.all_words.update((token0, token1)) self.all_stems.update((stem0, stem1)) print self.num_labels = len(self.all_labels) self.num_words = len(self.all_words) self.num_stems = len(self.all_stems)
def chunk_paragraphs(self, tokenizer, model_name, preprocess_step, data_set_range): c_unknown = 0 c_known = 0 dis = 0 timer1 = timer.Timer() for i, ex in tqdm(enumerate(self.examples[::])): total = min([len(self.examples[start_idx::]), number_of_part1]) # if i >total: # break if (i + 1) % 5000 == 0: self.save_tokenizer(tokenizer, data_set_range) self.save_coqa_dataset(data_set_range) #chunked_examples print(timer1.remains(total, i)) question_length = len(ex['annotated_question']['word']) if question_length > 350: continue doc_length_available = 512 - question_length - 3 if model_name == 'RoBERTa': doc_length_available = doc_length_available - 3 paragraph = self.paragraphs[ ex['paragraph_id']]['annotated_context']['word'] paragraph = preprocess(paragraph) if model_name != 'RoBERTa' and model_name != 'SpanBERT': paragraph = [p.lower() for p in paragraph] paragraph_length = len(paragraph) start_offset = 0 doc_spans = [] while start_offset < paragraph_length: length = paragraph_length - start_offset if length > doc_length_available: length = doc_length_available - 1 doc_spans.append([start_offset, length, 1]) else: doc_spans.append([start_offset, length, 0]) if start_offset + length == paragraph_length: break start_offset += length for spans in doc_spans: segment_ids = [] tokens = [] if model_name == 'RoBERTa': tokens.append('<s>') for q in ex['annotated_question']['word']: segment_ids.append(0) if model_name == 'RoBERTa' or model_name == 'SpanBERT': tokens.append(q) tokenizer.add_tokens([q]) else: tokens.append(q.lower()) tokenizer.add_tokens([q.lower()]) # save_object([q.lower()], filename) if model_name == 'RoBERTa': tokens.extend(['</s>', '</s>']) else: tokens.append('[SEP]') segment_ids.append(0) tokenizer.add_tokens(paragraph[spans[0]:spans[0] + spans[1]]) # save_object(paragraph[spans[0]:spans[0] + spans[1]], filename) tokens.extend(paragraph[spans[0]:spans[0] + spans[1]]) segment_ids.extend([1] * spans[1]) yes_index = len(tokens) tokens.append('yes') segment_ids.append(1) no_index = len(tokens) tokens.append('no') segment_ids.append(1) if spans[2] == 1: tokens.append('<unknown>') tokenizer.add_tokens(['<unknown>']) # save_object(['<unknown>'], filename) segment_ids.append(1) if model_name == 'RoBERTa': tokens.append('</s>') input_mask = [1] * len(tokens) input_ids = tokenizer.convert_tokens_to_ids(tokens) converted_to_string = tokenizer.convert_ids_to_tokens( input_ids) input_ids.extend([0] * (512 - len(tokens))) input_mask.extend([0] * (512 - len(tokens))) segment_ids.extend([0] * (512 - len(tokens))) start = ex['answer_span'][0] end = ex['answer_span'][1] if start >= spans[0] and end <= spans[1]: c_known += 1 start = question_length + 1 + start end = question_length + 1 + end else: c_unknown += 1 start = len(tokens) - 1 end = len(tokens) - 1 if ex['answer'] == 'yes' and tokens[start] != 'yes': start = yes_index end = yes_index if ex['answer'] == 'no' and tokens[start] != 'no': start = no_index end = no_index _example = { 'tokens': tokens, 'answer': tokens[start:end + 1], 'actual_answer': ex['answer'], 'input_tokens': input_ids, 'input_mask': input_mask, 'segment_ids': segment_ids, 'start': start, 'end': end, 'turn_id': ex['turn_id'], 'paragraph_id': self.paragraphs[ex['paragraph_id']]['id'] } self.chunked_examples.append(_example) #save_object(_example, sname) print("Chunk paragrapsh end. tokenizer number: {} ".format( len(tokenizer))) # if preprocess_step==PREPROCESS_STEP.SPLIT_DATA_AND_SAVE: self.save_tokenizer(tokenizer, data_set_range) self.save_coqa_dataset(data_set_range) #chunked_examples
def train(args, dataset, scaffold, logger): sess_cfg = tf.ConfigProto(allow_soft_placement=True, gpu_options=tf.GPUOptions(allow_growth=True)) sess = tf.Session(config=sess_cfg) fetches = scaffold["fetches"] optimizer = scaffold["optimizer"] saver = scaffold["saver"] writer = scaffold["writer"] metrics = scaffold["metrics"] summaries = scaffold["summaries"] require_val = not args.no_val logger.info(optimizer) # After create session sess.run(tf.global_variables_initializer()) logger.info("Global variable initialized") local_var_init = tf.local_variables_initializer() sess.run(local_var_init) logger.info("Local variable initialized") # Load checkpoint if possible ckpt = tf.train.get_checkpoint_state(args.model_dir) if ckpt and ckpt.model_checkpoint_path: saver.restore(sess, ckpt.model_checkpoint_path) logger.info("Restoring parameters from %s", ckpt.model_checkpoint_path) tr_feed_dict = {K.backend.learning_phase(): 1} val_feed_dict = {K.backend.learning_phase(): 0} if require_val: # Get train/val string handler train_handler, val_handler = sess.run([ dataset["train"]["iter"].string_handle(), dataset["val"]["iter"].string_handle() ]) tr_feed_dict[dataset["handler"]] = train_handler val_feed_dict[dataset["handler"]] = val_handler finished_epoch = sess.run( optimizer.global_step) // dataset["train"]["steps"] total_epochs = args.epochs or args.total_epochs - finished_epoch log_loss_acc = tools.Accumulator() total_loss_acc = tools.Accumulator() regu_loss_acc = tools.Accumulator() val_loss_acc = tools.Accumulator() best_acc = 0. best_epoch = 0. ti = timer.Timer() logger.info("Start training ...") for i in range(total_epochs): # Train sess.run(dataset["train"]["iter"].initializer) logger.info("Epoch %d/%d - Learning rate: %.4g", i + 1, total_epochs, sess.run(optimizer.lr)) while True: try: ti.tic() if ti.calls == 0: summary_val, fetches_val = sess.run([summaries, fetches], tr_feed_dict) writer.add_summary( summary_val, global_step=i) # Here we refer global step to #epoch else: fetches_val = sess.run(fetches, tr_feed_dict) ti.toc() total_loss_acc.update(fetches_val["total_loss"]) log_loss_acc.update(fetches_val["total_loss"]) regu_loss_acc.update(fetches_val["regu_loss"]) if ti.calls % args.log_step == 0: logger.info( "Epoch %d/%d Step %d/%d - Train loss: %.4f - %.2f step/s", i + 1, total_epochs, ti.calls, dataset["train"]["steps"], log_loss_acc.pop(), ti.speed) except tf.errors.OutOfRangeError: break # At epoch end val_summ = collections.OrderedDict() if require_val: sess.run(dataset["val"]["iter"].initializer) while True: try: ti.tic() total_loss_val, _ = sess.run( [fetches["total_loss"], metrics["acc_up"]], val_feed_dict) ti.toc() val_loss_acc.update(total_loss_val) except tf.errors.OutOfRangeError: break acc_val = sess.run(metrics["acc"]) if acc_val > best_acc: best_acc = acc_val best_epoch = i + 1 if args.save_best_ckpt: save_path = scaffold["best_saver"].save( sess, args.model_dir + "/best_" + args.tag, i, write_meta_graph=False, latest_filename="best_checkpoint") logger.info("Save (best) checkpoint to %s", save_path) val_summ["acc"] = acc_val val_summ["val_loss"] = val_loss_acc.pop() sess.run(local_var_init ) # Reset accuracy local variables 'count' and 'total' logger.info( "Epoch %d/%d - Train loss: %.4f, Val loss: %.4f, Val acc: %.4f, %.2f step/s", i + 1, total_epochs, total_loss_acc.avg, val_summ["val_loss"], val_summ["acc"], ti.speed) else: logger.info("Epoch %d/%d - Train loss: %.4f, %.2f step/s", i + 1, total_epochs, total_loss_acc.avg, ti.speed) summary_kits.summary_scalar( writer, i, ["train_loss", "regu_loss"] + list(val_summ.keys()), [total_loss_acc.pop(), regu_loss_acc.pop()] + list(val_summ.values())) save_path = saver.save(sess, args.model_dir + "/" + args.tag, i, write_meta_graph=False) logger.info("Save checkpoint to %s", save_path) ti.reset() logger.info("Best val acc: %.4f in epoch %d.", best_acc, best_epoch) sess.close() writer.close()
def __init__(self): self.learn_rate_init = cfg.TRAIN.LEARN_RATE_INIT self.learn_rate_end = cfg.TRAIN.LEARN_RATE_END self.first_stage_epochs = cfg.TRAIN.FRIST_STAGE_EPOCHS self.second_stage_epochs = cfg.TRAIN.SECOND_STAGE_EPOCHS self.warmup_periods = cfg.TRAIN.WARMUP_EPOCHS self.initial_weight = cfg.TRAIN.PRETRAIN_WEIGHT self.time = time.strftime('%Y-%m-%d-%H-%M-%S', time.localtime(time.time())) self.moving_ave_decay = cfg.YOLOv2.MOVING_AVE_DECAY self.train_logdir = "./data/log/train" self.trainset = dataset.Dataset('train') self.valset = dataset.Dataset('val') self.steps_per_period = len(self.trainset) config = ConfigProto() config.gpu_options.allow_growth = True self.sess = InteractiveSession(config=config) self.timer = timer.Timer() # self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) with tf.name_scope('model'): self.model = yolov2_network.YOLOv2Network() self.net = self.model.load() self.net_var = tf.global_variables() self.loss = self.net["yolov2_loss"] with tf.name_scope('learn_rate'): self.global_step = tf.Variable(1.0, dtype=tf.float64, trainable=False, name='global_step') warmup_steps = tf.constant(self.warmup_periods * self.steps_per_period, dtype=tf.float64, name='warmup_steps') train_steps = tf.constant( (self.first_stage_epochs + self.second_stage_epochs) * self.steps_per_period, dtype=tf.float64, name='train_steps') self.learn_rate = tf.cond( pred=self.global_step < warmup_steps, true_fn=lambda: self.global_step / warmup_steps * self. learn_rate_init, false_fn=lambda: self.learn_rate_end + 0.5 * (self.learn_rate_init - self.learn_rate_end) * (1 + tf.cos( (self.global_step - warmup_steps) / (train_steps - warmup_steps) * np.pi))) global_step_update = tf.assign_add(self.global_step, 1.0) with tf.name_scope("define_weight_decay"): moving_ave = tf.train.ExponentialMovingAverage( self.moving_ave_decay).apply(tf.trainable_variables()) with tf.name_scope("define_first_stage_train"): self.first_stage_trainable_var_list = [] for var in tf.trainable_variables(): var_name = var.op.name var_name_mess = str(var_name).split('/') if var_name_mess[0] in ["yolov2_headnet"]: self.first_stage_trainable_var_list.append(var) first_stage_optimizer = tf.train.AdamOptimizer( self.learn_rate).minimize( self.loss, var_list=self.first_stage_trainable_var_list) with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): with tf.control_dependencies( [first_stage_optimizer, global_step_update]): with tf.control_dependencies([moving_ave]): self.train_op_with_frozen_variables = tf.no_op() with tf.name_scope("define_second_stage_train"): second_stage_trainable_var_list = tf.trainable_variables() second_stage_optimizer = tf.train.AdamOptimizer( self.learn_rate).minimize( self.loss, var_list=second_stage_trainable_var_list) with tf.control_dependencies( tf.get_collection(tf.GraphKeys.UPDATE_OPS)): with tf.control_dependencies( [second_stage_optimizer, global_step_update]): with tf.control_dependencies([moving_ave]): self.train_op_with_all_variables = tf.no_op() with tf.name_scope('loader_and_saver'): self.loader = tf.train.Saver(self.net_var) self.saver = tf.train.Saver(tf.global_variables(), max_to_keep=10) with tf.name_scope('summary'): tf.summary.scalar("learn_rate", self.learn_rate) tf.summary.scalar("yolov2_loss", self.net["yolov2_loss"]) tf.summary.scalar("img_obj_loss", self.net["img_obj_loss"]) tf.summary.scalar("img_cls_loss", self.net["img_cls_loss"]) tf.summary.scalar("img_bbox_loss", self.net["img_bbox_loss"]) logdir = "../logs/tensorboard" if os.path.exists(logdir): shutil.rmtree(logdir) os.mkdir(logdir) self.write_op = tf.summary.merge_all() self.summary_writer = tf.summary.FileWriter(logdir, graph=self.sess.graph) img_pred_dir = cfg.YOLOv2.LOG_DIR + "/pred/img_pred/" if os.path.exists(img_pred_dir): shutil.rmtree(img_pred_dir) os.mkdir(img_pred_dir)
def run(): with timer.Timer("reading dataset"): dataset = util.read_binary_dataset(test_filename) domain = util.read_header(test_filename) ''' if OPTIONS.majority_circuit_opt: l = len(domain) for k in xrange(num_trees): domain["Tree_%d" % k] = l+k ''' with timer.Timer("initializing manager"): # start sdd manager var_count = len(domain) - 1 vtree = sdd.sdd_vtree_new(var_count, "balanced") manager = sdd.sdd_manager_new(vtree) #sdd.sdd_manager_auto_gc_and_minimize_on(manager) #sdd.sdd_manager_auto_gc_and_minimize_off(manager) sdd_state = SddState(vtree, manager) with timer.Timer("reading constraints"): constraint_sdd, constraint_info = encode_logical_constraints( constraint_filename, manager, domain) sdd.sdd_ref(constraint_sdd, manager) with timer.Timer("reading trees"): tree_states = [] for filename in sorted(glob.glob(tree_basename.replace('%d', '*'))): tree = pygv.AGraph(filename) tree_state = TreeState(tree, domain, constraint_info) tree_states.append(tree_state) #tree.layout(prog='dot') #tree.draw(filename+".png") #num_trees = len(tree_states) with timer.Timer("compiling trees"): forest_sdds, _ = izip(*forest_sdds_iter(tree_states, sdd_state)) #forest_sdds = list(forest_sdds_iter(tree_states,sdd_state)) forest_sdds = [ (tree_state, tree_sdd) for tree_state, tree_sdd in zip(tree_states, forest_sdds) ] cmpf = lambda x, y: cmp(sdd.sdd_size(x[1]), sdd.sdd_size(y[1])) forest_sdds.sort(cmp=cmpf) tree_states = [tree_state for tree_state, tree_sdd in forest_sdds] #ACACAC sdd.sdd_manager_auto_gc_and_minimize_off(manager) sdd.sdd_manager_minimize_limited(manager) stats = SddSizeStats() for tree_state, tree_sdd in forest_sdds: stats.update(tree_sdd) sdd.sdd_deref(tree_sdd, manager) sdd.sdd_manager_garbage_collect(manager) forest_sdds, used_vars_list = izip( *forest_sdds_iter(tree_states, sdd_state)) print stats with timer.Timer("compiling all", prefix="| "): alpha = compile_all(forest_sdds, used_vars_list, num_trees, domain, manager, constraint_sdd) with timer.Timer("evaluating"): msg = util.evaluate_dataset_all_sdd(dataset, alpha, manager) print "| trees : %d" % num_trees print "--- evaluating majority vote on random forest (compiled):" print msg print "| all size :", sdd.sdd_size(alpha) print "| all count:", sdd.sdd_count(alpha) print " model count:", sdd.sdd_global_model_count(alpha, manager) with timer.Timer("checking monotonicity"): result = is_monotone(alpha, manager) print "Is monotone?", result #for tree_sdd in forest_sdds: sdd.sdd_deref(tree_sdd,manager) print "====================" print "before garbage collecting..." print "live size:", sdd.sdd_manager_live_count(manager) print "dead size:", sdd.sdd_manager_dead_count(manager) print "garbage collecting..." sdd.sdd_manager_garbage_collect(manager) print "live size:", sdd.sdd_manager_live_count(manager) print "dead size:", sdd.sdd_manager_dead_count(manager) vtree = sdd.sdd_manager_vtree(manager) print "Writing sdd file %s and vtree file %s" % (sdd_filename, vtree_filename) sdd.sdd_save(sdd_filename, alpha) sdd.sdd_vtree_save(vtree_filename, vtree) print "Writing constraint sdd file %s and constraint vtree file %s" % ( constraint_sdd_filename, constraint_vtree_filename) sdd.sdd_save(constraint_sdd_filename, constraint_sdd) sdd.sdd_vtree_save(constraint_vtree_filename, vtree)
def __init__(self, sess, config, batch_size=64, output_size=64, z_dim=100, c_dim=3, data_dir='./data'): if config.learning_rate_D < 0: config.learning_rate_D = config.learning_rate """ Args: sess: TensorFlow session batch_size: The size of batch. Should be specified before training. output_size: (optional) The resolution in pixels of the images. [64] z_dim: (optional) Dimension of dim for Z. [100] gf_dim: (optional) Dimension of gen filters in first conv layer. [64] df_dim: (optional) Dimension of discrim filters in first conv layer. [64] gfc_dim: (optional) Dimension of gen units for for fully connected layer. [1024] dfc_dim: (optional) Dimension of discrim units for fully connected layer. [1024] c_dim: (optional) Dimension of image color. For grayscale input, set to 1. [3] """ self.timer = timer.Timer() self.dataset = config.dataset if config.architecture == 'dc128': output_size = 128 if config.architecture in ['dc64', 'dcgan64']: output_size = 64 self.sess = sess if config.real_batch_size == -1: config.real_batch_size = config.batch_size self.config = config self.is_grayscale = (c_dim == 1) self.batch_size = batch_size self.real_batch_size = config.real_batch_size self.sample_size = 64 if self.config.is_train else batch_size self.output_size = output_size self.data_dir = data_dir self.z_dim = z_dim self.gf_dim = config.gf_dim self.df_dim = config.df_dim self.dof_dim = self.config.dof_dim self.c_dim = c_dim discriminator_desc = '_dc' if self.config.learning_rate_D == self.config.learning_rate: lr = 'lr%.8f' % self.config.learning_rate else: lr = 'lr%.8fG%fD' % (self.config.learning_rate, self.config.learning_rate_D) arch = '%dx%d' % (self.config.gf_dim, self.config.df_dim) self.description = ( "%s%s_%s%s_%sd%d-%d-%d_%s_%s_%s" % (self.dataset, arch, self.config.architecture, discriminator_desc, self.config.kernel, self.config.dsteps, self.config.start_dsteps, self.config.gsteps, self.batch_size, self.output_size, lr)) if self.config.batch_norm: self.description += '_bn' self._ensure_dirs() stdout = sys.stdout if self.config.log: self.old_stdout = sys.stdout self.old_stderr = sys.stderr self.log_file = open(os.path.join(self.sample_dir, 'log.txt'), 'w', buffering=1) print('Execution start time: %s' % time.ctime()) print('Log file: %s' % self.log_file) stdout = self.log_file sys.stdout = self.log_file sys.stderr = self.log_file if config.compute_scores: self.scorer = scorer.Scorer(self.dataset, config.MMD_lr_scheduler, stdout=stdout) print('Execution start time: %s' % time.ctime()) pprint.PrettyPrinter().pprint(self.config.__dict__['__flags']) self.build_model() self.initialized_for_sampling = config.is_train
def run(basename,train_filename,test_filename, num_trees=100,tree_depth=0,class_index=0): with timer.Timer("loading data"): training = read_dataset(train_filename,class_index=class_index) testing = read_dataset(test_filename,class_index=class_index) """ print "====== naive Bayes =====" with timer.Timer("training"): nb = NaiveBayes() nb.buildClassifier(training) with timer.Timer("testing"): eval_training = evaluate_dataset(nb,training) eval_testing = evaluate_dataset(nb,testing) print "=== evaluation (training):" print eval_training.toSummaryString() print "=== evaluation (testing):" print eval_testing.toSummaryString() """ print "====== random forest =====" with timer.Timer("training"): rf = RandomForest() #rf.setOptions([ # u'-P', u'100', u'-I', u'100', u'-num-slots', u'1', u'-K', u'0', u'-M', u'1.0', u'-V', u'0.001', u'-S', u'1', # u'-num-decimal-places', u'6' #]) rf.setNumIterations(num_trees) if tree_depth: rf.setMaxDepth(tree_depth) rf.buildClassifier(training) with timer.Timer("testing"): eval_training = evaluate_dataset(rf,training) eval_testing = evaluate_dataset(rf,testing) print "=== evaluation (training):" print eval_training.toSummaryString() print "=== evaluation (testing):" print eval_testing.toSummaryString() #print rf.getmembers() num_classifiers = len(rf.m_Classifiers) for i,tree in enumerate(rf.m_Classifiers): options_arr = tree.getOptions() options_arr_python = [x for x in options_arr] options_arr_python += [u'-num-decimal-places',u'6'] tree.setOptions(options_arr_python) #print tree.toString() #binarize(tree) filename = basename % i with open(filename,"w") as f: f.writelines(tree.graph()) correct,incorrect = 0,0 for instance in testing: pos,neg = 0,0 for tree in rf.m_Classifiers: #print tree.classifyInstance(instance) if tree.classifyInstance(instance) >= 0.5: pos += 1 else: neg += 1 my_label = 1.0 if pos >= neg else 0.0 if my_label == instance.classValue(): correct += 1 else: incorrect += 1 print " trees : %d" % num_trees print "--- evaluating majority vote on random forest:" print " correct : %d" % correct print "incorrect : %d" % incorrect