def run(self, corner, subvol_size): """Runs FFN inference over a subvolume. Args: corner: start of the subvolume (z, y, x) subvol_size: size of the subvolume (z, y, x) Returns: Canvas object with the segmentation or None if the canvas could not be created or the segmentation subvolume already exists. """ self.counters.reset() seg_path = storage.segmentation_path( self.request.segmentation_output_dir, corner) prob_path = storage.object_prob_path( self.request.segmentation_output_dir, corner) cpoint_path = storage.checkpoint_path( self.request.segmentation_output_dir, corner) if gfile.Exists(seg_path): if pyborgletinfo.RunningUnderBorglet(): pywrapborgletlib.BorgletLib.SetStatusMsg( 'Segmentation already complete; exiting.') return None canvas, alignment = self.make_canvas(corner, subvol_size) if canvas is None: return None if gfile.Exists(cpoint_path): canvas.restore_checkpoint(cpoint_path) if self.request.alignment_options.save_raw: image_path = storage.subvolume_path( self.request.segmentation_output_dir, corner, 'align') with storage.atomic_file(image_path) as fd: np.savez_compressed(fd, im=canvas.image) canvas.segment_all( seed_policy=self.get_seed_policy(corner, subvol_size)) self.save_segmentation(canvas, alignment, seg_path, prob_path) # Attempt to remove the checkpoint file now that we no longer need it. try: gfile.Remove(cpoint_path) except: # pylint: disable=bare-except pass return canvas
def threshold_segmentation(segmentation_dir, corner, labels, threshold): prob_path = object_prob_path(segmentation_dir, corner) if not gfile.Exists(prob_path): prob_path = legacy_object_prob_path(segmentation_dir, corner) if not gfile.Exists(prob_path): raise ValueError('Cannot find probability map %s' % prob_path) with gfile.Open(prob_path, 'rb') as f: data = np.load(f) if 'qprob' not in data: raise ValueError('Invalid FFN probability map.') prob = dequantize_probability(data['qprob']) labels[prob < threshold] = 0
def get_mldata(dataset): # Use scikit to grab datasets and save them save_dir. save_dir = FLAGS.save_dir filename = os.path.join(save_dir, dataset[1] + '.pkl') if not gfile.Exists(save_dir): gfile.MkDir(save_dir) if not gfile.Exists(filename): if dataset[0][-3:] == 'csv': data = get_csv_data(dataset[0]) elif dataset[0] == 'breast_cancer': data = load_breast_cancer() elif dataset[0] == 'iris': data = load_iris() elif dataset[0] == 'newsgroup': # Removing header information to make sure that no newsgroup identifying # information is included in data data = fetch_20newsgroups_vectorized(subset='all', remove=('headers')) tfidf = TfidfTransformer(norm='l2') X = tfidf.fit_transform(data.data) data.data = X elif dataset[0] == 'rcv1': sklearn.datasets.rcv1.URL = ( 'http://www.ai.mit.edu/projects/jmlr/papers/' 'volume5/lewis04a/a13-vector-files/lyrl2004_vectors') sklearn.datasets.rcv1.URL_topics = ( 'http://www.ai.mit.edu/projects/jmlr/papers/' 'volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz') data = sklearn.datasets.fetch_rcv1(data_home='/tmp') elif dataset[0] == 'wikipedia_attack': data = get_wikipedia_talk_data() elif dataset[0] == 'cifar10': data = get_cifar10() elif 'keras' in dataset[0]: data = get_keras_data(dataset[0]) else: try: data = fetch_mldata(dataset[0]) except: raise Exception('ERROR: failed to fetch data from mldata.org') X = data.data y = data.target if X.shape[0] != y.shape[0]: X = np.transpose(X) assert X.shape[0] == y.shape[0] data = {'data': X, 'target': y} pickle.dump(data, gfile.GFile(filename, 'w'))
def train(working_dir): model_num, model_name = fsdb.get_latest_model() games = gfile.Glob(os.path.join(fsdb.selfplay_dir(), model_name, '*.zz')) if len(games) < MIN_GAMES_PER_GENERATION: print("{} doesn't have enough games to train a new model yet ({})". format(model_name, len(games))) print("Sleeping...") time.sleep(10 * 60) print("Done...") sys.exit(1) print("Training on gathered game data, initializing from {}".format( model_name)) new_model_num = model_num + 1 new_model_name = shipname.generate(new_model_num) print("New model will be {}".format(new_model_name)) training_file = os.path.join(fsdb.golden_chunk_dir(), str(new_model_num) + '.tfrecord.zz') while not gfile.Exists(training_file): print("Waiting for", training_file) time.sleep(1 * 60) print("Using Golden File:", training_file) save_file = os.path.join(fsdb.models_dir(), new_model_name) try: main.train(working_dir, [training_file], save_file, generation_num=model_num + 1) except: logging.exception("Train error")
def preprocess(source, chunksize): reader = read_csv(source, header=0, chunksize=chunksize) if gfile.Exists(prep_data_path): gfile.Remove(prep_data_path) for data in reader: data = data.fillna(0) data.replace(('yes', 'no'), (1, 0), inplace=True) product_type, sub_area, ecology = \ data['product_type'].values, data['sub_area'].values, data['ecology'].values data['product_type'] = np.reshape([ one_hot(x, n=np.unique(product_type).shape[0] + 1, filters='') for x in product_type ], product_type.shape) sub_area = np.array([ s.replace(' ', '').replace('-', '').replace('\'', '').replace(',', '') for s in sub_area ]) data['sub_area'] = np.reshape( [one_hot(x, n=np.unique(sub_area).shape[0] + 1) for x in sub_area], sub_area.shape) ecology = np.array([ s.replace(' ', '').replace('-', '').replace('\'', '').replace(',', '') for s in ecology ]) data['ecology'] = np.reshape( [one_hot(x, n=np.unique(ecology).shape[0] + 1) for x in ecology], ecology.shape) data.to_csv( prep_data_path) if not isfile(prep_data_path) else data.to_csv( prep_data_path, mode='a', header=False)
def inference(reader, checkpoint_file, train_dir, data_pattern, out_file_location, batch_size, top_k): with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)) as sess, gfile.Open(out_file_location, "w+") as out_file: video_id_batch, video_batch, num_frames_batch = get_input_data_tensors(reader, data_pattern, batch_size) if checkpoint_file: if not gfile.Exists(checkpoint_file + ".meta"): logging.fatal("Unable to find checkpoint file at provided location '%s'" % checkpoint_file) latest_checkpoint = checkpoint_file else: latest_checkpoint = tf.train.latest_checkpoint(train_dir) if latest_checkpoint is None: raise Exception("unable to find a checkpoint at location: %s" % train_dir) else: meta_graph_location = latest_checkpoint + ".meta" logging.info("loading meta-graph: " + meta_graph_location) saver = tf.train.import_meta_graph(meta_graph_location, clear_devices=True) logging.info("restoring variables from " + latest_checkpoint) saver.restore(sess, latest_checkpoint) input_tensor = tf.get_collection("input_batch_raw")[0] num_frames_tensor = tf.get_collection("num_frames")[0] predictions_tensor = tf.get_collection("predictions")[0] # Workaround for num_epochs issue. def set_up_init_ops(variables): init_op_list = [] for variable in list(variables): if "train_input" in variable.name: init_op_list.append(tf.assign(variable, 1)) variables.remove(variable) init_op_list.append(tf.variables_initializer(variables)) return init_op_list sess.run(set_up_init_ops(tf.get_collection_ref( tf.GraphKeys.LOCAL_VARIABLES))) coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) num_examples_processed = 0 start_time = time.time() out_file.write("VideoId,LabelConfidencePairs\n") try: while not coord.should_stop(): video_id_batch_val, video_batch_val,num_frames_batch_val = sess.run([video_id_batch, video_batch, num_frames_batch]) predictions_val, = sess.run([predictions_tensor], feed_dict={input_tensor: video_batch_val, num_frames_tensor: num_frames_batch_val}) now = time.time() num_examples_processed += len(video_batch_val) num_classes = predictions_val.shape[1] logging.info("num examples processed: " + str(num_examples_processed) + " elapsed seconds: " + "{0:.2f}".format(now-start_time)) for line in format_lines(video_id_batch_val, predictions_val, top_k): out_file.write(line) out_file.flush() except tf.errors.OutOfRangeError: logging.info('Done with inference. The output file was written to ' + out_file_location) finally: coord.request_stop() coord.join(threads) sess.close()
def load_from_config_path(config_paths, default_model_configs=None): """ Loads configurations from files of yaml format. Args: config_paths: A string (each file name is seperated by ",") or a list of strings (file names). default_model_configs: A dictionary of model configurations or None. Returns: A dictionary of model configurations, parsed from config files. """ if isinstance(config_paths, six.string_types): config_paths = config_paths.strip().split(",") assert isinstance(config_paths, list) or isinstance(config_paths, tuple) model_configs = default_model_configs if default_model_configs else dict() for config_path in config_paths: config_path = config_path.strip() if not config_path: continue if not gfile.Exists(config_path): raise OSError("config file does not exist: {}".format(config_path)) config_path = os.path.abspath(config_path) tf.logging.info("loading configurations from {}".format(config_path)) with gfile.GFile(config_path, "r") as config_file: config_flags = yaml.load(config_file) model_configs = deep_merge_dict(model_configs, config_flags) return model_configs
def __init__(self, filename, bpe_codes_file=None, reverse_seq=False): """ Initializes the object. Args: filename: Path to a vocabulary file containing one word per line. Each word is mapped to its line number (starting from 0). bpe_codes_file: Path to a BPE code file. If provided, do BPE before feature mapping. reverse_seq: Whether to reverse the sequence when encode the words to ids. Raises: ValueError: if `filename` or `bpe_codes_file` does not exist. """ self.vocab_dict, self.vocab_r_dict, _ = create_vocabulary_lookup_table_numpy( filename) self._sos_id = self.vocab_dict[Constants.SEQUENCE_START] self._eos_id = self.vocab_dict[Constants.SEQUENCE_END] self._unk_id = self.vocab_dict[Constants.UNKOWN] self._vocab_size = len(self.vocab_dict) self._reverse_seq = reverse_seq self._bpe = None if bpe_codes_file and not bpe_codes_file == "": if not gfile.Exists(bpe_codes_file): raise ValueError( "bpe_codes_file: {} not exists".format(bpe_codes_file)) self._bpe = BPE(bpe_codes_file, vocab=filename)
def _get_or_create_trial(self, uri, uri_info=None): """Create a new trial or get the previous one. The previous trials are recreated by looking at the content of the cached dir. Args: uri (str): Uri to create the trial for. uri_info (obj): Object containing additional info about the download or extraction (UrlInfo or ExtractInfo). Returns: trial (UriTrial): Result of the trial containing the destination, status, timestamp,... """ # The generation is deterministic so generating keys for the same uri will # always gives the same result trial_id = '{}_{}'.format( util.escape_uri(uri), util.hash_uri(uri), ) log = util.build_log(prefix=trial_id) # Generate a new trial to eventually use trial = download_pb2.UriTrial( id=trial_id, status=download_pb2.UriTrial.IN_PROGRESS, output_path=os.path.join(self._cache_dir, trial_id), ) add_uri_info(trial, uri, uri_info) if gfile.Exists(trial.output_path): # If the directory exists, the previous trial was complete (as it was # renamed successfully from ".incomplete") if self._mode == util.GenerateMode.FORCE_REDOWNLOAD: log('Cleanup previous trial: {}', trial.output_path) gfile.DeleteRecursively(trial.output_path) else: log('Reusing previously cached data...') # Try to reuse the previous download trial.status = download_pb2.UriTrial.COMPLETED # For the downloads, the output_path contains the file # TODO(epot): Should instead write the meta-data on disk (in a # ._trial.json) and replace ListDirectory() by a version which filter # the metadata file. is_dl = not any(uri.startswith(p) for p in ('local://', 'extract://')) is_gz = ( uri.startswith('extract://') and uri.endswith('.gz') and not uri.endswith('tar.gz') ) if is_dl or is_gz: trial.output_path = get_download_filepath(trial) else: log('No cached value found.') return trial
def get_meta_filename(self, start_new_model, train_dir): if start_new_model: logging.info( "%s: Flag 'start_new_model' is set. Building a new model.", task_as_string(self.task)) return None if FLAGS.checkpoint_file == '': latest_checkpoint = tf.train.latest_checkpoint(train_dir) else: latest_checkpoint = os.path.join(FLAGS.train_dir, FLAGS.checkpoint_file) if not latest_checkpoint: logging.info("%s: No checkpoint file found. Building a new model.", task_as_string(self.task)) return None meta_filename = latest_checkpoint + ".meta" if not gfile.Exists(meta_filename): logging.info("%s: No meta graph file found. Building a new model.", task_as_string(self.task)) return None else: return meta_filename
def create_vocabulary_lookup_table(filename, default_value=None): """Creates a lookup table for a vocabulary file. Args: filename: Path to a vocabulary file containg one word per line. Each word is mapped to its line number. default_value: UNK tokens will be mapped to this id. If None, UNK tokens will be mapped to [vocab_size] Returns: A tuple (vocab_to_id_table, id_to_vocab_table, word_to_count_table, vocab_size). The vocab size does not include the UNK token. """ if not gfile.Exists(filename): raise ValueError("File does not exist: {}".format(filename)) # Load vocabulary into memory with gfile.GFile(filename) as file: vocab = list(line.strip("\n") for line in file) vocab_size = len(vocab) has_counts = len(vocab[0].split("\t")) == 2 if has_counts: vocab, counts = zip(*[_.split("\t") for _ in vocab]) counts = [float(_) for _ in counts] vocab = list(vocab) else: counts = [-1. for _ in vocab] # Add special vocabulary items special_vocab = get_special_vocab(vocab_size) vocab += list(special_vocab._fields) vocab_size += len(special_vocab) counts += [-1. for _ in list(special_vocab._fields)] if default_value is None: default_value = special_vocab.UNK tf.logging.info("Creating vocabulary lookup table of size %d", vocab_size) vocab_tensor = tf.constant(vocab) count_tensor = tf.constant(counts, dtype=tf.float32) vocab_idx_tensor = tf.range(vocab_size, dtype=tf.int64) # Create ID -> word mapping id_to_vocab_init = tf.contrib.lookup.KeyValueTensorInitializer( vocab_idx_tensor, vocab_tensor, tf.int64, tf.string) id_to_vocab_table = tf.contrib.lookup.HashTable(id_to_vocab_init, "UNK") # Create word -> id mapping vocab_to_id_init = tf.contrib.lookup.KeyValueTensorInitializer( vocab_tensor, vocab_idx_tensor, tf.string, tf.int64) vocab_to_id_table = tf.contrib.lookup.HashTable(vocab_to_id_init, default_value) # Create word -> count mapping word_to_count_init = tf.contrib.lookup.KeyValueTensorInitializer( vocab_tensor, count_tensor, tf.string, tf.float32) word_to_count_table = tf.contrib.lookup.HashTable(word_to_count_init, -1) return vocab_to_id_table, id_to_vocab_table, word_to_count_table, vocab_size
def main(argv): del argv # Unused. if not gfile.Exists(FLAGS.save_dir): gfile.MkDir(FLAGS.save_dir) charting_filepath = os.path.join(FLAGS.save_dir, FLAGS.dataset + '_charts.pdf') sampling_methods = FLAGS.sampling_methods.split(',') scoring_methods = FLAGS.scoring_methods.split(',') files = gfile.Glob( os.path.join(FLAGS.source_dir, FLAGS.dataset + '*/results*.pkl')) files = [ f for f in files if (get_sampling_method(FLAGS.dataset, f) in sampling_methods and get_scoring_method(f) in scoring_methods and get_normalize(f) == FLAGS.normalize and get_standardize(f) == FLAGS.standardize) ] print('Reading in %d files...' % len(files)) all_results = combine_results(files) pdf = PdfPages(charting_filepath) print('Plotting charts...') plt.style.use('ggplot') for m in scoring_methods: plot_results( all_results, m, FLAGS.normalize, FLAGS.standardize, sampler_filter=sampling_methods) plt.title('Dataset: %s, Score Method: %s' % (FLAGS.dataset, m)) pdf.savefig() plt.close() pdf.close()
def train(load_dir=MODELS_DIR, save_dir=MODELS_DIR): model_num, model_name = get_latest_model() games = gfile.Glob(os.path.join(SELFPLAY_DIR, model_name, '*.zz')) if len(games) < MIN_GAMES_PER_GENERATION: print("{} doesn't have enough games to train a new model yet ({})". format(model_name, len(games))) print("Sleeping...") time.sleep(10 * 60) print("Done...") sys.exit(1) print("Training on gathered game data, initializing from {}".format( model_name)) new_model_num = model_num + 1 new_model_name = shipname.generate(new_model_num) print("New model will be {}".format(new_model_name)) training_file = os.path.join(GOLDEN_CHUNK_DIR, str(new_model_num) + '.tfrecord.zz') while not gfile.Exists(training_file): print("Waiting for", training_file) time.sleep(1 * 60) print("Using Golden File:", training_file) load_file = os.path.join(load_dir, model_name) save_file = os.path.join(save_dir, new_model_name) try: main.train(ESTIMATOR_WORKING_DIR, [training_file], save_file, generation_num=model_num + 1) except: logging.exception("Train error")
def get_meta_filename(self, start_new_model, train_dir): if start_new_model: logging.info( "%s: Flag 'start_new_model' is set. Building a new model.", task_as_string(self.task)) return None latest_checkpoint = tf.train.latest_checkpoint(train_dir) # latest_checkpoint = "model.ckpt-137964" print("...............................", latest_checkpoint) if not latest_checkpoint: logging.info("%s: No checkpoint file found. Building a new model.", task_as_string(self.task)) return None meta_filename = latest_checkpoint + ".meta" print("metafile:...........................................", meta_filename) if not gfile.Exists(meta_filename): logging.info("%s: No meta graph file found. Building a new model.", task_as_string(self.task)) return None else: return meta_filename
def get_meta_filename(self, start_new_model, train_dir): task_str = task_as_string(self.task) if start_new_model: logging.info( "{}: Flag 'start_new_model' is set. Building a new model.". format(task_str)) return None """ Finds the filename of latest saved checkpoint file. Returns: The FULL path to the latest checkpoint or None if no checkpoint was found. """ latest_checkpoint = tf.train.latest_checkpoint(train_dir) if not latest_checkpoint: logging.info( "{}: No checkpoint file found. Building a new model.".format( task_str)) return None meta_filename = latest_checkpoint + ".meta" if not gfile.Exists(meta_filename): logging.info( "{}: No meta graph file found. Building a new model.".format( task_str)) return None else: return meta_filename
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): """Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Args: data_path: path to the data file in one-sentence-per-line format. target_path: path where the file with token-ids will be created. vocabulary_path: path to the vocabulary file. tokenizer: a function to use to tokenize each sentence; if None, basic_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids(tf.compat.as_bytes(line), vocab, tokenizer, normalize_digits) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def initialize_vocabulary(vocabulary_path): """Initialize vocabulary from file. We assume the vocabulary is stored one-item-per-line, so a file: dog cat will result in a vocabulary {"dog": 0, "cat": 1}, and this function will also return the reversed-vocabulary ["dog", "cat"]. Args: vocabulary_path: path to the file containing the vocabulary. Returns: a pair: the vocabulary (a dictionary mapping string to integers), and the reversed vocabulary (a list, which reverses the vocabulary mapping). Raises: ValueError: if the provided vocabulary_path does not exist. """ if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="rb") as f: rev_vocab.extend(f.readlines()) rev_vocab = [tf.compat.as_bytes(line.strip()) for line in rev_vocab] vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab raise ValueError("Vocabulary file %s not found." % vocabulary_path)
def prover_pipeline(self, tasks: List[proof_assistant_pb2.ProverTask], root): """Make a prover pipeline for the given task and this round.""" current_round = self.loop_meta.status.current_round prover_options = deephol_pb2.ProverOptions() prover_options.CopyFrom(self.config.prover_options) prover_options.action_generator_options.random_tactic_probability = ( get_random_tactic_probability(self.config, current_round)) checkpoint = self.checkpoint_monitor.get_checkpoint() assert checkpoint, 'Model checkpoint is not present.' # Update prover options to utilize the latest checkpoint present. We also # make sure to utilize the embedding store as well. prover_options.path_model_prefix = checkpoint prover_options.theorem_embeddings = checkpoint + '.npy' assert gfile.Exists(prover_options.theorem_embeddings), ( 'Missing embeddings file "%s".' % prover_options.theorem_embeddings) output_dir = self.loop_meta.make_proof_logs_dir(current_round) output_prefix = os.path.join(output_dir, 'logs') logging.info('Prover options:\n%s', text_format.MessageToString(prover_options)) io_util.write_text_proto( str(os.path.join(output_dir, 'prover_options.pbtxt')), prover_options) return prover_runner.make_pipeline(tasks, prover_options, output_prefix)(root)
def _find_conf(conf, save_path): if conf.startswith('s3'): if not gfile.Exists(conf): conf = conf.strip('/') conf_name = conf.rsplit('/', 1)[1] save_path = os.path.join(save_path, conf_name) + '/' gfile.MakeDirs(save_path) check = os.system('aws s3 cp {} {} --recursive'.format(conf+'/', save_path)) conf = save_path if check: assert False, 'cant find conf in: {}'.format(conf) return conf conf_path = os.path.abspath(conf) if not gfile.Exists(conf_path): conf_path = os.path.join(_project_path, 'conf', conf) return conf_path
def dump_object(object_to_dump, output_path): if not gfile.Exists(output_path): gfile.MakeDirs(os.path.dirname(output_path)) with gfile.Open(output_path, 'w') as wf: joblib.dump(object_to_dump, wf)
def save_config_file(config_file, dest_dir): if not gfile.Exists(dest_dir): gfile.MakeDirs(dest_dir) config_file_dest = os.path.join(dest_dir, 'blueoil_config.yaml') # HACK: This is for tensorflow bug workaround. # We can remove following 2 lines once it's been resolved in tensorflow # issue link: https://github.com/tensorflow/tensorflow/issues/28508 if gfile.Exists(config_file_dest): gfile.Remove(config_file_dest) return gfile.Copy( config_file, config_file_dest )
def manual_dir(self): """Returns the directory containing the manually extracted data.""" if not gfile.Exists(self._manual_dir): raise AssertionError( 'Manual directory {} does not exist. Create it and download/extract' 'dataset artifacts in there.'.format(self._manual_dir)) return self._manual_dir
def train(working_dir): model_num, model_name = fsdb.get_latest_model() print("Training on gathered game data, initializing from {}".format(model_name)) new_model_num = model_num + 1 new_model_name = shipname.generate(new_model_num) print("New model will be {}".format(new_model_name)) training_file = os.path.join( fsdb.golden_chunk_dir(), str(new_model_num) + '.tfrecord.zz') while not gfile.Exists(training_file): print("Waiting for", training_file) time.sleep(1*60) print("Using Golden File:", training_file) try: save_file = os.path.join(fsdb.models_dir(), new_model_name) print("Training model") dual_net.train(training_file) print("Exporting model to ", save_file) dual_net.export_model(working_dir, save_file) except Exception as e: import traceback logging.error(traceback.format_exc()) print(traceback.format_exc()) logging.exception("Train error") sys.exit(1)
def _download(self, trial): """Downloads a single url given by the trial (thread safe). Args: trial (UriTrial): Object containing info about download. Raises: ValueError: If the destination dir is not empty """ log = util.build_log(prefix=trial.id) # Check the download dir is empty if (gfile.Exists(trial.output_path) and gfile.ListDirectory(trial.output_path)): raise ValueError('Download dir {} should be empty'.format( trial.output_path)) gfile.MakeDirs(trial.output_path) log('Start downloading...') self._backend.download(trial) # TODO(epot): Compute the checksum # Update the output path trial.output_path = get_download_filepath(trial) log('Download complete at {}', trial.output_path)
def main(unused_argv): logging.set_verbosity(tf.logging.INFO) print("tensorflow version: %s" % tf.__version__) is_chief = (FLAGS.task == 0) # Recover session saver = None latest_checkpoint = tf.train.latest_checkpoint(FLAGS.train_dir) if FLAGS.start_new_model: logging.info( "'start_new_model' flag is set. Removing existing train dir.") try: gfile.DeleteRecursively(FLAGS.train_dir) except: logging.error( "Failed to delete directory " + FLAGS.train_dir + " when starting a new model. Please delete it manually and" + " try again.") elif not latest_checkpoint: logging.info("No checkpoint file found. Building a new model.") else: meta_filename = latest_checkpoint + ".meta" if not gfile.Exists(meta_filename): logging.info("No meta graph file found. Building a new model.") else: logging.info("Restoring from meta graph file %s", meta_filename) saver = tf.train.import_meta_graph(meta_filename) if not saver: # convert feature_names and feature_sizes to lists of values feature_names, feature_sizes = utils.GetListOfFeatureNamesAndSizes( FLAGS.feature_names, FLAGS.feature_sizes) if FLAGS.frame_features: reader = readers.YT8MFrameFeatureReader( feature_names=feature_names, feature_sizes=feature_sizes) else: reader = readers.YT8MAggregatedFeatureReader( feature_names=feature_names, feature_sizes=feature_sizes) model = find_class_by_name(FLAGS.model, [frame_level_models, video_level_models])() label_loss_fn = find_class_by_name(FLAGS.label_loss, [losses])() optimizer_class = find_class_by_name(FLAGS.optimizer, [tf.train]) build_graph(reader=reader, model=model, optimizer_class=optimizer_class, train_data_pattern=FLAGS.train_data_pattern, label_loss_fn=label_loss_fn, base_learning_rate=FLAGS.base_learning_rate, regularization_penalty=FLAGS.regularization_penalty, num_readers=FLAGS.num_readers, batch_size=FLAGS.batch_size) logging.info("built graph") saver = tf.train.Saver() train_loop(is_chief=is_chief, train_dir=FLAGS.train_dir, saver=saver, master=FLAGS.master)
def get_target_path(request, point_num): """Computes the output path for a specific point. Args: request: ResegmentationRequest proto point_num: index of the point of interest within the proto Returns: path to the output file where resegmentation results will be saved """ # Prepare the output directory. output_dir = request.output_directory id_a = request.points[point_num].id_a id_b = request.points[point_num].id_b if request.subdir_digits > 1: m = hashlib.md5() m.update(str(id_a)) m.update(str(id_b)) output_dir = os.path.join(output_dir, m.hexdigest()[:request.subdir_digits]) gfile.MakeDirs(output_dir) # Terminate early if the output already exists. dp = request.points[point_num].point target_path = os.path.join( output_dir, '%d-%d_at_%d_%d_%d.npz' % (id_a, id_b, dp.x, dp.y, dp.z)) if gfile.Exists(target_path): logging.info('Output already exists: %s', target_path) return return target_path
def _prepare(self): """ Prepares for evaluation. Builds the model with reuse=True, mode=EVAL and preprocesses data file(s). """ text_inputter = TextLineInputter(dataset=self._dataset, data_field_name="eval_features_file", batch_size=self._batch_size) self._eval_feeding_data = text_inputter.make_feeding_data() self._model_configs = update_infer_params( # update inference parameters self._model_configs, beam_size=self._beam_size, maximum_labels_length=self._maximum_labels_length, length_penalty=self._length_penalty) estimator_spec = model_fn(model_configs=self._model_configs, mode=ModeKeys.INFER, dataset=self._dataset, name=self._model_name, reuse=True, verbose=False) self._predict_ops = estimator_spec.predictions tmp_trans_dir = os.path.join(self._model_configs["model_dir"], GlobalNames.TMP_TRANS_DIRNAME) if not gfile.Exists(tmp_trans_dir): gfile.MakeDirs(tmp_trans_dir) self._tmp_trans_file_prefix = os.path.join( tmp_trans_dir, GlobalNames.TMP_TRANS_FILENAME_PREFIX) self._read_ckpt_bleulog() self._eval_labels_file = self._dataset.eval_labels_file self._check_bleu_script() self._estop_patience = 0 self._best_bleu_score = 0.
def set_latest_checkpoint(dirname: str, chkpt: str): """Set the latest checkpoint in the checkpoint file. Args: dirname: Directory in which the checkpoint is located. chkpt: Checkpoint prefix. """ chkpt_file = os.path.join(dirname, 'checkpoint') lines = [] if gfile.Exists(chkpt_file): logging.info('Loading preexisting checkpoint file "%s"', chkpt_file) with gfile.Open(chkpt_file) as f: lines = [ l.strip() for l in f.readlines() if l.startswith(b'all_model_checkpoint_paths:') ] else: logging.info('No preexisting checkpoint file "%s"', chkpt_file) with gfile.Open(chkpt_file, 'w') as f: lines = [ '%s\n' % l.strip() for l in ([ 'model_checkpoint_path: "%s"' % chkpt, 'all_model_checkpoint_paths: "%s"' % chkpt ] + lines) ] f.writelines(lines)
def create_vocabulary_lookup_table_numpy(filename): """Creates a lookup table from a vocabulary file. Args: filename: Path to a vocabulary file containing one word per line. Each word is mapped to its line number (starting from 0). Returns: A tuple `(word_to_id_mapping, id_to_word_mapping, special_fields)` """ if not gfile.Exists(filename): raise ValueError("File does not exist: {}".format(filename)) # Load vocabulary into memory with open_file(filename, encoding="utf-8") as file: vocab = list(line.strip("\n") for line in file) vocab_size = len(vocab) has_counts = len(vocab[0].split("\t")) == 2 if has_counts: vocab, counts = zip(*[_.split("\t") for _ in vocab]) counts = [float(_) for _ in counts] vocab = list(vocab) else: counts = [-1. for _ in vocab] # Add special vocabulary items special_vocab = get_special_vocab(vocab_size) vocab += list(special_vocab._fields) vocab_size += len(special_vocab) counts += [-1. for _ in list(special_vocab._fields)] return {v: k for k, v in enumerate(vocab)}, \ {k: v for k, v in enumerate(vocab)}, \ special_vocab._fields
def get_mldata(data_dir, name): """Loads data from data_dir. Looks for the file in data_dir. Assumes that data is in pickle format with dictionary fields data and target. Args: data_dir: directory to look in name: dataset name, assumes data is saved in the save_dir with filename <name>.pkl Returns: data and targets Raises: NameError: dataset not found in data folder. """ dataname = name if dataname == "checkerboard": X, y = create_checker_unbalanced(split=[1. / 5, 4. / 5], n=10000, grid_size=4) else: filename = os.path.join(data_dir, dataname + ".pkl") if not gfile.Exists(filename): raise NameError("ERROR: dataset not available") data = pickle.load(gfile.GFile(filename, "r")) X = data["data"] y = data["target"] if "keras" in dataname: X = X / 255 y = y.flatten() return X, y