def make_needed_dirs(config): if not gfile.Exists(config.dir_in): gfile.MkDir(config.dir_in) if not gfile.Exists(config.dir_success): gfile.MkDir(config.dir_success) if not gfile.Exists(config.dir_error): gfile.MkDir(config.dir_error) if not gfile.Exists(config.dir_result): gfile.MkDir(config.dir_result) if not gfile.Exists(config.dir_log): gfile.MkDir(config.dir_log)
def main(FLAGS): output_dir = FLAGS.output_dir if FLAGS.verbose: print 'output_dir', output_dir print 'data_file', FLAGS.data_file print 'kb_file', FLAGS.kb_file print 'output_prefix', FLAGS.output_prefix if not tf.io.gfile.isdir(output_dir): gfile.MkDir(output_dir) input_data_file = FLAGS.data_file input_kb_file = FLAGS.kb_file if len(FLAGS.output_prefix.strip()) == 0: FLAGS.output_prefix = '' else: FLAGS.output_prefix = FLAGS.output_prefix output_data_pattern = output_dir + '/{0}data.json' output_kb_pattern = output_dir + '/{0}kb.json' # load data and do standardization raw_data, raw_kb = load_and_drop(input_data_file, input_kb_file, drop_incorrect=not FLAGS.keep_incorrect, verbose=FLAGS.verbose) write_infer_json( raw_data, raw_kb, output_data_pattern.format(FLAGS.output_prefix + '_infer_src_'), output_data_pattern.format(FLAGS.output_prefix + '_infer_tgt_'), output_kb_pattern.format(FLAGS.output_prefix + '_infer_'))
def main(argv): del argv # Unused. if not gfile.Exists(FLAGS.save_dir): gfile.MkDir(FLAGS.save_dir) charting_filepath = os.path.join(FLAGS.save_dir, FLAGS.dataset + '_charts.pdf') sampling_methods = FLAGS.sampling_methods.split(',') scoring_methods = FLAGS.scoring_methods.split(',') files = gfile.Glob( os.path.join(FLAGS.source_dir, FLAGS.dataset + '*/results*.pkl')) files = [ f for f in files if (get_sampling_method(FLAGS.dataset, f) in sampling_methods and get_scoring_method(f) in scoring_methods and get_normalize(f) == FLAGS.normalize and get_standardize(f) == FLAGS.standardize) ] print('Reading in %d files...' % len(files)) all_results = combine_results(files) pdf = PdfPages(charting_filepath) print('Plotting charts...') plt.style.use('ggplot') for m in scoring_methods: plot_results( all_results, m, FLAGS.normalize, FLAGS.standardize, sampler_filter=sampling_methods) plt.title('Dataset: %s, Score Method: %s' % (FLAGS.dataset, m)) pdf.savefig() plt.close() pdf.close()
def save_config_file(config_file, dest_dir): if not gfile.Exists(dest_dir): gfile.MkDir(dest_dir) return gfile.Copy( config_file, os.path.join(dest_dir, 'blueoil_config.yaml') )
def write_production(): """Copies staged templates to production directory. This function assumes that the template and associated metadata files are stored in a folder of the form gs://<template_staging_bucket>/<release_name>. It copies the templates from the <release_name> folder to two new locations: gs://<prod_bucket>/<release_name> and gs://<prod_bucket>/latest. Both folders contain identical contents; the <release_name> bucket is to allow customers to pin to a specific release and the `latest` folder gives the UI a location at which to point. Raises: GOSError if there was an error reading or writing a file. """ prod_root = FLAGS.template_prod_bucket template_staging_root = FLAGS.template_staging_bucket template_dir = os.path.join(template_staging_root, FLAGS.candidate_name) if not gfile.IsDirectory(template_dir): logging.fatal( 'Template staging directory %s does not exist or is not a ' 'directory.', template_dir) release_dir = os.path.join(prod_root, FLAGS.release_name) if gfile.IsDirectory(release_dir): logging.fatal( 'Template release directory %s already exists. Aborting.', template_dir) logging.info('Copying folder from %s to %s.', template_dir, release_dir) gfile.MkDir(release_dir) CopyRecursively(template_dir, release_dir) # TODO: If we ever delete templates, they will stick around in # `latest`; evaluate something rsync-like in the future. latest_dir = os.path.join(prod_root, LATEST_FOLDER_NAME) if gfile.Exists(latest_dir): if not gfile.IsDirectory(latest_dir): gfile.Remove(latest_dir) gfile.MkDir(latest_dir) else: gfile.MkDir(latest_dir) logging.info('Copying folder from %s to %s.', template_dir, latest_dir) CopyRecursively(template_dir, latest_dir, overwrite=True)
def CopyRecursively(src, dst, overwrite=False): entries = gfile.ListDirectory(src) for entry in entries: src_path = os.path.join(src, entry) dst_path = os.path.join(dst, entry) if gfile.IsDirectory(src_path): gfile.MkDir(dst_path) CopyRecursively(src_path, dst_path, overwrite) else: gfile.Copy(src_path, dst_path, overwrite)
def save_params_and_step(params, step, output_dir): """Save params and step in output dir.""" if output_dir is not None: if not gfile.Exists(output_dir): log("Creating directory %s" % output_dir) gfile.MkDir(output_dir) params_file = os.path.join(output_dir, "model.pkl") with gfile.Open(params_file, "wb") as f: pickle.dump((params, step), f) log("Model saved to %s" % params_file, stdout=False)
def make_dir(d): """Make dir with tensorflow.gfile engine. Args: d: directory """ if not gfile.Exists(d): try: gfile.MkDir(d) except: print(('WARNING: error creating save directory, ' 'directory most likely already created.'))
def save_config_file(config_file, dest_dir): if not gfile.Exists(dest_dir): gfile.MkDir(dest_dir) config_file_dest = os.path.join(dest_dir, 'blueoil_config.yaml') # HACK: This is for tensorflow bug workaround. # We can remove following 2 lines once it's been resolved in tensorflow # issue link: https://github.com/tensorflow/tensorflow/issues/28508 if gfile.Exists(config_file_dest): gfile.Remove(config_file_dest) return gfile.Copy(config_file, config_file_dest)
def get_mldata(dataset): # Use scikit to grab datasets and save them save_dir. save_dir = FLAGS.save_dir filename = os.path.join(save_dir, dataset[1] + '.pkl') if not gfile.Exists(save_dir): gfile.MkDir(save_dir) #这个函数不能创建多层目录,可以更换成os.makedirs() if not gfile.Exists(filename): if dataset[0][-3:] == 'csv': data = get_csv_data(dataset[0]) elif dataset[0] == 'breast_cancer': data = load_breast_cancer() elif dataset[0] == 'iris': data = load_iris() elif dataset[0] == 'newsgroup': # Removing header information to make sure that no newsgroup identifying # information is included in data data = fetch_20newsgroups_vectorized(subset='all', remove=('headers')) tfidf = TfidfTransformer(norm='l2') X = tfidf.fit_transform(data.data) data.data = X elif dataset[0] == 'rcv1': sklearn.datasets.rcv1.URL = ( 'http://www.ai.mit.edu/projects/jmlr/papers/' 'volume5/lewis04a/a13-vector-files/lyrl2004_vectors') sklearn.datasets.rcv1.URL_topics = ( 'http://www.ai.mit.edu/projects/jmlr/papers/' 'volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz') data = sklearn.datasets.fetch_rcv1(data_home='/tmp') elif dataset[0] == 'wikipedia_attack': data = get_wikipedia_talk_data() elif dataset[0] == 'cifar10': data = get_cifar10() elif 'keras' in dataset[0]: data = get_keras_data(dataset[0]) else: try: data = fetch_mldata(dataset[0]) except: raise Exception('ERROR: failed to fetch data from mldata.org') X = data.data y = data.target if X.shape[0] != y.shape[0]: X = np.transpose(X) #transpose()函数的作用就是调换数组的行列值的索引值,类似于求矩阵的转置 assert X.shape[0] == y.shape[ 0] #Python assert(断言)用于判断一个表达式,在表达式条件为 false 的时候触发异常 data = {'data': X, 'target': y} pickle.dump(data, gfile.GFile(filename, 'w'))
def main(argv): del argv # Unused. if not gfile.Exists(FLAGS.save_dir): gfile.MkDir(FLAGS.save_dir) print('------- Save dir is made\n') #charting_filepath = os.path.join(FLAGS.save_dir, #FLAGS.dataset + '_charts.pdf') charting_filepath = 'heart_charts.pdf' print('---------- Charting filepath is {}\n'.format(charting_filepath)) sampling_methods = FLAGS.sampling_methods.split(',') scoring_methods = FLAGS.scoring_methods.split(',') #files = gfile.Glob( #os.path.join(FLAGS.source_dir, FLAGS.dataset + '_' + FLAGS.sampling_methods + '*/results*.pkl')) files = [ 'tmp/toy_experiments/heart_margin/results_score_logistic_select_linear_svm_norm_False_stand_True_000.pkl' ] print('------------- Files in CHart Data script are {}\n'.format(files)) files = [ f for f in files if (get_sampling_method(FLAGS.dataset, f) in sampling_methods and get_scoring_method(f) in scoring_methods and get_normalize(f) == FLAGS.normalize and get_standardize(f) == FLAGS.standardize) ] print('Reading in %d files...' % len(files)) all_results = combine_results(files) pdf = PdfPages(charting_filepath) print('---------- PDF file is made\n') print('Plotting charts...') plt.style.use('ggplot') for m in scoring_methods: plot_results(all_results, m, FLAGS.normalize, FLAGS.standardize, sampler_filter=sampling_methods) plt.title('Dataset: %s, Score Method: %s' % (FLAGS.dataset, m)) pdf.savefig() plt.close() print('---------- Finished plotting') pdf.close()
def handle_profile_api(profile_path, profile_cnt): """Handles profile API requests.""" options = json.loads(flask.request.args.get('options')) # Determine view and output format. if options['view'] == 'pprof': output_format = 'pprof' elif options['view'] == 'graph': output_format = 'timeline' else: output_format = 'file' profile_dir = os.path.realpath(profile_path) resources_dir = os.path.join(profile_dir, 'resources') if not os.path.isdir(resources_dir): gfile.MkDir(os.path.join(profile_dir, 'resources')) if output_format == 'pprof': return produce_pprof_profile(profile_dir, resources_dir, profile_cnt, options) elif output_format == 'timeline': return produce_timeline_profile(profile_dir, resources_dir, profile_cnt, options) else: return produce_other_profile(profile_dir, resources_dir, profile_cnt, options)
def main(argv): del argv if not gfile.Exists(FLAGS.save_dir): try: gfile.MkDir(FLAGS.save_dir) except: print(('WARNING: error creating save directory, ')) save_dir = os.path.join(FLAGS.save_dir, FLAGS.dataset + '_' + FLAGS.sampling_method) if FLAGS.do_save == "True": if not gfile.Exists(save_dir): try: gfile.MkDir(save_dir) except: print(('WARNING: error creating save directory, ' 'directory most likely already created.')) # Set up logging filename = os.path.join( save_dir, "log-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + ".txt") sys.stdout = utils.Logger(filename) X, y = utils.get_mldata(FLAGS.data_dir, FLAGS.dataset) #load dataset! starting_seed = FLAGS.seed all_results = {} for seed in range(starting_seed, starting_seed + FLAGS.trials): sampler = get_AL_sampler(FLAGS.sampling_method) #load sampler! score_model = utils.get_model(FLAGS.score_method, seed) #load score model! if (FLAGS.select_method == "None" or #load select model! FLAGS.select_method == FLAGS.score_method): select_model = None else: select_model = utils.get_model(FLAGS.select_method, seed) results, sampler_state = \ generate_one_curve(X=X, y=y, sampler=sampler, score_model=score_model, seed=seed, warmstart_size=FLAGS.warmstart_size, batch_size=FLAGS.batch_size, select_model=select_model, max_points=FLAGS.max_dataset_size) key = (FLAGS.dataset, FLAGS.sampling_method, FLAGS.score_method, FLAGS.select_method, FLAGS.warmstart_size, FLAGS.batch_size, seed) #sampler_output = sampler_state.to_dict() #results['sampler_output'] = sampler_output results['sampler_output'] = None all_results[key] = results fields = [ 'dataset', 'sampling_methods', 'score_method', 'select_method', 'warmstart size', 'batch size', 'seed' ] all_results['tuple_keys'] = fields if FLAGS.do_save == "True": filename = ("results_score_" + FLAGS.score_method + "_select_" + FLAGS.select_method) existing_files = gfile.Glob(os.path.join(save_dir, filename + "*.pkl")) filename = os.path.join( save_dir, filename + "_" + str(1000 + len(existing_files))[1:] + ".pkl") pickle.dump(all_results, gfile.GFile(filename, "w")) sys.stdout.flush_file()
def main(argv): del argv if not gfile.Exists(FLAGS.save_dir): try: gfile.MkDir(FLAGS.save_dir) except: print(('WARNING: error creating save directory, ' 'directory most likely already created.')) save_dir = os.path.join( FLAGS.save_dir, FLAGS.dataset + "_" + FLAGS.sampling_method) do_save = FLAGS.do_save == "True" if do_save: if not gfile.Exists(save_dir): try: gfile.MkDir(save_dir) except: print(('WARNING: error creating save directory, ' 'directory most likely already created.')) # Set up logging filename = os.path.join( save_dir, "log-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + ".txt") sys.stdout = utils.Logger(filename) confusions = [float(t) for t in FLAGS.confusions.split(" ")] mixtures = [float(t) for t in FLAGS.active_sampling_percentage.split(" ")] all_results = {} max_dataset_size = None if FLAGS.max_dataset_size == "0" else int( FLAGS.max_dataset_size) normalize_data = FLAGS.normalize_data == "True" standardize_data = FLAGS.standardize_data == "True" X, y = utils.get_mldata(FLAGS.data_dir, FLAGS.dataset) starting_seed = FLAGS.seed for c in confusions: for m in mixtures: for seed in range(starting_seed, starting_seed + FLAGS.trials): sampler = get_AL_sampler(FLAGS.sampling_method) score_model = utils.get_model(FLAGS.score_method, seed) if (FLAGS.select_method == "None" or FLAGS.select_method == FLAGS.score_method): select_model = None else: select_model = utils.get_model(FLAGS.select_method, seed) results, sampler_state = generate_one_curve( X, y, sampler, score_model, seed, FLAGS.warmstart_size, FLAGS.batch_size, select_model, c, m, max_dataset_size, standardize_data, normalize_data, FLAGS.train_horizon) key = (FLAGS.dataset, FLAGS.sampling_method, FLAGS.score_method, FLAGS.select_method, m, FLAGS.warmstart_size, FLAGS.batch_size, c, standardize_data, normalize_data, seed) sampler_output = sampler_state.to_dict() results["sampler_output"] = sampler_output all_results[key] = results fields = [ "dataset", "sampler", "score_method", "select_method", "active percentage", "warmstart size", "batch size", "confusion", "standardize", "normalize", "seed" ] all_results["tuple_keys"] = fields if do_save: filename = ("results_score_" + FLAGS.score_method + "_select_" + FLAGS.select_method + "_norm_" + str(normalize_data) + "_stand_" + str(standardize_data)) existing_files = gfile.Glob(os.path.join(save_dir, filename + "*.pkl")) filename = os.path.join(save_dir, filename + "_" + str(1000+len(existing_files))[1:] + ".pkl") pickle.dump(all_results, gfile.GFile(filename, "w")) sys.stdout.flush_file()
def main(FLAGS): all_jobs = process_job_type(FLAGS.job_type, FLAGS.input_type) output_dir = FLAGS.output_dir if FLAGS.verbose: print 'all_jobs', all_jobs print 'input_type', FLAGS.input_type print 'output_dir', output_dir print 'data_file', FLAGS.data_file print 'kb_file', FLAGS.kb_file print 'output_prefix', FLAGS.output_prefix print 'skip_standardize', FLAGS.skip_standardize print 'keep_incorrect', FLAGS.keep_incorrect print 'word_cutoff', FLAGS.word_cutoff print 'gen_voc', FLAGS.gen_voc if not tf.io.gfile.isdir(output_dir): gfile.MkDir(output_dir) input_data_file = FLAGS.data_file input_kb_file = FLAGS.kb_file if len(FLAGS.output_prefix.strip()) == 0: FLAGS.output_prefix = '' else: FLAGS.output_prefix = FLAGS.output_prefix # output_vab = output_dir + '/{0}.vocab'.format(FLAGS.output_prefix) output_vab = output_dir + '/vocab.txt' output_all_vab = output_dir + '/{0}.full.vocab'.format(FLAGS.output_prefix) all_token_file = output_dir + '/{0}.special.vocab'.format( FLAGS.output_prefix) first_name_cats_file = output_dir + '/{0}.firstname.cat'.format( FLAGS.output_prefix) last_name_cats_file = output_dir + '/{0}.lastname.cat'.format( FLAGS.output_prefix) flight_cats_file = output_dir + '/{0}.flight.cat'.format( FLAGS.output_prefix) status_cats_file = output_dir + '/{0}.status.cat'.format( FLAGS.output_prefix) output_data_pattern = output_dir + '/{0}data' output_kb_pattern = output_dir + '/{0}kb' nltk_path = FLAGS.ntlk_data nltk.data.path.append(nltk_path) sent_tokenize = nltk.sent_tokenize vocal_map = {} # load data and do standardization if not FLAGS.skip_standardize: raw_data, raw_kb = standardize_and_drop( input_data_file, input_kb_file, drop_incorrect=not FLAGS.keep_incorrect, verbose=FLAGS.verbose) else: raw_data, raw_kb = load_and_drop( input_data_file, input_kb_file, drop_incorrect=not FLAGS.drop_incorrect, verbose=FLAGS.verbose) # has to be there no matter what if FLAGS.verbose: print 'processing kb' processed_kb, vocal_map = process_kb(raw_kb, vocal_map) # if dialogue, everything will be there. # if context, only intents, actions, vocal_map will be there if FLAGS.verbose: print 'processing data' result = process_main_data(raw_data, sent_tokenize, word_tokenize, vocal_map, input_type=FLAGS.input_type) intents, actions, expected_actions, dialogues, vocal_map, boundaries1, boundaries2, cats = result frequency_cutoff = FLAGS.word_cutoff # 3 is the number of special tokens if FLAGS.verbose: print 'vocabulary before cutoff', len(vocal_map) + 3 if not FLAGS.gen_voc: # if we choose not to generate vocabulary file, we set output_vab to None output_vab = None if not FLAGS.gen_voc_map: output_all_vab = None vocal_map = write_vocabulary(output_vab, output_all_vab, vocal_map, frequency_cutoff, FLAGS.keep_non_ascii) if FLAGS.gen_cat: if FLAGS.verbose: print 'writing category' cat_files = [ first_name_cats_file, last_name_cats_file, flight_cats_file, status_cats_file ] write_cat(cat_files, cats) if FLAGS.verbose: print 'frequency_cutoff= {0}, vocabulary after cutoff'.format( frequency_cutoff), len(vocal_map) data = reorganize_data(intents, actions, expected_actions, dialogues, processed_kb, boundaries1, boundaries2) if 'train' in all_jobs: if FLAGS.verbose: print 'writing train data' write_data(data, output_data_pattern.format(FLAGS.output_prefix + '.'), output_kb_pattern.format(FLAGS.output_prefix + '.')) if 'eval' in all_jobs: if FLAGS.verbose: print 'writing eval data' write_data(data, output_data_pattern.format(FLAGS.output_prefix + '.eval.'), output_kb_pattern.format(FLAGS.output_prefix + '.eval.')) if 'infer' in all_jobs: if FLAGS.verbose: print 'writing infer data' write_completion( data, output_data_pattern.format(FLAGS.output_prefix + '.infer.src.'), output_data_pattern.format(FLAGS.output_prefix + '.infer.tar.'), output_kb_pattern.format(FLAGS.output_prefix + '.infer.')) if 'sp-train' in all_jobs: if FLAGS.verbose: print 'writing self play training data' write_self_play( data, output_data_pattern.format(FLAGS.output_prefix + '.selfplay.'), output_kb_pattern.format(FLAGS.output_prefix + '.selfplay.')) if 'sp-eval' in all_jobs: if FLAGS.verbose: print 'writing self play eval data' write_self_play( data, output_data_pattern.format(FLAGS.output_prefix + '.selfplay.eval.'), output_kb_pattern.format(FLAGS.output_prefix + '.selfplay.eval.')) if FLAGS.gen_special_token: # write all token file. f_tokens = gfile.Open(all_token_file, 'w') for token in list(list_of_action_tokens_except_name): f_tokens.write(token + '\n') f_tokens.close()