Beispiel #1
0
def make_needed_dirs(config):
    if not gfile.Exists(config.dir_in):
        gfile.MkDir(config.dir_in)
    if not gfile.Exists(config.dir_success):
        gfile.MkDir(config.dir_success)
    if not gfile.Exists(config.dir_error):
        gfile.MkDir(config.dir_error)
    if not gfile.Exists(config.dir_result):
        gfile.MkDir(config.dir_result)
    if not gfile.Exists(config.dir_log):
        gfile.MkDir(config.dir_log)
Beispiel #2
0
def main(FLAGS):
    output_dir = FLAGS.output_dir
    if FLAGS.verbose:
        print 'output_dir', output_dir
        print 'data_file', FLAGS.data_file
        print 'kb_file', FLAGS.kb_file
        print 'output_prefix', FLAGS.output_prefix

    if not tf.io.gfile.isdir(output_dir):
        gfile.MkDir(output_dir)

    input_data_file = FLAGS.data_file
    input_kb_file = FLAGS.kb_file
    if len(FLAGS.output_prefix.strip()) == 0:
        FLAGS.output_prefix = ''
    else:
        FLAGS.output_prefix = FLAGS.output_prefix

    output_data_pattern = output_dir + '/{0}data.json'
    output_kb_pattern = output_dir + '/{0}kb.json'

    # load data and do standardization
    raw_data, raw_kb = load_and_drop(input_data_file,
                                     input_kb_file,
                                     drop_incorrect=not FLAGS.keep_incorrect,
                                     verbose=FLAGS.verbose)

    write_infer_json(
        raw_data, raw_kb,
        output_data_pattern.format(FLAGS.output_prefix + '_infer_src_'),
        output_data_pattern.format(FLAGS.output_prefix + '_infer_tgt_'),
        output_kb_pattern.format(FLAGS.output_prefix + '_infer_'))
Beispiel #3
0
def main(argv):
  del argv  # Unused.
  if not gfile.Exists(FLAGS.save_dir):
    gfile.MkDir(FLAGS.save_dir)
  charting_filepath = os.path.join(FLAGS.save_dir,
                                   FLAGS.dataset + '_charts.pdf')
  sampling_methods = FLAGS.sampling_methods.split(',')
  scoring_methods = FLAGS.scoring_methods.split(',')
  files = gfile.Glob(
      os.path.join(FLAGS.source_dir, FLAGS.dataset + '*/results*.pkl'))
  files = [
      f for f in files
      if (get_sampling_method(FLAGS.dataset, f) in sampling_methods and
          get_scoring_method(f) in scoring_methods and
          get_normalize(f) == FLAGS.normalize and
          get_standardize(f) == FLAGS.standardize)
  ]

  print('Reading in %d files...' % len(files))
  all_results = combine_results(files)
  pdf = PdfPages(charting_filepath)

  print('Plotting charts...')
  plt.style.use('ggplot')
  for m in scoring_methods:
    plot_results(
        all_results,
        m,
        FLAGS.normalize,
        FLAGS.standardize,
        sampler_filter=sampling_methods)
    plt.title('Dataset: %s, Score Method: %s' % (FLAGS.dataset, m))
    pdf.savefig()
    plt.close()
  pdf.close()
Beispiel #4
0
def save_config_file(config_file, dest_dir):
    if not gfile.Exists(dest_dir):
        gfile.MkDir(dest_dir)

    return gfile.Copy(
        config_file,
        os.path.join(dest_dir, 'blueoil_config.yaml')
    )
Beispiel #5
0
def write_production():
    """Copies staged templates to production directory.

  This function assumes that the template and associated metadata files are
  stored in a folder of the form gs://<template_staging_bucket>/<release_name>.
  It copies the templates from the <release_name> folder to two new locations:
  gs://<prod_bucket>/<release_name> and gs://<prod_bucket>/latest. Both
  folders contain identical contents; the <release_name> bucket is to allow
  customers to pin to a specific release and the `latest` folder gives the UI
  a location at which to point.

  Raises:
    GOSError if there was an error reading or writing a file.
  """
    prod_root = FLAGS.template_prod_bucket
    template_staging_root = FLAGS.template_staging_bucket

    template_dir = os.path.join(template_staging_root, FLAGS.candidate_name)
    if not gfile.IsDirectory(template_dir):
        logging.fatal(
            'Template staging directory %s does not exist or is not a '
            'directory.', template_dir)

    release_dir = os.path.join(prod_root, FLAGS.release_name)
    if gfile.IsDirectory(release_dir):
        logging.fatal(
            'Template release directory %s already exists. Aborting.',
            template_dir)

    logging.info('Copying folder from %s to %s.', template_dir, release_dir)
    gfile.MkDir(release_dir)
    CopyRecursively(template_dir, release_dir)

    # TODO: If we ever delete templates, they will stick around in
    # `latest`; evaluate something rsync-like in the future.
    latest_dir = os.path.join(prod_root, LATEST_FOLDER_NAME)
    if gfile.Exists(latest_dir):
        if not gfile.IsDirectory(latest_dir):
            gfile.Remove(latest_dir)
            gfile.MkDir(latest_dir)
    else:
        gfile.MkDir(latest_dir)

    logging.info('Copying folder from %s to %s.', template_dir, latest_dir)
    CopyRecursively(template_dir, latest_dir, overwrite=True)
Beispiel #6
0
def CopyRecursively(src, dst, overwrite=False):
    entries = gfile.ListDirectory(src)
    for entry in entries:
        src_path = os.path.join(src, entry)
        dst_path = os.path.join(dst, entry)
        if gfile.IsDirectory(src_path):
            gfile.MkDir(dst_path)
            CopyRecursively(src_path, dst_path, overwrite)
        else:
            gfile.Copy(src_path, dst_path, overwrite)
Beispiel #7
0
def save_params_and_step(params, step, output_dir):
  """Save params and step in output dir."""
  if output_dir is not None:
    if not gfile.Exists(output_dir):
      log("Creating directory %s" % output_dir)
      gfile.MkDir(output_dir)
    params_file = os.path.join(output_dir, "model.pkl")
    with gfile.Open(params_file, "wb") as f:
      pickle.dump((params, step), f)
    log("Model saved to %s" % params_file, stdout=False)
Beispiel #8
0
def make_dir(d):
    """Make dir with tensorflow.gfile engine.

    Args:
        d: directory
    """
    if not gfile.Exists(d):
        try:
            gfile.MkDir(d)
        except:
            print(('WARNING: error creating save directory, '
                   'directory most likely already created.'))
Beispiel #9
0
def save_config_file(config_file, dest_dir):
    if not gfile.Exists(dest_dir):
        gfile.MkDir(dest_dir)

    config_file_dest = os.path.join(dest_dir, 'blueoil_config.yaml')

    # HACK: This is for tensorflow bug workaround.
    # We can remove following 2 lines once it's been resolved in tensorflow
    # issue link: https://github.com/tensorflow/tensorflow/issues/28508
    if gfile.Exists(config_file_dest):
        gfile.Remove(config_file_dest)

    return gfile.Copy(config_file, config_file_dest)
def get_mldata(dataset):
    # Use scikit to grab datasets and save them save_dir.
    save_dir = FLAGS.save_dir
    filename = os.path.join(save_dir, dataset[1] + '.pkl')

    if not gfile.Exists(save_dir):
        gfile.MkDir(save_dir)  #这个函数不能创建多层目录,可以更换成os.makedirs()
    if not gfile.Exists(filename):
        if dataset[0][-3:] == 'csv':
            data = get_csv_data(dataset[0])
        elif dataset[0] == 'breast_cancer':
            data = load_breast_cancer()
        elif dataset[0] == 'iris':
            data = load_iris()
        elif dataset[0] == 'newsgroup':
            # Removing header information to make sure that no newsgroup identifying
            # information is included in data
            data = fetch_20newsgroups_vectorized(subset='all',
                                                 remove=('headers'))
            tfidf = TfidfTransformer(norm='l2')
            X = tfidf.fit_transform(data.data)
            data.data = X
        elif dataset[0] == 'rcv1':
            sklearn.datasets.rcv1.URL = (
                'http://www.ai.mit.edu/projects/jmlr/papers/'
                'volume5/lewis04a/a13-vector-files/lyrl2004_vectors')
            sklearn.datasets.rcv1.URL_topics = (
                'http://www.ai.mit.edu/projects/jmlr/papers/'
                'volume5/lewis04a/a08-topic-qrels/rcv1-v2.topics.qrels.gz')
            data = sklearn.datasets.fetch_rcv1(data_home='/tmp')
        elif dataset[0] == 'wikipedia_attack':
            data = get_wikipedia_talk_data()
        elif dataset[0] == 'cifar10':
            data = get_cifar10()
        elif 'keras' in dataset[0]:
            data = get_keras_data(dataset[0])
        else:
            try:
                data = fetch_mldata(dataset[0])
            except:
                raise Exception('ERROR: failed to fetch data from mldata.org')
        X = data.data
        y = data.target
        if X.shape[0] != y.shape[0]:
            X = np.transpose(X)  #transpose()函数的作用就是调换数组的行列值的索引值,类似于求矩阵的转置
        assert X.shape[0] == y.shape[
            0]  #Python assert(断言)用于判断一个表达式,在表达式条件为 false 的时候触发异常

        data = {'data': X, 'target': y}
        pickle.dump(data, gfile.GFile(filename, 'w'))
def main(argv):
    del argv  # Unused.
    if not gfile.Exists(FLAGS.save_dir):
        gfile.MkDir(FLAGS.save_dir)
        print('------- Save dir is made\n')
    #charting_filepath = os.path.join(FLAGS.save_dir,
    #FLAGS.dataset + '_charts.pdf')
    charting_filepath = 'heart_charts.pdf'
    print('---------- Charting filepath is {}\n'.format(charting_filepath))
    sampling_methods = FLAGS.sampling_methods.split(',')
    scoring_methods = FLAGS.scoring_methods.split(',')
    #files = gfile.Glob(
    #os.path.join(FLAGS.source_dir, FLAGS.dataset + '_' + FLAGS.sampling_methods + '*/results*.pkl'))
    files = [
        'tmp/toy_experiments/heart_margin/results_score_logistic_select_linear_svm_norm_False_stand_True_000.pkl'
    ]
    print('------------- Files in CHart Data script are {}\n'.format(files))
    files = [
        f for f in files
        if (get_sampling_method(FLAGS.dataset, f) in sampling_methods
            and get_scoring_method(f) in scoring_methods and get_normalize(f)
            == FLAGS.normalize and get_standardize(f) == FLAGS.standardize)
    ]

    print('Reading in %d files...' % len(files))
    all_results = combine_results(files)
    pdf = PdfPages(charting_filepath)
    print('---------- PDF file is made\n')

    print('Plotting charts...')
    plt.style.use('ggplot')
    for m in scoring_methods:
        plot_results(all_results,
                     m,
                     FLAGS.normalize,
                     FLAGS.standardize,
                     sampler_filter=sampling_methods)
        plt.title('Dataset: %s, Score Method: %s' % (FLAGS.dataset, m))
        pdf.savefig()
        plt.close()
    print('---------- Finished plotting')
    pdf.close()
Beispiel #12
0
def handle_profile_api(profile_path, profile_cnt):
  """Handles profile API requests."""
  options = json.loads(flask.request.args.get('options'))

  # Determine view and output format.
  if options['view'] == 'pprof':
    output_format = 'pprof'
  elif options['view'] == 'graph':
    output_format = 'timeline'
  else:
    output_format = 'file'

  profile_dir = os.path.realpath(profile_path)
  resources_dir = os.path.join(profile_dir, 'resources')
  if not os.path.isdir(resources_dir):
    gfile.MkDir(os.path.join(profile_dir, 'resources'))

  if output_format == 'pprof':
    return produce_pprof_profile(profile_dir, resources_dir, profile_cnt, options)
  elif output_format == 'timeline':
    return produce_timeline_profile(profile_dir, resources_dir, profile_cnt, options)
  else:
    return produce_other_profile(profile_dir, resources_dir, profile_cnt, options)
Beispiel #13
0
def main(argv):
    del argv

    if not gfile.Exists(FLAGS.save_dir):
        try:
            gfile.MkDir(FLAGS.save_dir)
        except:
            print(('WARNING: error creating save directory, '))

    save_dir = os.path.join(FLAGS.save_dir,
                            FLAGS.dataset + '_' + FLAGS.sampling_method)

    if FLAGS.do_save == "True":
        if not gfile.Exists(save_dir):
            try:
                gfile.MkDir(save_dir)
            except:
                print(('WARNING: error creating save directory, '
                       'directory most likely already created.'))

        # Set up logging
        filename = os.path.join(
            save_dir,
            "log-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + ".txt")
        sys.stdout = utils.Logger(filename)

    X, y = utils.get_mldata(FLAGS.data_dir, FLAGS.dataset)  #load dataset!
    starting_seed = FLAGS.seed

    all_results = {}

    for seed in range(starting_seed, starting_seed + FLAGS.trials):
        sampler = get_AL_sampler(FLAGS.sampling_method)  #load sampler!
        score_model = utils.get_model(FLAGS.score_method,
                                      seed)  #load score model!
        if (FLAGS.select_method == "None" or  #load select model!
                FLAGS.select_method == FLAGS.score_method):
            select_model = None
        else:
            select_model = utils.get_model(FLAGS.select_method, seed)

        results, sampler_state = \
        generate_one_curve(X=X,
                           y=y,
                           sampler=sampler,
                           score_model=score_model,
                           seed=seed,
                           warmstart_size=FLAGS.warmstart_size,
                           batch_size=FLAGS.batch_size,
                           select_model=select_model,
                           max_points=FLAGS.max_dataset_size)

        key = (FLAGS.dataset, FLAGS.sampling_method, FLAGS.score_method,
               FLAGS.select_method, FLAGS.warmstart_size, FLAGS.batch_size,
               seed)

        #sampler_output = sampler_state.to_dict()
        #results['sampler_output'] = sampler_output
        results['sampler_output'] = None
        all_results[key] = results

    fields = [
        'dataset', 'sampling_methods', 'score_method', 'select_method',
        'warmstart size', 'batch size', 'seed'
    ]
    all_results['tuple_keys'] = fields

    if FLAGS.do_save == "True":
        filename = ("results_score_" + FLAGS.score_method + "_select_" +
                    FLAGS.select_method)
        existing_files = gfile.Glob(os.path.join(save_dir, filename + "*.pkl"))
        filename = os.path.join(
            save_dir,
            filename + "_" + str(1000 + len(existing_files))[1:] + ".pkl")
        pickle.dump(all_results, gfile.GFile(filename, "w"))
        sys.stdout.flush_file()
Beispiel #14
0
def main(argv):
  del argv

  if not gfile.Exists(FLAGS.save_dir):
    try:
      gfile.MkDir(FLAGS.save_dir)
    except:
      print(('WARNING: error creating save directory, '
             'directory most likely already created.'))

  save_dir = os.path.join(
      FLAGS.save_dir,
      FLAGS.dataset + "_" + FLAGS.sampling_method)
  do_save = FLAGS.do_save == "True"

  if do_save:
    if not gfile.Exists(save_dir):
      try:
        gfile.MkDir(save_dir)
      except:
        print(('WARNING: error creating save directory, '
               'directory most likely already created.'))
    # Set up logging
    filename = os.path.join(
        save_dir, "log-" + strftime("%Y-%m-%d-%H-%M-%S", gmtime()) + ".txt")
    sys.stdout = utils.Logger(filename)

  confusions = [float(t) for t in FLAGS.confusions.split(" ")]
  mixtures = [float(t) for t in FLAGS.active_sampling_percentage.split(" ")]
  all_results = {}
  max_dataset_size = None if FLAGS.max_dataset_size == "0" else int(
      FLAGS.max_dataset_size)
  normalize_data = FLAGS.normalize_data == "True"
  standardize_data = FLAGS.standardize_data == "True"
  X, y = utils.get_mldata(FLAGS.data_dir, FLAGS.dataset)
  starting_seed = FLAGS.seed

  for c in confusions:
    for m in mixtures:
      for seed in range(starting_seed, starting_seed + FLAGS.trials):
        sampler = get_AL_sampler(FLAGS.sampling_method)
        score_model = utils.get_model(FLAGS.score_method, seed)
        if (FLAGS.select_method == "None" or
            FLAGS.select_method == FLAGS.score_method):
          select_model = None
        else:
          select_model = utils.get_model(FLAGS.select_method, seed)
        results, sampler_state = generate_one_curve(
            X, y, sampler, score_model, seed, FLAGS.warmstart_size,
            FLAGS.batch_size, select_model, c, m, max_dataset_size,
            standardize_data, normalize_data, FLAGS.train_horizon)
        key = (FLAGS.dataset, FLAGS.sampling_method, FLAGS.score_method,
               FLAGS.select_method, m, FLAGS.warmstart_size, FLAGS.batch_size,
               c, standardize_data, normalize_data, seed)
        sampler_output = sampler_state.to_dict()
        results["sampler_output"] = sampler_output
        all_results[key] = results
  fields = [
      "dataset", "sampler", "score_method", "select_method",
      "active percentage", "warmstart size", "batch size", "confusion",
      "standardize", "normalize", "seed"
  ]
  all_results["tuple_keys"] = fields

  if do_save:
    filename = ("results_score_" + FLAGS.score_method +
                "_select_" + FLAGS.select_method +
                "_norm_" + str(normalize_data) +
                "_stand_" + str(standardize_data))
    existing_files = gfile.Glob(os.path.join(save_dir, filename + "*.pkl"))
    filename = os.path.join(save_dir,
                            filename + "_" + str(1000+len(existing_files))[1:] + ".pkl")
    pickle.dump(all_results, gfile.GFile(filename, "w"))
    sys.stdout.flush_file()
Beispiel #15
0
def main(FLAGS):
    all_jobs = process_job_type(FLAGS.job_type, FLAGS.input_type)
    output_dir = FLAGS.output_dir
    if FLAGS.verbose:
        print 'all_jobs', all_jobs
        print 'input_type', FLAGS.input_type
        print 'output_dir', output_dir
        print 'data_file', FLAGS.data_file
        print 'kb_file', FLAGS.kb_file
        print 'output_prefix', FLAGS.output_prefix
        print 'skip_standardize', FLAGS.skip_standardize
        print 'keep_incorrect', FLAGS.keep_incorrect
        print 'word_cutoff', FLAGS.word_cutoff
        print 'gen_voc', FLAGS.gen_voc

    if not tf.io.gfile.isdir(output_dir):
        gfile.MkDir(output_dir)

    input_data_file = FLAGS.data_file
    input_kb_file = FLAGS.kb_file
    if len(FLAGS.output_prefix.strip()) == 0:
        FLAGS.output_prefix = ''
    else:
        FLAGS.output_prefix = FLAGS.output_prefix
    # output_vab = output_dir + '/{0}.vocab'.format(FLAGS.output_prefix)
    output_vab = output_dir + '/vocab.txt'
    output_all_vab = output_dir + '/{0}.full.vocab'.format(FLAGS.output_prefix)
    all_token_file = output_dir + '/{0}.special.vocab'.format(
        FLAGS.output_prefix)
    first_name_cats_file = output_dir + '/{0}.firstname.cat'.format(
        FLAGS.output_prefix)
    last_name_cats_file = output_dir + '/{0}.lastname.cat'.format(
        FLAGS.output_prefix)
    flight_cats_file = output_dir + '/{0}.flight.cat'.format(
        FLAGS.output_prefix)
    status_cats_file = output_dir + '/{0}.status.cat'.format(
        FLAGS.output_prefix)

    output_data_pattern = output_dir + '/{0}data'
    output_kb_pattern = output_dir + '/{0}kb'

    nltk_path = FLAGS.ntlk_data
    nltk.data.path.append(nltk_path)
    sent_tokenize = nltk.sent_tokenize

    vocal_map = {}
    # load data and do standardization
    if not FLAGS.skip_standardize:
        raw_data, raw_kb = standardize_and_drop(
            input_data_file,
            input_kb_file,
            drop_incorrect=not FLAGS.keep_incorrect,
            verbose=FLAGS.verbose)
    else:
        raw_data, raw_kb = load_and_drop(
            input_data_file,
            input_kb_file,
            drop_incorrect=not FLAGS.drop_incorrect,
            verbose=FLAGS.verbose)
    # has to be there no matter what
    if FLAGS.verbose: print 'processing kb'
    processed_kb, vocal_map = process_kb(raw_kb, vocal_map)
    # if dialogue, everything will be there.
    # if context, only intents, actions, vocal_map will be there
    if FLAGS.verbose: print 'processing data'
    result = process_main_data(raw_data,
                               sent_tokenize,
                               word_tokenize,
                               vocal_map,
                               input_type=FLAGS.input_type)
    intents, actions, expected_actions, dialogues, vocal_map, boundaries1, boundaries2, cats = result
    frequency_cutoff = FLAGS.word_cutoff
    # 3 is the number of special tokens
    if FLAGS.verbose: print 'vocabulary before cutoff', len(vocal_map) + 3
    if not FLAGS.gen_voc:
        # if we choose not to generate vocabulary file, we set output_vab to None
        output_vab = None
    if not FLAGS.gen_voc_map:
        output_all_vab = None
    vocal_map = write_vocabulary(output_vab, output_all_vab, vocal_map,
                                 frequency_cutoff, FLAGS.keep_non_ascii)
    if FLAGS.gen_cat:
        if FLAGS.verbose: print 'writing category'
        cat_files = [
            first_name_cats_file, last_name_cats_file, flight_cats_file,
            status_cats_file
        ]
        write_cat(cat_files, cats)

    if FLAGS.verbose:
        print 'frequency_cutoff= {0}, vocabulary after cutoff'.format(
            frequency_cutoff), len(vocal_map)
    data = reorganize_data(intents, actions, expected_actions, dialogues,
                           processed_kb, boundaries1, boundaries2)

    if 'train' in all_jobs:
        if FLAGS.verbose:
            print 'writing train data'
        write_data(data, output_data_pattern.format(FLAGS.output_prefix + '.'),
                   output_kb_pattern.format(FLAGS.output_prefix + '.'))
    if 'eval' in all_jobs:
        if FLAGS.verbose:
            print 'writing eval data'
        write_data(data,
                   output_data_pattern.format(FLAGS.output_prefix + '.eval.'),
                   output_kb_pattern.format(FLAGS.output_prefix + '.eval.'))
    if 'infer' in all_jobs:
        if FLAGS.verbose: print 'writing infer data'
        write_completion(
            data,
            output_data_pattern.format(FLAGS.output_prefix + '.infer.src.'),
            output_data_pattern.format(FLAGS.output_prefix + '.infer.tar.'),
            output_kb_pattern.format(FLAGS.output_prefix + '.infer.'))
    if 'sp-train' in all_jobs:
        if FLAGS.verbose: print 'writing self play training data'
        write_self_play(
            data,
            output_data_pattern.format(FLAGS.output_prefix + '.selfplay.'),
            output_kb_pattern.format(FLAGS.output_prefix + '.selfplay.'))
    if 'sp-eval' in all_jobs:
        if FLAGS.verbose: print 'writing self play eval data'
        write_self_play(
            data,
            output_data_pattern.format(FLAGS.output_prefix +
                                       '.selfplay.eval.'),
            output_kb_pattern.format(FLAGS.output_prefix + '.selfplay.eval.'))

    if FLAGS.gen_special_token:
        # write all token file.
        f_tokens = gfile.Open(all_token_file, 'w')
        for token in list(list_of_action_tokens_except_name):
            f_tokens.write(token + '\n')
        f_tokens.close()