コード例 #1
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    news_config = GroverConfig.from_json_file(FLAGS.config_file)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    input_files = []
    print(FLAGS.input_file.split(","))
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=None,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = model_fn_builder(
        news_config,
        init_checkpoint=FLAGS.init_checkpoint,
        learning_rate=FLAGS.learning_rate,
        num_train_steps=FLAGS.num_train_steps,
        num_warmup_steps=FLAGS.num_warmup_steps,
        use_tpu=FLAGS.use_tpu,
    )

    # # If TPU is not available, this will fall back to normal Estimator on CPU
    # # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.train_batch_size,
        params={'model_dir': FLAGS.output_dir})

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    train_input_fn = input_fn_builder(input_files=input_files,
                                      seq_length=FLAGS.max_seq_length,
                                      is_training=True)

    print("Start trainning.............................................")
    estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
コード例 #2
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)
    news_config = GroverConfig.from_json_file(FLAGS.config_file)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    my_per_process_gpu_memory_fraction = 1.0
    gpu_options = tf.GPUOptions(
        per_process_gpu_memory_fraction=my_per_process_gpu_memory_fraction)
    sess_config = tf.ConfigProto(gpu_options=gpu_options)
    run_config = tf.estimator.RunConfig(
        model_dir=FLAGS.output_dir,
        session_config=sess_config,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=None)

    model_fn = model_fn_builder(news_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps)

    # # If TPU is not available, this will fall back to normal Estimator on CPU
    # # or GPU.
    estimator = tf.estimator.Estimator(model_fn=model_fn, config=run_config)

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    train_input_fn = input_fn_builder(input_files=input_files,
                                      seq_length=FLAGS.max_seq_length,
                                      is_training=True,
                                      batch_size=FLAGS.train_batch_size)

    print("Start trainning.............................................")
    estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
コード例 #3
0
ファイル: demo1.py プロジェクト: r2dh4t/Sense_art
        'extraction':
        tokenization.printable_text(''.join(
            tokenizer.convert_ids_to_tokens(output_tokens))),
        'start_ind':
        start_ind,
        'end_ind':
        end_ind,
    }


args = parser.parse_args()
proj_root_path = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
vocab_file_path = os.path.join(proj_root_path, "tokenization/clue-vocab.txt")
tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path,
                                       do_lower_case=True)
news_config = GroverConfig.from_json_file(args.config_fn)

# We might have to split the batch into multiple chunks if the batch size is too large
default_mbs = {12: 32, 24: 16, 48: 3}
max_batch_size = args.max_batch_size if args.max_batch_size is not None else default_mbs[
    news_config.num_hidden_layers]

# factorize args.batch_size = (num_chunks * batch_size_per_chunk) s.t. batch_size_per_chunk < max_batch_size
num_chunks = int(np.ceil(args.batch_size / max_batch_size))
batch_size_per_chunk = int(np.ceil(args.batch_size / num_chunks))

# This controls the top p for each generation.
top_p = np.ones(
    (num_chunks, batch_size_per_chunk), dtype=np.float32) * args.top_p

tf_config = tf.ConfigProto(allow_soft_placement=True)
コード例 #4
0
def main(_):
    tf.logging.set_verbosity(tf.logging.INFO)

    news_config = GroverConfig.from_json_file(FLAGS.config_file)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    input_files = []
    for input_pattern in FLAGS.input_file.split(","):
        input_files.extend(tf.gfile.Glob(input_pattern))

    tf.logging.info("*** Input Files ***")
    for input_file in input_files:
        tf.logging.info("  %s" % input_file)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.compat.v1.estimator.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.compat.v1.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        keep_checkpoint_max=None,
        tpu_config=tf.compat.v1.estimator.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    model_fn = model_fn_builder(news_config, init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=FLAGS.num_train_steps,
                                num_warmup_steps=FLAGS.num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                )

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.compat.v1.estimator.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.train_batch_size,
        params={'model_dir': FLAGS.output_dir}
    )

    tf.logging.info("***** Running training *****")
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    train_input_fn = input_fn_builder(
        input_files=input_files,
        seq_length=FLAGS.max_seq_length,
        is_training=True)

    try:
        estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps)
    except KeyboardInterrupt:
        def serving_input_receiver_fn():
            """Serving input_fn that builds features from placeholders

            Returns
            -------
            tf.estimator.export.ServingInputReceiver
            """
            number = tf.placeholder(dtype=tf.int32, shape=[FLAGS.max_seq_length + 1], name='feature')
            receiver_tensors = {'input_ids': number}
            return tf.estimator.export.ServingInputReceiver(number, receiver_tensors)
        
        export_path = estimator.export_saved_model("./model_save", serving_input_receiver_fn)
コード例 #5
0
ファイル: gpt_predict.py プロジェクト: yyht/gpt2_ml_my
    def __init__(self):
        """ Configuration adapter for `ez_bert_feat`
            It adapts user command args to configuration protocol of `ez_transfer` engine
        """
        input_table = FLAGS.tables
        output_table = FLAGS.outputs

        all_input_col_names = get_all_columns_name(input_table)
        
        first_sequence = _APP_FLAGS.firstSequence
        assert first_sequence in all_input_col_names, "The first sequence should be in input schema"
        second_sequence = _APP_FLAGS.secondSequence
        if second_sequence not in all_input_col_names:
            second_sequence = ""
        append_columns = [t for t in _APP_FLAGS.appendCols.split(",") if t and t in all_input_col_names] \
                          if _APP_FLAGS.appendCols else []
        tf.logging.info(input_table)
        
        selected_cols_set = [first_sequence]
        if second_sequence:
            selected_cols_set.append(second_sequence)
        selected_cols_set.extend(append_columns)
        selected_cols_set = set(selected_cols_set)
        input_schema = get_selected_columns_schema(input_table, selected_cols_set)
        
        output_schema = _APP_FLAGS.outputSchema
        for column_name in append_columns:
            output_schema += "," + column_name

        config_json = {
            "preprocess_config": {
                "input_schema": input_schema,
                "output_schema": output_schema,
                "first_sequence": first_sequence,
                "second_sequence": second_sequence,
                'sequence_length': _APP_FLAGS.sequenceLength,
            },
            "model_config": {
                "my_vocab_path": "oss://alg-misc/BERT/bert_pretrain/open_domain/gpt/mega_clue_vocab/clue-vocab.txt",
            },
            "predict_config": {
                "predict_input_fp": None,
                "predict_batch_size": 1,
                "predict_output_fp": None
            }
        }
        config_json["worker_hosts"] = FLAGS.worker_hosts
        config_json["task_index"] = FLAGS.task_index
        config_json["job_name"] = FLAGS.job_name
        config_json["num_gpus"] = FLAGS.workerGPU
        config_json["num_workers"] = FLAGS.workerCount

        self.worker_hosts = str(config_json["worker_hosts"])
        self.task_index = int(config_json["task_index"])
        self.job_name = str(config_json["job_name"])
        self.num_gpus = int(config_json["num_gpus"])
        self.num_workers = int(config_json["num_workers"])

        self.input_schema = config_json['preprocess_config']['input_schema']
        self.label_name = config_json['preprocess_config'].get('label_name', None)
        self.label_enumerate_values = config_json['preprocess_config'].get('label_enumerate_values', None)
        self.output_schema = config_json['preprocess_config'].get('output_schema', None)
        self.sequence_length = config_json['preprocess_config']['sequence_length']
        self.first_sequence = config_json['preprocess_config']['first_sequence']
        self.second_sequence = config_json['preprocess_config']['second_sequence']

        self.vocab_file_path = config_json['model_config']['my_vocab_path']

        self.predict_input_fp = config_json['predict_config']['predict_input_fp']
        self.predict_output_fp = config_json['predict_config'].get('predict_output_fp', None)
        self.predict_batch_size = config_json['predict_config']['predict_batch_size']

        self.news_config = GroverConfig.from_json_file('oss://alg-misc/BERT/bert_pretrain/open_domain/gpt/mega_clue_vocab/mega.json')
        self.ckpt_fn = "oss://alg-misc/BERT/bert_pretrain/open_domain/gpt/mega_clue_vocab/model.ckpt-220000"
コード例 #6
0
ファイル: app_gpt2.py プロジェクト: Holden-Lin/gpt2_chinese
def predict():

    ##### ignore tf deprecated warning temporarily
    os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

    # mac-specific settings, comment this when exec in other systems
    os.environ['KMP_DUPLICATE_LIB_OK']='True'

    tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.DEBUG)
    from tensorflow.python.util import deprecation
    deprecation._PRINT_DEPRECATION_WARNINGS = False
    try:
        from tensorflow.python.util import module_wrapper as deprecation
    except ImportError:
        from tensorflow.python.util import deprecation_wrapper as deprecation
    deprecation._PER_MODULE_WARNING_LIMIT = 0
    #####

    parser = argparse.ArgumentParser(description='Contextual generation (aka given some metadata we will generate articles')
    parser.add_argument(
        '-metadata_fn',
        dest='metadata_fn',
        type=str,
        help='Path to a JSONL containing metadata',
    )
    parser.add_argument(
        '-out_fn',
        dest='out_fn',
        type=str,
        help='Out jsonl, which will contain the completed jsons',
    )
    parser.add_argument(
        '-input',
        dest='input',
        type=str,
        help='Text to complete',
    )
    parser.add_argument(
        '-model_config_fn',
        dest='model_config_fn',
        default='configs/mega.json',
        type=str,
        help='Configuration JSON for the model',
    )
    parser.add_argument(
        '-model_ckpt',
        dest='model_ckpt',
        default='model.ckpt-220000',
        type=str,
        help='checkpoint file for the model',
    )
    parser.add_argument(
        '-target',
        dest='target',
        default='article',
        type=str,
        help='What to generate for each item in metadata_fn. can be article (body), title, etc.',
    )
    parser.add_argument(
        '-batch_size',
        dest='batch_size',
        default=1,
        type=int,
        help='How many things to generate per context. will split into chunks if need be',
    )
    parser.add_argument(
        '-num_folds',
        dest='num_folds',
        default=1,
        type=int,
        help='Number of folds. useful if we want to split up a big file into multiple jobs.',
    )
    parser.add_argument(
        '-fold',
        dest='fold',
        default=0,
        type=int,
        help='which fold we are on. useful if we want to split up a big file into multiple jobs.'
    )
    parser.add_argument(
        '-max_batch_size',
        dest='max_batch_size',
        default=None,
        type=int,
        help='max batch size. You can leave this out and we will infer one based on the number of hidden layers',
    )
    parser.add_argument(
        '-top_p',
        dest='top_p',
        default=0.95,
        type=float,
        help='p to use for top p sampling. if this isn\'t none, use this for everthing'
    )
    parser.add_argument(
        '-min_len',
        dest='min_len',
        default=1024,
        type=int,
        help='min length of sample',
    )
    parser.add_argument(
        '-eos_token',
        dest='eos_token',
        default=60000,
        type=int,
        help='eos token id',
    )
    parser.add_argument(
        '-samples',
        dest='samples',
        default=5,
        type=int,
        help='num_samples',
    )

    def extract_generated_target(output_tokens, tokenizer):
        """
        Given some tokens that were generated, extract the target
        :param output_tokens: [num_tokens] thing that was generated
        :param encoder: how they were encoded
        :param target: the piece of metadata we wanted to generate!
        :return:
        """
        # Filter out first instance of start token
        assert output_tokens.ndim == 1

        start_ind = 0
        end_ind = output_tokens.shape[0]

        return {
            'extraction': tokenization.printable_text(''.join(tokenizer.convert_ids_to_tokens(output_tokens))),
            'start_ind': start_ind,
            'end_ind': end_ind,
        }

    # args = parser.parse_args()
    args, unknown = parser.parse_known_args()
    proj_root_path = os.path.dirname(os.path.realpath(__file__))
    vocab_file_path = os.path.join(proj_root_path, "tokenization/clue-vocab.txt")

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab_file_path , do_lower_case=True)
    news_config = GroverConfig.from_json_file(args.model_config_fn)

    # We might have to split the batch into multiple chunks if the batch size is too large
    default_mbs = {12: 32, 24: 16, 48: 3}
    max_batch_size = args.max_batch_size if args.max_batch_size is not None else default_mbs[news_config.num_hidden_layers]

    # factorize args.batch_size = (num_chunks * batch_size_per_chunk) s.t. batch_size_per_chunk < max_batch_size
    num_chunks = int(np.ceil(args.batch_size / max_batch_size))
    batch_size_per_chunk = int(np.ceil(args.batch_size / num_chunks))

    # This controls the top p for each generation.
    top_p = np.ones((num_chunks, batch_size_per_chunk), dtype=np.float32) * args.top_p

    tf_config = tf.ConfigProto(allow_soft_placement=True)

    with tf.Session(config=tf_config, graph=tf.Graph()) as sess:
        initial_context = tf.placeholder(tf.int32, [batch_size_per_chunk, None])
        p_for_topp = tf.placeholder(tf.float32, [batch_size_per_chunk])
        eos_token = tf.placeholder(tf.int32, [])
        min_len = tf.placeholder(tf.int32, [])
        tokens, probs = sample(news_config=news_config, initial_context=initial_context,
                            eos_token=eos_token, min_len=min_len, ignore_ids=None, p_for_topp=p_for_topp,
                            do_topk=False)

        saver = tf.train.Saver()
        saver.restore(sess, args.model_ckpt)

        '''
        如果部署到web上,则所有的print都不需要
        input改为web返回的message
        不需要while循环
        将最后的"\n".join(l) 返回到一个参数,并展示到web中
        主要参数(篇数、长度)要用户在web中输入,或者在本代码里写死 -- 有默认值

        待解决:
        sample有5个,下面代码会for循环分别predict 5次,这5次结果要怎么在网页展示?
        min_lens没有用,比如1024的时候还是会生产一两百字的文章

        '''

        # print('🍺Model loaded. \nInput something please:⬇️')

        if request.method == 'POST':
            text = request.form['message']
            # data = [text] 原spam detection里的代码,不确定此处是否需要
        
        for i in range(args.samples):
            # print("Sample,", i + 1, " of ", args.samples)
            line = tokenization.convert_to_unicode(text)
            bert_tokens = tokenizer.tokenize(line)
            encoded = tokenizer.convert_tokens_to_ids(bert_tokens)
            context_formatted = []
            context_formatted.extend(encoded)
            # Format context end

            gens = []
            gens_raw = []
            gen_probs = []
            final_result = []

            for chunk_i in range(num_chunks):
                tokens_out, probs_out = sess.run([tokens, probs],
                                                feed_dict={initial_context: [context_formatted] * batch_size_per_chunk,
                                                            eos_token: args.eos_token, min_len: args.min_len,
                                                            p_for_topp: top_p[chunk_i]})

                for t_i, p_i in zip(tokens_out, probs_out):
                    extraction = extract_generated_target(output_tokens=t_i, tokenizer=tokenizer)
                    gens.append(extraction['extraction'])

            l = re.findall('.{1,70}', gens[0].replace('[UNK]', '').replace('##', ''))
            # 下一句的参应该传给 return
            # print("\n".join(l)) 
            # return a for loop
            # https://stackoverflow.com/questions/44564414/how-to-use-a-return-statement-in-a-for-loop
            final_result.append("\n".join(l))
            

    return render_template('result.html',prediction = final_result)