Esempio n. 1
0
def save(args):
    """Save inference model."""
    gpu_id = 0
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    model.save_inference_model(args.inference_model_path)
    return
Esempio n. 2
0
def evaluate(args):
    """Evaluation main function."""
    if args.is_distributed:
        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        phase = "distributed_test"
    else:
        dev_count = 1
        gpu_id = 0
        phase = "test"
    place = fluid.CUDAPlace(gpu_id)

    # setup task and model
    task = tasks.create_task(args)
    model = models.create_model(args, place)

    # setup dataset
    eval_generator = task.get_data_loader(
        model,
        input_file=args.eval_file,
        num_part=model.topo.data_info.size,
        part_id=model.topo.data_info.rank,
        phase=phase
    )
    if model.topo.pp_info.size != 1:
        raise ValueError("Cannot support pipeline in evaluation now!")
    if model.topo.world.size > dev_count:
        raise ValueError("Cannot support evaluation on multiple nodes now!")

    evaluate_dataset(
        task,
        model,
        eval_generator,
        args,
        dev_count,
        gpu_id,
        training_step=0,
        tag="test"
    )
    return
Esempio n. 3
0
def infer(args):
    """Inference main function."""
    if args.is_distributed:
        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        phase = "distributed_test"
    else:
        dev_count = 1
        gpu_id = 0
        phase = "test"
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)

    # setup dataset
    infer_generator = task.get_data_loader(model,
                                           input_file=args.infer_file,
                                           num_part=model.topo.data_info.size,
                                           part_id=model.topo.data_info.rank,
                                           phase=phase,
                                           is_infer=True)
    if model.topo.pp_info.size != 1:
        raise ValueError("Cannot support pipeline in inference now!")
    if model.topo.sharding_info.size != 1:
        raise ValueError("Cannot support sharding in inference now!")
    if model.topo.world.size > dev_count:
        raise ValueError("Cannot support evaluation on multiple nodes now!")

    # run inference
    timer = Timer()
    timer.start()
    infer_out = {}
    step = 0  # fix no input data case.
    for step, data in enumerate(infer_generator(), 1):
        predictions = task.infer_step(model, data)
        for pred in predictions:
            infer_out[pred["data_id"]] = pred
        if step % args.log_steps == 0:
            time_cost = timer.pass_time
            print(f"\tstep: {step}, time: {time_cost:.3f}, "
                  f"queue size: {infer_generator.queue.size()}, "
                  f"speed: {step / time_cost:.3f} steps/s")

    time_cost = timer.pass_time
    print(f"[infer] steps: {step} time cost: {time_cost}, "
          f"speed: {step / time_cost} steps/s")

    if args.is_distributed:
        # merge inference outputs in distributed mode.
        part_file = os.path.join(args.save_path,
                                 f"inference_output.part_{gpu_id}")
        with open(part_file, "w") as fp:
            json.dump(infer_out, fp, ensure_ascii=False, indent=2)
        part_finish_file = os.path.join(
            args.save_path, f"inference_output.part_{gpu_id}.finish")
        with open(part_finish_file, "w"):
            pass

    # Only run on master GPU in each node
    if gpu_id != 0:
        return

    if args.is_distributed:
        part_files = f"inference_output.part_*.finish"
        while True:
            ret = subprocess.getoutput(
                f"find {args.save_path} -maxdepth 1 -name {part_files}")
            num_completed = len(ret.split("\n"))
            if num_completed != dev_count:
                time.sleep(1)
                continue
            infer_out = {}
            for dev_id in range(dev_count):
                part_file = os.path.join(args.save_path,
                                         f"inference_output.part_{dev_id}")
                with open(part_file, "r") as fp:
                    part_infer_out = json.load(fp)
                    for data_id in part_infer_out:
                        infer_out[data_id] = part_infer_out[data_id]
            break
        subprocess.getoutput(
            "rm " + os.path.join(args.save_path, f"inference_output.part*"))

    # save inference outputs
    inference_output = os.path.join(args.save_path, "inference_output.txt")
    with open(inference_output, "w") as f:
        for data_id in sorted(infer_out.keys(), key=lambda x: int(x)):
            f.write("\t".join(
                map(str, [
                    infer_out[data_id][name]
                    for name in args.output_name.split(",")
                ])) + "\n")
    print(f"save inference result into: {inference_output}")

    return
Esempio n. 4
0
def infer(args):
    """Main inference function."""
    place = fluid.CUDAPlace(0)

    task = DialogGeneration(args)
    model = models.create_model(args, place)
    task.debug()

    empty_ds_seq = "<ds/> " + " ".join(flatten_ds({})) + " </ds>"
    post_process = PostProcess(args.db_file,
                               normalization=args.normalization,
                               db_guidance=args.db_guidance)

    # record original order and init status
    output_order = []
    # {"dial_id": {"prev_ds": "", "turns": [], "cur_turn_idx": 0}}
    dial_status = defaultdict(dict)
    with open(args.infer_file, "r") as fin:
        next(fin)
        for line in fin:
            dial_id, turn_idx, utt = line.strip().split("\t")
            output_order.append(f"{dial_id}-{turn_idx}")
            if dial_id not in dial_status:
                dial_status[dial_id]["prev_ds"] = empty_ds_seq
                dial_status[dial_id]["turns"] = []
                dial_status[dial_id]["cur_turn_idx"] = 0
            dial_status[dial_id]["turns"].append({
                "utts": utt,
                "turn_idx": turn_idx
            })
    dial_ids = sorted(list(dial_status.keys()))

    # batch inference
    outputs = {}
    timer = Timer()
    batch_idx = 0
    while len(dial_ids) > 0:
        logger.info(f"Batch index: {batch_idx}")
        batch_idx += 1
        timer.start()
        cur_dial_ids = dial_ids[:args.dial_batch_size]

        cur_inputs = {}
        for cur_dial_id in cur_dial_ids:
            cur_dial_turn = dial_status[cur_dial_id]["turns"][
                dial_status[cur_dial_id]["cur_turn_idx"]]
            cur_utt = cur_dial_turn["utts"]
            prev_ds = dial_status[cur_dial_id]["prev_ds"]
            src = f"{cur_utt} [SEP] {prev_ds}\x010"
            cur_inputs[f"{cur_dial_id}-{cur_dial_turn['turn_idx']}"] = src
        cur_outputs = generate(cur_inputs, model, task)
        time_cost_infer = timer.pass_time
        logger.debug(f"Time cost (prediction): {time_cost_infer}")

        # post process
        cur_outputs_postprocess = {}
        for dial_turn_tag, pred_ds in cur_outputs.items():
            dial_id, _ = dial_turn_tag.split("-")
            cur_dial_turn = dial_status[dial_id]["turns"][dial_status[dial_id]
                                                          ["cur_turn_idx"]]
            cur_utt_ls = cur_dial_turn["utts"].split("[SEP]")
            postprocessed_pred_ds = post_process.run(
                pred_ds,
                prev_ds=dial_status[dial_id]["prev_ds"],
                utt_list=cur_utt_ls)
            cur_outputs_postprocess[dial_turn_tag] = postprocessed_pred_ds
        outputs.update(cur_outputs_postprocess)
        time_cost_postprocess = timer.pass_time - time_cost_infer
        logger.debug(f"Time cost (postprocess): {time_cost_postprocess}")

        # update `cur_turn_idx` and `prev_ds`
        for dial_turn_tag in cur_outputs:
            dial_id, _ = dial_turn_tag.split("-")
            dial_status[dial_id]["cur_turn_idx"] += 1
            if dial_status[dial_id]["cur_turn_idx"] >= len(
                    dial_status[dial_id]["turns"]):
                dial_ids.remove(dial_id)
            else:
                dial_status[dial_id]["prev_ds"] = outputs[dial_turn_tag]
        timer.reset()

    # reorder and output
    sample_indices = []
    with open(args.session_to_sample_mapping_file, "r") as fin:
        for line in fin:
            line = line.strip()
            if line:
                sample_indices.append(int(line))
    pred_seqs = [outputs[dial_turn_tag] for dial_turn_tag in output_order]
    pred_sample_labels = [None] * len(pred_seqs)
    for pred_ds_seq, sample_idx in zip(pred_seqs, sample_indices):
        pred_ds_dict = parse_ds(pred_ds_seq, date_prefix="$")
        pred_sample_labels[sample_idx] = pred_ds_dict

    out_seq_file = os.path.join(args.save_path, "inference_output.txt")
    out_sample_label_file = os.path.join(args.save_path,
                                         "inference_labels.json")
    with open(out_seq_file, "w") as fout_seq, open(out_sample_label_file,
                                                   "w") as fout_label:
        fout_seq.write("\n".join(pred_seqs))
        json.dump(pred_sample_labels, fout_label, indent=2)
    logger.info(f"Save inference sequences to `{out_seq_file}`")
    logger.info(f"Save inference sample labels to `{out_sample_label_file}`")
Esempio n. 5
0
def train(args):
    """The main function of training."""
    if args.is_distributed:
        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
    else:
        dev_count = 1
        gpu_id = 0
    place = fluid.CUDAPlace(gpu_id)

    # setup task and model
    task = tasks.create_task(args)
    model = models.create_model(args, place)

    global need_save
    need_save = model.topo.dp_info.rank == 0

    # setup datasets
    train_generator = task.get_data_loader(
        model,
        input_file=args.train_file,
        num_epochs=args.num_epochs,
        num_part=model.topo.data_info.size,
        part_id=model.topo.data_info.rank,
        phase="train"
    )
    if model.topo.pp_info.size == 1:
        assert model.topo.mp_info.size <= dev_count and dev_count % model.topo.mp_info.size == 0
        valid_num_part = dev_count // model.topo.mp_info.size
        valid_part_id = gpu_id // model.topo.mp_info.size
    else:
        raise ValueError("Cannot support pipeline in training now!")
    print("# part in validation:", valid_num_part)
    print("part id in validation:", valid_part_id)
    valid_tags = []
    valid_generators = []
    for valid_file in args.valid_file.split(","):
        if ":" in valid_file:
            valid_tag, valid_file = valid_file.split(":")
        else:
            valid_tag = "valid"
        valid_tags.append(valid_tag)
        valid_generators.append(task.get_data_loader(
            model,
            input_file=valid_file,
            num_part=valid_num_part,
            part_id=valid_part_id,
            phase="distributed_valid" if args.is_distributed else "valid"
        ))

    # maintain best metric (init)
    best_metric = -1e10
    if args.eval_metric.startswith("-"):
        scale = -1.0
        eval_metric = args.eval_metric[1:]
    else:
        scale = 1.0
        eval_metric = args.eval_metric

    # start training
    timer = Timer()
    timer.start()
    print("Training is start.")
    for step, data in enumerate(train_generator(), args.start_step + 1):
        outputs = task.train_step(model, data)
        timer.pause()

        if step % args.log_steps == 0:
            time_cost = timer.pass_time
            current_epoch, current_file_index, total_file = task.reader.get_train_progress()
            current_lr = outputs.pop('scheduled_lr')
            print(f"[train][{current_epoch}] progress: {current_file_index}/{total_file} "
                  f"step: {step}, time: {time_cost:.3f}, "
                  f"queue size: {train_generator.queue.size()}, "
                  f"speed: {args.log_steps / time_cost:.3f} steps/s")
            print(f"\tcurrent lr: {current_lr:.7f}")
            metrics = task.get_metrics(outputs)
            print("\t" + ", ".join(f"{k}: {v:.4f}" for k, v in metrics.items()))
            timer.reset()

        if step % args.validation_steps == 0:
            for valid_tag, valid_generator in zip(valid_tags, valid_generators):
                eval_metrics = evaluate(task, model, valid_generator, args, dev_count, gpu_id, step, tag=valid_tag)
                if valid_tag == "valid":
                    valid_metrics = eval_metrics

            # save lastest model
            if args.save_steps <= 0:
                save_model(model, args.save_path, "lastest", dev_count, gpu_id, args)
            # maintain best metric (update)
            if valid_metrics[eval_metric] * scale > best_metric:
                best_metric = valid_metrics[eval_metric] * scale
                print(f"Get better valid metric: {eval_metric} = {valid_metrics[eval_metric]}")
                # save best model (with best evaluation metric)
                save_model(model, args.save_path, "best", dev_count, gpu_id, args)

        if args.save_steps > 0 and step % args.save_steps == 0:
            save_model(model, args.save_path, f"step_{step}", dev_count, gpu_id, args)

        timer.start()
    print("Training is completed.")

    return
Esempio n. 6
0
def infer(args):
    """Inference main function."""
    if args.is_distributed:
        fleet.init(is_collective=True)

        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        trainers_num = fleet.worker_num()
        trainer_id = fleet.worker_index()
        phase = "distributed_test"
    else:
        dev_count = 1
        gpu_id = 0
        trainers_num = 1
        trainer_id = 0
        phase = "test"
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    infer_generator = task.get_data_loader(model,
                                           input_file=args.infer_file,
                                           num_part=trainers_num,
                                           part_id=trainer_id,
                                           phase=phase,
                                           is_infer=True)

    # run inference
    timer = Timer()
    timer.start()
    infer_out = {}
    step = 0
    for step, data in enumerate(infer_generator(), 1):
        predictions = task.infer_step(model, data)
        for pred in predictions:
            infer_out[pred["data_id"]] = pred
        if step % args.log_steps == 0:
            time_cost = timer.pass_time
            print(f"\tstep: {step}, time: {time_cost:.3f}, "
                  f"queue size: {infer_generator.queue.size()}, "
                  f"speed: {step / time_cost:.3f} steps/s")

    time_cost = timer.pass_time
    print(f"[infer] steps: {step} time cost: {time_cost}, "
          f"speed: {step / time_cost} steps/s")

    if args.is_distributed:
        # merge inference outputs in distributed mode.
        part_file = os.path.join(args.save_path,
                                 f"inference_output.part_{gpu_id}")
        with open(part_file, "w") as fp:
            json.dump(infer_out, fp, ensure_ascii=False, indent=2)
        part_finish_file = os.path.join(
            args.save_path, f"inference_output.part_{gpu_id}.finish")
        with open(part_finish_file, "w"):
            pass

    # Only run on master GPU in each node
    if gpu_id != 0:
        return

    if args.is_distributed:
        part_files = f"inference_output.part_*.finish"
        while True:
            ret = subprocess.getoutput(
                f"find {args.save_path} -maxdepth 1 -name {part_files}")
            num_completed = len(ret.split("\n"))
            if num_completed != dev_count:
                time.sleep(1)
                continue
            infer_out = {}
            for dev_id in range(dev_count):
                part_file = os.path.join(args.save_path,
                                         f"inference_output.part_{dev_id}")
                with open(part_file, "r") as fp:
                    part_infer_out = json.load(fp)
                    for data_id in part_infer_out:
                        infer_out[data_id] = part_infer_out[data_id]
            break
        subprocess.getoutput(
            "rm " + os.path.join(args.save_path, f"inference_output.part*"))

    # save inference outputs
    inference_output = os.path.join(args.save_path, args.save_name)
    save_array = []
    for i in range(len(infer_out)):
        save_array.append(infer_out[str(i)]["emb"])
    np_array = np.array(save_array)
    np.save(inference_output, np_array)

    return
Esempio n. 7
0
def interact(args):
    """Interaction main function."""
    if args.is_distributed:
        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))

    else:
        dev_count = 1
        gpu_id = 0
    place = fluid.CUDAPlace(gpu_id)

    task = DialogGeneration(args)
    model = models.create_model(args, place)

    if model.topo.pp_info.size != 1:
        raise ValueError("Cannot support pipeline in inference now!")
    if model.topo.sharding_info.size != 1:
        raise ValueError("Cannot support sharding in inference now!")
    if model.topo.world.size > dev_count:
        raise ValueError("Cannot support evaluation on multiple nodes now!")

    if args.is_distributed:
        if gpu_id > 0:
            Example = namedtuple("Example", ["src", "data_id"])
            context = []
            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                host, port = "127.0.0.1", args.port + gpu_id
                s.bind((host, port))
                s.listen()
                while True:
                    conn, addr = s.accept()
                    with conn:
                        data = conn.recv(1024)
                        if data.decode("utf8") == "[EXIT]":
                            break
                        example = Example(src=data.decode("utf8"), data_id=0)
                        task.reader.features[0] = example
                        try:
                            record = task.reader._convert_example_to_record(
                                example, is_infer=True)
                        except ValueError as e:
                            print(f"[FATAL] {e}")
                            raise e
                        data = task.reader._pad_batch_records([record],
                                                              is_infer=True)
                        pred = task.infer_step(model, data)[0]
                        bot_response = pred["response"]
                        context.append(bot_response)
            return
        else:

            def send_request(dst_id, src):
                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
                    host, port = "127.0.0.1", args.port + dst_id
                    s.connect((host, port))
                    data = src.encode("utf8")
                    s.sendall(data)

    Example = namedtuple("Example", ["src", "data_id"])
    context = []
    start_info = "Enter [EXIT] to quit the interaction, [NEXT] to start a new conversation."
    cprint(start_info, "yellow", attrs=["bold"])
    while True:
        if args.is_distributed:
            print(colored("[Human]:", "red", attrs=["bold"]))
            user_utt = input().strip()
        else:
            user_utt = input(colored("[Human]: ", "red",
                                     attrs=["bold"])).strip()
        if user_utt == "[EXIT]":
            if args.is_distributed:
                threads = []
                for i in range(1, dev_count):
                    thread = threading.Thread(target=send_request,
                                              args=(i, "[EXIT]"))
                    thread.start()
                    threads.append(thread)
            break
        elif user_utt == "[NEXT]":
            context = []
            cprint(start_info, "yellow", attrs=["bold"])
        else:
            context.append(user_utt)
            src = " [SEP] ".join(context)

            if args.is_distributed:
                threads = []
                for i in range(1, dev_count):
                    thread = threading.Thread(target=send_request,
                                              args=(i, src))
                    thread.start()
                    threads.append(thread)

            example = Example(src=src, data_id=0)
            task.reader.features[0] = example
            try:
                record = task.reader._convert_example_to_record(example,
                                                                is_infer=True)
            except ValueError as e:
                print(f"[FATAL] {e}")
                raise e
            data = task.reader._pad_batch_records([record], is_infer=True)
            pred = task.infer_step(model, data)[0]
            bot_response = pred["response"]
            if args.is_distributed:
                print(colored("[Bot]:", "blue", attrs=["bold"]))
                print(colored(bot_response, attrs=["bold"]))
            else:
                print(colored("[Bot]:", "blue", attrs=["bold"]),
                      colored(bot_response, attrs=["bold"]))
            context.append(bot_response)

            if args.is_distributed:
                for thread in threads:
                    thread.join()

    return
Esempio n. 8
0
def infer_dst(args):
    """Inference main function."""
    if args.is_distributed:
        fleet.init(is_collective=True)

        dev_count = fluid.core.get_cuda_device_count()
        gpu_id = int(os.getenv("FLAGS_selected_gpus"))
        trainers_num = fleet.worker_num()
        trainer_id = fleet.worker_index()
        phase = "distributed_test"
    else:
        dev_count = 1
        gpu_id = 0
        trainers_num = 1
        trainer_id = 0
        phase = "test"
    place = fluid.CUDAPlace(gpu_id)

    task = tasks.create_task(args)
    model = models.create_model(args, place)
    # task.debug()

    schema = get_schema(args.dataset)
    empty_ds_seq = "<ds/> " + " ".join(flatten_ds({}, schema)) + " </ds>"

    # record original order and init status
    output_order = []
    # {"dial_id": {"prev_ds": "", "turns": [{"utts": utts, "turn_idx": turn_idx}], "cur_idx": 0}}
    dial_status = defaultdict(dict)
    with open(args.infer_file, "r") as fin:
        next(fin)
        for line in fin:
            dial_id, turn_idx, utts = line.strip().split("\t")
            output_order.append(f"{dial_id}-{turn_idx}")
            if dial_id not in dial_status:
                dial_status[dial_id]["prev_ds"] = empty_ds_seq
                dial_status[dial_id]["turns"] = []
                dial_status[dial_id]["cur_idx"] = 0
            dial_status[dial_id]["turns"].append({
                "utts": utts,
                "turn_idx": turn_idx
            })
    dial_ids = list(dial_status.keys())

    # batch inference
    outputs = {}
    timer = Timer()
    while len(dial_ids) > 0:
        timer.start()
        cur_dial_ids = dial_ids[:args.dial_batch_size]
        logger.info(f"Sampled dialogue ids: {cur_dial_ids}")

        # 1st: basic generation
        basic_inputs = {}
        for cur_dial_id in cur_dial_ids:
            cur_idx = dial_status[cur_dial_id]["cur_idx"]
            cur_dial_turn = dial_status[cur_dial_id]["turns"][cur_idx]
            cur_utts = cur_dial_turn["utts"]
            prev_ds = dial_status[cur_dial_id]["prev_ds"]
            src = f"<gen/> {cur_utts} [SEP] {prev_ds} </gen>\x010"
            basic_inputs[f"{cur_dial_id}-{cur_dial_turn['turn_idx']}"] = src
        basic_outputs = generate(basic_inputs, model, task)

        # 2nd: amending generation
        amending_inputs = {}
        for cur_dial_id in cur_dial_ids:
            cur_idx = dial_status[cur_dial_id]["cur_idx"]
            cur_dial_turn = dial_status[cur_dial_id]["turns"][cur_idx]
            cur_utts = cur_dial_turn["utts"]
            basic_ds = basic_outputs[
                f"{cur_dial_id}-{cur_dial_turn['turn_idx']}"]
            src = f"<amend/> {cur_utts} [SEP] {basic_ds} </amend>\x010"
            amending_inputs[f"{cur_dial_id}-{cur_dial_turn['turn_idx']}"] = src
        amending_outputs = generate(amending_inputs, model, task)

        outputs.update(amending_outputs)
        time_cost_infer = timer.pass_time
        logger.info(f"Time cost: {time_cost_infer}")

        # debug info
        for dial_turn_tag in basic_inputs:
            logger.debug(f"[basic input]: {basic_inputs[dial_turn_tag]}")
            logger.debug(f"[basic output]: {basic_outputs[dial_turn_tag]}")
            logger.debug(f"[amending input]: {amending_inputs[dial_turn_tag]}")
            logger.debug(
                f"[amending output]: {amending_outputs[dial_turn_tag]}")

        # update dial_status
        for dial_turn_tag in amending_outputs:
            dial_id, _ = dial_turn_tag.split("-")
            dial_status[dial_id]["cur_idx"] += 1
            if dial_status[dial_id]["cur_idx"] >= len(
                    dial_status[dial_id]["turns"]):
                dial_ids.remove(dial_id)
            else:
                dial_status[dial_id]["prev_ds"] = outputs[dial_turn_tag]
        timer.reset()

    # reorder and output
    if gpu_id == 0:
        pred_seqs = []
        pred_labels = []
        for dial_turn_tag in output_order:
            pred_seqs.append(outputs[dial_turn_tag])
            pred_label = parse_ds(outputs[dial_turn_tag], schema)
            pred_labels.append(pred_label)

        out_seq_file = os.path.join(args.save_path, "inference_output.txt")
        out_label_file = os.path.join(args.save_path, "inference_labels.json")
        with open(out_seq_file, "w") as fout_seq, open(out_label_file,
                                                       "w") as fout_label:
            fout_seq.write("\n".join(pred_seqs))
            json.dump(pred_labels, fout_label, indent=2)
        logger.info(f"Save inference sequences to `{out_seq_file}`")
        logger.info(f"Save inference labels to `{out_label_file}`")