コード例 #1
0
def seq2vec(sentence):
    input_file = '/tmp/input.txt'
    output_file = '/tmp/output.json'
    vocab_file = '/home/yanai-lab/sugiya-y/space/research/bert/pretrain/uncased_L-12_H-768_A-12/vocab.txt'
    bert_config_file = '/home/yanai-lab/sugiya-y/space/research/bert/pretrain/uncased_L-12_H-768_A-12/bert_config.json'
    init_checkpoint = '/home/yanai-lab/sugiya-y/space/research/bert/pretrain/uncased_L-12_H-768_A-12/pytorch_model.bin'
    layers = '-1,-2,-3,-4'
    max_seq_length = 128
    do_lower_case = True
    batch_size = 8
    local_rank = -1
    no_cuda = False

    with open(input_file, 'w') as writer:
        writer.write(sentence)

    if local_rank == -1 or no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        # torch.distributed.init_process_group(backend='nccl')
    # logger.info("device", device, "n_gpu", n_gpu, "distributed training",
    #             bool(local_rank != -1))

    layer_indexes = [int(x) for x in layers.split(",")]

    bert_config = BertConfig.from_json_file(bert_config_file)

    tokenizer = tokenization.FullTokenizer(
        vocab_file=vocab_file, do_lower_case=do_lower_case)

    examples = read_examples(input_file)
    # print(input_file)

    features = convert_examples_to_features(
        examples=examples, seq_length=max_seq_length, tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model = BertModel(bert_config)
    if init_checkpoint is not None:
        model.load_state_dict(
            torch.load(init_checkpoint, map_location='cpu'))
    model.to(device)

    if local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[local_rank], output_device=local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    if local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(
        eval_data, sampler=eval_sampler, batch_size=batch_size)

    model.eval()
    # with open(output_file, "w", encoding='utf-8') as writer:
    for input_ids, input_mask, example_indices in eval_dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)

        all_encoder_layers, _ = model(
            input_ids, token_type_ids=None, attention_mask=input_mask)
        all_encoder_layers = all_encoder_layers

        for b, example_index in enumerate(example_indices):
            feature = features[example_index.item()]
            unique_id = int(feature.unique_id)
            # feature = unique_id_to_feature[unique_id]
            output_json = collections.OrderedDict()
            output_json["linex_index"] = unique_id
            all_out_features = []
            for (i, token) in enumerate(feature.tokens):
                all_layers = []
                for (j, layer_index) in enumerate(layer_indexes):
                    layer_output = all_encoder_layers[int(
                        layer_index)].detach().cpu().numpy()
                    layer_output = layer_output[b]
                    layers = collections.OrderedDict()
                    layers["index"] = layer_index
                    layers["values"] = [
                        round(x.item(), 6)
                        for x in layer_output[i]  # これが文書特徴っぽい
                    ]

                #         all_layers.append(layers)
                #     out_features = collections.OrderedDict()
                #     out_features["token"] = token
                #     out_features["layers"] = all_layers
                #     all_out_features.append(out_features)
                # output_json["features"] = all_out_features
                # writer.write(json.dumps(output_json) + "\n")
    return layers["values"]  # [768] 文書長に関わらない
コード例 #2
0
ファイル: extract_features.py プロジェクト: xixiareone/GDPNet
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--input_file", default=None, type=str, required=True)
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_file", default=None, type=str, required=True)
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help="The config json file corresponding to the pre-trained BERT model. "
        "This specifies the model architecture.")
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        required=True,
        help="Initial checkpoint (usually from a pre-trained BERT model).")

    ## Other parameters
    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer "
        "than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help="Whether to lower case the input text. Should be True for uncased "
        "models and False for cased models.")
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size for predictions.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device", device, "n_gpu", n_gpu, "distributed training",
                bool(args.local_rank != -1))

    layer_indexes = [int(x) for x in args.layers.split(",")]

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab_file,
                                           do_lower_case=args.do_lower_case)

    examples = read_examples(args.input_file)

    features = convert_examples_to_features(examples=examples,
                                            seq_length=args.max_seq_length,
                                            tokenizer=tokenizer)

    unique_id_to_feature = {}
    for feature in features:
        unique_id_to_feature[feature.unique_id] = feature

    model = BertModel(bert_config)
    if args.init_checkpoint is not None:
        model.load_state_dict(
            torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    all_input_ids = torch.tensor([f.input_ids for f in features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features],
                                  dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
    if args.local_rank == -1:
        eval_sampler = SequentialSampler(eval_data)
    else:
        eval_sampler = DistributedSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.batch_size)

    model.eval()
    with open(args.output_file, "w", encoding='utf-8') as writer:
        for input_ids, input_mask, example_indices in eval_dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)

            all_encoder_layers, _ = model(input_ids,
                                          token_type_ids=None,
                                          attention_mask=input_mask)
            all_encoder_layers = all_encoder_layers

            for b, example_index in enumerate(example_indices):
                feature = features[example_index.item()]
                unique_id = int(feature.unique_id)
                # feature = unique_id_to_feature[unique_id]
                output_json = collections.OrderedDict()
                output_json["linex_index"] = unique_id
                all_out_features = []
                for (i, token) in enumerate(feature.tokens):
                    all_layers = []
                    for (j, layer_index) in enumerate(layer_indexes):
                        layer_output = all_encoder_layers[int(
                            layer_index)].detach().cpu().numpy()
                        layer_output = layer_output[b]
                        layers = collections.OrderedDict()
                        layers["index"] = layer_index
                        layers["values"] = [
                            round(x.item(), 6) for x in layer_output[i]
                        ]
                        all_layers.append(layers)
                    out_features = collections.OrderedDict()
                    out_features["token"] = token
                    out_features["layers"] = all_layers
                    all_out_features.append(out_features)
                output_json["features"] = all_out_features
                writer.write(json.dumps(output_json) + "\n")
コード例 #3
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--data_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The input data dir")
    parser.add_argument(
        "--vocab_file",
        default=None,
        type=str,
        required=True,
        help="The vocabulary file that the BERT model was trained on.")
    parser.add_argument("--output_dir", default=None, type=str, required=True)
    parser.add_argument(
        "--bert_config_file",
        default=None,
        type=str,
        required=True,
        help="The config json file corresponding to the pre-trained BERT model. "
        "This specifies the model architecture.")
    parser.add_argument(
        "--init_checkpoint",
        default=None,
        type=str,
        required=True,
        help="Initial checkpoint (usually from a pre-trained BERT model).")
    parser.add_argument('--gpu_id', default=0, type=int, help='')
    parser.add_argument('--no_cuda', default=False, type=bool, help='')
    ## Other parameters
    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences longer "
        "than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        action='store_true',
        help="Whether to lower case the input text. Should be True for uncased "
        "models and False for cased models.")
    parser.add_argument("--batch_size",
                        default=32,
                        type=int,
                        help="Batch size for predictions.")

    args = parser.parse_args()

    if not args.no_cuda:
        device = torch.device("cuda", args.gpu_id)
        n_gpu = 1  # torch.cuda.device_count()
    else:
        device = torch.device('cpu')
        n_gpu = 0
    logger.info("device {} n_gpu {}".format(device, n_gpu))

    layer_indexes = [int(x) for x in args.layers.split(",")]

    bert_config = BertConfig.from_json_file(args.bert_config_file)

    cache_path = os.path.join(args.output_dir, 'tmp_data.pkl')
    if not os.path.exists(cache_path):
        tokenizer = tokenization.FullTokenizer(
            vocab_file=args.vocab_file, do_lower_case=args.do_lower_case)

        processor = AtecProcessor()
        label_list = processor.get_labels()

        train_examples = processor.get_train_examples(args.data_dir)
        train_features = convert_examples_to_siamese_features(
            train_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Dataset info *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.batch_size)

        train_dataloader = convert_siamese_features_to_dataset(
            train_features, args.batch_size)

        dev_examples = processor.get_dev_examples(args.data_dir)
        dev_features = convert_examples_to_siamese_features(
            dev_examples, label_list, args.max_seq_length, tokenizer)
        logger.info("***** Dataset info *****")
        logger.info("  Num examples = %d", len(dev_examples))
        logger.info("  Batch size = %d", args.batch_size)
        dev_dataloader = convert_siamese_features_to_dataset(
            dev_features, args.batch_size)

        with open(cache_path, 'wb') as f:
            pickle.dump([train_dataloader, dev_dataloader], f)
    else:
        logger.info("load data from cache file: {}".format(cache_path))
        with open(cache_path, 'rb') as f:
            train_dataloader, dev_dataloader = pickle.load(f)

    model = BertModel(bert_config)
    if args.init_checkpoint is not None:
        model.load_state_dict(
            torch.load(args.init_checkpoint, map_location='cpu'))
    model.to(device)
    model.eval()

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    logger.info('extract train features.')
    bert_feature(os.path.join(args.output_dir, "train.npz"), model,
                 train_dataloader, device)
    logger.info('extract dev features.')
    bert_feature(os.path.join(args.output_dir, "dev.npz"), model,
                 dev_dataloader, device)