def predict(in_file, out_file):
    xvals, yvals = utils.load_hot(in_file)

    network = build_network()
    predictions = utils.predict_hot(xvals, network, 'complicated1.tflearn')
    print('Accuracy: {}%'.format(utils.get_accuracy_hot(yvals, predictions)))
    utils.write_predictions(xvals, predictions, out_file)
Beispiel #2
0
def predict(dataset):
    """Generate predictions for audio tagging and sound event detection.

    This function uses an ensemble of trained models to generate the
    predictions, with the averaging function being an arithmetic mean.
    Computed predictions are then saved to disk.

    Args:
        dataset: Dataset to generate predictions for.
    """
    import capsnet

    # Load (standardized) input data and associated file names
    test_x, _, names = _load_data(dataset)

    # Predict class probabilities for each model (epoch)
    at_preds, sed_preds = [], []

    for epoch in _determine_epochs(cfg.prediction_epochs):
        model = _load_model(epoch)
        at_pred, sed_pred = utils.timeit(
            lambda: capsnet.gccaps_predict(test_x, model),
            '[Epoch %d] Predicted class probabilities' % epoch)

        at_preds.append(at_pred)
        sed_preds.append(sed_pred)

    # Average predictions to give an overall output
    total_at_pred = np.mean(at_preds, axis=0)
    total_sed_pred = np.mean(sed_preds, axis=0)

    # Ensure output directory exists and set file path format
    os.makedirs(os.path.dirname(cfg.predictions_path), exist_ok=True)
    predictions_path = cfg.predictions_path.format('%s', dataset.name)

    # Save free parameters to disk
    utils.log_parameters({'prediction_epochs': cfg.prediction_epochs},
                         os.path.join(os.path.dirname(cfg.predictions_path),
                                      'parameters.json'))

    # Write predictions to disk
    utils.write_predictions(names, total_at_pred, predictions_path % 'at')
    utils.write_predictions(names, total_sed_pred, predictions_path % 'sed')
Beispiel #3
0
def evaluate(args, model, tokenizer_a, tokenizer_b, is_double=True):
    dataset, examples, features = load_and_cached_examples(
        args,
        tokenizer_a,
        tokenizer_b,
        evaluate=True,
        output_examples=True,
        is_double=is_double)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # Eval!
    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                'input_a_id': batch[0],
                'input_b_id': batch[1],
                'input_a_mask': batch[2],
                'input_b_mask': batch[3],
                'input_a_length': batch[4],
                'input_b_length': batch[5],
                'label': batch[6]
            }
            outputs = model(**inputs)
            loss = outputs[0]
            predict = torch.argmax(outputs[1], -1)
            all_results.extend(to_list(predict.view(-1)))
    output_prediction_file = os.path.join(args.output_dir, "predictions.json")
    all_labels = write_predictions(examples, all_results,
                                   output_prediction_file)
    accuracy, confusion = eval_cross(all_labels, all_results)
    return accuracy, confusion
Beispiel #4
0
    scores = cross_validation.cross_val_score(clf,
                                              data,
                                              result_vect,
                                              cv=10,
                                              scoring=rmsle_scorer)
    scores = -scores
    cv_means.append(np.mean(scores))
    cv_stds.append(np.std(scores))

# Plots mean and std depending on number of trees
plt.subplot(211)
plt.plot(n_trees_list, cv_means)
plt.xlabel("Number of trees")
plt.ylabel("Mean RMSLE")

plt.subplot(212)
plt.plot(n_trees_list, cv_stds)
plt.xlabel("Number of trees")
plt.ylabel("Standard dev RMSLE")

plt.tight_layout()
plt.show()

# Make predictions according to best result
clf = RandomForestRegressor(n_estimators=n_trees_list[np.argmin(cv_means)])
test_data = utils.get_data("test.csv")
clf.fit(data, result_vect)
pred_test = clf.predict(test_data)

utils.write_predictions(pred_test, "res_RF_all.csv")
#utils.plot_tune_results('Perceptron', 'T', Ts, *pct_tune_results)
#utils.plot_tune_results('Avg Perceptron', 'T', Ts, *avg_pct_tune_results)
#utils.plot_tune_results('Avg Passive-Aggressive', 'T', Ts, *avg_pa_tune_results_T)
#utils.plot_tune_results('Avg Passive-Aggressive', 'L', Ls, *avg_pa_tune_results_L)

print "(train accuracy, test accuracy) after modification"
print p1.average_passive_aggressive_accuracy(train_final_features,test_final_features,train_labels,test_labels,best_T,best_L)

#-------------------------------------------------------------------------------
#
#-------------------------------------------------------------------------------
#
#
#-------------------------------------------------------------------------------

submit_texts = [sample['text'] for sample in utils.load_data('reviews_submit.tsv')]

# 1. Extract the preferred features from the train and submit data
dictionary = p1.modified_bag_of_words(submit_texts)
train_final_features = p1.extract_final_features(train_texts, dictionary)
submit_final_features = p1.extract_final_features(submit_texts, dictionary)

# 2. Train the most accurate classifier
final_thetas = p1.average_passive_aggressive(train_final_features, train_labels, 2,50)

# 3. Classify and write out the submit predictions.
submit_predictions = p1.classify(submit_final_features, *final_thetas)
utils.write_predictions('reviews_submit.tsv', submit_predictions)

#-------------------------------------------------------------------------------
Beispiel #6
0
def main():
    parser = argparse.ArgumentParser()
    # # 必要参数
    parser.add_argument('--task',
                        default='multi',
                        type=str,
                        help='Task affecting load data and vectorize feature')
    parser.add_argument(
        '--loss_type',
        default='double',
        type=str,
        help='Select loss double or single, only for multi task'
    )  # 针对multi任务才有效
    parser.add_argument(
        "--bert_model",
        default="bert-base-uncased",
        type=str,
        help=
        "Bert pre-trained model selected in the list: bert-base-uncased,bert-large-uncased, "
        "bert-base-cased, bert-large-cased, bert-base-multilingual-uncased,bert-base-chinese,"
        "bert-base-multilingual-cased.")  # 选择预训练模型参数
    parser.add_argument("--debug",
                        default=False,
                        help="Whether run on small dataset")  # 正常情况下都应该选择false
    parser.add_argument(
        "--output_dir",
        default="./SQuAD/output/",
        type=str,
        help=
        "The output directory where the model checkpoints and predictions will be written."
    )

    # # 其他参数
    parser.add_argument("--train_file",
                        default="./SQuAD/version/train.json",
                        type=str,
                        help="SQuAD json for training. E.g., train-v1.1.json")
    parser.add_argument(
        "--predict_file",
        default="./SQuAD/version/prediction.json",
        type=str,
        help=
        "SQuAD json for predictio ns. E.g., dev-v1.1.json or test-v1.1.json")

    parser.add_argument(
        "--max_seq_length",
        default=384,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. Sequences "
        "longer than this will be truncated, and sequences shorter than this will be padded."
    )
    parser.add_argument(
        "--doc_stride",
        default=128,
        type=int,
        help=
        "When splitting up a long document into chunks, how much stride to take between chunks."
    )
    parser.add_argument(
        "--max_query_length",
        default=64,
        type=int,
        help=
        "The maximum number of tokens for the question. Questions longer than this will be "
        "truncated to this length.")

    # # 控制参数
    parser.add_argument("--do_train",
                        default=True,
                        help="Whether to run training.")
    parser.add_argument("--do_predict",
                        default=True,
                        help="Whether to run eval on the dev set.")

    parser.add_argument("--train_batch_size",
                        default=18,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--predict_batch_size",
                        default=18,
                        type=int,
                        help="Total batch size for predictions.")

    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for.")
    parser.add_argument(
        "--n_best_size",
        default=20,
        type=int,
        help=
        "The total number of n-best predictions to generate in the nbest_predictions.json file."
    )
    parser.add_argument(
        "--max_answer_length",
        default=30,
        type=int,
        help=
        "The maximum length of an answer that can be generated.This is needed because the start "
        "and end predictions are not conditioned on one another.")
    parser.add_argument(
        "--verbose_logging",
        default=False,
        help=
        "If true, all of the warnings related to data processing will be printed.A number of "
        "warnings are expected for a normal SQuAD evaluation.")
    parser.add_argument("--no_cuda",
                        default=False,
                        help="Whether not to use CUDA when available")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        "--do_lower_case",
        default=True,
        help=
        "Whether to lower case the input text. True for uncased models, False for cased models."
    )
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument(
        '--fp16',
        default=False,
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.Positive power of 2: static loss scaling value.\n"
    )
    parser.add_argument(
        '--version_2_with_negative',
        default=False,
        help=
        'If true, the SQuAD examples contain some that do not have an answer.')
    parser.add_argument(
        '--null_score_diff_threshold',
        type=float,
        default=0.0,
        help=
        "If null_score - best_non_null is greater than the threshold predict null."
    )
    args = parser.parse_args()

    # if是采用单机形式,else采用的是分布式形式;因为我们没有分布式系统,所以采用单机多GPU的方式进行训练10.24
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='hierarchical_copy')

    # 以下三句话的意义不是很大,基本操作这一部分是日志的输出形式10.24
    logging.basicConfig(
        format='%(asctime)s-%(levelname)s-%(name)s-%(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
    logger.info(
        "device:{}, n_gpu:{}, distributed training:{}, 16-bits training:{}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))
    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    # 以下几行均是用来设置参数10.24
    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
    random.seed(args.seed)  # 设置随机种子
    np.random.seed(args.seed)  # 设置随机种子
    torch.manual_seed(args.seed)  # 为CPU设置种子用于生成随机数,以使得结果是确定的
    if n_gpu > 0:  # 如果使用多个GPU,应该使用torch.cuda.manual_seed_all()为所有的GPU设置种子
        torch.cuda.manual_seed_all(args.seed)

    # 以下三句又是基本操作,意义不大10.24
    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")
    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    # 以下2句是用来判断output_dir是否存在,若不存在,则创建即可(感觉有这个东西反而不太好,因为需要空文件夹)10.24
    # if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
    #     raise ValueError("Output directory () already exists and is not empty.")
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    # 这个东西是用来干啥的(从tokenization中读取,对Tokenizer进行初始化操作)10.24
    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    # 从data中读取数据的方式,一种是单队列的读取方式,另一种是多通道读取方式10.24
    if args.task == 'squad':
        read_examples = read_squad_examples
    elif args.task == 'multi':
        read_examples = read_multi_examples

    # 用来加载训练样例以及优化的步骤10.24
    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = read_examples(
            input_file=args.train_file,
            is_training=True,
            version_2_with_negative=args.version_2_with_negative)
        if args.debug:
            train_examples = train_examples[:100]
        num_train_optimization_steps = \
            int(len(train_examples)/args.train_batch_size/args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # 模型准备中ing10.24
    model = BertForQuestionAnswering.from_pretrained(
        args.bert_model,
        cache_dir=os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
                               'distributed_{}'.format(args.local_rank)))

    # model = torch.nn.DataParallel(model).cuda()
    # 判断是否使用float16编码10.24
    if args.fp16:
        # model.half().cuda()
        model.half()
        # 将模型加载到相应的CPU或者GPU中10.24
    model.to(device)

    # 配置优化器等函数10.24
    if args.do_train:
        param_optimizer = list(model.named_parameters())

        # hack to remove pooler, which is not used
        # thus it produce None grad that break apex
        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        if args.fp16:
            try:
                # from apex.optimizers import FP16_Optimizer
                from apex.fp16_utils import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
                )

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=True)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer,
                                           static_loss_scale=args.loss_scale)
            warmup_linear = WarmupLinearSchedule(
                warmup=args.warmup_proportion,
                t_total=num_train_optimization_steps)
        else:
            optimizer = BertAdam(optimizer_grouped_parameters,
                                 lr=args.learning_rate,
                                 warmup=args.warmup_proportion,
                                 t_total=num_train_optimization_steps)

    # 进行模型的拟合训练10.24
    global_step = 0
    if args.do_train:
        # 训练语料的特征提取
        train_features = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)

        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)
        all_start_vector = torch.tensor(
            [f.start_vector for f in train_features], dtype=torch.float)
        all_end_vector = torch.tensor([f.end_vector for f in train_features],
                                      dtype=torch.float)
        all_content_vector = torch.tensor(
            [f.content_vector for f in train_features], dtype=torch.float)

        # # 替换的内容all_start_positions以及all_end_positions
        # all1_start_positions = []
        # for i in range(len(train_features)):
        #     for j in range(len(train_features[i].start_position)):
        #         all1_start_positions.append(train_features[i].start_position[j])
        # all_start_positions = torch.tensor([k for k in all1_start_positions], dtype=torch.long)
        # all1_end_positions = []
        # for i in range(len(train_features)):
        #     for j in range(len(train_features[i].end_position)):
        #         all1_end_positions.append(train_features[i].end_position[j])
        # all_end_positions = torch.tensor([k for k in all1_end_positions], dtype=torch.long)
        # ####################################################################

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_start_positions,
                                   all_end_positions, all_start_vector,
                                   all_end_vector, all_content_vector)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)  # 随机采样器
        else:
            train_sampler = DistributedSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for ep in trange(int(args.num_train_epochs), desc="Epoch"):
            # 每次都叫他进行分发,这样的话,就可以进行多GPU训练
            model = torch.nn.DataParallel(model).cuda()
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Iteration",
                         disable=args.local_rank not in [-1, 0])):

                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, start_positions, end_positions, start_vector, end_vector, content_vector = batch

                loss = model(input_ids, segment_ids, input_mask,
                             start_positions, end_positions, start_vector,
                             end_vector, content_vector, args.loss_type)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                    print("loss率为:{}".format(loss))
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()

                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used and handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear.get_lr(
                            global_step, args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1

            print("\n")
            print(ep)
            output_model_file = os.path.join(args.output_dir,
                                             str(ep) + WEIGHTS_NAME)
            output_config_file = os.path.join(args.output_dir,
                                              str(ep) + CONFIG_NAME)

            torch.save(model.state_dict(), output_model_file)
            if isinstance(model, torch.nn.DataParallel):
                model = model.module
            model.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(args.output_dir)

    # 这个是用来加载进行微调调好后的代码以方便进行预测10.25
    if args.do_train and (args.local_rank == -1
                          or torch.distributed.get_rank() == 0):
        # Save a trained model, configuration and tokenizer
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self

        # If we save using the predefined names, we can load using `from_pretrained`
        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)

        torch.save(model_to_save.state_dict(), output_model_file)
        model_to_save.config.to_json_file(output_config_file)
        tokenizer.save_vocabulary(args.output_dir)

        # Load a trained model and vocabulary that you have fine-tuned
        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)
    else:
        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)

    # 再次将GPU加入10.25
    model.to(device)

    # 这部分就是进行相应的预测(用于生成预测文件)
    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        eval_examples = \
            read_examples(input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
        if args.debug:
            eval_examples = eval_examples[:100]
        eval_features = convert_examples_to_features(
            examples=eval_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=False)

        logger.info("***** Running predictions *****")
        logger.info("  Num orig examples = %d", len(eval_examples))
        logger.info("  Num split examples = %d", len(eval_features))
        logger.info("  Batch size = %d", args.predict_batch_size)

        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_example_index = torch.arange(all_input_ids.size(0),
                                         dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_example_index)

        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.predict_batch_size)

        model.eval()
        all_results = []
        logger.info("Start evaluating")
        for input_ids, input_mask, segment_ids, example_indices in tqdm(
                eval_dataloader,
                desc="Evaluating",
                disable=args.local_rank not in [-1, 0]):
            if len(all_results) % 1000 == 0:
                logger.info("Processing example: %d" % (len(all_results)))
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            with torch.no_grad():
                batch_start_logits, batch_end_logits = model(
                    input_ids, segment_ids, input_mask)
            for i, example_index in enumerate(example_indices):
                start_logits = batch_start_logits[i].detach().cpu().tolist()
                end_logits = batch_end_logits[i].detach().cpu().tolist()
                eval_feature = eval_features[example_index.item()]
                unique_id = int(eval_feature.unique_id)
                all_results.append(
                    RawResult(unique_id=unique_id,
                              start_logits=start_logits,
                              end_logits=end_logits))

        middle_result = os.path.join(args.output_dir, 'middle_result.pkl')
        pickle.dump([eval_examples, eval_features, all_results],
                    open(middle_result, 'wb'))

        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(args.output_dir,
                                                 "null_odds.json")

        if (args.loss_type == 'double'):
            write_predictions_couple_labeling(
                eval_examples, eval_features, all_results, args.n_best_size,
                args.max_answer_length, args.do_lower_case,
                output_prediction_file, output_nbest_file,
                output_null_log_odds_file, args.verbose_logging,
                args.version_2_with_negative, args.null_score_diff_threshold)
        elif (args.loss_type == 'single'):
            write_predictions_single_labeling(
                eval_examples, eval_features, all_results, args.n_best_size,
                args.max_answer_length, args.do_lower_case,
                output_prediction_file, output_nbest_file,
                output_null_log_odds_file, args.verbose_logging,
                args.version_2_with_negative, args.null_score_diff_threshold)
        elif (args.loss_type == 'origin') or (args.task == 'multi'
                                              and args.loss_type == 'squad'):
            write_predictions(eval_examples, eval_features, all_results,
                              args.n_best_size, args.max_answer_length,
                              args.do_lower_case, output_prediction_file,
                              output_nbest_file, output_null_log_odds_file,
                              args.verbose_logging,
                              args.version_2_with_negative,
                              args.null_score_diff_threshold)
        else:
            raise ValueError('{} dataset and {} loss is not support'.format(
                args.task, args.loss_type))
def predict(in_file, out_file):
    xvals, yvals = utils.load_data(in_file)
    network = build_network()
    predictions = utils.predict(xvals, network, 'circle.tflearn')
    print('Accuracy: {}%'.format(utils.get_accuracy(yvals, predictions)))
    utils.write_predictions(xvals, predictions, out_file)
#-------------------------------------------------------------------------------
# dictionary = p1.bag_of_words(train_texts)
#
# train_final_features = p1.extract_final_features(train_texts, dictionary)
# val_final_features   = p1.extract_final_features(val_texts, dictionary)
# test_final_features  = p1.extract_final_features(test_texts, dictionary)
#-------------------------------------------------------------------------------
#
#-------------------------------------------------------------------------------
# Section 3.13
#
# Modify the code below to extract your best features from the submission data
# and then classify it using your most accurate classifier.
#-------------------------------------------------------------------------------
submit_texts = [
    sample['text'] for sample in utils.load_data('reviews_submit.tsv')
]
#
# # 1. Extract your preferred features from the train and submit data
dictionary = p1.bag_of_words(submit_texts)
train_final_features = p1.extract_final_features(train_texts, dictionary)
submit_final_features = p1.extract_final_features(submit_texts, dictionary)
#
# # 2. Train your most accurate classifier
final_thetas = p1.average_perceptron(train_final_features, train_labels, T=50)
#
# # 3. Classify and write out the submit predictions.
submit_predictions = p1.classify(submit_final_features, *final_thetas)
utils.write_predictions('reviews_submit.tsv', submit_predictions)

#-------------------------------------------------------------------------------
def evaluate(args, model, tokenizer, prefix=""):
    if prefix == 'test':
        eval_file = args.test_file
    else:
        eval_file = args.dev_file

    DatasetClass = WeakSupervisorDataset
    dataset = DatasetClass(eval_file,
                           args.max_seq_length,
                           tokenizer,
                           args.load_small,
                           is_training=False)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)
    predict_dir = os.path.join(args.output_dir, 'predictions')
    if not os.path.exists(predict_dir) and args.local_rank in [-1, 0]:
        os.makedirs(predict_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    # eval_sampler = SequentialSampler(
    #     dataset) if args.local_rank == -1 else DistributedSampler(dataset)
    eval_sampler = SequentialSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size)

    # multi-gpu evaluate
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    start_time = timeit.default_timer()
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()

        example_ids = batch['example_id']
        batch = {
            k: v.to(args.device)
            for k, v in batch.items() if k != 'example_id'
        }
        with torch.no_grad():
            inputs = {
                'input_ids': batch['input_ids'],
                'attention_mask': batch['input_mask']
            }
            if args.model_type != 'distilbert':
                # XLM don't use segment_ids
                inputs[
                    'token_type_ids'] = None if args.model_type == 'xlm' else batch[
                        'segment_ids']
            # example_ids = batch['example_id']
            outputs = model(**inputs)

        for i, example_id in enumerate(example_ids):
            result = RawResult(
                unique_id=example_id,
                start_logits=to_list(outputs[0][i]),
                end_logits=to_list(outputs[1][i]),
                retrieval_logits=[1])  # retrieval_logits is not used
            all_results.append(result)

    examples = dataset.all_examples
    features = dataset.all_features
    # assert len(examples) == len(dataset), (len(examples), len(dataset))
    # assert len(features) == len(dataset), (len(features), len(dataset))

    evalTime = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                evalTime, evalTime / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(
        predict_dir, "instance_predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        predict_dir, "instance_nbest_predictions_{}.json".format(prefix))
    output_final_prediction_file = os.path.join(
        predict_dir, "final_predictions_{}.json".format(prefix))
    if args.version_2_with_negative:
        output_null_log_odds_file = os.path.join(
            predict_dir, "instance_null_odds_{}.json".format(prefix))
    else:
        output_null_log_odds_file = None

    all_predictions = write_predictions(
        examples, features, all_results, args.n_best_size,
        args.max_answer_length, args.do_lower_case, output_prediction_file,
        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
        args.version_2_with_negative, args.null_score_diff_threshold)
    write_weak_supervisor_predictions(all_predictions,
                                      output_final_prediction_file)
    eval_metrics = weak_supervisor_eval(eval_file,
                                        output_final_prediction_file)

    metrics_file = os.path.join(predict_dir, "metrics_{}.json".format(prefix))
    with open(metrics_file, 'w') as fout:
        json.dump(eval_metrics, fout)

    return eval_metrics
Beispiel #10
0
def evaluate(args, model, tokenizer, max_depth, prefix=""):
    r"""
    Evaluate the model
    """
    dataset, examples, features = load_and_cache_examples(args,
                                                          tokenizer,
                                                          max_depth=max_depth,
                                                          evaluate=True,
                                                          output_examples=True)

    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
        os.makedirs(args.output_dir)

    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
    # Note that DistributedSampler samples randomly
    eval_sampler = SequentialSampler(
        dataset) if args.local_rank == -1 else DistributedSampler(dataset)
    eval_dataloader = DataLoader(dataset,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 num_workers=args.dataloader_workers)

    # multi-gpu evaluate
    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
        model = torch.nn.DataParallel(model)

    # Eval!
    logger.info("***** Running evaluation {} *****".format(prefix))
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", args.eval_batch_size)
    all_results = []
    start_time = timeit.default_timer()
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to(args.device) for t in batch)
        with torch.no_grad():
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids': batch[2],
                'xpath_tags_seq': batch[4],
                'xpath_subs_seq': batch[5],
            }
            feature_indices = batch[3]
            outputs = model(**inputs)

        for i, feature_index in enumerate(feature_indices):
            eval_feature = features[feature_index.item()]
            unique_id = int(eval_feature.unique_id)
            result = RawResult(unique_id=unique_id,
                               start_logits=to_list(outputs[0][i]),
                               end_logits=to_list(outputs[1][i]))
            all_results.append(result)

    eval_time = timeit.default_timer() - start_time
    logger.info("  Evaluation done in total %f secs (%f sec per example)",
                eval_time, eval_time / len(dataset))

    # Compute predictions
    output_prediction_file = os.path.join(args.output_dir,
                                          "predictions_{}.json".format(prefix))
    output_tag_prediction_file = os.path.join(
        args.output_dir, "tag_predictions_{}.json".format(prefix))
    output_nbest_file = os.path.join(
        args.output_dir, "nbest_predictions_{}.json".format(prefix))
    output_result_file = os.path.join(
        args.output_dir, "qas_eval_results_{}.json".format(prefix))
    output_file = os.path.join(args.output_dir,
                               "eval_matrix_results_{}".format(prefix))

    write_predictions(examples, features, all_results, args.n_best_size,
                      args.max_answer_length, args.do_lower_case,
                      output_prediction_file, output_tag_prediction_file,
                      output_nbest_file, args.verbose_logging, tokenizer)

    # Evaluate
    evaluate_options = EvalOpts(data_file=args.predict_file,
                                root_dir=args.root_dir,
                                pred_file=output_prediction_file,
                                tag_pred_file=output_tag_prediction_file,
                                result_file=output_result_file,
                                out_file=output_file)
    results = evaluate_on_squad(evaluate_options)
    return results
Beispiel #11
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--input_dir', required=True)
    parser.add_argument('--model_dir', required=True)
    parser.add_argument('--output_dir', required=True)
    parser.add_argument('--gpu_num', default=1, type=int)
    parser.add_argument('--batch_size', default=10, type=int)
    parser.add_argument('--n_best_size', default=20, type=int)
    parser.add_argument('--max_answer_length', default=30, type=int)

    conf = parser.parse_args()

    if os.path.isfile(conf.model_dir):
        model_path = conf.model_dir
    else:
        model_path = get_best_model_path(conf.model_dir)
    with open(os.path.join(os.path.dirname(model_path), "config.json"),
              "r") as f:
        model_conf = json.load(f)
        max_seq_length = model_conf['max_seq_length']
        doc_stride = model_conf['doc_stride']
        max_query_length = model_conf['max_query_length']
        do_lower_case = model_conf['do_lower_case']

    examples = read_squad_examples(conf.input_dir)
    tokenizer = FullTokenizer(os.path.join(os.path.dirname(model_path),
                                           'vocab.txt'),
                              do_lower_case=do_lower_case)
    generator = data_generator(tokenizer,
                               examples,
                               conf.batch_size,
                               max_seq_length,
                               doc_stride,
                               max_query_length,
                               for_predict=True)
    data_list = [f for f in generator]
    data_size = len(data_list)
    if conf.gpu_num == 1:
        results = predict(data_list, model_path, show_summary=True, gpuid=None)
    else:
        with concurrent.futures.ProcessPoolExecutor(
                max_workers=conf.gpu_num) as executor:
            per_data = int(ceil(data_size / conf.gpu_num))
            futures = []
            for idx in range(conf.gpu_num):
                sub_data_list = data_list[per_data *
                                          idx:min(per_data *
                                                  (idx + 1), data_size)]
                future = executor.submit(predict,
                                         sub_data_list,
                                         model_path,
                                         show_summary=idx == 0,
                                         gpuid=idx)
                futures.append(future)
            results = []
            for future in futures:
                results.extend(future.result())

    features = convert_examples_to_features(examples=examples,
                                            tokenizer=tokenizer,
                                            max_seq_length=max_seq_length,
                                            doc_stride=doc_stride,
                                            max_query_length=max_query_length,
                                            insert_unk=False)

    output_prediction_file = os.path.join(conf.output_dir, "predictions.json")
    output_nbest_file = os.path.join(conf.output_dir, "nbest_predictions.json")
    output_prediction_with_answer_file = os.path.join(conf.output_dir,
                                                      "ans_predictions.json")

    write_predictions(examples, features, results, conf.n_best_size,
                      conf.max_answer_length, do_lower_case,
                      output_prediction_file, output_nbest_file,
                      output_prediction_with_answer_file)
Beispiel #12
0
def main(args):

    # set up logging and device
    args.save_dir = utils.get_save_dir(args.save_dir, args.name, training=True)
    logger = utils.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train and not args.do_predict:
        raise ValueError(
            "At least one of `do_train` or `do_predict` must be True.")

    if args.do_train:
        if not args.train_file:
            raise ValueError(
                "If `do_train` is True, then `train_file` must be specified.")
    if args.do_predict:
        if not args.predict_file:
            raise ValueError(
                "If `do_predict` is True, then `predict_file` must be specified."
            )

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory () already exists and is not empty.")
    os.makedirs(args.output_dir, exist_ok=True)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    # Generating the dictionaries
    dep_dict, pos_dict, ent_dict, total_features = generate_dictionary(
        args.train_ling_features_file, args.eval_ling_features_file,
        args.test_ling_features_file)
    #    from IPython import embed; embed()
    # Generating total_dictionary
    total_dict = convert_string_features_to_array(total_features, dep_dict,
                                                  pos_dict, ent_dict)

    #    from IPython import embed; embed()
    train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        train_examples = read_squad_examples(
            input_file=args.train_file,
            is_training=True,
            version_2_with_negative=args.version_2_with_negative,
            total_dictionary=total_dict)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    # Prepare model
    model = BertForQuestionAnsweringLing.from_pretrained(
        args.bert_model,
        cache_dir=PYTORCH_PRETRAINED_BERT_CACHE /
        'distributed_{}'.format(args.local_rank))

    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    param_optimizer = list(model.named_parameters())

    # hack to remove pooler, which is not used
    # thus it produce None grad that break apex
    param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]

    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    if args.fp16:
        try:
            from apex.optimizer import FP16_Optimizer
            from apex.optimizers import FusedAdam
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
            )

        optimizer = FusedAdam(optimizer_grouped_parameters,
                              lr=args.learning_rate,
                              bias_correction=False,
                              max_grad_norm=1.0)
        if args.loss_scale == 0:
            optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
        else:
            optimizer = FP16_Optimizer(optimizer,
                                       static_loss_scale=args.loss_scale)
    else:
        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)

    global_step = 0

    # load training features
    cached_train_features_file = args.train_file + '_{0}_{1}_{2}_{3}'.format(
        list(filter(None, args.bert_model.split('/'))).pop(),
        str(args.max_seq_length), str(args.doc_stride),
        str(args.max_query_length))
    train_features = None
    print(cached_train_features_file)
    try:
        with open(cached_train_features_file, "rb") as reader:
            train_features = pickle.load(reader)
    except:
        train_features = convert_examples_to_features(
            examples=train_examples,
            tokenizer=tokenizer,
            max_seq_length=args.max_seq_length,
            doc_stride=args.doc_stride,
            max_query_length=args.max_query_length,
            is_training=True)
        if args.local_rank == -1 or torch.distributed.get_rank() == 0:
            logger.info("  Saving train features into cached file %s",
                        cached_train_features_file)
            with open(cached_train_features_file, "wb") as writer:
                pickle.dump(train_features, writer)

    # load eval features
    eval_examples = read_squad_examples(
        input_file=args.predict_file,
        is_training=False,
        version_2_with_negative=args.version_2_with_negative,
        total_dictionary=total_dict)
    eval_features = convert_examples_to_features(
        examples=eval_examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=False)

    test_examples = read_squad_examples(
        input_file=args.test_file,
        is_training=False,
        version_2_with_negative=args.version_2_with_negative,
        total_dictionary=total_dict)
    test_features = convert_examples_to_features(
        examples=test_examples,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        doc_stride=args.doc_stride,
        max_query_length=args.max_query_length,
        is_training=False)

    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num orig examples = %d", len(train_examples))
        logger.info("  Num split examples = %d", len(train_features))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_start_positions = torch.tensor(
            [f.start_position for f in train_features], dtype=torch.long)
        all_end_positions = torch.tensor(
            [f.end_position for f in train_features], dtype=torch.long)
        # from IPython import embed; embed()
        all_ling_features = torch.tensor(
            [f.ling_features for f in train_features], dtype=torch.float)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_ling_features,
                                   all_start_positions, all_end_positions)
        steps_till_eval = args.eval_steps
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()

        best_F1 = 0
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                if n_gpu == 1:
                    batch = tuple(
                        t.to(device)
                        for t in batch)  # multi-gpu does scattering it-self
                input_ids, input_mask, segment_ids, ling_features, start_positions, end_positions = batch
                # from IPython import embed; embed()
                loss = model(input_ids, segment_ids, input_mask, ling_features,
                             start_positions, end_positions)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    if args.fp16:
                        # modify learning rate with special warm up BERT uses
                        # if args.fp16 is False, BertAdam is used and handles this automatically
                        lr_this_step = args.learning_rate * warmup_linear(
                            global_step / num_train_optimization_steps,
                            args.warmup_proportion)
                        for param_group in optimizer.param_groups:
                            param_group['lr'] = lr_this_step
                    optimizer.step()
                    optimizer.zero_grad()
                    global_step += 1
                # add to tensorboard
                loss_val = loss.item()
                tbx.add_scalar('train/NLL', loss_val, global_step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               global_step)

                steps_till_eval -= args.train_batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    logger.info('Evaluating at step {}...'.format(step))
                    # ema.assign(model)
                    results, _ = evaluate(model, eval_examples, eval_features,
                                          device, args, logger,
                                          args.version_2_with_negative,
                                          args.dev_eval_file)
                    # saver.save(step, model, results[args.metric_name], device)
                    # ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    logger.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    logger.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, global_step)
                    """
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
                    """
                    if results['F1'] > best_F1:
                        best_F1 = results['F1']
                        model_to_save = model.module if hasattr(
                            model,
                            'module') else model  # Only save the model it-self
                        output_model_file = os.path.join(
                            args.output_dir, "pytorch_model_best.bin")
                        torch.save(model_to_save.state_dict(),
                                   output_model_file)
                        #model.to(device)

    # Save a trained model
    """
    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
    output_model_file = os.path.join(args.output_dir, "pytorch_model.bin")
    if args.do_train:
        torch.save(model_to_save.state_dict(), output_model_file)
        # Load a trained model that you have fine-tuned
        model_state_dict = torch.load(output_model_file)
        model = BertForQuestionAnsweringLing.from_pretrained(args.bert_model, state_dict=model_state_dict)
    else:
        model = BertForQuestionAnsweringLing.from_pretrained(args.bert_model)

    model.to(device)
    """

    # load the best trained model and eval on the eval set and test set
    best_model_file = os.path.join(args.output_dir, "pytorch_model_best.bin")
    model_state_dict = torch.load(best_model_file)
    model = BertForQuestionAnsweringLing.from_pretrained(
        args.bert_model, state_dict=model_state_dict)
    model.to(device)

    if args.do_predict and (args.local_rank == -1
                            or torch.distributed.get_rank() == 0):
        logger.info('Evaluating at the best model')
        results, all_results = evaluate(model, eval_examples, eval_features,
                                        device, args, logger,
                                        args.version_2_with_negative,
                                        args.dev_eval_file)

        logger.info('Write the best eval results')
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions.json")
        output_null_log_odds_file = os.path.join(args.output_dir,
                                                 "null_odds.json")
        write_predictions(eval_examples, eval_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file,
                          args.verbose_logging, args.version_2_with_negative,
                          args.null_score_diff_threshold, 'dev')

        logger.info('Test set at the best model')
        results, all_results = evaluate(model, test_examples, test_features,
                                        device, args, logger,
                                        args.version_2_with_negative,
                                        args.test_eval_file)

        logger.info('Write the best test set results')
        output_prediction_file = os.path.join(args.output_dir,
                                              "predictions_test.json")
        output_nbest_file = os.path.join(args.output_dir,
                                         "nbest_predictions_test.json")
        output_null_log_odds_file = os.path.join(args.output_dir,
                                                 "null_odds_test.json")
        write_predictions(test_examples, test_features, all_results,
                          args.n_best_size, args.max_answer_length,
                          args.do_lower_case, output_prediction_file,
                          output_nbest_file, output_null_log_odds_file,
                          args.verbose_logging, args.version_2_with_negative,
                          args.null_score_diff_threshold, 'test')
        """
plt.subplot(223)
plt.plot(n_trees_list, cv_means_cas)
plt.xlabel("Number of trees")
plt.ylabel("Mean RMSLE for casual")

plt.subplot(224)
plt.plot(n_trees_list, cv_stds_cas)
plt.xlabel("Number of trees")
plt.ylabel("Standard dev RMSLE for casual")

plt.tight_layout()
plt.show()

# Make predictions according to best result
cv_summary = np.add(cv_means_cas, cv_means_reg)

clf_reg = RandomForestRegressor(n_estimators=n_trees_list[np.argmin(cv_summary)])
clf_cas = RandomForestRegressor(n_estimators=n_trees_list[np.argmin(cv_summary)])

test_data = utils.get_data("test.csv")

clf_reg.fit(data, registered_result)
clf_cas.fit(data, casual_result)

pred_test_reg = clf_reg.predict(test_data)
pred_test_cas = clf_cas.predict(test_data)

pred_test = np.add(pred_test_reg, pred_test_cas)

utils.write_predictions(pred_test, "res_RF_reg_casual.csv")
plt.xlabel("Number of trees")
plt.ylabel("Mean RMSLE for casual")

plt.subplot(224)
plt.plot(n_trees_list, cv_stds_cas)
plt.xlabel("Number of trees")
plt.ylabel("Standard dev RMSLE for casual")

plt.tight_layout()
plt.show()

# Make predictions according to best result
cv_summary = np.add(cv_means_cas, cv_means_reg)

clf_reg = RandomForestRegressor(
    n_estimators=n_trees_list[np.argmin(cv_summary)])
clf_cas = RandomForestRegressor(
    n_estimators=n_trees_list[np.argmin(cv_summary)])

test_data = utils.get_data("test.csv")

clf_reg.fit(data, registered_result)
clf_cas.fit(data, casual_result)

pred_test_reg = clf_reg.predict(test_data)
pred_test_cas = clf_cas.predict(test_data)

pred_test = np.add(pred_test_reg, pred_test_cas)

utils.write_predictions(pred_test, "res_RF_reg_casual.csv")
Beispiel #15
0
def exec_demo(demo_params):
    """
    Train the crf with different size of the train set
    Tune the hyperparameter over the development set
    Then test the best model
    :param demo_params:
    :return:
    """
    logger = log.setup_logger(__name__)

    #ignore this line... It's a long story.
    feature_type = "ver1"

    # extract base parameters
    demo_id, name, train_file, dev_file, test_file, output_folder = utils.extract_base_demo_params(
        demo_params)
    print_log(
        logger,
        "\n".join([str((key, demo_params[key])) for key in list(demo_params)]))

    different_sizes_perc = list(range(10, 101, 10))

    # define the scoring function for the grid search
    my_scorer = sklearn.metrics.make_scorer(metrics.my_scorer.get_evaluation)

    # track some result from the search used for tuning the hyperparameter delta
    size_evaluations = {}
    train_data_partitions = {}
    fscores = {}

    # pre-processing the data (remove tags and other stuff)
    print_log(logger, "Making datasets...")
    train_data = data_parsers.make_dataset.parse_file(open(train_file))
    dev_data = data_parsers.make_dataset.parse_file(open(dev_file))
    test_data = data_parsers.make_dataset.parse_file(open(test_file))

    # compute the maximum delta possible (from the length of the longest word
    # in the train and development set)
    max_delta = max(utils.find_max_len(train_data),
                    utils.find_max_len(dev_data))
    print_log(logger,
              "max delta: %s, len train set:%s" % (max_delta, len(train_data)))

    # train the model for different train sizes
    for size in different_sizes_perc:
        print_log(
            logger,
            "train the model with percentage of the train set: %02d%%" % size)

        train_data_shuffled = copy.deepcopy(train_data)
        random.shuffle(train_data_shuffled)
        current_size = round(len(train_data) * size / 100)
        print_log(logger, "current train set size: %d" % current_size)
        train_data_partition = train_data_shuffled[:current_size]
        print_log(
            logger, "train set: " +
            "; ".join(list(map(str, train_data_partition[0:5]))) + "...")

        size_evaluations[size] = {}
        train_data_partitions[size] = train_data_partition

        current_max_delta = utils.find_max_len(train_data_partition)
        print_log(logger, "current max delta: %s" % current_max_delta)

        for delta in range(1, current_max_delta + 1):
            print_log(logger, "train the model with delta: %d" % delta)

            X_train, y_train = features.extract_features.get_features_and_labels(
                train_data_partition, delta, feature_type)
            X_dev, y_dev = features.extract_features.get_features_and_labels(
                dev_data, delta, feature_type)
            X_test, y_test = features.extract_features.get_features_and_labels(
                test_data, delta, feature_type)

            crf = sklearn_crfsuite.CRF(
                algorithm='ap',
                all_possible_transitions=True,
                all_possible_states=False,
            )
            crf.fit(X_train, y_train)
            y_dev_pred = crf.predict(X_dev)
            delta_evaluation = metrics.evaluation.get_evaluation(
                feature_type, y_dev, y_dev_pred)

            print_log(
                logger,
                "F-score on development set: %s" % delta_evaluation["F-score"])
            size_evaluations[size][delta] = (delta_evaluation["Precision"],
                                             delta_evaluation["Recall"],
                                             delta_evaluation["F-score"])

    # find delta that yields best F-score
    sizes = list(size_evaluations.keys())
    sizes.sort()
    deltas = []
    for size in sizes:
        max_fscore = max(size_evaluations[size].values())
        max_delta_for_size = [
            i for i in size_evaluations[size]
            if size_evaluations[size][i] == max_fscore
        ][0]
        deltas.append(max_delta_for_size)
        print_log(
            logger, "\nBest delta=%s for train size perc=%s%%. "
            "\nOn development set:"
            "\n\tPrecision=%s"
            "\n\tRecall=%s"
            "\n\tF-score=%s" % (max_delta_for_size, size,
                                size_evaluations[size][max_delta_for_size][0],
                                size_evaluations[size][max_delta_for_size][1],
                                size_evaluations[size][max_delta_for_size][2]))

    test_evaluations = {}
    print_log(logger, "Test models with different sizes of training set")
    for size, best_delta in zip(sizes, deltas):
        print_log(logger,
                  "Train with size: %d and delta: %s" % (size, best_delta))
        cur_train_set = train_data_partitions[size]
        print_log(
            logger, "train set: " +
            "; ".join(list(map(str, cur_train_set[0:5]))) + "...")
        X_train, y_train = features.extract_features.get_features_and_labels(
            cur_train_set, best_delta, feature_type)
        X_test, y_test = features.extract_features.get_features_and_labels(
            test_data, best_delta, feature_type)

        crf = sklearn_crfsuite.CRF(
            algorithm='ap',
            all_possible_transitions=True,
            all_possible_states=False,
        )
        crf.fit(X_train, y_train)

        y_test_pred = crf.predict(X_test)
        delta_evaluation = metrics.evaluation.get_evaluation(
            feature_type, y_test, y_test_pred)

        test_evaluations[size] = (delta_evaluation["Precision"],
                                  delta_evaluation["Recall"],
                                  delta_evaluation["F-score"])
        print_log(
            logger, "train score (delta=%s): F-score, : %s" %
            (best_delta, delta_evaluation["F-score"]))

        # save some result from the tests
        curpath = output_folder + "/size_%02d_delta_%02d" % (size, best_delta)
        os.makedirs(curpath)

        curpath = curpath\
                  +"/"+name+"_" \
                  + "size_%02d_delta_%02d"%(size,best_delta)
        utils.write_model(crf, open(curpath + ".model", "wb+"))
        utils.write_predictions(feature_type, open(test_file), y_test_pred,
                                open(curpath + ".pred", "w+"))
        utils.write_evaluation(delta_evaluation, open(curpath + ".eval", "w+"))
        utils.write_fails(open(test_file), y_test, y_test_pred,
                          open(curpath + ".fails", "w+"), feature_type)
        details.print_details(crf, file=open(curpath + ".details", "w+"))

    freport = open(output_folder + "/report.txt", "w+")
    for size, best_delta in zip(sizes, deltas):
        print(
            "Best delta=%s for train size perc=%s%%. "
            "\nOn development set:"
            "\n\tPrecision=%s"
            "\n\tRecall=%s"
            "\n\tF-score=%s"
            "\nOn test set:"
            "\n\tPrecision=%s"
            "\n\tRecall=%s"
            "\n\tF-score=%s" %
            (best_delta, size, size_evaluations[size][best_delta][0],
             size_evaluations[size][best_delta][1],
             size_evaluations[size][best_delta][2], test_evaluations[size][0],
             test_evaluations[size][1], test_evaluations[size][2]) + "\n" +
            "-" * 50,
            file=freport)
    clf = RandomForestRegressor(n_estimators=n_trees)

    # cross-validation evaluation
    scores = cross_validation.cross_val_score(clf, data, result_vect, cv=10, scoring=rmsle_scorer)
    scores = -scores
    cv_means.append(np.mean(scores))
    cv_stds.append(np.std(scores))


# Plots mean and std depending on number of trees
plt.subplot(211)
plt.plot(n_trees_list, cv_means)
plt.xlabel("Number of trees")
plt.ylabel("Mean RMSLE")

plt.subplot(212)
plt.plot(n_trees_list, cv_stds)
plt.xlabel("Number of trees")
plt.ylabel("Standard dev RMSLE")

plt.tight_layout()
plt.show()

# Make predictions according to best result
clf = RandomForestRegressor(n_estimators=n_trees_list[np.argmin(cv_means)])
test_data = utils.get_data("test.csv")
clf.fit(data, result_vect)
pred_test = clf.predict(test_data)

utils.write_predictions(pred_test, "res_RF_all.csv")
Beispiel #17
0
def exec_demo(demo_params):
    """
    Execute grid search over the param_grid defined in demo_params,
    using the data from the crowd-sourced annotations.
    :param demo_params:
    :return:
    """
    logger = log.setup_logger(__name__)

    #ignore this line... It's a long story.
    feature_type = "ver1"

    # extract base parameters
    demo_id, name, train_file, dev_file, test_file, output_folder = utils.extract_base_demo_params(
        demo_params)
    print_log(
        logger,
        "\n".join([str((key, demo_params[key])) for key in list(demo_params)]))

    train_file_extra_points = demo_params["train_file_extra_points"]
    param_grid = demo_params["param_grid"]

    # define the scoring function for the grid search
    my_scorer = sklearn.metrics.make_scorer(metrics.my_scorer.get_evaluation)

    # track some result from the grid search used for tuning the hyperparameter delta
    fscores = {}
    epsilons_list = {}
    max_iterations_list = {}
    best_eval = {"F-score": 0}

    # pre-processing the data (remove tags and other stuff)
    print_log(logger, "Making datasets...")
    task1_train_data = data_parsers.make_dataset.parse_file(open(train_file))
    dev_data = data_parsers.make_dataset.parse_file(open(dev_file))
    test_data = data_parsers.make_dataset.parse_file(open(test_file))
    extra_points_train_data = data_parsers.make_dataset.parse_file(
        open(train_file_extra_points))

    train_data = task1_train_data + extra_points_train_data

    print_log(logger, "train data size: %s" % len(train_data))
    print_log(logger, "development data size: %s" % len(dev_data))
    print_log(logger, "test data size: %s" % len(test_data))

    # compute the maximum delta possible (from the length of the longest word
    # in the train and development set)
    max_delta = max(utils.find_max_len(train_data),
                    utils.find_max_len(dev_data))
    if max_delta > settings.MAX_ALLOWABLE_DELTA:
        max_delta = settings.MAX_ALLOWABLE_DELTA
    print_log(logger, "max delta: %s" % max_delta)

    # repeat the grid search for each possible value of delta
    for delta in range(1, max_delta + 1):
        os.makedirs(output_folder + "/%02d" % delta, exist_ok=True)

        print_log(logger, "Training with delta=%s" % delta)
        X_train, y_train = features.extract_features.get_features_and_labels(
            train_data, delta, feature_type)
        X_dev, y_dev = features.extract_features.get_features_and_labels(
            dev_data, delta, feature_type)
        X_test, y_test = features.extract_features.get_features_and_labels(
            test_data, delta, feature_type)

        model = utils.run_grid_search(X_train, y_train, X_dev, y_dev,
                                      param_grid, my_scorer)

        best_cv_epsilon = model.best_params_["epsilon"]
        best_cv_max_iterations = model.best_params_["max_iterations"]

        # the best score will be considered in order to pick the best model
        fscores[delta] = model.best_score_
        epsilons_list[delta] = best_cv_epsilon
        max_iterations_list[delta] = best_cv_max_iterations

        print_log(
            logger,
            "Best params for delta %02d: max_iterations=%d\tepsilon=%.2E" %
            (delta, best_cv_max_iterations, best_cv_epsilon))
        print_log(logger, "Best CV score: " + str(model.best_score_))

        # test the model on the test set. NOTICE: the result will not be considered for the choice
        # of the hyperparameter delta!
        print_log(logger, "***Predict test with the grid search model:***")

        y_test_pred = model.predict(X_test)
        test_eval = metrics.evaluation.get_evaluation(feature_type, y_test,
                                                      y_test_pred)
        print_log(
            logger, "F-score on test (grid search with delta=%s): %s" %
            (delta, test_eval["F-score"]))

        # save some result from the grid search
        curpath = output_folder + "/%02d" % delta + "/" + name + "_" + "%02d" % delta
        utils.write_model(model, open(curpath + "_gridsearch.model", "wb+"))
        utils.write_predictions(feature_type, open(test_file), y_test_pred,
                                open(curpath + ".pred", "w+"))
        utils.write_evaluation(test_eval, open(curpath + ".eval", "w+"))
        utils.write_fails(open(test_file), y_test, y_test_pred,
                          open(curpath + ".fails", "w+"), feature_type)
        details.print_gridsearch_details(model,
                                         file=open(
                                             curpath + "_gridsearch.details",
                                             "w+"))

        print_log(logger, "#" * 50)

    print_log(logger, "-" * 50)
    max_fscore = max(fscores.values())
    max_fscore_delta = [i for i in fscores.keys()
                        if fscores[i] == max_fscore][0]
    best_model_num = max_fscore_delta
    best_epsilon = epsilons_list[best_model_num]
    best_max_iterations = max_iterations_list[best_model_num]

    freport = open(output_folder + "/report.txt", "w+")
    print_log(
        logger,
        "The best model found is the one with delta: %s" % best_model_num)
    print_log(
        logger, "With best parameters: max_iterations=%s, epsilon=%s" %
        (best_max_iterations, best_epsilon))
    print_log(logger, "CV F-score: %s" % max_fscore)
    print("The best model found is the one with delta: %s" % best_model_num,
          file=freport)
    print("With best parameters: max_iterations=%s, epsilon=%s" %
          (best_max_iterations, best_epsilon),
          file=freport)
    print("CV F-score: %s" % max_fscore, file=freport)

    best_model_path = output_folder + "/%02d" % best_model_num + "/" + name + "_" + "%02d" % best_model_num + "_gridsearch.model"
    best_model = pickle.load(open(best_model_path, "rb"))
    X_test, y_test = features.extract_features.get_features_and_labels(
        test_data, best_model_num, feature_type)

    y_pred = best_model.predict(X_test)
    delta_evaluation = metrics.evaluation.get_evaluation(
        feature_type, y_test, y_pred)
    print_log(
        logger, "delta: %s\tF-score, : %s" %
        (best_model_num, delta_evaluation["F-score"]))
    print("F-score on test set: %s" % delta_evaluation["F-score"],
          file=freport)
'''Trains SupportVectorMachine and uses it to write predictions.
'''
from sklearn import svm

from utils import load_data, write_predictions

if __name__ == "__main__":
    ids, data, labels = load_data()
    clf = svm.SVC().fit(data,labels)
    write_predictions(clf)