Esempio n. 1
0
def main():
    py_utils.add_stdout_logger()
    logging.info("Checking word vectors..")
    download_wordvecs()
    logging.info("Checking MNLI...")
    download_mnli()
    logging.info("Checking SQUAD...")
    download_squad()
    logging.info("Checking TriviaQA-CP...")
    download_triviaqa_cp()
    logging.info("Done! All data should be ready")
Esempio n. 2
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument("model")
  parser.add_argument("--n_processes", "-n", type=int, default=1)
  parser.add_argument("--nocache", action="store_true")
  parser.add_argument("--dataset", choices=["dev", "hans", "both"],
                      default="both")
  args = parser.parse_args()
  py_utils.add_stdout_logger()

  compute_scores(
    args.model, args.dataset in ["dev", "both"], args.dataset in ["hans", "both"],
    not args.nocache, args.n_processes)
Esempio n. 3
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument("output_dir")
  parser.add_argument("--nocache", action="store_true")
  parser.add_argument("--datasets", default=None, help="Comma separated list of datasets")
  args = parser.parse_args()
  py_utils.add_stdout_logger()

  if args.datasets is None:
    datasets = ["dev", "add_sent", "add_one_sent"]
  else:
    datasets = args.datasets.split(",")
    for ds in datasets:
      if ds not in squad.DATASETS:
        raise ValueError("Unsupported dataset %s" % ds)

  compute_all_scores(args.output_dir, datasets, not args.nocache)
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("output_dir")
    parser.add_argument("--nocache", action="store_true")
    parser.add_argument("--dataset",
                        choices=["location", "person"],
                        required=True,
                        help="Dataset to test on")
    parser.add_argument("--parts",
                        default=None,
                        help="Comma seperated list of parts to test on")
    args = parser.parse_args()
    py_utils.add_stdout_logger()

    if args.parts is None:
        parts = ["dev", "test"]
    else:
        parts = args.parts.split(",")
        for ds in parts:
            if ds not in ["dev", "test", "train"]:
                raise ValueError("Unsupported dataset %s" % ds)

    show_scores(args.output_dir, args.dataset, parts, not args.nocache)
Esempio n. 5
0
def main():
  parser = argparse.ArgumentParser()

  ## Required parameters
  parser.add_argument("--bert_model", default="bert-base-uncased", type=str,
                      help="Bert pre-trained model selected in the list: bert-base-uncased, "
                           "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
                           "bert-base-multilingual-cased, bert-base-chinese.")
  parser.add_argument("--output_dir",
                      default=None,
                      type=str,
                      required=True,
                      help="The output directory where the model predictions and checkpoints will be written.")
  parser.add_argument("--cache_dir",
                      default="",
                      type=str,
                      help="Where do you want to store the pre-trained models downloaded from s3")
  parser.add_argument("--max_seq_length",
                      default=128,
                      type=int,
                      help="The maximum total input sequence length after WordPiece tokenization. \n"
                           "Sequences longer than this will be truncated, and sequences shorter \n"
                           "than this will be padded.")
  parser.add_argument("--do_train",
                      action='store_true',
                      help="Whether to run training.")
  parser.add_argument("--do_eval",
                      action='store_true',
                      help="Whether to run eval on the dev set.")
  parser.add_argument("--train_batch_size",
                      default=32,
                      type=int,
                      help="Total batch size for training.")
  parser.add_argument("--seed",
                      default=None,
                      type=int,
                      help="Seed for randomized elements in the training")
  parser.add_argument("--eval_batch_size",
                      default=16,
                      type=int,
                      help="Total batch size for eval.")
  parser.add_argument("--learning_rate",
                      default=5e-5,
                      type=float,
                      help="The initial learning rate for Adam.")
  parser.add_argument("--num_train_epochs",
                      default=3.0,
                      type=float,
                      help="Total number of training epochs to perform.")
  parser.add_argument("--warmup_proportion",
                      default=0.1,
                      type=float,
                      help="Proportion of training to perform linear learning rate warmup for. "
                           "E.g., 0.1 = 10%% of training.")
  parser.add_argument("--no_cuda",
                      action='store_true',
                      help="Whether not to use CUDA when available")
  parser.add_argument("--local_rank",
                      type=int,
                      default=-1,
                      help="local_rank for distributed training on gpus")
  parser.add_argument('--gradient_accumulation_steps',
                      type=int,
                      default=1,
                      help="Number of updates steps to accumulate before performing a backward/update pass.")
  parser.add_argument('--fp16',
                      action='store_true',
                      help="Whether to use 16-bit float precision instead of 32-bit")
  parser.add_argument('--loss_scale',
                      type=float, default=0,
                      help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                           "0 (default value): dynamic loss scaling.\n"
                           "Positive power of 2: static loss scaling value.\n")

  ## Our arguements
  parser.add_argument("--mode", choices=["bias_product", "none", "learned_mixin", "reweight"],
                      default="learned_mixin", help="Kind of debiasing method to use")
  parser.add_argument("--penalty", type=float, default=0.03,
                      help="Penalty weight for the learn_mixin model")
  parser.add_argument("--n_processes", type=int, default=4,
                      help="Processes to use for pre-processing")
  parser.add_argument("--debug", action="store_true")
  parser.add_argument("--sorted", action="store_true",
                      help='Sort the data so most batches have the same input length,'
                           ' makes things about 2x faster. Our experiments did not actually'
                           ' use this in the end (not sure if it makes a difference) so '
                           'its off by default.')

  args = parser.parse_args()

  py_utils.add_stdout_logger()

  if args.mode == "none":
    loss_fn = clf_debias_loss_functions.Plain()
  elif args.mode == "reweight":
    loss_fn = clf_debias_loss_functions.ReweightByInvBias()
  elif args.mode == "bias_product":
    loss_fn = clf_debias_loss_functions.BiasProduct()
  elif args.mode == "learned_mixin":
    loss_fn = clf_debias_loss_functions.LearnedMixin(args.penalty)
  else:
    raise RuntimeError()

  output_dir = args.output_dir

  if args.do_train:
    if exists(output_dir):
      if len(os.listdir(output_dir)) > 0:
        raise ValueError("Output dir exists and is non-empty")
    else:
      os.makedirs(output_dir)

  print("Saving model to %s" % output_dir)

  if args.local_rank == -1 or args.no_cuda:
    device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
    n_gpu = torch.cuda.device_count()
  else:
    torch.cuda.set_device(args.local_rank)
    device = torch.device("cuda", args.local_rank)
    n_gpu = 1
    # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
    torch.distributed.init_process_group(backend='nccl')
  logging.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
    device, n_gpu, bool(args.local_rank != -1), args.fp16))

  if args.gradient_accumulation_steps < 1:
    raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
      args.gradient_accumulation_steps))

  args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

  if args.seed is not None:
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
      torch.cuda.manual_seed_all(args.seed)
    
  if not args.do_train and not args.do_eval:
    raise ValueError("At least one of `do_train` or `do_eval` must be True.")

  if os.path.exists(output_dir) and os.listdir(output_dir) and args.do_train:
    raise ValueError("Output directory ({}) already exists and is not empty.".format(output_dir))
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)

  # Its way ot easy to forget if this is being set by a command line flag
  if "-uncased" in args.bert_model:
    do_lower_case = True
  elif "-cased" in args.bert_model:
    do_lower_case = False
  else:
    raise NotImplementedError(args.bert_model)

  tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=do_lower_case)

  num_train_optimization_steps = None
  train_examples = None
  if args.do_train:
    train_examples = load_mnli(True, 2000 if args.debug else None)
    num_train_optimization_steps = int(
      len(train_examples) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
    if args.local_rank != -1:
      num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

  # Prepare model
  cache_dir = args.cache_dir if args.cache_dir else os.path.join(str(PYTORCH_PRETRAINED_BERT_CACHE),
                                                                 'distributed_{}'.format(args.local_rank))

  model = BertWithDebiasLoss.from_pretrained(
    args.bert_model, cache_dir=cache_dir, num_labels=3, loss_fn=loss_fn)

  if args.fp16:
    model.half()
  model.to(device)
  if args.local_rank != -1:
    try:
      from apex.parallel import DistributedDataParallel as DDP
    except ImportError:
      raise ImportError(
        "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

    model = DDP(model)
  elif n_gpu > 1:
    model = torch.nn.DataParallel(model)

  # Prepare optimizer
  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
  ]
  if args.fp16:
    try:
      from apex.optimizers import FP16_Optimizer
      from apex.optimizers import FusedAdam
    except ImportError:
      raise ImportError(
        "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

    optimizer = FusedAdam(optimizer_grouped_parameters,
                          lr=args.learning_rate,
                          bias_correction=False,
                          max_grad_norm=1.0)
    if args.loss_scale == 0:
      optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
    else:
      optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

  else:
    optimizer = BertAdam(optimizer_grouped_parameters,
                         lr=args.learning_rate,
                         warmup=args.warmup_proportion,
                         t_total=num_train_optimization_steps)

  global_step = 0
  nb_tr_steps = 0
  tr_loss = 0

  if args.do_train:
    train_features: List[InputFeatures] = convert_examples_to_features(
      train_examples, args.max_seq_length, tokenizer, args.n_processes)

    bias_map = load_bias("train")
    for fe in train_features:
      fe.bias = bias_map[fe.example_id].astype(np.float32)

    logging.info("***** Running training *****")
    logging.info("  Num examples = %d", len(train_examples))
    logging.info("  Batch size = %d", args.train_batch_size)
    logging.info("  Num steps = %d", num_train_optimization_steps)

    train_dataloader = build_train_dataloader(train_features, args.train_batch_size, args.seed, args.sorted)

    model.train()
    loss_ema = 0
    total_steps = 0
    decay = 0.99

    for _ in trange(int(args.num_train_epochs), desc="Epoch", ncols=100):
      tr_loss = 0
      nb_tr_examples, nb_tr_steps = 0, 0
      pbar = tqdm(train_dataloader, desc="loss", ncols=100)
      for step, batch in enumerate(pbar):
        batch = tuple(t.to(device) for t in batch)
        if bias_map is not None:
          input_ids, input_mask, segment_ids, label_ids, bias = batch
        else:
          bias = None
          input_ids, input_mask, segment_ids, label_ids = batch

        logits, loss = model(input_ids, segment_ids, input_mask, label_ids, bias)

        total_steps += 1
        loss_ema = loss_ema * decay + loss.cpu().detach().numpy() * (1 - decay)
        descript = "loss=%.4f" % (loss_ema / (1 - decay**total_steps))
        pbar.set_description(descript, refresh=False)

        if n_gpu > 1:
          loss = loss.mean()  # mean() to average on multi-gpu.
        if args.gradient_accumulation_steps > 1:
          loss = loss / args.gradient_accumulation_steps

        if args.fp16:
          optimizer.backward(loss)
        else:
          loss.backward()

        tr_loss += loss.item()
        nb_tr_examples += input_ids.size(0)
        nb_tr_steps += 1
        if (step + 1) % args.gradient_accumulation_steps == 0:
          if args.fp16:
            # modify learning rate with special warm up BERT uses
            # if args.fp16 is False, BertAdam is used that handles this automatically
            lr_this_step = args.learning_rate * warmup_linear(global_step / num_train_optimization_steps,
                                                              args.warmup_proportion)
            for param_group in optimizer.param_groups:
              param_group['lr'] = lr_this_step
          optimizer.step()
          optimizer.zero_grad()
          global_step += 1

    # Save a trained model and the associated configuration
    model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
    output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
    torch.save(model_to_save.state_dict(), output_model_file)
    output_config_file = os.path.join(output_dir, CONFIG_NAME)
    with open(output_config_file, 'w') as f:
      f.write(model_to_save.config.to_json_string())

    # Record the args as well
    arg_dict = {}
    for arg in vars(args):
      arg_dict[arg] = getattr(args, arg)
    with open(join(output_dir, "args.json"), 'w') as out_fh:
      json.dump(arg_dict, out_fh)

    # Load a trained model and config that you have fine-tuned
    config = BertConfig(output_config_file)
    model = BertWithDebiasLoss(config, num_labels=3, loss_fn=loss_fn)
    model.load_state_dict(torch.load(output_model_file))
  else:
    output_config_file = os.path.join(output_dir, CONFIG_NAME)
    config = BertConfig.from_json_file(output_config_file)
    output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
    model = BertWithDebiasLoss(config, num_labels=3, loss_fn=loss_fn)
    model.load_state_dict(torch.load(output_model_file))

  model.to(device)

  if not args.do_eval:
    return
  if not (args.local_rank == -1 or torch.distributed.get_rank() == 0):
    return

  model.eval()

  eval_datasets = [("dev", load_mnli(False)), ("hans", load_hans())]
  for name, eval_examples in eval_datasets:
    logging.info("***** Running evaluation on %s *****" % name)
    logging.info("  Num examples = %d", len(eval_examples))
    logging.info("  Batch size = %d", args.eval_batch_size)
    eval_features = convert_examples_to_features(
      eval_examples, args.max_seq_length, tokenizer)
    eval_features.sort(key=lambda x: len(x.input_ids))
    all_label_ids = np.array([x.label_id for x in eval_features])
    eval_dataloader = build_eval_dataloader(eval_features, args.eval_batch_size)

    eval_loss = 0
    nb_eval_steps = 0
    probs = []

    for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating", ncols=100):
      input_ids = input_ids.to(device)
      input_mask = input_mask.to(device)
      segment_ids = segment_ids.to(device)
      label_ids = label_ids.to(device)

      with torch.no_grad():
        logits = model(input_ids, segment_ids, input_mask)

      # create eval loss and other metric required by the task
      loss_fct = CrossEntropyLoss()
      tmp_eval_loss = loss_fct(logits.view(-1, 3), label_ids.view(-1))

      eval_loss += tmp_eval_loss.mean().item()
      nb_eval_steps += 1
      probs.append(torch.nn.functional.softmax(logits, 1).detach().cpu().numpy())

    probs = np.concatenate(probs, 0)
    eval_loss = eval_loss / nb_eval_steps

    if name == "hans":
      probs[:, 0] += probs[:, 2]
      probs = probs[:, :2]

    preds = np.argmax(probs, axis=1)

    result = {"acc": simple_accuracy(preds, all_label_ids)}
    loss = tr_loss / nb_tr_steps if args.do_train else None

    result['eval_loss'] = eval_loss
    result['global_step'] = global_step
    result['loss'] = loss

    output_eval_file = os.path.join(output_dir, "eval_%s_results.txt" % name)
    with open(output_eval_file, "w") as writer:
      logging.info("***** Eval results *****")
      for key in sorted(result.keys()):
        logging.info("  %s = %s", key, str(result[key]))
        writer.write("%s = %s\n" % (key, str(result[key])))

    output_answer_file = os.path.join(output_dir, "eval_%s_answers.json" % name)
    answers = {ex.example_id: [float(x) for x in p] for ex,p in zip(eval_features, probs)}
    with open(output_answer_file, "w") as f:
      json.dump(answers, f)
Esempio n. 6
0
def build_mnli_bias_only(out_dir, cache_examples=None, w2v_cache=None):
  """Builds our bias-only MNLI model and saves its predictions

  :param out_dir: Directory to save the predictions
  :param cache_examples: Cache examples to this file
  :param w2v_cache: Cache w2v features to this file
  """
  py_utils.add_stdout_logger()

  tok = NltkAndPunctTokenizer()

  # Load the data we want to use
  if cache_examples and exists(cache_examples):
    tf.logging.info("Loading cached examples")
    with open(cache_examples, "rb") as f:
      dataset_to_examples = pickle.load(f)
  else:
    dataset_to_examples = {}
    dataset_to_examples["hans"] = tokenize_examples(load_hans(), tok, 5)
    dataset_to_examples["train"] = tokenize_examples(load_mnli(True), tok, 5)
    dataset_to_examples["dev"] = tokenize_examples(load_mnli(False), tok, 5)
    if cache_examples:
      with open(cache_examples, "wb") as f:
        pickle.dump(dataset_to_examples, f)

  # Our models will only distinguish entailment vs (neutral/contradict)
  for examples in dataset_to_examples.values():
    for i, ex in enumerate(examples):
      if ex.label == 2:
        examples[i] = ex._replace(label=0)

  # Load the pre-normalized word vectors to use when building features
  if w2v_cache and exists(w2v_cache):
    tf.logging.info("Loading cached word vectors")
    with open(w2v_cache, "rb") as f:
      w2v = pickle.load(f)
  else:
    logging.info("Loading word vectors")
    voc = set()
    for v in dataset_to_examples.values():
      for ex in v:
        voc.update(ex.hypothesis)
        voc.update(ex.premise)
    words, vecs = load_word_vectors("crawl-300d-2M", voc)
    w2v = {w: v/np.linalg.norm(v) for w, v in zip(words, vecs)}
    if w2v_cache:
      with open(w2v_cache, "wb") as f:
        pickle.dump(w2v, f)

  # Build the features, store as a pandas dataset
  dataset_to_features = {}
  for name, examples in dataset_to_examples.items():
    tf.logging.info("Building features for %s.." % name)
    features = []
    for example in examples:
      h = [x.lower() for x in example.hypothesis]
      p = [x.lower() for x in example.premise]
      p_words = set(p)
      n_words_in_p = sum(x in p_words for x in h)
      fe = {
        "h-is-subseq": is_subseq(h, p),
        "all-in-p": n_words_in_p == len(h),
        "percent-in-p": n_words_in_p / len(h),
        "log-len-diff": np.log(max(len(p) - len(h), 1)),
        "label": example.label
      }

      h_vecs = [w2v[w] for w in example.hypothesis if w in w2v]
      p_vecs = [w2v[w] for w in example.premise if w in w2v]
      if len(h_vecs) > 0 and len(p_vecs) > 0:
        h_vecs = np.stack(h_vecs, 0)
        p_vecs = np.stack(p_vecs, 0)
        # [h_size, p_size]
        similarities = np.matmul(h_vecs, p_vecs.T)
        # [h_size]
        similarities = np.max(similarities, 1)
        similarities.sort()
        fe["average-sim"] = similarities.sum() / len(h)
        fe["min-similarity"] = similarities[0]
        if len(similarities) > 1:
          fe["min2-similarity"] = similarities[1]

      features.append(fe)

    dataset_to_features[name] = pd.DataFrame(features)
    dataset_to_features[name].fillna(0.0, inplace=True)

  # Train the model
  tf.logging.info("Fitting...")
  train_df = dataset_to_features["train"]
  feature_cols = [x for x in train_df.columns if x != "label"]

  # class_weight='balanced' will weight the entailemnt/non-entailment examples equally
  # C=100 means no regularization
  lr = LogisticRegression(multi_class="auto", solver="liblinear",
                          class_weight='balanced', C=100)
  lr.fit(train_df[feature_cols].values, train_df.label.values)

  # Save the model predictions
  if not exists(out_dir):
    mkdir(out_dir)

  for name, ds in dataset_to_features.items():
    tf.logging.info("Predicting for %s" % name)
    examples = dataset_to_examples[name]
    pred = lr.predict_log_proba(ds[feature_cols].values).astype(np.float32)
    y = ds.label.values

    bias = {}
    for i in range(len(pred)):
      if examples[i].id in bias:
        raise RuntimeError("non-unique IDs?")
      bias[examples[i].id] = pred[i]

    acc = np.mean(y == np.argmax(pred, 1))
    print("%s two-class accuracy: %.4f (size=%d)" % (name, acc, len(examples)))

    with open(join(out_dir, "%s.pkl" % name), "wb") as f:
      pickle.dump(bias, f)
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument("--stratify", type=int, default=None)
  parser.add_argument("--dataset", choices=["location", "person"], default="location")
  cli_utils.add_general_args(parser)
  cli_utils.add_loss_args(parser, default_penalty=None)
  args = parser.parse_args()

  if args.stratify is None:
    if args.mode == "learned_mixin":
      # Note sure if this actually makes a difference, but I turned this on
      # for the learned_mixin case so we do here for exactness
      args.stratify = 6

  if args.penalty is None:
    if args.dataset == "person":
      args.penalty = 0.2
    else:
      args.penalty = 0.4

  dbg = args.debug

  if dbg:
    epoch_size = 50
  else:
    epoch_size = 1200

  opt = AdamOptimizer(decay_steps=50, max_grad_norm=3.0)
  batcher = QuantileBatcher(45, 10, 400, 4, 12)
  evaluator = Evaluator("triviaqa")

  trainer = Trainer(
    batcher, opt, evaluator,
    eval_batch_size=90,
    num_epochs=30, epoch_size=epoch_size,
    log_period=100,
    prefetch=5, loss_ema=0.999,
    n_processes=args.n_processes
  )

  if dbg:
    dataset = AnnotatedTriviaQACPLoader(
      args.dataset, sample_train=1000, stratify=args.stratify)
  else:
    dataset = AnnotatedTriviaQACPLoader(
      args.dataset, sample_train_eval=8000, stratify=args.stratify)

  dim = 128
  recurrent_layer = CudnnLSTMRecurrentDropout(dim, 0.2)
  model = TextPairQaDebiasingModel(
    None,  # Assume pre-tokenized data
    text_encoder=WordAndCharEncoder(
      "glove.6B.50d" if dbg else "crawl-300d-2M",
      first_n=500000,
      char_embed_dim=24,
      character_mapper=Conv1d(100, 5, None),
      character_pooler=MaxPooler(),
      word_length=30
    ),
    map_embed=seq(
      Dropout(0.3),
      HighwayLayer(recurrent_layer),
    ),
    fuse_layer=BiAttention(WeightedDot()),
    post_process_layer=seq(
      VariationalDropout(0.2),
      FullyConnected(dim * 2, activation="relu"),
      VariationalDropout(0.2),
      HighwayLayer(recurrent_layer),
      VariationalDropout(0.2),
      HighwayLayer(recurrent_layer),
      VariationalDropout(0.2),
    ),
    debias_loss_fn=cli_utils.get_qa_loss_fn(args)
  )

  with open(__file__) as f:
    notes = f.read()

  py_utils.add_stdout_logger()

  trainer.train(dataset, model, args.output_dir, notes)

  if args.output_dir:
    logging.info("Evaluating...")
    eval_debiased_triviaqa_cp.show_scores(args.output_dir, args.dataset, ["dev", "test"])
Esempio n. 8
0
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument("--bias", choices=["indicator", "excluder", "dependent"], default="indicator")
  cli_utils.add_general_args(parser)
  cli_utils.add_loss_args(parser, default_penalty=None)
  args = parser.parse_args()

  if args.penalty is None:
    if args.bias == "indicator":
      args.penalty = 0.01
    else:
      args.penalty = 0.005

  dbg = args.debug

  if dbg:
    epoch_size = 200
  else:
    epoch_size = 6000

  opt = AdamOptimizer(max_grad_norm=5.0)
  batcher = QuantileBatcher(32, 10, 160, 4, 12)
  evaluator = Evaluator(mode="clf")

  trainer = Trainer(
    batcher, opt, evaluator,
    eval_batch_size=64,
    num_epochs=30, epoch_size=epoch_size,
    log_period=100,
    prefetch=5, loss_ema=0.999,
    n_processes=args.n_processes,
  )

  if args.bias == "indicator":
    bias_prob, i_prob = 0.8, None
  elif args.bias == "excluder":
    bias_prob, i_prob = 0.03, None
  elif args.bias == "dependent":
    bias_prob, i_prob = 0.9, 0.8
  else:
    raise RuntimeError()

  if dbg:
    dataset = MnliWithSyntheticBiasLoading(bias_prob, n_train_eval=200, n_train_sample=1000, n_dev_sample=200, indicator_noise=i_prob)
  else:
    dataset = MnliWithSyntheticBiasLoading(bias_prob, n_train_eval=10000, indicator_noise=i_prob)

  dim = 50 if dbg else 200
  recurrent_layer = CudnnLSTMRecurrentDropout(dim, 0.2)
  model = TextPairClfDebiasingModel(
    NltkAndPunctTokenizer(),
    WordAndCharEncoder(
      "glove.6B.50d" if dbg else "crawl-300d-2M",
      first_n=None,
      char_embed_dim=24,
      character_mapper=mseq(Dropout(0.1), Conv1d(100, 5, None)),
      character_pooler=MaxPooler(),
      word_length=30,
    ),
    map_embed=seq(
      VariationalDropout(0.2),
      recurrent_layer
    ),
    bifuse_layer=AttentionBiFuse(WeightedDot()),
    post_process_layer=seq(
      recurrent_layer,
      VariationalDropout(0.2),
    ),
    pool_layer=MaxPooler(),
    processs_joint=mseq(
      FullyConnected(100),
      Dropout(0.2)
    ),
    n_classes=3,
    debias_loss_fn=cli_utils.get_clf_loss_fn(args)
  )

  with open(__file__) as f:
    notes = f.read()

  py_utils.add_stdout_logger()

  trainer.train(dataset, model, args.output_dir, notes)

  if args.output_dir:
    logging.info("Evaluating...")
    show_scores(args.output_dir, args.bias, [False, True], n_processes=args.n_processes)