Ejemplo n.º 1
0
def get_data():
    """Load official THYME data"""

    xml_regex = cfg.get('data', 'xml_regex')

    train_xml_dir = os.path.join(base, cfg.get('data', 'train_xml'))
    train_text_dir = os.path.join(base, cfg.get('data', 'train_text'))

    dev_xml_dir = os.path.join(base, cfg.get('data', 'dev_xml'))
    dev_text_dir = os.path.join(base, cfg.get('data', 'dev_text'))

    train_data = dtrdata.DTRData(train_xml_dir, train_text_dir, xml_regex,
                                 cfg.getint('args', 'context_chars'))
    dev_data = dtrdata.DTRData(dev_xml_dir, dev_text_dir, xml_regex,
                               cfg.getint('args', 'context_chars'))

    x_train, y_train = train_data()
    x_dev, y_dev = dev_data()

    vectorizer = TfidfVectorizer(tokenizer=tokenizer,
                                 token_pattern=None,
                                 ngram_range=(1, 3))
    x_train = vectorizer.fit_transform(x_train)
    x_dev = vectorizer.transform(x_dev)

    return x_train, y_train, x_dev, y_dev
Ejemplo n.º 2
0
def main():
    """Fine-tune bert"""

    train_data = dtrdata.DTRData(os.path.join(base, cfg.get('data',
                                                            'xmi_dir')),
                                 partition='train',
                                 n_files=cfg.get('data', 'n_files'))
    tr_texts, tr_labels = train_data.read()
    train_loader = utils.make_data_loader(tr_texts, tr_labels,
                                          cfg.getint('model', 'batch_size'),
                                          None, 'train', utils.to_lstm_inputs)

    val_data = dtrdata.DTRData(os.path.join(base, cfg.get('data', 'xmi_dir')),
                               partition='dev',
                               n_files=cfg.get('data', 'n_files'))
    val_texts, val_labels = val_data.read()
    val_loader = utils.make_data_loader(val_texts, val_labels,
                                        cfg.getint('model', 'batch_size'),
                                        None, 'dev', utils.to_lstm_inputs)

    model = LstmClassifier()

    label_counts = torch.bincount(torch.IntTensor(tr_labels))
    weights = len(tr_labels) / (2.0 * label_counts)

    train(model, train_loader, val_loader, weights)
    evaluate(model, val_loader, weights)
Ejemplo n.º 3
0
def main():
  """Fine-tune bert"""

  train_data = dtrdata.DTRData(
    os.path.join(base, cfg.get('data', 'xmi_dir')),
    partition='train',
    n_files=cfg.get('data', 'n_files'))
  tr_texts, tr_labels = train_data.read()
  train_loader = make_data_loader(tr_texts, tr_labels, RandomSampler)

  val_data = dtrdata.DTRData(
    os.path.join(base, cfg.get('data', 'xmi_dir')),
    partition='dev',
    n_files=cfg.get('data', 'n_files'))
  val_texts, val_labels = val_data.read()
  val_loader = make_data_loader(val_texts, val_labels, SequentialSampler)

  model = BertClassifier.from_pretrained(
    'bert-base-uncased',
    num_labels=2)

  label_counts = torch.bincount(torch.IntTensor(tr_labels))
  weights = len(tr_labels) / (2.0 * label_counts)
  print('class weights:', weights)

  train(model, train_loader, val_loader, weights)
  evaluate(model, val_loader, weights)
Ejemplo n.º 4
0
def main():
    """Fine-tune bert"""

    train_data = dtrdata.DTRData(os.path.join(base, cfg.get('data',
                                                            'xmi_dir')),
                                 partition='train',
                                 n_files=cfg.get('data', 'n_files'))
    tr_texts, tr_labels = train_data.read()
    train_loader = utils.make_data_loader(tr_texts, tr_labels,
                                          cfg.getint('model', 'batch_size'),
                                          cfg.getint('data', 'max_len'),
                                          'train', utils.to_token_id_sequences)

    val_data = dtrdata.DTRData(os.path.join(base, cfg.get('data', 'xmi_dir')),
                               partition='dev',
                               n_files=cfg.get('data', 'n_files'))
    val_texts, val_labels = val_data.read()
    val_loader = utils.make_data_loader(val_texts, val_labels,
                                        cfg.getint('model', 'batch_size'),
                                        cfg.getint('data', 'max_len'), 'dev',
                                        utils.to_token_id_sequences)

    model = BagOfEmbeddings()

    label_counts = torch.bincount(torch.IntTensor(tr_labels))
    weights = len(tr_labels) / (2.0 * label_counts)

    train(model, train_loader, val_loader, weights)
    evaluate(model, val_loader, weights)
Ejemplo n.º 5
0
def main():
    """Fine-tune bert"""

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                          num_labels=4)

    if torch.cuda.is_available():
        device = torch.device('cuda')
        model.cuda()
    else:
        device = torch.device('cpu')
        model.cpu()

    optimizer, scheduler = make_optimizer_and_scheduler(model)

    train_data = dtrdata.DTRData(os.path.join(base, cfg.get('data',
                                                            'xmi_dir')))
    train_loader = make_data_loader(train_data, RandomSampler)

    for epoch in trange(cfg.getint('bert', 'num_epochs'), desc='epoch'):
        model.train()

        train_loss, num_train_examples, num_train_steps = 0, 0, 0

        for step, batch in enumerate(train_loader):
            batch = tuple(t.to(device) for t in batch)
            batch_inputs, batch_masks, batch_labels = batch
            optimizer.zero_grad()

            loss, logits = model(batch_inputs,
                                 attention_mask=batch_masks,
                                 labels=batch_labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
            num_train_examples += batch_inputs.size(0)
            num_train_steps += 1

        print('epoch: %d, loss: %.4f' % (epoch, train_loss / num_train_steps))

    dev_data = dtrdata.DTRData(os.path.join(base, cfg.get('data', 'xmi_dir')),
                               partition='dev',
                               xml_ref_dir=os.path.join(
                                   base, cfg.get('data', 'ref_xml_dir')),
                               xml_out_dir=cfg.get('data', 'out_xml_dir'))

    dev_loader = make_data_loader(dev_data, sampler=SequentialSampler)
    predictions = evaluate(model, dev_loader, device)
    dev_data.write(predictions)
Ejemplo n.º 6
0
def main():
  """Fine-tune bert"""

  train_data = dtrdata.DTRData(
    os.path.join(base, cfg.get('data', 'xmi_dir')),
    partition='train',
    n_files=cfg.get('data', 'n_files'))
  tr_texts, tr_labels = train_data.read()
  train_loader = utils.make_data_loader(
    tr_texts,
    tr_labels,
    cfg.getint('model', 'batch_size'),
    cfg.getint('data', 'max_len'),
    'train',
    utils.to_transformer_inputs)

  val_data = dtrdata.DTRData(
    os.path.join(base, cfg.get('data', 'xmi_dir')),
    partition='dev',
    n_files=cfg.get('data', 'n_files'))
  val_texts, val_labels = val_data.read()
  val_loader = utils.make_data_loader(
    val_texts,
    val_labels,
    cfg.getint('model', 'batch_size'),
    cfg.getint('data', 'max_len'),
    'dev',
    utils.to_transformer_inputs)

  print('loaded %d training and %d validation samples' % \
        (len(tr_texts), len(val_texts)))

  model = TransformerClassifier()

  label_counts = torch.bincount(torch.IntTensor(tr_labels))
  weights = len(tr_labels) / (2.0 * label_counts)

  train(model, train_loader, val_loader, weights)
  evaluate(model, val_loader, weights, suppress_output=False)
Ejemplo n.º 7
0
def main():
    """Fine-tune bert"""

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    model = BertForSequenceClassification.from_pretrained('bert-base-uncased',
                                                          num_labels=4)
    if torch.cuda.is_available():
        model.cuda()
    else:
        model.cpu()

    # this is still a mystery to me
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() \
          if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
        {'params': [p for n, p in model.named_parameters() \
          if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=cfg.getfloat('bert', 'lr'),
                      eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=100,
                                                num_training_steps=1000)

    train_data = dtrdata.DTRData(
        os.path.join(base, cfg.get('data', 'train_xml')),
        os.path.join(base, cfg.get('data', 'train_text')),
        cfg.get('data', 'xml_regex'), cfg.get('data', 'out_dir'),
        cfg.getint('args', 'context_chars'), cfg.getint('bert', 'max_len'))

    train_loader = make_data_loader(train_data, sampler=RandomSampler)

    for epoch in trange(cfg.getint('bert', 'num_epochs'), desc='epoch'):
        model.train()

        train_loss, num_train_examples, num_train_steps = 0, 0, 0

        for step, batch in enumerate(train_loader):

            batch = tuple(t.to(device) for t in batch)
            batch_inputs, batch_masks, batch_labels = batch
            optimizer.zero_grad()

            loss, logits = model(batch_inputs,
                                 attention_mask=batch_masks,
                                 labels=batch_labels)

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
            num_train_examples += batch_inputs.size(0)
            num_train_steps += 1

        print('epoch: %d, loss: %.4f' % (epoch, train_loss / num_train_steps))

    dev_data = dtrdata.DTRData(os.path.join(base, cfg.get('data', 'dev_xml')),
                               os.path.join(base, cfg.get('data', 'dev_text')),
                               cfg.get('data', 'xml_regex'),
                               cfg.get('data', 'out_dir'),
                               cfg.getint('args', 'context_chars'),
                               cfg.getint('bert', 'max_len'))

    dev_loader = make_data_loader(dev_data, sampler=SequentialSampler)
    predictions = evaluate(model, dev_loader, device)
    dev_data.write(predictions)