Beispiel #1
0
 def __init__(self, miner: AbstractTripletMiner, *, margin=1.0, weights=(1.0, 1.0), temperature=1.0):
     super(TripletLossWithMiner, self).__init__()
     self._cross_entropy = CrossEntropy(temperature=temperature)
     self._cosine_similarity = nn.CosineSimilarity()
     self._triplet_loss = nn.TripletMarginWithDistanceLoss(margin=margin, distance_function=self._cosine_similarity)
     self._miner = miner
     self._weights = weights
Beispiel #2
0
    def __init__(self, **kwargs):
        super().__init__(self, **kwargs)

        self.bertmodel = BertModelInvertibleEmbeddings(self.devbert_config,
                                                       add_pooling_layer=False)
        # self.criterion = nn.L1Loss(reduction='sum')
        self.criterion = nn.TripletMarginWithDistanceLoss(reduction='mean',
                                                          margin=1)
        self.distance = nn.PairwiseDistance()
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--exp", type=int, default=7)
    parser.add_argument("--save",
                        type=str,
                        default="./model2_best_diverse_mean_maskedLM.pt")
    args = parser.parse_args()

    # Data and Tokenization
    device = "cuda" if torch.cuda.is_available() else "cpu"
    tokenizer = DistilBertTokenizerFast.from_pretrained(
        "distilbert-base-uncased")

    batch_size = 4
    train_dataset = TorchDataset(
        file_name="./data/diverse.triplets.train.tsv",
        queries_path="./data/diverse.queries.all.tsv",
        passages_path="./data/diverse.passages.all.tsv",
    )
    train_dataloader = DataLoader(train_dataset,
                                  batch_size=batch_size,
                                  shuffle=True)

    dev_dataset = TorchDataset(
        file_name="./data/diverse.triplets.dev.tsv",
        queries_path="./data/diverse.queries.all.tsv",
        passages_path="./data/diverse.passages.all.tsv",
    )
    dev_dataloader = DataLoader(dev_dataset,
                                batch_size=batch_size,
                                shuffle=False)

    # Model Training and Evaluation
    NUM_EPOCHS = 1
    LEARNING_RATE = 0.00003

    # load model
    model = DistilBertForMaskedLM.from_pretrained("distilbert-base-uncased")

    # if args.exp == 1:
    #     pass
    # elif args.exp == 2:
    #
    # elif args.exp == 3:

    if args.exp == 7:
        # For Experiment7: average
        model = DistilBertForMaskedLM.from_pretrained(
            "distilbert-base-uncased")
        triplet_loss = nn.TripletMarginLoss(margin=1.0)

    elif args.exp == 6:
        # For Experiment6: base + cosine
        triplet_loss = nn.TripletMarginWithDistanceLoss(
            distance_function=lambda x, y: 1 - F.cosine_similarity(
                x, y, dim=-1),
            margin=1.0,
        )

    elif args.exp == 5:
        # For Experiment5: base + margin = 0.1
        triplet_loss = nn.TripletMarginLoss(margin=0.1)

    elif args.exp == 4:
        # For Experiment4: base
        triplet_loss = nn.TripletMarginLoss(margin=1.0)

    model.to(device)
    model.train()
    optimizer = torch.optim.Adam(model.distilbert.parameters(),
                                 lr=LEARNING_RATE)

    def evaluate(inputs, model, tokenizer):
        encodings = tokenizer(
            inputs,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512,
        )
        ids, masks = encodings["input_ids"], encodings["attention_mask"]
        outputs = model.distilbert(ids.to(device), masks.to(device))
        if args.exp < 7:
            # Experiment: using the first index of the last layers
            outputs_hidden = outputs.last_hidden_state[:, 0]
        else:
            # Averaging last layers
            outputs_hidden = outputs.last_hidden_state.mean(dim=1)

        return outputs_hidden.view(3, len(queries), -1)

    dataloader = train_dataloader
    N = len(dataloader)
    lowest_loss = float("inf")
    start = time.time()
    learning_curve_y = []
    learning_curve_x = []

    for epoch in range(NUM_EPOCHS):
        epoch_loss = 0
        for i, (queries, pos_docs, neg_docs) in enumerate(dataloader):
            # readability
            # train()
            # evaluate()
            # print()
            optimizer.zero_grad()  # set gradient to zero
            anchors, positives, negatives = evaluate(
                inputs=list(queries + pos_docs + neg_docs),
                model=model,
                tokenizer=tokenizer,
            )

            loss = triplet_loss(anchors, positives, negatives)
            loss.backward()
            optimizer.step()

            epoch_loss += float(loss)

            if i % 10 == 0:
                elapsed_time = time.time() - start
                remaining_time = elapsed_time * (1 / (i + 1) * N - 1)
                print(
                    f"{i}: remaining time: {remaining_time:.1f} | est. epoch loss: {epoch_loss / (i + 1):.4f}"
                )

            if i % 100 == 0:
                with torch.no_grad():
                    correct = total = 0
                    val_start = time.time()
                    for dq, dp, dn in dev_dataloader:
                        anchors, positives, negatives = evaluate(
                            inputs=list(dq + dp + dn),
                            model=model,
                            tokenizer=tokenizer,
                        )
                        if args.exp == 6:
                            # cosine distance
                            pos_dist = 1 - F.cosine_similarity(
                                anchors, positives, dim=-1)
                            neg_dist = 1 - F.cosine_similarity(
                                anchors, negatives, dim=-1)
                        else:
                            # using l2 norm
                            pos_dist = (anchors - positives).norm(
                                dim=-1)  # B distances
                            neg_dist = (anchors - negatives).norm(
                                dim=-1)  # B distances

                        correct += float((pos_dist < neg_dist).sum())
                        total += len(dq)
                        if time.time() - val_start > 15:
                            break
                    print(
                        f"{i}: est. validation accuracy: {correct / total:.4f}"
                    )
                    learning_curve_y.append(correct / total)
                    learning_curve_x.append(i * batch_size)  # epoch normally

            if (epoch_loss / (i + 1)) < lowest_loss:
                if args.exp == 4:
                    torch.save(model.state_dict(),
                               "model2_best_diverse_base.pt")
                elif args.exp == 5:
                    torch.save(model.state_dict(),
                               "model2_best_diverse_margin.pt")
                elif args.exp == 6:
                    torch.save(model.state_dict(),
                               "model2_best_diverse_cosine.pt")
                elif args.exp == 7:
                    torch.save(model.state_dict(),
                               "model2_best_diverse_mean_maskedLM.pt")

                lowest_loss = epoch_loss / (i + 1)

        print(f"loss for epoch {epoch} is {epoch_loss}")

        generate_data_for_plot(learning_curve_y, learning_curve_x)
        100, document_input_size).astype(np.float32)
    negative_document_inputs = np.random.rand(
        100, document_input_size).astype(np.float32)
    data_loader = utils.data.DataLoader(Dataset(torch.from_numpy(query_inputs), torch.from_numpy(positive_document_inputs), torch.from_numpy(negative_document_inputs)),
                                        batch_size=50, shuffle=True, num_workers=2)
    return data_loader


if __name__ == '__main__':
    # number of features for query encoder and document encoder
    query_input_size = 20
    document_input_size = 15
    # Encoder initialization
    query_encoder = QueryEncoder(query_input_size).to(device)
    document_encoder = DocumentEncoder(document_input_size).to(device)
    # Optimizer initialization
    query_optimizer = torch.optim.Adam(
        query_encoder.parameters(), lr=learning_rate)
    document_optimizer = torch.optim.Adam(
        document_encoder.parameters(), lr=learning_rate)
    # Triplet loss
    triplet_loss = nn.TripletMarginWithDistanceLoss(
        distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y), margin=margin)
    # Trainer initialization
    trainer = Trainer(query_encoder, document_encoder,
                      triplet_loss, query_optimizer, document_optimizer)
    # load dummy data
    data_loader = load_dummy_data(query_input_size, document_input_size)
    for epoch in range(num_epochs):
        trainer.train(data_loader, epoch)
Beispiel #5
0
    def __init__(self,
                 model,
                 dataset_name="mrpc",
                 batch_size=12,
                 epochs=30,
                 epoch_size=80):
        self.model = model
        self.siam = True
        self.use_triplet = False
        if self.siam and self.use_triplet:
            self.triplet_loss = nn.TripletMarginWithDistanceLoss(
                distance_function=lambda x, y: 1.0 - F.cosine_similarity(x, y),
                margin=0.5)
        self.semi_siam = False
        self.CLS = False
        self.dataset_name = dataset_name
        self.batch_size = batch_size
        self.epochs = epochs
        self.epoch_size = epoch_size
        self.pretrain_head = not self.siam
        self.init_epochs = 10
        # self.evaluate_softmax = True if self.model.output_size == 2 else False
        self.device = torch.device("cuda")

        self.model = torch.load(
            "/home/ihor/University/DiplomaProject/Program/models/roberta_base_cls_20210401-180654.pt"
        )
        self.model.attach_head("siam")
        self.model.to(self.device)

        self.evaluate_softmax = True if self.model.output_size == 2 else False

        # data = tfds.load('glue/mrpc')
        # self.train_dataset = glue_convert_examples_to_features(data['train'], self.tokenizer, max_length=128, task='mrpc')
        # self.train_dataset = self.train_dataset.shuffle(100).batch(32).repeat(2)
        # print(type(self.train_dataset))

        if self.dataset_name == "mrpc":
            self.train_dataset = load_dataset("glue", "mrpc", split="train")
            self.train_data_loader = torch.utils.data.DataLoader(
                self.train_dataset, batch_size=self.batch_size, shuffle=True)

            self.test_dataset = load_dataset(
                "csv",
                data_files={
                    "test":
                    "/home/ihor/University/DiplomaProject/Program/datasets/MRPC/msr_paraphrase_test.txt"
                },
                skip_rows=1,
                delimiter='\t',
                quote_char=False,
                column_names=[
                    'label', 'idx1', 'idx2', 'sentence1', 'sentence2'
                ],
                split="test")
        elif self.dataset_name == "qqp":
            self.train_dataset = load_dataset("glue", "qqp", split="train")
            self.train_dataset = self.train_dataset.map(
                lambda examples: {
                    'sentence1': examples['question1'],
                    'sentence2': examples['question2']
                },
                batched=True)
            self.train_data_loader = torch.utils.data.DataLoader(
                self.train_dataset, batch_size=self.batch_size, shuffle=True)

            self.test_dataset = load_dataset(
                "csv",
                data_files={
                    "test":
                    "/home/ihor/University/DiplomaProject/Program/datasets/Quora/dev.tsv"
                },
                skip_rows=1,
                delimiter='\t',
                quote_char=False,
                split="test")
            self.test_dataset = self.test_dataset.map(
                lambda examples: {
                    'sentence1': examples['question1'],
                    'sentence2': examples['question2'],
                    'label': examples['is_duplicate']
                },
                batched=True)
        else:
            raise Exception("Unsupported dataset!")
        self.nrof_train_samples = len(self.train_dataset["label"])

        print("Samples in train_set: {}".format(self.nrof_train_samples))
        self.nrof_test_samples = len(self.test_dataset["label"])
        print("Samples in test_set: {}".format(self.nrof_test_samples))

        # for logits of size (batch_size, nrof_classes)
        # self.criterion = nn.CrossEntropyLoss()

        # no_decay = ['bias', 'LayerNorm.weight']
        # optimizer_grouped_parameters = [
        #     {'params': [p for n, p in self.model.named_parameters() if not any(nd in n for nd in no_decay)],
        #      'weight_decay': 0.01},
        #     {'params': [p for n, p in self.model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        # ]
        # # print(self.model.named_parameters())
        # self.optimizer = AdamW(optimizer_grouped_parameters, lr=0.001)
        print("Nrof parameters: {}".format(len(list(self.model.parameters()))))

        self.optimizer = optim.SGD(self.model.parameters(),
                                   lr=0.0005,
                                   momentum=0.9,
                                   weight_decay=0.01,
                                   nesterov=True)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer,
                                                   step_size=5,
                                                   gamma=0.5)
Beispiel #6
0
def main_worker(gpu, ngpus_per_node, args):
    global best_loss
    global train_loss
    global valid_loss
    global lr_log
    global f1_log
    global mAP_log
    global mrr_log
    args.gpu = gpu

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)

    model = TripletShopeeImageEmbeddingNet(model=args.model_name)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
    else:
        # DataParallel will divide and allocate batch_size to all available GPUs
        model = torch.nn.DataParallel(model).cuda()

    train_csv = pd.read_csv(
        os.path.join(args.data_dir, 'split_data', 'train.csv'))
    train_csv['image'] = args.data_dir + 'train_images/' + train_csv['image']
    tmp = train_csv.groupby('label_group').posting_id.agg('unique').to_dict()
    train_csv['target'] = train_csv.label_group.map(tmp)

    val_csv = pd.read_csv(os.path.join(args.data_dir, 'split_data', 'val.csv'))
    val_csv['image'] = args.data_dir + 'train_images/' + val_csv['image']
    tmp = val_csv.groupby('label_group').posting_id.agg('unique').to_dict()
    val_csv['target'] = val_csv.label_group.map(tmp)

    test_csv = pd.read_csv(
        os.path.join(args.data_dir, 'split_data', 'test.csv'))
    test_csv['image'] = args.data_dir + 'train_images/' + test_csv['image']
    tmp = test_csv.groupby('label_group').posting_id.agg('unique').to_dict()
    test_csv['target'] = test_csv.label_group.map(tmp)

    train_dataset = TripletShopeeImageDataset(
        train_csv,
        transforms.Compose([
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        train=True)

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler)

    val_dataset = TripletShopeeImageDataset(
        val_csv,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        train=True)

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=args.batch_size,
                                             shuffle=False,
                                             num_workers=args.workers,
                                             pin_memory=True)

    test_dataset = TripletShopeeImageDataset(
        test_csv,
        transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        train=False)

    test_loader = torch.utils.data.DataLoader(test_dataset,
                                              batch_size=args.batch_size,
                                              shuffle=False,
                                              num_workers=args.workers,
                                              pin_memory=True)

    # state = {'model': args.model_name,
    #           'state_dict': model.state_dict()}
    # torch.save(state, '/home/jhj/PR-project/shopee/data/best_triplet_d201.pth')
    #
    # assert 1==0

    if args.resume:
        checkpoint = torch.load(args.resume, map_location='cpu')
        model.load_state_dict(checkpoint['state_dict'])

    if args.test:
        topns = [2]
        for topn in topns:
            args.topn = topn
            f1, mAP, mrr = test(test_loader, test_csv, model, args)

        return

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay,
                                nesterov=True)
    criterion = nn.TripletMarginWithDistanceLoss(
        distance_function=cosine_similarity,
        margin=args.tripletloss_margin).cuda(args.gpu)

    torch.autograd.set_detect_anomaly(True)
    cudnn.benchmark = True

    epoch_time = AverageMeter('Epoch Tiem', ':6.3f')
    start_time = time.time()
    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)

        ### Train for one epoch
        tr_loss, lr = train(train_loader, model, criterion, optimizer, epoch,
                            args)

        ### Evaluate on validation set
        val_loss = validate(val_loader, model, criterion, args)

        ### Remember best Acc@1 and save checkpoint
        is_best = val_loss > best_loss
        best_loss = min(val_loss, best_loss)

        f1, mAP, mrr = test(test_loader, test_csv, model, args)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            train_loss.append(tr_loss)
            valid_loss.append(val_loss)
            lr_log.append(lr)
            f1_log.append(f1)
            mAP_log.append(mAP)
            mrr_log.append(mrr)

            df = pd.DataFrame({
                'train_loss': train_loss,
                'valid_loss': valid_loss,
                'lr_log': lr_log
            })
            log_file = os.path.join(args.log_dir, args.log_name)

            with open(log_file, "w") as f:
                df.to_csv(f)

            save_checkpoint(
                {
                    'epoch': epoch,
                    'model': args.model_name,
                    'state_dict': model.state_dict(),
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict(),
                    'train_loss': train_loss,
                    'valid_loss': valid_loss,
                    'lr_log': lr_log,
                },
                args,
                is_best,
                filename='checkpoint_epoch{}.pth.tar'.format(epoch))

            epoch_time.update(time.time() - start_time, 1)
            print('Duration: %4f H, Left Time: %4f H' %
                  (epoch_time.sum / 3600, epoch_time.avg *
                   (args.epochs - epoch - 1) / 3600))
            start_time = time.time()

            df = pd.DataFrame({
                'f1_log': f1_log,
                'mAP': mAP_log,
                'mrr': mrr_log
            })

            log_file = os.path.join(args.log_dir, 'test_result.txt')
            with open(log_file, "w") as f:
                df.to_csv(f)

    return