Example #1
0
def get_data():
    train_data = get_dataset('url-versions-2015-06-14-clean-train.csv')
    X, y = split_data(train_data)
    X = p.pipeline.fit_transform(X)
    
    train_data1 = train_data[:1489]
    train_data2 = train_data[1489:]
    X1 = X[:1489]
    X2 = X[1489:]
    
    
    test_data = get_dataset('url-versions-2015-06-14-clean-test.csv')
    X_test, y_test = split_data(test_data)
    X_test = p.pipeline.transform(X_test)
    
    # return train/ validation/ test
    return (train_data1, X1, train_data2, X2, test_data, X_test)
Example #2
0
def run():
    parser = ArgumentParser()
    parser.add_argument("--dataset_path", type=str, default="",
                        help="Path or url of the dataset. If empty download from S3.")
    parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache")
    parser.add_argument("--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)",
                        choices=['openai-gpt', 'gpt2'])  # anything besides gpt2 will load openai-gpt
    # parser.add_argument("--model", type=str, default="gpt2", help="Model type (openai-gpt or gpt2)",
    #                     choices=['openai-gpt', 'gpt2'])  # anything besides gpt2 will load openai-gpt
    parser.add_argument("--model_checkpoint", type=str, default="drive/My Drive/GPT-2_Text_Generation/model_checkpoint", help="Path, url or short name of the model")
    # parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model")
    parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history")
    parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu",
                        help="Device (cuda or cpu)")

    parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling")
    parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances")
    parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances")
    parser.add_argument("--seed", type=int, default=0, help="Seed")
    parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature")
    parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)")
    parser.add_argument("--top_p", type=float, default=0.9,
                        help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)")
    args = parser.parse_args()

    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger(__file__)
    logger.info(pformat(args))

    if args.model_checkpoint == "":
        if args.model == 'gpt2':
            raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint")
        else:
            args.model_checkpoint = download_pretrained_model()

    if args.seed != 0:
        random.seed(args.seed)
        torch.random.manual_seed(args.seed)
        torch.cuda.manual_seed(args.seed)

    logger.info("Get pretrained model and tokenizer")
    tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else (
    OpenAIGPTTokenizer, OpenAIGPTLMHeadModel)
    tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint)
    model = model_class.from_pretrained(args.model_checkpoint)
    model.to(args.device)
    add_special_tokens_(model, tokenizer)

    logger.info("Sample a personality")
    dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)
    personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset]
    personality = random.choice(personalities)
    personality_decoded = [tokenizer.decode(x) for x in personality]
    database.push_personality(personality_decoded)
Example #3
0
def get_data_loaders(args, tokenizer):
    """ Prepare the dataset for training and evaluation """
    personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache)

    logger.info("Build inputs and labels")
    datasets = {"train": defaultdict(list), "valid": defaultdict(list)}
    for dataset_name, dataset in personachat.items():
        num_candidates = len(dataset[0]["utterances"][0]["candidates"])
        if args.num_candidates > 0 and dataset_name == 'train':
            num_candidates = min(args.num_candidates, num_candidates)
        for dialog in dataset:
            persona = dialog["personality"].copy()
            for _ in range(args.personality_permutations):
                for utterance in dialog["utterances"]:
                    history = utterance["history"][-(2 * args.max_history + 1):]
                    for j, candidate in enumerate(utterance["candidates"][-num_candidates:]):
                        lm_labels = bool(j == num_candidates - 1)
                        instance = build_input_from_segments(persona, history, candidate, tokenizer, lm_labels)
                        for input_name, input_array in instance.items():
                            datasets[dataset_name][input_name].append(input_array)
                    datasets[dataset_name]["mc_labels"].append(num_candidates - 1)
                    datasets[dataset_name]["n_candidates"] = num_candidates
                persona = [persona[-1]] + persona[:-1]  # permuted personalities

    logger.info("Pad inputs and convert to Tensor")
    tensor_datasets = {"train": [], "valid": []}
    for dataset_name, dataset in datasets.items():
        dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1]))
        for input_name in MODEL_INPUTS:
            tensor = torch.tensor(dataset[input_name])
            if input_name != "mc_labels":
                tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:])
            tensor_datasets[dataset_name].append(tensor)

    logger.info("Build train and validation dataloaders")
    train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"])
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
    valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None
    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,
                              shuffle=(not args.distributed))
    valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False)

    logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape))
    logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape))
    return train_loader, valid_loader, train_sampler, valid_sampler
except:
    import pickle

from aligner import align

from model.utils import get_dataset, get_tokenized_lemmas


def _get_unaligned_tokens(tokens, alignment):
    aligned = [a - 1 for (a, _) in alignment]
    unaligned = [i for i in range(len(tokens)) if i not in aligned]
    return [tokens[i] for i in unaligned]


if __name__ == "__main__":
    df = get_dataset()
    data = {}

    for id, row in df.iterrows():
        article_hl_tok = get_tokenized_lemmas(row.articleHeadline)
        claim_hl_tok = get_tokenized_lemmas(row.claimHeadline)
        try:
            alignment = align(claim_hl_tok, article_hl_tok)
            data[(row.claimId, row.articleId)] = [(s - 1, t - 1)
                                                  for (s, t) in alignment[0]]
        except:
            print 'Unable to align', article_hl_tok, 'and', claim_hl_tok
            print row.articleId, row.claimId

    with open(os.path.join('..', 'data', 'pickled', 'aligned-data.pickle'),
              'wb') as f:
_dataset_base_files = [
    'url-versions-2015-06-14-clean-test.csv',
    'url-versions-2015-06-14-clean-train.csv'
]

_no_folds = 10

_dataset_files = list(_dataset_base_files)
for fold in range(1, _no_folds + 1):
    _dataset_files.append(
        'url-versions-2015-06-14-clean-train-fold-{0:d}.csv'.format(fold))
    _dataset_files.append(
        'url-versions-2015-06-14-clean-test-fold-{0:d}.csv'.format(fold))

if __name__ == '__main__':
    for ds_base_filename in _dataset_base_files:
        df = get_dataset(ds_base_filename)
        df_fa = df.drop(df[df.articleHeadlineStance == 'observing'].index)
        output_filename = '{0:s}-rte-fa.xml'.format(
            ds_base_filename.split('.')[0])
        with open(os.path.join('..', 'data', 'emergent', output_filename),
                  'w') as f:
            f.write(_generate_xml(df_fa))

    for ds_filename in _dataset_files:
        df = get_dataset(ds_filename)
        output_filename = '{0:s}-rte.xml'.format(ds_filename.split('.')[0])
        with open(os.path.join('..', 'data', 'emergent', output_filename),
                  'w') as f:
            f.write(_generate_xml(df))
import os

from model.utils import get_dataset, split_data
from model.cross_validation import ClaimKFold

if __name__ == '__main__':
    train_data = get_dataset('url-versions-2015-06-14-clean-train.csv')
    X, y = split_data(train_data)

    ckf = ClaimKFold(X)

    fold = 1
    for train_index, test_index in ckf:
        Z_test = X.iloc[test_index, :].copy()
        Z_test['articleHeadlineStance'] = y.iloc[test_index]
        Z_test.to_csv(os.path.join('..', 'data', 'emergent',
                                   'url-versions-2015-06-14-clean-test-fold-{0:d}.csv'.format(fold)))

        Z_train = X.iloc[train_index, :].copy()
        Z_train['articleHeadlineStance'] = y.iloc[train_index]
        Z_train.to_csv(os.path.join('..', 'data', 'emergent',
                                    'url-versions-2015-06-14-clean-train-fold-{0:d}.csv'.format(fold)))

        fold += 1

Example #7
0
    model_dict = model.state_dict()
    if args.init_weights is not None:
        pretrained_dict = torch.load(args.init_weights)['params']
        # remove weights for FC
        # pretrained_dict = {'encoder.'+k: v for k, v in pretrained_dict.items()}
        pretrained_dict = {
            k: v
            for k, v in pretrained_dict.items()
            if k in model_dict and 'fc' not in k
        }
        print(pretrained_dict.keys())
        model_dict.update(pretrained_dict)
    model.load_state_dict(model_dict)
    model.eval()

    trainset = get_dataset(args.dataset, 'train', args.unsupervised, args)
    loader = DataLoader(dataset=trainset,
                        batch_size=args.batch_size,
                        shuffle=False,
                        num_workers=args.num_workers,
                        pin_memory=True)

    embs = []
    labels = []
    with torch.no_grad():
        for i, batch in tqdm(enumerate(loader, 1),
                             total=len(loader),
                             desc='embedding'):
            if torch.cuda.is_available():
                data, label = batch[0].cuda(), batch[1]
            else:
import os
import operator as op
import sys
try:
    import cPickle as pickle
except:
    import pickle

sys.path.append(os.path.join('..', 'src'))

import numpy as np

from model.utils import get_dataset, get_w2v_model, convert_text_to_vec, W2VEC_SIZE

if __name__ == "__main__":
    df = get_dataset('url-versions-2015-06-14-clean-with-body.csv')
    data = ({}, {})
    add_data, mult_data = data
    model = get_w2v_model()

    for id, row in df.iterrows():
        add_data[row.claimId] = convert_text_to_vec(model, row.claimHeadline)
        add_data[row.articleId] = convert_text_to_vec(model,
                                                      row.articleHeadline)

        grp_mult = (np.ones(W2VEC_SIZE), op.mul)
        mult_data[row.claimId] = convert_text_to_vec(model, row.claimHeadline,
                                                     grp_mult)
        mult_data[row.articleId] = convert_text_to_vec(model,
                                                       row.articleHeadline,
                                                       grp_mult)
Example #9
0
import sys
import os

sys.path.append(os.path.join('..', 'src'))

from model.utils import get_dataset, split_data, run_test
from model.baseline.baseline_predictors import ProbabilityPredictor, ChancePredictor, \
    MajorityPredictor, WordOverlapBaselinePredictor


if __name__ == '__main__':
    train_data = get_dataset('url-versions-2015-06-14-clean-train.csv')
    X, y = split_data(train_data)
    test_data = get_dataset('url-versions-2015-06-14-clean-test.csv')

    print('\n>> Chance predictor <<\n')
    print(run_test(X, y, test_data, ChancePredictor()))

    print('\n>> Majority predictor <<\n')
    print(run_test(X, y, test_data, MajorityPredictor()))

    print('\n>> Probability predictor <<\n')
    print(run_test(X, y, test_data, ProbabilityPredictor()))

    print('\n>> Word overlap predictor <<\n')
    print(run_test(X, y, test_data, WordOverlapBaselinePredictor()))
Example #10
0
def main_worker(gpu, ngpus_per_node, args):
    args.gpu = gpu

    # suppress printing if not master
    if args.multiprocessing_distributed and args.gpu != 0:

        def print_pass(*args):
            pass

        builtins.print = print_pass

    if args.gpu is not None:
        print("Use GPU: {} for training".format(args.gpu))

    if args.distributed:
        if args.dist_url == "env://" and args.rank == -1:
            args.rank = int(os.environ["RANK"])
        if args.multiprocessing_distributed:
            # For multiprocessing distributed training, rank needs to be the
            # global rank among all the processes
            args.rank = args.rank * ngpus_per_node + gpu
        dist.init_process_group(backend=args.dist_backend,
                                init_method=args.dist_url,
                                world_size=args.world_size,
                                rank=args.rank)
    # create model
    print("=> creating model '{}'".format(args.backbone_class))
    if args.backbone_class == 'ConvNet':
        from model.networks.convnet import ConvNet as base_encoder
        hdim = 64
    elif args.backbone_class == 'Res12':
        hdim = 640
        from model.networks.res12 import ResNet as base_encoder
    elif args.backbone_class == 'Res18':
        hdim = 512
        from model.networks.res18 import ResNet as base_encoder
    else:
        hdim = 640
        from model.networks.WRN28 import Wide_ResNet as base_encoder

    model = moco.builder.MoCo(base_encoder, hdim, args.moco_k, args.moco_m,
                              args.moco_t, args.mlp)
    print(model)

    if args.distributed:
        # For multiprocessing distributed, DistributedDataParallel constructor
        # should always set the single device scope, otherwise,
        # DistributedDataParallel will use all available devices.
        if args.gpu is not None:
            torch.cuda.set_device(args.gpu)
            model.cuda(args.gpu)
            # When using a single GPU per process and per
            # DistributedDataParallel, we need to divide the batch size
            # ourselves based on the total number of GPUs we have
            args.batch_size = int(args.batch_size / ngpus_per_node)
            args.workers = int(
                (args.workers + ngpus_per_node - 1) / ngpus_per_node)
            model = torch.nn.parallel.DistributedDataParallel(
                model, device_ids=[args.gpu])
        else:
            model.cuda()
            # DistributedDataParallel will divide and allocate batch_size to all
            # available GPUs if device_ids are not set
            model = torch.nn.parallel.DistributedDataParallel(model)
    elif args.gpu is not None:
        torch.cuda.set_device(args.gpu)
        model = model.cuda(args.gpu)
        # comment out the following line for debugging
        # raise NotImplementedError("Only DistributedDataParallel is supported.")
    else:
        # AllGather implementation (batch shuffle, queue update, etc.) in
        # this code only supports DistributedDataParallel.
        raise NotImplementedError("Only DistributedDataParallel is supported.")

    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(args.gpu)

    optimizer = torch.optim.SGD(model.parameters(),
                                args.lr,
                                momentum=args.momentum,
                                weight_decay=args.weight_decay)

    # optionally resume from a checkpoint
    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            if args.gpu is None:
                checkpoint = torch.load(args.resume)
            else:
                # Map model to be loaded to specified single gpu.
                loc = 'cuda:{}'.format(args.gpu)
                checkpoint = torch.load(args.resume, map_location=loc)
            args.start_epoch = checkpoint['epoch']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    cudnn.benchmark = True

    # Data loading code
    # traindir = os.path.join(args.data, 'train')
    # normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
    #                                  std=[0.229, 0.224, 0.225])
    # if args.aug_plus:
    #     # MoCo v2's aug: similar to SimCLR https://arxiv.org/abs/2002.05709
    #     augmentation = [
    #         transforms.RandomResizedCrop(224, scale=(0.2, 1.)),
    #         transforms.RandomApply([
    #             transforms.ColorJitter(0.4, 0.4, 0.4, 0.1)  # not strengthened
    #         ], p=0.8),
    #         transforms.RandomGrayscale(p=0.2),
    #         transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=0.5),
    #         transforms.RandomHorizontalFlip(),
    #         transforms.ToTensor(),
    #         normalize
    #     ]
    # else:
    #     # MoCo v1's aug: the same as InstDisc https://arxiv.org/abs/1805.01978
    #     augmentation = [
    #         transforms.RandomResizedCrop(224, scale=(0.2, 1.)),
    #         transforms.RandomGrayscale(p=0.2),
    #         transforms.ColorJitter(0.4, 0.4, 0.4, 0.4),
    #         transforms.RandomHorizontalFlip(),
    #         transforms.ToTensor(),
    #         normalize
    #     ]

    # train_dataset = datasets.ImageFolder(
    #     traindir,
    #     moco.loader.TwoCropsTransform(transforms.Compose(augmentation)))
    train_dataset = get_dataset(args.dataset, 'train', False, args, 'moco')

    if args.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None

    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=args.batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=args.workers,
                                               pin_memory=True,
                                               sampler=train_sampler,
                                               drop_last=True)

    for epoch in range(args.start_epoch, args.epochs):
        if args.distributed:
            train_sampler.set_epoch(epoch)
        adjust_learning_rate(optimizer, epoch, args)

        # train for one epoch
        train(train_loader, model, criterion, optimizer, epoch, args)

        if not args.multiprocessing_distributed or (
                args.multiprocessing_distributed
                and args.rank % ngpus_per_node == 0):
            save_checkpoint(
                {
                    'epoch': epoch + 1,
                    'arch': args.backbone_class,
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                },
                is_best=False,
                filename='checkpoint_{:04d}.pth.tar'.format(epoch))
Example #11
0
def get_dataloader(args):
    num_device = torch.cuda.device_count()
    num_episodes = args.episodes_per_epoch * num_device if args.multi_gpu else args.episodes_per_epoch
    num_workers = args.num_workers * num_device if args.multi_gpu else args.num_workers
    if args.additional == 'Mixed':
        from model.dataloader.mix_dataset import MixedDatasetWrapper
        trainset = get_dataset(args.dataset,
                               'train',
                               True,
                               args,
                               augment=args.augment)
        # args.num_class = unsupervised_trainset.num_class
        unsupervised_loader = DataLoader(dataset=trainset,
                                         batch_size=args.batch_size,
                                         shuffle=True,
                                         num_workers=num_workers,
                                         collate_fn=examplar_collate,
                                         pin_memory=True,
                                         drop_last=True)
        supervised_trainset = get_dataset(args.dataset,
                                          'train',
                                          False,
                                          args,
                                          augment=args.augment)
        args.num_classes = min(len(supervised_trainset.wnids),
                               args.num_classes)
        train_sampler = CategoriesSampler(supervised_trainset.label,
                                          num_episodes,
                                          max(args.way, args.num_classes),
                                          args.shot + args.query)

        supervised_loader = DataLoader(dataset=supervised_trainset,
                                       num_workers=num_workers,
                                       batch_sampler=train_sampler,
                                       pin_memory=True)
        dataset = MixedDatasetWrapper(supervised_loader, unsupervised_loader)
        train_loader = DataLoader(dataset=dataset,
                                  batch_size=1,
                                  shuffle=True,
                                  num_workers=num_workers,
                                  pin_memory=True)
    else:
        if args.finetune:
            split = 'train_%d_%d' % (args.finetune_ways,
                                     args.samples_per_class)
        else:
            split = 'train'
        trainset = get_dataset(args.dataset,
                               split,
                               args.unsupervised,
                               args,
                               augment=args.augment)
        args.num_classes = min(len(trainset.wnids), args.num_classes)
        if args.unsupervised:
            train_loader = DataLoader(dataset=trainset,
                                      batch_size=args.batch_size,
                                      shuffle=True,
                                      num_workers=num_workers,
                                      collate_fn=examplar_collate,
                                      pin_memory=True,
                                      drop_last=True)
        else:
            train_sampler = CategoriesSampler(trainset.label, num_episodes,
                                              max(args.way, args.num_classes),
                                              args.shot + args.query)

            train_loader = DataLoader(dataset=trainset,
                                      num_workers=num_workers,
                                      batch_sampler=train_sampler,
                                      pin_memory=True)
    if args.model_class == 'DummyProto':
        from model.dataloader.dummy_loader import DummyWrapper
        train_loader = DummyWrapper(args.dummy_samples, train_loader)
    # if args.multi_gpu and num_device > 1:
    # train_loader = MultiGPUDataloader(train_loader, num_device)
    # args.way = args.way * num_device

    valset = get_dataset(args.dataset, 'val', args.unsupervised, args)
    # val_sampler = CategoriesSampler(valset.label,
    #                                 args.num_eval_episodes,
    #                                 args.eval_way, args.eval_shot + args.eval_query)
    # val_loader = DataLoader(dataset=valset,
    #                         batch_sampler=val_sampler,
    #                         num_workers=args.num_workers,
    #                         pin_memory=True)
    #
    testsets = dict(((n, get_dataset(n, 'test', args.unsupervised, args))
                     for n in args.eval_dataset.split(',')))
    # testsets = TestDataset('test', args.unsupervised, args)
    # test_sampler = CategoriesSampler(testset.label,
    #                                  10000,  # args.num_eval_episodes,
    #                                  args.eval_way, args.eval_shot + args.eval_query)
    # test_loader = DataLoader(dataset=testset,
    #                          batch_sampler=test_sampler,
    #                          num_workers=args.num_workers,
    #                          pin_memory=True)
    args.image_shape = trainset.image_shape
    return train_loader, valset, testsets
Example #12
0
    parser.add_argument(
        '-f',
        default="RootDep,Q,PPDB,BoUg,BoBg,SVO,BoW-B,BoW-S,W2V,NegAlgn",
        type=str)
    parser.add_argument('-t', action='store_true')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-i', action='store_true')
    group.add_argument('-a', action='store_true')

    args = parser.parse_args()

    # When running original project, use LogitPredictor
    predictor = LogitPredictor
    # predictor = ShowDownPredictor

    train_data = get_dataset(
        'url-versions-2015-06-14-clean-with-body-train-with-body.csv')
    X, y = split_data(train_data)
    # Split the dataset in two equal parts
    # X_train, X_test, y_train, y_test = train_test_split(
    #     X, y, test_size=0.5, random_state=0)

    test_data = get_dataset(
        'url-versions-2015-06-14-clean-with-body-test-with-body.csv')

    transforms = {
        'CosSim': CosSimTransform,
        'BoW-B': BoWBTransform,
        'BoW-S': BoWSTransform,
        'BoW': BoWTransform,
        'BoUg': BoUgTransform,
        'BoBg': BoBgTransform,
    provided in https://pypi.python.org/pypi/munkres/."""
    s_toks = get_tokenized_lemmas(s)
    t_toks = get_tokenized_lemmas(t)

    df = pd.DataFrame(index=s_toks, columns=t_toks, data=0.)

    for c in s_toks:
        for a in t_toks:
            df.ix[c, a] = compute_paraphrase_score(c, a)

    matrix = df.values
    cost_matrix = make_cost_matrix(matrix, lambda cost: _max_ppdb_score - cost)

    indexes = _munk.compute(cost_matrix)
    total = 0.0
    for row, column in indexes:
        value = matrix[row][column]
        total += value
    return indexes, total / float(np.min(matrix.shape))


if __name__ == "__main__":
    df = get_dataset()
    data = {}

    for _, row in df.iterrows():
        data[(row.claimId, row.articleId)] = calc_hungarian_alignment_score(row.claimHeadline,
                                                                            row.articleHeadline)

    with open(os.path.join('..', 'data', 'pickled', 'hungarian-alignment-score.pickle'), 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        xml += '<t>{0:s}</t>\n'.format(row.articleHeadline)
        xml += '<h>{0:s}</h>\n'.format(row.claimHeadline)
        xml += '</pair>\n'
    xml += '</entailment-corpus>\n'
    return xml

_dataset_base_files = ['url-versions-2015-06-14-clean-test.csv',
                       'url-versions-2015-06-14-clean-train.csv']

_no_folds = 10

_dataset_files = list(_dataset_base_files)
for fold in range(1, _no_folds+1):
    _dataset_files.append('url-versions-2015-06-14-clean-train-fold-{0:d}.csv'.format(fold))
    _dataset_files.append('url-versions-2015-06-14-clean-test-fold-{0:d}.csv'.format(fold))

if __name__ == '__main__':
    for ds_base_filename in _dataset_base_files:
        df = get_dataset(ds_base_filename)
        df_fa = df.drop(df[df.articleHeadlineStance == 'observing'].index)
        output_filename = '{0:s}-rte-fa.xml'.format(ds_base_filename.split('.')[0])
        with open(os.path.join('..', 'data', 'emergent', output_filename), 'w') as f:
            f.write(_generate_xml(df_fa))

    for ds_filename in _dataset_files:
        df = get_dataset(ds_filename)
        output_filename = '{0:s}-rte.xml'.format(ds_filename.split('.')[0])
        with open(os.path.join('..', 'data', 'emergent', output_filename), 'w') as f:
            f.write(_generate_xml(df))

Example #15
0
    model_dict = model.state_dict()
    if args.init_weights is not None:
        pretrained_dict = torch.load(args.init_weights)['params']
        # remove weights for FC
        # pretrained_dict = {'encoder.'+k: v for k, v in pretrained_dict.items()}
        pretrained_dict = {
            k: v
            for k, v in pretrained_dict.items()
            if k in model_dict and 'fc' not in k
        }
        print(pretrained_dict.keys())
        model_dict.update(pretrained_dict)
    model.load_state_dict(model_dict)
    model.eval()

    trainset = get_dataset('MiniImageNet', 'train', False, args)

    class_mean = get_class_mean('MiniImageNet', args.backbone_class, trainset)
    # testsets = dict(((n, get_dataset(n, 'test', args.unsupervised, args)) for n in args.eval_dataset.split(',')))
    ensemble_result = []
    for n in args.datasets:
        print('----------- test on {} --------------'.format(n))
        valset = get_dataset(n, 'val', args.unsupervised, args)
        for i, (args.way, args.shot) in enumerate(valset.eval_setting):
            # train best gamma
            # valset = Dataset('val', args.unsupervised, args)
            valset = get_dataset(n, 'val', args.unsupervised, args)
            val_sampler = CategoriesSampler(valset.label, 500,
                                            min(args.way, valset.num_class),
                                            args.shot + args.query)
            val_loader = DataLoader(dataset=valset,
Example #16
0
def get_snopes():
    test_data = get_dataset("my_claims_csv_cleaned.csv")
    X_test, y_test = split_data(test_data)
    X_test = p.pipeline.transform(X_test)
    
    return(test_data, X_test)