def get_data(): train_data = get_dataset('url-versions-2015-06-14-clean-train.csv') X, y = split_data(train_data) X = p.pipeline.fit_transform(X) train_data1 = train_data[:1489] train_data2 = train_data[1489:] X1 = X[:1489] X2 = X[1489:] test_data = get_dataset('url-versions-2015-06-14-clean-test.csv') X_test, y_test = split_data(test_data) X_test = p.pipeline.transform(X_test) # return train/ validation/ test return (train_data1, X1, train_data2, X2, test_data, X_test)
def run(): parser = ArgumentParser() parser.add_argument("--dataset_path", type=str, default="", help="Path or url of the dataset. If empty download from S3.") parser.add_argument("--dataset_cache", type=str, default='./dataset_cache', help="Path or url of the dataset cache") parser.add_argument("--model", type=str, default="openai-gpt", help="Model type (openai-gpt or gpt2)", choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt # parser.add_argument("--model", type=str, default="gpt2", help="Model type (openai-gpt or gpt2)", # choices=['openai-gpt', 'gpt2']) # anything besides gpt2 will load openai-gpt parser.add_argument("--model_checkpoint", type=str, default="drive/My Drive/GPT-2_Text_Generation/model_checkpoint", help="Path, url or short name of the model") # parser.add_argument("--model_checkpoint", type=str, default="", help="Path, url or short name of the model") parser.add_argument("--max_history", type=int, default=2, help="Number of previous utterances to keep in history") parser.add_argument("--device", type=str, default="cuda" if torch.cuda.is_available() else "cpu", help="Device (cuda or cpu)") parser.add_argument("--no_sample", action='store_true', help="Set to use greedy decoding instead of sampling") parser.add_argument("--max_length", type=int, default=20, help="Maximum length of the output utterances") parser.add_argument("--min_length", type=int, default=1, help="Minimum length of the output utterances") parser.add_argument("--seed", type=int, default=0, help="Seed") parser.add_argument("--temperature", type=int, default=0.7, help="Sampling softmax temperature") parser.add_argument("--top_k", type=int, default=0, help="Filter top-k tokens before sampling (<=0: no filtering)") parser.add_argument("--top_p", type=float, default=0.9, help="Nucleus filtering (top-p) before sampling (<=0.0: no filtering)") args = parser.parse_args() logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__file__) logger.info(pformat(args)) if args.model_checkpoint == "": if args.model == 'gpt2': raise ValueError("Interacting with GPT2 requires passing a finetuned model_checkpoint") else: args.model_checkpoint = download_pretrained_model() if args.seed != 0: random.seed(args.seed) torch.random.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) logger.info("Get pretrained model and tokenizer") tokenizer_class, model_class = (GPT2Tokenizer, GPT2LMHeadModel) if args.model == 'gpt2' else ( OpenAIGPTTokenizer, OpenAIGPTLMHeadModel) tokenizer = tokenizer_class.from_pretrained(args.model_checkpoint) model = model_class.from_pretrained(args.model_checkpoint) model.to(args.device) add_special_tokens_(model, tokenizer) logger.info("Sample a personality") dataset = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) personalities = [dialog["personality"] for dataset in dataset.values() for dialog in dataset] personality = random.choice(personalities) personality_decoded = [tokenizer.decode(x) for x in personality] database.push_personality(personality_decoded)
def get_data_loaders(args, tokenizer): """ Prepare the dataset for training and evaluation """ personachat = get_dataset(tokenizer, args.dataset_path, args.dataset_cache) logger.info("Build inputs and labels") datasets = {"train": defaultdict(list), "valid": defaultdict(list)} for dataset_name, dataset in personachat.items(): num_candidates = len(dataset[0]["utterances"][0]["candidates"]) if args.num_candidates > 0 and dataset_name == 'train': num_candidates = min(args.num_candidates, num_candidates) for dialog in dataset: persona = dialog["personality"].copy() for _ in range(args.personality_permutations): for utterance in dialog["utterances"]: history = utterance["history"][-(2 * args.max_history + 1):] for j, candidate in enumerate(utterance["candidates"][-num_candidates:]): lm_labels = bool(j == num_candidates - 1) instance = build_input_from_segments(persona, history, candidate, tokenizer, lm_labels) for input_name, input_array in instance.items(): datasets[dataset_name][input_name].append(input_array) datasets[dataset_name]["mc_labels"].append(num_candidates - 1) datasets[dataset_name]["n_candidates"] = num_candidates persona = [persona[-1]] + persona[:-1] # permuted personalities logger.info("Pad inputs and convert to Tensor") tensor_datasets = {"train": [], "valid": []} for dataset_name, dataset in datasets.items(): dataset = pad_dataset(dataset, padding=tokenizer.convert_tokens_to_ids(SPECIAL_TOKENS[-1])) for input_name in MODEL_INPUTS: tensor = torch.tensor(dataset[input_name]) if input_name != "mc_labels": tensor = tensor.view((-1, datasets[dataset_name]["n_candidates"]) + tensor.shape[1:]) tensor_datasets[dataset_name].append(tensor) logger.info("Build train and validation dataloaders") train_dataset, valid_dataset = TensorDataset(*tensor_datasets["train"]), TensorDataset(*tensor_datasets["valid"]) train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, shuffle=(not args.distributed)) valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False) logger.info("Train dataset (Batch, Candidates, Seq length): {}".format(train_dataset.tensors[0].shape)) logger.info("Valid dataset (Batch, Candidates, Seq length): {}".format(valid_dataset.tensors[0].shape)) return train_loader, valid_loader, train_sampler, valid_sampler
except: import pickle from aligner import align from model.utils import get_dataset, get_tokenized_lemmas def _get_unaligned_tokens(tokens, alignment): aligned = [a - 1 for (a, _) in alignment] unaligned = [i for i in range(len(tokens)) if i not in aligned] return [tokens[i] for i in unaligned] if __name__ == "__main__": df = get_dataset() data = {} for id, row in df.iterrows(): article_hl_tok = get_tokenized_lemmas(row.articleHeadline) claim_hl_tok = get_tokenized_lemmas(row.claimHeadline) try: alignment = align(claim_hl_tok, article_hl_tok) data[(row.claimId, row.articleId)] = [(s - 1, t - 1) for (s, t) in alignment[0]] except: print 'Unable to align', article_hl_tok, 'and', claim_hl_tok print row.articleId, row.claimId with open(os.path.join('..', 'data', 'pickled', 'aligned-data.pickle'), 'wb') as f:
_dataset_base_files = [ 'url-versions-2015-06-14-clean-test.csv', 'url-versions-2015-06-14-clean-train.csv' ] _no_folds = 10 _dataset_files = list(_dataset_base_files) for fold in range(1, _no_folds + 1): _dataset_files.append( 'url-versions-2015-06-14-clean-train-fold-{0:d}.csv'.format(fold)) _dataset_files.append( 'url-versions-2015-06-14-clean-test-fold-{0:d}.csv'.format(fold)) if __name__ == '__main__': for ds_base_filename in _dataset_base_files: df = get_dataset(ds_base_filename) df_fa = df.drop(df[df.articleHeadlineStance == 'observing'].index) output_filename = '{0:s}-rte-fa.xml'.format( ds_base_filename.split('.')[0]) with open(os.path.join('..', 'data', 'emergent', output_filename), 'w') as f: f.write(_generate_xml(df_fa)) for ds_filename in _dataset_files: df = get_dataset(ds_filename) output_filename = '{0:s}-rte.xml'.format(ds_filename.split('.')[0]) with open(os.path.join('..', 'data', 'emergent', output_filename), 'w') as f: f.write(_generate_xml(df))
import os from model.utils import get_dataset, split_data from model.cross_validation import ClaimKFold if __name__ == '__main__': train_data = get_dataset('url-versions-2015-06-14-clean-train.csv') X, y = split_data(train_data) ckf = ClaimKFold(X) fold = 1 for train_index, test_index in ckf: Z_test = X.iloc[test_index, :].copy() Z_test['articleHeadlineStance'] = y.iloc[test_index] Z_test.to_csv(os.path.join('..', 'data', 'emergent', 'url-versions-2015-06-14-clean-test-fold-{0:d}.csv'.format(fold))) Z_train = X.iloc[train_index, :].copy() Z_train['articleHeadlineStance'] = y.iloc[train_index] Z_train.to_csv(os.path.join('..', 'data', 'emergent', 'url-versions-2015-06-14-clean-train-fold-{0:d}.csv'.format(fold))) fold += 1
model_dict = model.state_dict() if args.init_weights is not None: pretrained_dict = torch.load(args.init_weights)['params'] # remove weights for FC # pretrained_dict = {'encoder.'+k: v for k, v in pretrained_dict.items()} pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict and 'fc' not in k } print(pretrained_dict.keys()) model_dict.update(pretrained_dict) model.load_state_dict(model_dict) model.eval() trainset = get_dataset(args.dataset, 'train', args.unsupervised, args) loader = DataLoader(dataset=trainset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, pin_memory=True) embs = [] labels = [] with torch.no_grad(): for i, batch in tqdm(enumerate(loader, 1), total=len(loader), desc='embedding'): if torch.cuda.is_available(): data, label = batch[0].cuda(), batch[1] else:
import os import operator as op import sys try: import cPickle as pickle except: import pickle sys.path.append(os.path.join('..', 'src')) import numpy as np from model.utils import get_dataset, get_w2v_model, convert_text_to_vec, W2VEC_SIZE if __name__ == "__main__": df = get_dataset('url-versions-2015-06-14-clean-with-body.csv') data = ({}, {}) add_data, mult_data = data model = get_w2v_model() for id, row in df.iterrows(): add_data[row.claimId] = convert_text_to_vec(model, row.claimHeadline) add_data[row.articleId] = convert_text_to_vec(model, row.articleHeadline) grp_mult = (np.ones(W2VEC_SIZE), op.mul) mult_data[row.claimId] = convert_text_to_vec(model, row.claimHeadline, grp_mult) mult_data[row.articleId] = convert_text_to_vec(model, row.articleHeadline, grp_mult)
import sys import os sys.path.append(os.path.join('..', 'src')) from model.utils import get_dataset, split_data, run_test from model.baseline.baseline_predictors import ProbabilityPredictor, ChancePredictor, \ MajorityPredictor, WordOverlapBaselinePredictor if __name__ == '__main__': train_data = get_dataset('url-versions-2015-06-14-clean-train.csv') X, y = split_data(train_data) test_data = get_dataset('url-versions-2015-06-14-clean-test.csv') print('\n>> Chance predictor <<\n') print(run_test(X, y, test_data, ChancePredictor())) print('\n>> Majority predictor <<\n') print(run_test(X, y, test_data, MajorityPredictor())) print('\n>> Probability predictor <<\n') print(run_test(X, y, test_data, ProbabilityPredictor())) print('\n>> Word overlap predictor <<\n') print(run_test(X, y, test_data, WordOverlapBaselinePredictor()))
def main_worker(gpu, ngpus_per_node, args): args.gpu = gpu # suppress printing if not master if args.multiprocessing_distributed and args.gpu != 0: def print_pass(*args): pass builtins.print = print_pass if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model print("=> creating model '{}'".format(args.backbone_class)) if args.backbone_class == 'ConvNet': from model.networks.convnet import ConvNet as base_encoder hdim = 64 elif args.backbone_class == 'Res12': hdim = 640 from model.networks.res12 import ResNet as base_encoder elif args.backbone_class == 'Res18': hdim = 512 from model.networks.res18 import ResNet as base_encoder else: hdim = 640 from model.networks.WRN28 import Wide_ResNet as base_encoder model = moco.builder.MoCo(base_encoder, hdim, args.moco_k, args.moco_m, args.moco_t, args.mlp) print(model) if args.distributed: # For multiprocessing distributed, DistributedDataParallel constructor # should always set the single device scope, otherwise, # DistributedDataParallel will use all available devices. if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) # When using a single GPU per process and per # DistributedDataParallel, we need to divide the batch size # ourselves based on the total number of GPUs we have args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() # DistributedDataParallel will divide and allocate batch_size to all # available GPUs if device_ids are not set model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) # comment out the following line for debugging # raise NotImplementedError("Only DistributedDataParallel is supported.") else: # AllGather implementation (batch shuffle, queue update, etc.) in # this code only supports DistributedDataParallel. raise NotImplementedError("Only DistributedDataParallel is supported.") # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda(args.gpu) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # Data loading code # traindir = os.path.join(args.data, 'train') # normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], # std=[0.229, 0.224, 0.225]) # if args.aug_plus: # # MoCo v2's aug: similar to SimCLR https://arxiv.org/abs/2002.05709 # augmentation = [ # transforms.RandomResizedCrop(224, scale=(0.2, 1.)), # transforms.RandomApply([ # transforms.ColorJitter(0.4, 0.4, 0.4, 0.1) # not strengthened # ], p=0.8), # transforms.RandomGrayscale(p=0.2), # transforms.RandomApply([moco.loader.GaussianBlur([.1, 2.])], p=0.5), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # normalize # ] # else: # # MoCo v1's aug: the same as InstDisc https://arxiv.org/abs/1805.01978 # augmentation = [ # transforms.RandomResizedCrop(224, scale=(0.2, 1.)), # transforms.RandomGrayscale(p=0.2), # transforms.ColorJitter(0.4, 0.4, 0.4, 0.4), # transforms.RandomHorizontalFlip(), # transforms.ToTensor(), # normalize # ] # train_dataset = datasets.ImageFolder( # traindir, # moco.loader.TwoCropsTransform(transforms.Compose(augmentation))) train_dataset = get_dataset(args.dataset, 'train', False, args, 'moco') if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler, drop_last=True) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, args) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.backbone_class, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), }, is_best=False, filename='checkpoint_{:04d}.pth.tar'.format(epoch))
def get_dataloader(args): num_device = torch.cuda.device_count() num_episodes = args.episodes_per_epoch * num_device if args.multi_gpu else args.episodes_per_epoch num_workers = args.num_workers * num_device if args.multi_gpu else args.num_workers if args.additional == 'Mixed': from model.dataloader.mix_dataset import MixedDatasetWrapper trainset = get_dataset(args.dataset, 'train', True, args, augment=args.augment) # args.num_class = unsupervised_trainset.num_class unsupervised_loader = DataLoader(dataset=trainset, batch_size=args.batch_size, shuffle=True, num_workers=num_workers, collate_fn=examplar_collate, pin_memory=True, drop_last=True) supervised_trainset = get_dataset(args.dataset, 'train', False, args, augment=args.augment) args.num_classes = min(len(supervised_trainset.wnids), args.num_classes) train_sampler = CategoriesSampler(supervised_trainset.label, num_episodes, max(args.way, args.num_classes), args.shot + args.query) supervised_loader = DataLoader(dataset=supervised_trainset, num_workers=num_workers, batch_sampler=train_sampler, pin_memory=True) dataset = MixedDatasetWrapper(supervised_loader, unsupervised_loader) train_loader = DataLoader(dataset=dataset, batch_size=1, shuffle=True, num_workers=num_workers, pin_memory=True) else: if args.finetune: split = 'train_%d_%d' % (args.finetune_ways, args.samples_per_class) else: split = 'train' trainset = get_dataset(args.dataset, split, args.unsupervised, args, augment=args.augment) args.num_classes = min(len(trainset.wnids), args.num_classes) if args.unsupervised: train_loader = DataLoader(dataset=trainset, batch_size=args.batch_size, shuffle=True, num_workers=num_workers, collate_fn=examplar_collate, pin_memory=True, drop_last=True) else: train_sampler = CategoriesSampler(trainset.label, num_episodes, max(args.way, args.num_classes), args.shot + args.query) train_loader = DataLoader(dataset=trainset, num_workers=num_workers, batch_sampler=train_sampler, pin_memory=True) if args.model_class == 'DummyProto': from model.dataloader.dummy_loader import DummyWrapper train_loader = DummyWrapper(args.dummy_samples, train_loader) # if args.multi_gpu and num_device > 1: # train_loader = MultiGPUDataloader(train_loader, num_device) # args.way = args.way * num_device valset = get_dataset(args.dataset, 'val', args.unsupervised, args) # val_sampler = CategoriesSampler(valset.label, # args.num_eval_episodes, # args.eval_way, args.eval_shot + args.eval_query) # val_loader = DataLoader(dataset=valset, # batch_sampler=val_sampler, # num_workers=args.num_workers, # pin_memory=True) # testsets = dict(((n, get_dataset(n, 'test', args.unsupervised, args)) for n in args.eval_dataset.split(','))) # testsets = TestDataset('test', args.unsupervised, args) # test_sampler = CategoriesSampler(testset.label, # 10000, # args.num_eval_episodes, # args.eval_way, args.eval_shot + args.eval_query) # test_loader = DataLoader(dataset=testset, # batch_sampler=test_sampler, # num_workers=args.num_workers, # pin_memory=True) args.image_shape = trainset.image_shape return train_loader, valset, testsets
parser.add_argument( '-f', default="RootDep,Q,PPDB,BoUg,BoBg,SVO,BoW-B,BoW-S,W2V,NegAlgn", type=str) parser.add_argument('-t', action='store_true') group = parser.add_mutually_exclusive_group() group.add_argument('-i', action='store_true') group.add_argument('-a', action='store_true') args = parser.parse_args() # When running original project, use LogitPredictor predictor = LogitPredictor # predictor = ShowDownPredictor train_data = get_dataset( 'url-versions-2015-06-14-clean-with-body-train-with-body.csv') X, y = split_data(train_data) # Split the dataset in two equal parts # X_train, X_test, y_train, y_test = train_test_split( # X, y, test_size=0.5, random_state=0) test_data = get_dataset( 'url-versions-2015-06-14-clean-with-body-test-with-body.csv') transforms = { 'CosSim': CosSimTransform, 'BoW-B': BoWBTransform, 'BoW-S': BoWSTransform, 'BoW': BoWTransform, 'BoUg': BoUgTransform, 'BoBg': BoBgTransform,
provided in https://pypi.python.org/pypi/munkres/.""" s_toks = get_tokenized_lemmas(s) t_toks = get_tokenized_lemmas(t) df = pd.DataFrame(index=s_toks, columns=t_toks, data=0.) for c in s_toks: for a in t_toks: df.ix[c, a] = compute_paraphrase_score(c, a) matrix = df.values cost_matrix = make_cost_matrix(matrix, lambda cost: _max_ppdb_score - cost) indexes = _munk.compute(cost_matrix) total = 0.0 for row, column in indexes: value = matrix[row][column] total += value return indexes, total / float(np.min(matrix.shape)) if __name__ == "__main__": df = get_dataset() data = {} for _, row in df.iterrows(): data[(row.claimId, row.articleId)] = calc_hungarian_alignment_score(row.claimHeadline, row.articleHeadline) with open(os.path.join('..', 'data', 'pickled', 'hungarian-alignment-score.pickle'), 'wb') as f: pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
xml += '<t>{0:s}</t>\n'.format(row.articleHeadline) xml += '<h>{0:s}</h>\n'.format(row.claimHeadline) xml += '</pair>\n' xml += '</entailment-corpus>\n' return xml _dataset_base_files = ['url-versions-2015-06-14-clean-test.csv', 'url-versions-2015-06-14-clean-train.csv'] _no_folds = 10 _dataset_files = list(_dataset_base_files) for fold in range(1, _no_folds+1): _dataset_files.append('url-versions-2015-06-14-clean-train-fold-{0:d}.csv'.format(fold)) _dataset_files.append('url-versions-2015-06-14-clean-test-fold-{0:d}.csv'.format(fold)) if __name__ == '__main__': for ds_base_filename in _dataset_base_files: df = get_dataset(ds_base_filename) df_fa = df.drop(df[df.articleHeadlineStance == 'observing'].index) output_filename = '{0:s}-rte-fa.xml'.format(ds_base_filename.split('.')[0]) with open(os.path.join('..', 'data', 'emergent', output_filename), 'w') as f: f.write(_generate_xml(df_fa)) for ds_filename in _dataset_files: df = get_dataset(ds_filename) output_filename = '{0:s}-rte.xml'.format(ds_filename.split('.')[0]) with open(os.path.join('..', 'data', 'emergent', output_filename), 'w') as f: f.write(_generate_xml(df))
model_dict = model.state_dict() if args.init_weights is not None: pretrained_dict = torch.load(args.init_weights)['params'] # remove weights for FC # pretrained_dict = {'encoder.'+k: v for k, v in pretrained_dict.items()} pretrained_dict = { k: v for k, v in pretrained_dict.items() if k in model_dict and 'fc' not in k } print(pretrained_dict.keys()) model_dict.update(pretrained_dict) model.load_state_dict(model_dict) model.eval() trainset = get_dataset('MiniImageNet', 'train', False, args) class_mean = get_class_mean('MiniImageNet', args.backbone_class, trainset) # testsets = dict(((n, get_dataset(n, 'test', args.unsupervised, args)) for n in args.eval_dataset.split(','))) ensemble_result = [] for n in args.datasets: print('----------- test on {} --------------'.format(n)) valset = get_dataset(n, 'val', args.unsupervised, args) for i, (args.way, args.shot) in enumerate(valset.eval_setting): # train best gamma # valset = Dataset('val', args.unsupervised, args) valset = get_dataset(n, 'val', args.unsupervised, args) val_sampler = CategoriesSampler(valset.label, 500, min(args.way, valset.num_class), args.shot + args.query) val_loader = DataLoader(dataset=valset,
def get_snopes(): test_data = get_dataset("my_claims_csv_cleaned.csv") X_test, y_test = split_data(test_data) X_test = p.pipeline.transform(X_test) return(test_data, X_test)