def main(args): """Visualization of contexts, questions, and colored answer spans.""" # Load dataset, and optionally shuffle. dataset = QADataset(args, args.path) samples = dataset.samples if args.shuffle: random.shuffle(samples) vis_samples = samples[:args.samples] print() print('-' * RULE_LENGTH) print() # Visualize samples. for (qid, context, question, answer_start, answer_end) in vis_samples[:10]: cxt = _build_string(context) print(cxt) stanza.download('en') en_nlp = stanza.Pipeline('en') en_doc = en_nlp(cxt) for i, sent in enumerate(en_doc.sentences): print(f"[Sentence {i+1}") for word in sent.words: print("{:12s}\t{:12s}\t{:6s}\t{:d}\t{:12s}".format( word.text, word.lemma, word.pos, word.head, word.deprel)) print("") print("Mention text\tType\tStart-End") for ent in en_doc.ents: print("{}\t{}\t{}-{}".format(ent.text, ent.type, ent.start_char, ent.end_char))
def main(args): """Visualization of contexts, questions, and colored answer spans.""" # Load dataset, and optionally shuffle. dataset = QADataset(args, args.path) samples = dataset.samples if args.shuffle: random.shuffle(samples) # print("NUMBER OF TOTAL POSSIBLE SAMPLES:", len(samples)) vis_samples = samples[args.start:args.start + args.samples] print() print('-' * RULE_LENGTH) print() # Visualize samples. for (qid, context, question, answer_start, answer_end) in vis_samples: print('[METADATA]') print(f'path = \'{args.path}\'') print(f'question id = {qid}') print() print('[CONTEXT]') print(_color_context(context, answer_start, answer_end)) print() print('[QUESTION]') print(_build_string(question)) print() print('[ANSWER]') print(_build_string(context[answer_start:(answer_end + 1)])) print() print('-' * RULE_LENGTH) print()
def main(args): """ Main function for training, evaluating, and checkpointing. Args: args: `argparse` object. """ # Print arguments. print('\nusing arguments:') _print_arguments(args) print() # Check if GPU is available. if not args.use_gpu and torch.cuda.is_available(): print('warning: GPU is available but args.use_gpu = False') print() local_rank = args.local_rank # world_size = torch.cuda.device_count() # assume all local GPUs # Set up distributed process group rank = setup_dist(local_rank) # Set up datasets. train_dataset = QADataset(args, args.train_path) dev_dataset = QADataset(args, args.dev_path) # Create vocabulary and tokenizer. vocabulary = Vocabulary(train_dataset.samples, args.vocab_size) tokenizer = Tokenizer(vocabulary) for dataset in (train_dataset, dev_dataset): dataset.register_tokenizer(tokenizer) args.vocab_size = len(vocabulary) args.pad_token_id = tokenizer.pad_token_id print(f'vocab words = {len(vocabulary)}') # Print number of samples. print(f'train samples = {len(train_dataset)}') print(f'dev samples = {len(dev_dataset)}') print() # Select model. model = _select_model(args) #model = model.to(rank) #model = DDP(model, device_ids=[rank], output_device=rank) num_pretrained = model.load_pretrained_embeddings( vocabulary, args.embedding_path ) pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2) print(f'using pre-trained embeddings from \'{args.embedding_path}\'') print( f'initialized {num_pretrained}/{len(vocabulary)} ' f'embeddings ({pct_pretrained}%)' ) print() # device = torch.device(f'cuda:{rank}') model = model.to(rank) model = DDP(model, device_ids=[rank], output_device=rank) # if args.use_gpu: # model = cuda(args, model) if args.resume and args.model_path: map_location = {"cuda:0": "cuda:{}".format(rank)} model.load_state_dict(torch.load(args.model_path, map_location=map_location)) params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'using model \'{args.model}\' ({params} params)') print(model) print() if args.do_train: # Track training statistics for checkpointing. eval_history = [] best_eval_loss = float('inf') # Begin training. for epoch in range(1, args.epochs + 1): # Perform training and evaluation steps. try: train_loss = train(args, epoch, model, train_dataset) except RuntimeError: print(f'NCCL Wait Timeout, rank: \'{args.local_rank}\' (exit)') exit(1) eval_loss = evaluate(args, epoch, model, dev_dataset) # If the model's evaluation loss yields a global improvement, # checkpoint the model. if rank == 0: eval_history.append(eval_loss < best_eval_loss) if eval_loss < best_eval_loss: best_eval_loss = eval_loss torch.save(model.state_dict(), args.model_path) print( f'epoch = {epoch} | ' f'train loss = {train_loss:.6f} | ' f'eval loss = {eval_loss:.6f} | ' f"{'saving model!' if eval_history[-1] else ''}" ) # If early stopping conditions are met, stop training. if _early_stop(args, eval_history): suffix = 's' if args.early_stop > 1 else '' print( f'no improvement after {args.early_stop} epoch{suffix}. ' 'early stopping...' ) print() cleanup_dist() break if args.do_test and rank == 0: # Write predictions to the output file. Use the printed command # below to obtain official EM/F1 metrics. write_predictions(args, model, dev_dataset) eval_cmd = ( 'python3 evaluate.py ' f'--dataset_path {args.dev_path} ' f'--output_path {args.output_path}' ) print() print(f'predictions written to \'{args.output_path}\'') print(f'compute EM/F1 with: \'{eval_cmd}\'') print()
def main(args): """ Main function for training, evaluating, and checkpointing. Args: args: `argparse` object. """ # Print arguments. print('\nusing arguments:') _print_arguments(args) # Check if GPU is available. if not args.use_gpu and torch.cuda.is_available(): print('warning: GPU is available but args.use_gpu = False') print() # Set up datasets. train_dataset = QADataset(args, args.train_path, is_train=True) dev_dataset = QADataset(args, args.dev_path, is_train=False) print("Start creating vocabulary and tokenizer") # Create vocabulary and tokenizer. vocabulary = Vocabulary( train_dataset.samples + train_dataset.culled_samples, args.vocab_size) tokenizer = Tokenizer(vocabulary) for dataset in (train_dataset, dev_dataset): dataset.register_tokenizer(tokenizer) args.vocab_size = len(vocabulary) args.pad_token_id = tokenizer.pad_token_id print(f'vocab words = {len(vocabulary)}') # Print number of samples. print(f'train samples = {len(train_dataset)}') print(f'dev samples = {len(dev_dataset)}') print() # Select model. model = _select_model(args) num_pretrained = model.load_pretrained_embeddings(vocabulary, args.embedding_path) pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2) print(f'using pre-trained embeddings from \'{args.embedding_path}\'') print(f'initialized {num_pretrained}/{len(vocabulary)} ' f'embeddings ({pct_pretrained}%)') print() if args.use_gpu: model = cuda(args, model) params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'using model \'{args.model}\' ({params} params)') if args.do_train: # Track training statistics for checkpointing. eval_history = [] best_eval_loss = float('inf') # Begin training. for epoch in range(1, args.epochs + 1): # Perform training and evaluation steps. train_loss = train(args, epoch, model, train_dataset) eval_loss = evaluate(args, epoch, model, dev_dataset) # If the model's evaluation loss yields a global improvement, # checkpoint the model. eval_history.append(eval_loss < best_eval_loss) if eval_loss < best_eval_loss: best_eval_loss = eval_loss torch.save(model.state_dict(), args.model_path) print(f'epoch = {epoch} | ' f'train loss = {train_loss:.6f} | ' f'eval loss = {eval_loss:.6f} | ' f"{'saving model!' if eval_history[-1] else ''}") # If early stopping conditions are met, stop training. if _early_stop(args, eval_history): suffix = 's' if args.early_stop > 1 else '' print(f'no improvement after {args.early_stop} epoch{suffix}. ' 'early stopping...') print() break if args.do_test: # Write predictions to the output file. Use the printed command # below to obtain official EM/F1 metrics. write_predictions(args, model, dev_dataset) eval_cmd = ('python3 evaluate.py ' f'--dataset_path {args.dev_path} ' f'--output_path {args.output_path}') print() print(f'predictions written to \'{args.output_path}\'') print(f'compute EM/F1 with: \'{eval_cmd}\'') print()
help='learning rate for ensemble') parser.add_argument('--weight_decay', type=float, default=5e-3) parser.add_argument('--num_epochs', type=int, default=200) parser.add_argument('--batch_size', type=int, default=500) parser.add_argument('--root', type=str, default='../movie-data') args = parser.parse_args() seed = 1234734614 torch.manual_seed(seed) if args.use_cuda: torch.cuda.manual_seed(seed) # dataset dataset = QADataset(dataset='StackExchange', questionFile='QuestionFeatures.tsv', answerFile='AnswerFeatures.tsv', userFile='UserFeatures.tsv', rootFolder=args.root) print "Dataset read", len(dataset) PosClass = dataset.trainPairs_WFeatures[ dataset.trainPairs_WFeatures['Credible'] == '1'] NegClass = dataset.trainPairs_WFeatures[ dataset.trainPairs_WFeatures['Credible'] == '0'] print "Positive samples", len(PosClass) questions = dataset.trainPairs['QuestionId'].unique() if len(PosClass) > len(NegClass): NegClass_Sample = NegClass else: NegClass_Sample = NegClass.sample(n=len(PosClass))
def main(args): """ Main function for training, evaluating, and checkpointing. Args: args: `argparse` object. """ # Print arguments. print('\nusing arguments:') _print_arguments(args) print() # Check if GPU is available. if not args.use_gpu and torch.cuda.is_available(): print('warning: GPU is available but args.use_gpu = False') print() # Set up datasets. train_dataset = QADataset(args, args.train_path) dev_dataset = QADataset(args, args.dev_path) # Create vocabulary and tokenizer. if args.vocab_path != None: print("loading vocabulary from file at {}".format(args.vocab_path)) vocabulary = Vocabulary(train_dataset.samples, args.vocab_size, load_from_file=True, filepath=args.vocab_path) else: print("constructing the vocab from dataset examples") vocabulary = Vocabulary(train_dataset.samples, args.vocab_size) tokenizer = Tokenizer(vocabulary) for dataset in (train_dataset, dev_dataset): dataset.register_tokenizer(tokenizer) args.vocab_size = len(vocabulary) args.pad_token_id = tokenizer.pad_token_id args.char_vocab_size = vocabulary.numCharacters() print(f'vocab words = {len(vocabulary)}') print(f'num characters = {args.char_vocab_size}') # Print number of samples. num_train_samples = len(train_dataset) print(f'train samples = {len(train_dataset)}') print(f'dev samples = {len(dev_dataset)}') print() # Select model. model = _select_model(args) num_pretrained = model.load_pretrained_embeddings(vocabulary, args.embedding_path) pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2) print(f'using pre-trained embeddings from \'{args.embedding_path}\'') print(f'initialized {num_pretrained}/{len(vocabulary)} ' f'embeddings ({pct_pretrained}%)') print() if args.use_gpu: model = cuda(args, model) # load the model from previous checkpoint if args.finetune >= 1: print("preparing to load {} as base model".format(args.init_model)) model.load_state_dict(torch.load(args.init_model, map_location='cpu')) params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'using model \'{args.model}\' ({params} params)') print(model) print() if args.do_train: # create tensorboard summary writer train_writer = tb.SummaryWriter( log_dir=os.path.join(args.logdir, args.run + "_train")) valid_writer = tb.SummaryWriter( log_dir=os.path.join(args.logdir, args.run + "_valid")) # Track training statistics for checkpointing. eval_history = [] best_eval_loss = float('inf') # Begin training. for epoch in range(1, args.epochs + 1): # Perform training and evaluation steps. train_loss = train(args, epoch, model, train_dataset, train_writer, num_train_samples) eval_loss = evaluate(args, epoch, model, dev_dataset) # write the loss to tensorboard valid_writer.add_scalar("valid_loss", eval_loss, global_step=epoch) # If the model's evaluation loss yields a global improvement, # checkpoint the model. eval_history.append(eval_loss < best_eval_loss) if eval_loss < best_eval_loss: best_eval_loss = eval_loss torch.save(model.state_dict(), args.model_path) print(f'epoch = {epoch} | ' f'train loss = {train_loss:.6f} | ' f'eval loss = {eval_loss:.6f} | ' f"{'saving model!' if eval_history[-1] else ''}") # If early stopping conditions are met, stop training. if _early_stop(args, eval_history): suffix = 's' if args.early_stop > 1 else '' print(f'no improvement after {args.early_stop} epoch{suffix}. ' 'early stopping...') print() break if args.do_test: # Write predictions to the output file. Use the printed command # below to obtain official EM/F1 metrics. write_predictions(args, model, dev_dataset) eval_cmd = ('python3 evaluate.py ' f'--dataset_path {args.dev_path} ' f'--output_path {args.output_path}') print() print(f'predictions written to \'{args.output_path}\'') print(f'compute EM/F1 with: \'{eval_cmd}\'') print()
parser = argparse.ArgumentParser(description='PyTorch Credibility Prediction Model') parser.add_argument('--use_cuda', dest='use_cuda', default=False, action='store_true') parser.add_argument('-lr', '--learning_rate', type=float, default=0.001,\ help='learning rate for FeedForward') parser.add_argument('--weight_decay', type=float, default=5e-4) parser.add_argument('--num_epochs', type=int, default=2) parser.add_argument('--batch_size', type=int, default=500) parser.add_argument('--root', type=str, default='/home/github/UserCredibility/movie-data') glove_path = '/Users/kanika/Documents/glove.6B/glove.6B.50d.txt' args = parser.parse_args() seed = 1234734614 torch.manual_seed(seed) if args.use_cuda: torch.cuda.manual_seed(seed) dataset = QADataset(dataset='StackExchange',questionFile = 'QuestionFeatures.tsv', answerFile = 'AnswerFeatures.tsv', userFile = 'UserFeatures.tsv', rootFolder= args.root) print "Dataset read", len(dataset) ###SAMPLE THE DATASET PosClass = dataset.trainPairs_WFeatures[dataset.trainPairs_WFeatures['Credible'] == '1'] NegClass = dataset.trainPairs_WFeatures[dataset.trainPairs_WFeatures['Credible'] == '0'] print "Positive samples",len(PosClass) questions = dataset.trainPairs['QuestionId'].unique() if len(PosClass) > len(NegClass): NegClass_Sample = NegClass else: NegClass_Sample = NegClass.sample(n=len(PosClass))
args = parser.parse_args() seed = 1234734614 torch.manual_seed(seed) if args.use_cuda: torch.cuda.manual_seed(seed) if args.use_content: print("Using content embeddings") device = torch.device("cuda:1" if args.use_cuda else "cpu") print device # dataset dataset = QADataset(dataset=args.dataset, questionFile='QuestionFeatures.tsv', answerFile='AnswerFeatures.tsv', userFile='UserFeatures.tsv', rootFolder=os.path.join(args.root, args.dataset)) print "Dataset read", args.dataset, len(dataset) PosClass = dataset.trainPairs_WFeatures[ dataset.trainPairs_WFeatures['Credible'] == '1'] NegClass = dataset.trainPairs_WFeatures[ dataset.trainPairs_WFeatures['Credible'] == '0'] print "Positive samples", len(PosClass) questions = dataset.trainPairs['QuestionId'].unique() if len(PosClass) > len(NegClass): NegClass_Sample = NegClass else: NegClass_Sample = NegClass.sample(n=len(PosClass))
def main(args): """ Main function for training, evaluating, and checkpointing. Args: args: `argparse` object. """ # Print arguments. print('\nusing arguments:') _print_arguments(args) print("args type: ", type(args)) print() # Check if GPU is available. if not args.use_gpu and torch.cuda.is_available(): print('warning: GPU is available but args.use_gpu = False') print() if args.bio: print("training on bio dataset") train_dataset = QADataset(args, args.train_path) dev_dataset = QADataset(args, args.dev_path) bio_len = len(train_dataset.elems) # len == 1504 print("bio data size: ", bio_len) random.shuffle(train_dataset.elems) train_dataset.elems = train_dataset.elems[:int(bio_len / 2)] dev_dataset.elems = train_dataset.elems[int(bio_len / 2):] else: # Set up datasets train_dataset = QADataset( args, args.train_path) # len == 18885, vocab_size == 50004 dev_dataset = QADataset( args, args.dev_path) # len == 2067, vocab words == 24987 if args.domain_adaptive: # NewsQA dataset print("domain adaptive training") news_train = QADataset(args, "datasets/newsqa_train.jsonl.gz" ) # len == 11428, vocab words == 24989 news_dev = QADataset( args, "datasets/newsqa_dev.jsonl.gz") # len == 638, vocab words == 18713 bio = QADataset( args, "datasets/bioasq.jsonl.gz") # len == 1504, vocab words == 18715 train_dataset.elems = train_dataset.elems + news_train.elems + news_dev.elems print("total dataset size: ", len(train_dataset.elems)) # Create vocabulary and tokenizer. vocabulary = Vocabulary(train_dataset.samples, args.vocab_size) tokenizer = Tokenizer(vocabulary) for dataset in (train_dataset, dev_dataset): dataset.register_tokenizer(tokenizer) args.vocab_size = len(vocabulary) args.pad_token_id = tokenizer.pad_token_id print(f'vocab words = {len(vocabulary)}') # Print number of samples. print(f'train samples = {len(train_dataset)}') print(f'dev samples = {len(dev_dataset)}') print() # Select model. model = _select_model(args) num_pretrained = model.load_pretrained_embeddings(vocabulary, args.embedding_path) pct_pretrained = round(num_pretrained / len(vocabulary) * 100., 2) print(f'using pre-trained embeddings from \'{args.embedding_path}\'') print(f'initialized {num_pretrained}/{len(vocabulary)} ' f'embeddings ({pct_pretrained}%)') print() if args.use_gpu: model = cuda(args, model) params = sum(p.numel() for p in model.parameters() if p.requires_grad) print(f'using model \'{args.model}\' ({params} params)') print(model) print() if args.do_train: # Track training statistics for checkpointing. eval_history = [] best_eval_loss = float('inf') # Begin training. for epoch in range(1, args.epochs + 1): if args.use_EDA_aug: # randomly shuffle the data 1st print("shuffling dataset") random.shuffle(train_dataset.samples) # Perform augmentation on the training data print("performing augmentation on dataset...") train_dataset_copy = deepcopy(train_dataset) print("prob for char aug is: ", args.char_aug) augmented_train_dataset = EDA(train_dataset_copy, sr_prob=0.33, rd_prob=0.05, rs_prob=0.10, ri_prob=0.10, r_shuffle_prob=0.10, r_backtrans_prob=0.0, char_aug=args.char_aug) else: print("no augmentation") # Perform training and evaluation steps. if args.use_EDA_aug: # ADDITION: train for augmented training set, eval on the same old dev set print("training on augmented dataset") # print ("1st context of the augmented dataset looks like: ", augmented_train_dataset.elems[0]['context']) a = random.randint(0, 3) print("random num gen: ", a) print("context ex of augmented data: " + augmented_train_dataset.elems[a]['context']) print("sample ex of augmented data: " + " ".join( [token for token in augmented_train_dataset.samples[a][1]])) assert augmented_train_dataset != train_dataset train_loss = train(args, epoch, model, augmented_train_dataset) else: print("training on normal dataset") train_loss = train(args, epoch, model, train_dataset) eval_loss = evaluate(args, epoch, model, dev_dataset) # If the model's evaluation loss yields a global improvement, # checkpoint the model. eval_history.append(eval_loss < best_eval_loss) if eval_loss < best_eval_loss: best_eval_loss = eval_loss torch.save(model.state_dict(), args.model_path) print(f'epoch = {epoch} | ' f'train loss = {train_loss:.6f} | ' f'eval loss = {eval_loss:.6f} | ' f"{'saving model!' if eval_history[-1] else ''}") # If early stopping conditions are met, stop training. if _early_stop(args, eval_history): suffix = 's' if args.early_stop > 1 else '' print(f'no improvement after {args.early_stop} epoch{suffix}. ' 'early stopping...') print() break if args.do_test: # Write predictions to the output file. Use the printed command # below to obtain official EM/F1 metrics. write_predictions(args, model, dev_dataset) eval_cmd = ('python3 evaluate.py ' f'--dataset_path {args.dev_path} ' f'--output_path {args.output_path}') print() print(f'predictions written to \'{args.output_path}\'') print(f'compute EM/F1 with: \'{eval_cmd}\'') print()
en_tok_path = encparams["tokenizer_path"] en_tokenizer = BertTokenizerFast(os.path.join(en_tok_path, "vocab.txt")) de_tok_path = decparams["tokenizer_path"] de_tokenizer = BertTokenizerFast(os.path.join(de_tok_path, "vocab.txt")) # Init the dataset train_en_file = globalparams["train_en_file"] train_de_file = globalparams["train_de_file"] valid_en_file = globalparams["valid_en_file"] valid_de_file = globalparams["valid_de_file"] enc_maxlength = encparams["max_length"] dec_maxlength = decparams["max_length"] batch_size = modelparams["batch_size"] train_dataset = QADataset(train_en_file, train_de_file, en_tokenizer, de_tokenizer, enc_maxlength, dec_maxlength) train_dataloader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=False, \ drop_last=True, num_workers=1, collate_fn=train_dataset.collate_function) valid_dataset = QADataset(valid_en_file, valid_de_file, en_tokenizer, de_tokenizer, enc_maxlength, dec_maxlength) valid_dataloader = torch.utils.data.DataLoader(dataset=valid_dataset, batch_size=batch_size, shuffle=False, \ drop_last=True, num_workers=1, collate_fn=valid_dataset.collate_function) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print("Using device:", device) print("Loading models ..") vocabsize = encparams["vocab_size"] max_length = encparams["max_length"] encoder_config = BertConfig(
default=False, action='store_true') parser.add_argument('-lr', '--learning_rate', type=float, default=0.001,\ help='learning rate for FeedForward') parser.add_argument('--num_epochs', type=int, default=100) parser.add_argument('--root', type=str, default='/home/knarang2/StackExchange') glove_path = '/Users/kanika/Documents/glove.6B/glove.6B.50d.txt' args = parser.parse_args() seed = 1234734614 torch.manual_seed(seed) if args.use_cuda: torch.cuda.manual_seed(seed) dataset = QADataset(dataset='StackExchange', questionFile='QuestionFeatures.tsv', answerFile='AnswerFeatures.tsv', userFile='UserFeatures.tsv', rootFolder=args.root) vectorizer = IndexVectorizer(min_frequency=10) ##CHANGED HERE textDataset = SubjObjDataset( os.path.join(args.root, "pairText_merge_processed.tsv"), vectorizer) word2idx = textDataset.vectorizer.word2idx embeddings = load_glove_embeddings(glove_path, word2idx) #print embeddings print "#WORDS in the Vocabulary", len(word2idx) print "Dataset read", len(dataset) ###SAMPLE THE DATASET
from load_embeddings import GloveVector, getWeightMatrix from data import QADataset, get_dataloader from model import ARC1 from loss import marginLoss import numpy as np document_set = './datasets/DataSet_query_document/document_set.json' tweet_set = './datasets/tweet2new/id2twitter.json' news_set = './datasets/tweet2new/id2new.json' query_set = './datasets/DataSet_query_document/query_set.json' query2document_train = './datasets/DataSet_query_document/query2document_train.json' tw2ne_train = './datasets/tweet2new/tw2ne_train.json' tw2ne_test = './datasets/tweet2new/tw2ne_test.json' query2document_test = './datasets/DataSet_query_document/query2document_test.json' qa_dataset_train = QADataset(query2document_train, document_set, query_set) dataset_sizes_train = len(qa_dataset_train) tn_dataset_train = QADataset(tw2ne_train, news_set, tweet_set) tn_dataset_sizes_train = len(tn_dataset_train) tn_dataset_test = QADataset(tw2ne_test, news_set, tweet_set) tn_dataset_sizes_test = len(tn_dataset_test) qa_dataset_test = QADataset(query2document_test, document_set, query_set) dataset_sizes_test = len(qa_dataset_test) train_dataloader = get_dataloader(tn_dataset_train, 100) test_dataloader = get_dataloader(tn_dataset_test, 1) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
BATCH_SIZE = args.batch_size timestamp = str(int(time.time())) seed = 999 torch.manual_seed(seed) if args.use_cuda: torch.cuda.manual_seed(seed) os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu # torch.device object used throughout this script DEVICE = torch.device("cuda:0" if args.use_cuda else "cpu") # dataset dataset = QADataset(dataset=args.dataset, questionFile='QuestionFeatures.tsv', answerFile='AnswerFeatures.tsv', userFile='UserFeatures.tsv', rootFolder=os.path.join(args.root, args.dataset)) print "Dataset read", len(dataset), args.dataset PosClass = dataset.trainPairs_WFeatures[ dataset.trainPairs_WFeatures['Credible'] == '1'] NegClass = dataset.trainPairs_WFeatures[ dataset.trainPairs_WFeatures['Credible'] == '0'] print "Positive samples", len(PosClass) questions = dataset.trainPairs['QuestionId'].unique() if len(PosClass) > len(NegClass): NegClass_Sample = NegClass else: NegClass_Sample = NegClass.sample(n=len(PosClass))