def convert_dictionary(self, dictionary, word_rank): rank_dictionary = data.Dictionary() rank_dictionary.idx2word = [''] * len(dictionary.idx2word) for idx, word in enumerate(dictionary.idx2word): rank = word_rank[idx] rank_dictionary.idx2word[rank] = word if word not in rank_dictionary.word2idx: rank_dictionary.word2idx[word] = rank return rank_dictionary
def evaluate_zero_shot(args, model, tokenizer, path, src_query, trg_query): """ Evaluate the model for a zero-shot classification task Return loss and accuracy """ model.eval() pred_ls = [] true_ls = [] #### Data test_src = [line.rstrip('\n') for line in open(path + "/test.src")] test_trg = [line.rstrip('\n') for line in open(path + "/test.trg")] # Shuffle in case of short eval src_shuf = [] trg_shuf = [] index_shuf = list(range(len(test_src))) shuffle(index_shuf) for i in index_shuf: src_shuf.append(test_src[i]) trg_shuf.append(test_trg[i]) test_src = src_shuf test_trg = trg_shuf # Targets dictionary dictionary = data.Dictionary() for l in test_trg: dictionary.add_word(l) n_samples = len(test_src) if args.max_batches is not None and args.max_batches < n_samples: n_samples = args.max_batches # for i in trange(len(test_src)): for i in trange(n_samples): src, trg = test_src[i], test_trg[i] src += src_query # Get context hidden states once to speed up eval context = torch.tensor([tokenizer.encode(src)]) pred, past = model(context) mp, true_lbl = most_probable_label(model, trg, trg_query, dictionary, past, tokenizer) pred_ls.append(mp) true_ls.append(true_lbl) return pred_ls, true_ls
############################################################################### # Load data ############################################################################### # load train and dev dataset train_corpus = data.Corpus(args.tokenize, args.max_query_length, args.max_doc_length) train_corpus.parse(args.data + 'train.txt', max_example=args.max_example) print('train set size = ', len(train_corpus)) dev_corpus = data.Corpus(args.tokenize, args.max_query_length, args.max_doc_length) dev_corpus.parse(args.data + 'dev.txt') print('development set size = ', len(dev_corpus)) dictionary = data.Dictionary() dictionary.build_dict(train_corpus, args.max_words) # save the dictionary object to use during testing helper.save_object(dictionary, args.save_path + 'dictionary.p') print('vocabulary size = ', len(dictionary)) embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) print('number of OOV words = ', len(dictionary) - len(embeddings_index)) # ############################################################################### # # Build the model # ############################################################################### model = NSRF(dictionary, embeddings_index, args)
def main(): ############################################################################### # Load data ############################################################################### dictionary = data.Dictionary() train_corpus = data.Corpus(dictionary) dev_corpus = data.Corpus(dictionary) test_corpus = data.Corpus(dictionary) task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task] for task in task_names: skip_first_line = True if task == 'sick' else False train_corpus.parse(task, args.data, 'train.txt', args.tokenize, num_examples=args.max_example, skip_first_line=skip_first_line) if task == 'multinli': dev_corpus.parse(task, args.data, 'dev_matched.txt', args.tokenize) dev_corpus.parse(task, args.data, 'dev_mismatched.txt', args.tokenize) test_corpus.parse(task, args.data, 'test_matched.txt', args.tokenize, is_test_corpus=False) test_corpus.parse(task, args.data, 'test_mismatched.txt', args.tokenize, is_test_corpus=False) else: dev_corpus.parse(task, args.data, 'dev.txt', args.tokenize, skip_first_line=skip_first_line) test_corpus.parse(task, args.data, 'test.txt', args.tokenize, is_test_corpus=False, skip_first_line=skip_first_line) print('train set size = ', len(train_corpus.data)) print('development set size = ', len(dev_corpus.data)) print('test set size = ', len(test_corpus.data)) print('vocabulary size = ', len(dictionary)) # save the dictionary object to use during testing helper.save_object(dictionary, args.save_path + args.task + '_dictionary.pkl') embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) print('number of OOV words = ', len(dictionary) - len(embeddings_index)) # ############################################################################### # # Build the model # ############################################################################### model = SentenceClassifier(dictionary, embeddings_index, args) optim_fn, optim_params = helper.get_optimizer(args.optimizer) optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()), **optim_params) best_acc = 0 if args.cuda: model = model.cuda() if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_acc'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # ############################################################################### # # Train the model # ############################################################################### train = Train(model, optimizer, dictionary, embeddings_index, args, best_acc) bestmodel = train.train_epochs(train_corpus, dev_corpus, args.start_epoch, args.epochs) test_batches = helper.batchify(test_corpus.data, args.batch_size) if 'multinli' in task_names: print( 'Skipping evaluating best model. Evaluate using the test script.') else: test_accuracy, test_f1 = evaluate(bestmodel, test_batches, dictionary) print('accuracy: %.2f%%' % test_accuracy) print('f1: %.2f%%' % test_f1)
assert len(args.units_first) == len(args.colors_first), "!!!---Number of colors_first is not equal to number of units_first (1st Layer)---!!!" gate_names = ['Input', 'Forget', 'Cell', 'Output'] # Parse output dir and file names: # os.makedirs(os.path.dirname(args.output), exist_ok=True) dirname = os.path.dirname(args.output) filename = os.path.basename(args.output) # Load model print('Loading models...') print('\nmodel: ' + args.model+'\n') model = torch.load(args.model, lambda storage, loc: storage) model.rnn.flatten_parameters() embeddings_in = model.encoder.weight.data.cpu().numpy() embeddings_out = model.decoder.weight.data.cpu().numpy() vocab = data.Dictionary(args.vocabulary) # Read list of contrasted words (e.g., singular vs. plural verbs). with open(args.input, 'r') as f: lines=f.readlines() verbs_singular = [l.split('\t')[0].strip() for l in lines] verbs_plural = [l.split('\t')[1].strip() for l in lines] verbs_all = verbs_singular + verbs_plural print('\nWords used (group 1):') print(verbs_singular) print('\nWords used (group 2):') print(verbs_plural) # Get index in the vocab for all words and extract embeddings idx_verbs_singular = [vocab.word2idx[w] for w in verbs_singular] idx_verbs_plural = [vocab.word2idx[w] for w in verbs_plural]
if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) device = torch.device("cuda" if args.cuda else "cpu") ############################################################################### # Load data ############################################################################### dic_exists = os.path.isfile(os.path.join(args.data, 'action_dictionary.pkl')) if dic_exists: with open(os.path.join(args.data, 'action_dictionary.pkl'), 'rb') as input: Corpus_Dic = pickle.load(input) else: Corpus_Dic = data.Dictionary() train_data_name = os.path.join( args.data, str(args.number_per_class) + '_labeled_train.csv') test_data_name = os.path.join(args.data, 'test.csv') train_data = data.Csv_DataSet(train_data_name) test_data = data.Csv_DataSet(test_data_name) train_data.load(dictionary=Corpus_Dic) test_data.load(dictionary=Corpus_Dic, train_mode=False) # save the dictionary if not dic_exists: with open(os.path.join(args.data, 'action_dictionary.pkl'), 'wb') as output:
random.seed(args.seed) torch.manual_seed(args.seed) os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) device = torch.device("cuda" if args.cuda else "cpu") ############################################################################### # Load vocab ############################################################################### vocab = data.Dictionary() with open("../data/train.txt", 'r', encoding="utf8") as f: for line in f: words = line.split() + ['<eos>'] for word in words: vocab.add_word(word) ############################################################################### # Build the model ############################################################################### ntokens = len(vocab) forward_model = model.TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device) backward_model = model.TransformerModel(ntokens, emsize, nhead, nhid, nlayers, dropout).to(device)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='Translate using a pre-trained model') parser.add_argument('--model', help='a model previously trained with train.py') parser.add_argument('--batch_size', type=int, default=50, help='the batch size (defaults to 50)') parser.add_argument('--beam_size', type=int, default=12, help='the beam size (defaults to 12, 0 for greedy search)') parser.add_argument('--encoding', default='utf-8', help='the character encoding for input/output (defaults to utf-8)') parser.add_argument('-i', '--input', default=sys.stdin.fileno(), help='the input file (defaults to stdin)') parser.add_argument('-o', '--output', default=sys.stdout.fileno(), help='the output file (defaults to stdout)') parser.add_argument('--noise',type=float,default=0.5) parser.add_argument('--pass_att',action='store_true',default=False) parser.add_argument('--src_embeddings',default=None,help='common intersection source embeddings') parser.add_argument('--cutoff', type=int, default=None, help='cutoff for source embeddings above') parser.add_argument('--cat_embedds',help='use torch.load to load src and trg ') parser.add_argument('--ncontrol',type=int,default=0,help='control number given while using the decoder') args = parser.parse_args() t = torch.load(args.model) # try: # t = torch.load(args.model) # except Exception: # # t = torch.load(args.model,map_location={'cuda:1':'cuda:0'}) # t = torch.load(args.model,map_location={'cuda:3'}) # Translate sentences end = False fin = open(args.input, encoding=args.encoding, errors='surrogateescape') fout = open(args.output, mode='w', encoding=args.encoding, errors='surrogateescape') if args.src_embeddings is not None: encoder_embeddings,src_dictionary = data.read_embeddings(open(args.src_embeddings,'r'),threshold=args.cutoff) encoder_embeddings = gpu(encoder_embeddings) t.decoder_embeddings=gpu(t.decoder_embeddings) t.generator=gpu(t.generator) t.encoder=gpu(t.encoder) t.decoder=gpu(t.decoder) translator_new = Translator(encoder_embeddings,t.decoder_embeddings,t.generator,src_dictionary,\ t.trg_dictionary,t.encoder,t.decoder,t.denoising,t.device) else: t.device=torch.device('cuda') t.encoder=gpu(t.encoder) t.decoder=gpu(t.decoder) t.encoder_embeddings=gpu(t.encoder_embeddings) t.decoder_embeddings=gpu(t.decoder_embeddings) t.generator=gpu(t.generator) t.src_dictionary = data.Dictionary(t.src_dictionary.id2word[1:]) t.trg_dictionary = data.Dictionary(t.trg_dictionary.id2word[1:]) translator_new = Translator(t.encoder_embeddings,t.decoder_embeddings,t.generator,t.src_dictionary,\ t.trg_dictionary,t.encoder,t.decoder,t.denoising,t.device) # print (translator_new.denoising) # exit(0) while not end: batch = [] while len(batch) < args.batch_size and not end: line = fin.readline() if not line: end = True else: batch.append(line) if args.beam_size <= 0 and len(batch) > 0: for translation in translator_new.greedy(batch, train=False): print(translation, file=fout) elif len(batch) > 0: translations = translator_new.beam_search(batch, train=False, beam_size=12, max_ratio=2,rnk=6,noiseratio=args.noise,pass_att=args.pass_att,ncontrol=args.ncontrol if args.ncontrol!=0 else None) print(translations) if args.pass_att: for translation1,trans2 in translations: print(translation1,trans2, file=fout) else: for translation in translations: print(translation, file=fout) fout.flush() fin.close() fout.close()
# chapter = i # if chapter < 10: # # input_data = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/Data/Chapters_Parsed/Test_all/Chapitre0{}.alt.txt'.format(chapter) # # output = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/patterns/fr/Chap0{}_activations.pkl'.format(chapter) # input_data = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/Data/en/Chapter0{}.txt'.format(chapter) # output = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/patterns/en/updated/Chap0{}_activations.pkl'.format(chapter) # else: # # input_data = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/Data/Chapters_Parsed/Test_all/Chapitre{}.alt.txt'.format(chapter) # # output = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/patterns/fr/Chap{}_activations.pkl'.format(chapter) # input_data = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/Data/en/Chapter{}.txt'.format(chapter) # output = '/neurospin/unicog/protocols/IRMf/LePetitPrince_Pallier_2018/LSTM/patterns/en/updated/Chap{}_activations.pkl'.format(chapter) vocab = data.Dictionary(vocabulary) sentences = [] print(open(input_data, 'r')) for l in open(input_data, 'r'): if not l.find("\'") == -1: l = l.replace("\'", "\' ") sentence = l.rstrip().split(" ") sentence = [s.lower() for s in sentence] if l[0] != '\n': sentences.append(sentence) sentences = np.array(sentences) print(sentences) # sentences = sentences[0:1000] print('Loading models...')
def main(): # if output directory doesn't exist, create it if not os.path.exists(args.save_path): os.makedirs(args.save_path) # set the random seed manually for reproducibility. numpy.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) print('\ncommand-line params : {0}\n'.format(sys.argv[1:])) print('{0}\n'.format(args)) ############################################################################### # Load data ############################################################################### dictionary = data.Dictionary() tasks = [] train_dict, dev_dict = {}, {} if 'quora' in args.task: print('**Task name : Quora**') # load quora dataset quora_train = data.Corpus(args.data, dictionary) quora_train.parse('quora/train.txt', 'quora', args.tokenize, args.max_example) print('Found {} pairs of train sentences.'.format(len( quora_train.data))) quora_dev = data.Corpus(args.data, dictionary) quora_dev.parse('quora/dev.txt', 'quora', args.tokenize) print('Found {} pairs of dev sentences.'.format(len(quora_dev.data))) quora_test = data.Corpus(args.data, dictionary) quora_test.parse('quora/test.txt', 'quora', args.tokenize) print('Found {} pairs of test sentences.'.format(len(quora_test.data))) tasks.append(('quora', 2)) train_dict['quora'] = quora_train dev_dict['quora'] = quora_dev if 'snli' in args.task: print('**Task name : SNLI**') # load snli dataset snli_train = data.Corpus(args.data, dictionary) snli_train.parse('snli/train.txt', 'snli', args.tokenize, args.max_example) print('Found {} pairs of train sentences.'.format(len( snli_train.data))) snli_dev = data.Corpus(args.data, dictionary) snli_dev.parse('snli/dev.txt', 'snli', args.tokenize) print('Found {} pairs of dev sentences.'.format(len(snli_dev.data))) snli_test = data.Corpus(args.data, dictionary) snli_test.parse('snli/test.txt', 'snli', args.tokenize) print('Found {} pairs of test sentences.'.format(len(snli_test.data))) tasks.append(('snli', 3)) train_dict['snli'] = snli_train dev_dict['snli'] = snli_dev if 'multinli' in args.task: print('**Task name : Multi-NLI**') # load multinli dataset multinli_train = data.Corpus(args.data, dictionary) multinli_train.parse('multinli/train.txt', 'multinli', args.tokenize, args.max_example) print('Found {} pairs of train sentences.'.format( len(multinli_train.data))) multinli_dev = data.Corpus(args.data, dictionary) multinli_dev.parse('multinli/dev_matched.txt', 'multinli', args.tokenize) multinli_dev.parse('multinli/dev_mismatched.txt', 'multinli', args.tokenize) print('Found {} pairs of dev sentences.'.format(len( multinli_dev.data))) multinli_test = data.Corpus(args.data, dictionary) multinli_test.parse('multinli/test_matched.txt', 'multinli', args.tokenize) multinli_test.parse('multinli/test_mismatched.txt', 'multinli', args.tokenize) print('Found {} pairs of test sentences.'.format( len(multinli_test.data))) tasks.append(('multinli', 3)) train_dict['multinli'] = multinli_train dev_dict['multinli'] = multinli_dev if 'allnli' in args.task: print('**Task name : AllNLI**') # load allnli dataset allnli_train = data.Corpus(args.data, dictionary) allnli_train.parse('snli/train.txt', 'snli', args.tokenize, args.max_example) allnli_train.parse('multinli/train.txt', 'multinli', args.tokenize, args.max_example) print('Found {} pairs of train sentences.'.format( len(allnli_train.data))) allnli_dev = data.Corpus(args.data, dictionary) allnli_dev.parse('snli/dev.txt', 'snli', args.tokenize) allnli_dev.parse('multinli/dev_matched.txt', 'multinli', args.tokenize) allnli_dev.parse('multinli/dev_mismatched.txt', 'multinli', args.tokenize) print('Found {} pairs of dev sentences.'.format(len(allnli_dev.data))) allnli_test = data.Corpus(args.data, dictionary) allnli_test.parse('snli/test.txt', 'snli', args.tokenize) allnli_test.parse('multinli/test_matched.txt', 'multinli', args.tokenize) allnli_test.parse('multinli/test_mismatched.txt', 'multinli', args.tokenize) print('Found {} pairs of test sentences.'.format(len( allnli_test.data))) tasks.append(('allnli', 3)) train_dict['allnli'] = allnli_train dev_dict['allnli'] = allnli_dev print('\nvocabulary size = ', len(dictionary)) # save the dictionary object to use during testing helper.save_object(dictionary, args.save_path + 'dictionary.p') embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) print('number of OOV words = ', len(dictionary) - len(embeddings_index)) # ############################################################################### # # Build the model # ############################################################################### if not tasks: return model = MultitaskDomainAdapter(dictionary, embeddings_index, args, tasks) print(model) optim_fn, optim_params = helper.get_optimizer(args.optimizer) optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()), **optim_params) best_accuracy = 0 # for training on multiple GPUs. use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use if 'CUDA_VISIBLE_DEVICES' in os.environ: cuda_visible_devices = [ int(x) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(',') ] if len(cuda_visible_devices) > 1: model = torch.nn.DataParallel(model, device_ids=cuda_visible_devices) if args.cuda: model = model.cuda() if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_accuracy = checkpoint['best_acc'] model.load_state_dict(checkpoint['state_dict']['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # ############################################################################### # # Train the model # ############################################################################### train = Train(model, optimizer, dictionary, embeddings_index, args, best_accuracy) train.set_train_dev_corpus(train_dict, dev_dict) train.train_epochs(args.start_epoch, args.epochs)