def add_statistics(self, histo, pkl_name = None, fit = None) : print 'histo %f .. %f with %d bins' % (histo.GetBinLowEdge(1), histo.GetXaxis().GetBinUpEdge(histo.GetNbinsX()), histo.GetNbinsX()) if pkl_name == None : pkl_name = self.name histos = {} histos['data'] = histo res = {} res['mean' ] = histo.GetMean() res['mean_err'] = histo.GetMeanError() print 'Mean: %f' % res['mean'] histos['stat'] = ROOT.TH1F('%s_stat' % pkl_name, 'stat', histo.GetNbinsX(), 0., 1.) histos['stat'].SetBinContent(1, self.run_no ) histos['stat'].SetBinError (1, 0 ) histos['stat'].SetBinContent(2, res['mean' ]) histos['stat'].SetBinError (2, res['mean_err']) histos['stat'].SetBinContent(3, histo.GetRMS ()) histos['stat'].SetBinError (3, histo.GetRMSError()) histos['stat'].SetBinContent(4, histo.Integral()) histos['stat'].SetBinContent(7, self.nstrips) if self.run_config_file != '' and self.run_config.has_section('%d' % self.run_no) and self.det_type == 'Dia' : histos['stat'].SetBinContent(5, eval(self.run_config.get('%d' % self.run_no, 'calibration' ))) histos['stat'].SetBinError (5, eval(self.run_config.get('%d' % self.run_no, 'calibration_err'))) histos['stat'].SetBinContent(6, eval(self.run_config.get('%d' % self.run_no, 'fluence' ))/1e15) histos['stat'].SetBinError (6, eval(self.run_config.get('%d' % self.run_no, 'fluence_err'))/1e15) if fit != None : for i in [0, 1, 2] : histos['stat'].SetBinContent(i+10, fit.GetParameter(i)) histos['stat'].SetBinError (i+10, fit.GetParError(i)) res['sigma' ] = fit.GetParameter(2) res['sigma_err'] = fit.GetParError(2) mean = (fit.GetParameter(1), fit.GetParError(1)) sigma = (fit.GetParameter(2), fit.GetParError(2)) print 'Mean : %s +- %s' % rn.get_roundedNumber(*mean ) print 'Sigma: %s +- %s' % rn.get_roundedNumber(*sigma) helper.save_object(res, '%s%s_stat.pkl' % (self.output_path, pkl_name)) return histos
def make_eventDisplays(self) : pkl_path = '%sevent_displays.pkl' % self.path if os.path.exists(pkl_path) : print '[status] loading %s..' % pkl_path histo = helper.load_object(pkl_path) else : histo = self.scan_events() helper.save_object(histo, pkl_path) self.draw_events(histo)
############################################################################### # load train and dev dataset train_corpus = data.Corpus(args.tokenize, args.max_query_length, args.max_doc_length) train_corpus.parse(args.data + 'train.txt', max_example=args.max_example) print('train set size = ', len(train_corpus)) dev_corpus = data.Corpus(args.tokenize, args.max_query_length, args.max_doc_length) dev_corpus.parse(args.data + 'dev.txt') print('development set size = ', len(dev_corpus)) dictionary = data.Dictionary() dictionary.build_dict(train_corpus, args.max_words) # save the dictionary object to use during testing helper.save_object(dictionary, args.save_path + 'dictionary.p') print('vocabulary size = ', len(dictionary)) embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) print('number of OOV words = ', len(dictionary) - len(embeddings_index)) # ############################################################################### # # Build the model # ############################################################################### model = NSRF(dictionary, embeddings_index, args) print(model) optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), args.lr)
def main(): ############################################################################### # Load data ############################################################################### dictionary = data.Dictionary() train_corpus = data.Corpus(dictionary) dev_corpus = data.Corpus(dictionary) test_corpus = data.Corpus(dictionary) task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task] for task in task_names: skip_first_line = True if task == 'sick' else False train_corpus.parse(task, args.data, 'train.txt', args.tokenize, num_examples=args.max_example, skip_first_line=skip_first_line) if task == 'multinli': dev_corpus.parse(task, args.data, 'dev_matched.txt', args.tokenize) dev_corpus.parse(task, args.data, 'dev_mismatched.txt', args.tokenize) test_corpus.parse(task, args.data, 'test_matched.txt', args.tokenize, is_test_corpus=False) test_corpus.parse(task, args.data, 'test_mismatched.txt', args.tokenize, is_test_corpus=False) else: dev_corpus.parse(task, args.data, 'dev.txt', args.tokenize, skip_first_line=skip_first_line) test_corpus.parse(task, args.data, 'test.txt', args.tokenize, is_test_corpus=False, skip_first_line=skip_first_line) print('train set size = ', len(train_corpus.data)) print('development set size = ', len(dev_corpus.data)) print('test set size = ', len(test_corpus.data)) print('vocabulary size = ', len(dictionary)) # save the dictionary object to use during testing helper.save_object(dictionary, args.save_path + args.task + '_dictionary.pkl') embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) print('number of OOV words = ', len(dictionary) - len(embeddings_index)) # ############################################################################### # # Build the model # ############################################################################### model = SentenceClassifier(dictionary, embeddings_index, args) optim_fn, optim_params = helper.get_optimizer(args.optimizer) optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()), **optim_params) best_acc = 0 if args.cuda: model = model.cuda() if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_acc = checkpoint['best_acc'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # ############################################################################### # # Train the model # ############################################################################### train = Train(model, optimizer, dictionary, embeddings_index, args, best_acc) bestmodel = train.train_epochs(train_corpus, dev_corpus, args.start_epoch, args.epochs) test_batches = helper.batchify(test_corpus.data, args.batch_size) if 'multinli' in task_names: print( 'Skipping evaluating best model. Evaluate using the test script.') else: test_accuracy, test_f1 = evaluate(bestmodel, test_batches, dictionary) print('accuracy: %.2f%%' % test_accuracy) print('f1: %.2f%%' % test_f1)
threshold_examples:mid_test + threshold_examples] print('train set size = ', len(train_corpus.data)) print('development set size = ', len(dev_corpus.data)) print('test set size = ', len(test_corpus.data)) # save the dictionary object to use during testing if os.path.exists(args.output_base_path + args.task + '/' + 'dictionary.p'): print('loading dictionary') dictionary = helper.load_object(args.output_base_path + args.task + '/' + 'dictionary.p') else: dictionary = data.Dictionary() dictionary.build_dict(train_corpus.data, args.max_words) helper.save_object( dictionary, args.output_base_path + args.task + '/' + 'dictionary.p') print('vocabulary size = ', len(dictionary)) # ############################################################################### # train = train.Train(model, optimizer, selector, optimizer_selector, dictionary, args, best_acc) # train.train_epochs(train_corpus, dev_corpus, test_corpus, args.start_epoch, args.epochs) numpy.random.shuffle( train_corpus.data) #helper.batchify(train_corpus.data, args.batch_size) # num_batches=len(train_batches) # save the model to disk filename = args.task + '_model.pcl'
#load dictionary dictionary = helper.load_object(args.data_path + args.dataset + '/dictionary.p') print('vocabulary size = ', len(dictionary)) if not os.path.exists(args.data_path + args.dataset + '/test_dataset.p'): #build test dataset test_dataset = dataload.Dataset(args.max_query_len, args.max_doc_len, args.hist_session_num_limit, args.click_num_limit) test_dataset.parse(args.corpus_path + args.dataset + '/test.txt', dictionary, args.max_example) print('test set size = ', len(test_dataset)) #save the test_dataset object helper.save_object(test_dataset, args.data_path + args.dataset + '/test_dataset.p') else: #load test dataset test_dataset = helper.load_object(args.data_path + args.dataset + '/test_dataset.p') print('test set size = ', len(test_dataset)) #build pretrained weight pretrained_weight = helper.init_embedding_weights( dictionary, args.emb_dim) #不利用预训练参数进行测试,使用载入的训练参数进行测试 #build model model = LostNet(vocab_size=len(dictionary), emb_dim=args.emb_dim, max_query_len=args.max_query_len, max_doc_len=args.max_doc_len,
candid_next_queries = extract_next_queries if anchor_query not in anchor_candidates: anchor_candidates[anchor_query] = [] random.shuffle(candid_next_queries) for candid_next_query in candid_next_queries: candid_sen = dataload.Sentence(tag=True) candid_sen.sen2seq(candid_next_query, dictionary, args.max_query_len) anchor_candidates[anchor_query].append(candid_sen) print('anchor queries size = ', len(anchor_candidates)) #输出测试集中anchor queries的数目 #save anchor candidates object helper.save_object( anchor_candidates, args.data_path + args.dataset + '/anchor_candidates.p') else: #load anchor candidates anchor_candidates = helper.load_object(args.data_path + args.dataset + '/anchor_candidates.p') print('anchor queries size = ', len(anchor_candidates)) if not os.path.exists(args.data_path + args.dataset + '/test_dataset.p'): #build test dataset test_dataset = dataload.Dataset(args.max_query_len, args.max_doc_len, args.hist_session_num_limit, args.click_num_limit) test_dataset.parse(args.corpus_path + args.dataset + '/test.txt', dictionary, args.max_example) print('test set size = ', len(test_dataset))
def main(): # if output directory doesn't exist, create it if not os.path.exists(args.save_path): os.makedirs(args.save_path) # set the random seed manually for reproducibility. numpy.random.seed(args.seed) torch.manual_seed(args.seed) if torch.cuda.is_available(): if not args.cuda: print( "WARNING: You have a CUDA device, so you should probably run with --cuda" ) else: torch.cuda.manual_seed(args.seed) print('\ncommand-line params : {0}\n'.format(sys.argv[1:])) print('{0}\n'.format(args)) ############################################################################### # Load data ############################################################################### dictionary = data.Dictionary() tasks = [] train_dict, dev_dict = {}, {} if 'quora' in args.task: print('**Task name : Quora**') # load quora dataset quora_train = data.Corpus(args.data, dictionary) quora_train.parse('quora/train.txt', 'quora', args.tokenize, args.max_example) print('Found {} pairs of train sentences.'.format(len( quora_train.data))) quora_dev = data.Corpus(args.data, dictionary) quora_dev.parse('quora/dev.txt', 'quora', args.tokenize) print('Found {} pairs of dev sentences.'.format(len(quora_dev.data))) quora_test = data.Corpus(args.data, dictionary) quora_test.parse('quora/test.txt', 'quora', args.tokenize) print('Found {} pairs of test sentences.'.format(len(quora_test.data))) tasks.append(('quora', 2)) train_dict['quora'] = quora_train dev_dict['quora'] = quora_dev if 'snli' in args.task: print('**Task name : SNLI**') # load snli dataset snli_train = data.Corpus(args.data, dictionary) snli_train.parse('snli/train.txt', 'snli', args.tokenize, args.max_example) print('Found {} pairs of train sentences.'.format(len( snli_train.data))) snli_dev = data.Corpus(args.data, dictionary) snli_dev.parse('snli/dev.txt', 'snli', args.tokenize) print('Found {} pairs of dev sentences.'.format(len(snli_dev.data))) snli_test = data.Corpus(args.data, dictionary) snli_test.parse('snli/test.txt', 'snli', args.tokenize) print('Found {} pairs of test sentences.'.format(len(snli_test.data))) tasks.append(('snli', 3)) train_dict['snli'] = snli_train dev_dict['snli'] = snli_dev if 'multinli' in args.task: print('**Task name : Multi-NLI**') # load multinli dataset multinli_train = data.Corpus(args.data, dictionary) multinli_train.parse('multinli/train.txt', 'multinli', args.tokenize, args.max_example) print('Found {} pairs of train sentences.'.format( len(multinli_train.data))) multinli_dev = data.Corpus(args.data, dictionary) multinli_dev.parse('multinli/dev_matched.txt', 'multinli', args.tokenize) multinli_dev.parse('multinli/dev_mismatched.txt', 'multinli', args.tokenize) print('Found {} pairs of dev sentences.'.format(len( multinli_dev.data))) multinli_test = data.Corpus(args.data, dictionary) multinli_test.parse('multinli/test_matched.txt', 'multinli', args.tokenize) multinli_test.parse('multinli/test_mismatched.txt', 'multinli', args.tokenize) print('Found {} pairs of test sentences.'.format( len(multinli_test.data))) tasks.append(('multinli', 3)) train_dict['multinli'] = multinli_train dev_dict['multinli'] = multinli_dev if 'allnli' in args.task: print('**Task name : AllNLI**') # load allnli dataset allnli_train = data.Corpus(args.data, dictionary) allnli_train.parse('snli/train.txt', 'snli', args.tokenize, args.max_example) allnli_train.parse('multinli/train.txt', 'multinli', args.tokenize, args.max_example) print('Found {} pairs of train sentences.'.format( len(allnli_train.data))) allnli_dev = data.Corpus(args.data, dictionary) allnli_dev.parse('snli/dev.txt', 'snli', args.tokenize) allnli_dev.parse('multinli/dev_matched.txt', 'multinli', args.tokenize) allnli_dev.parse('multinli/dev_mismatched.txt', 'multinli', args.tokenize) print('Found {} pairs of dev sentences.'.format(len(allnli_dev.data))) allnli_test = data.Corpus(args.data, dictionary) allnli_test.parse('snli/test.txt', 'snli', args.tokenize) allnli_test.parse('multinli/test_matched.txt', 'multinli', args.tokenize) allnli_test.parse('multinli/test_mismatched.txt', 'multinli', args.tokenize) print('Found {} pairs of test sentences.'.format(len( allnli_test.data))) tasks.append(('allnli', 3)) train_dict['allnli'] = allnli_train dev_dict['allnli'] = allnli_dev print('\nvocabulary size = ', len(dictionary)) # save the dictionary object to use during testing helper.save_object(dictionary, args.save_path + 'dictionary.p') embeddings_index = helper.load_word_embeddings(args.word_vectors_directory, args.word_vectors_file, dictionary.word2idx) print('number of OOV words = ', len(dictionary) - len(embeddings_index)) # ############################################################################### # # Build the model # ############################################################################### if not tasks: return model = MultitaskDomainAdapter(dictionary, embeddings_index, args, tasks) print(model) optim_fn, optim_params = helper.get_optimizer(args.optimizer) optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()), **optim_params) best_accuracy = 0 # for training on multiple GPUs. use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use if 'CUDA_VISIBLE_DEVICES' in os.environ: cuda_visible_devices = [ int(x) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(',') ] if len(cuda_visible_devices) > 1: model = torch.nn.DataParallel(model, device_ids=cuda_visible_devices) if args.cuda: model = model.cuda() if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_accuracy = checkpoint['best_acc'] model.load_state_dict(checkpoint['state_dict']['model']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) # ############################################################################### # # Train the model # ############################################################################### train = Train(model, optimizer, dictionary, embeddings_index, args, best_accuracy) train.set_train_dev_corpus(train_dict, dev_dict) train.train_epochs(args.start_epoch, args.epochs)