def add_statistics(self, histo, pkl_name = None, fit = None) :
		print 'histo %f .. %f with %d bins' % (histo.GetBinLowEdge(1), histo.GetXaxis().GetBinUpEdge(histo.GetNbinsX()), histo.GetNbinsX())
		if pkl_name == None : pkl_name = self.name
		histos = {}
		histos['data'] = histo
		res = {}
		res['mean'    ] = histo.GetMean()
		res['mean_err'] = histo.GetMeanError()
		print 'Mean: %f' % res['mean']
		histos['stat'] = ROOT.TH1F('%s_stat' % pkl_name, 'stat', histo.GetNbinsX(), 0., 1.)
		histos['stat'].SetBinContent(1, self.run_no    )
		histos['stat'].SetBinError  (1, 0              )
		histos['stat'].SetBinContent(2, res['mean'    ])
		histos['stat'].SetBinError  (2, res['mean_err'])
		histos['stat'].SetBinContent(3, histo.GetRMS     ())
		histos['stat'].SetBinError  (3, histo.GetRMSError())
		histos['stat'].SetBinContent(4, histo.Integral())
		histos['stat'].SetBinContent(7, self.nstrips)
		if self.run_config_file != '' and self.run_config.has_section('%d' % self.run_no) and self.det_type == 'Dia' :
			histos['stat'].SetBinContent(5, eval(self.run_config.get('%d' % self.run_no, 'calibration'    )))
			histos['stat'].SetBinError  (5, eval(self.run_config.get('%d' % self.run_no, 'calibration_err')))
			histos['stat'].SetBinContent(6, eval(self.run_config.get('%d' % self.run_no, 'fluence'    ))/1e15)
			histos['stat'].SetBinError  (6, eval(self.run_config.get('%d' % self.run_no, 'fluence_err'))/1e15)
		if fit != None :
			for i in [0, 1, 2] :
				histos['stat'].SetBinContent(i+10, fit.GetParameter(i))
				histos['stat'].SetBinError  (i+10, fit.GetParError(i))
			res['sigma'    ] = fit.GetParameter(2)
			res['sigma_err'] = fit.GetParError(2)
			mean  = (fit.GetParameter(1), fit.GetParError(1))
			sigma = (fit.GetParameter(2), fit.GetParError(2))
			print 'Mean : %s +- %s' % rn.get_roundedNumber(*mean )
			print 'Sigma: %s +- %s' % rn.get_roundedNumber(*sigma)
		helper.save_object(res, '%s%s_stat.pkl' % (self.output_path, pkl_name))
		return histos
	def make_eventDisplays(self) :
		pkl_path = '%sevent_displays.pkl' % self.path
		if os.path.exists(pkl_path) :
			print '[status] loading %s..' % pkl_path
			histo = helper.load_object(pkl_path)
		else :
			histo = self.scan_events()
			helper.save_object(histo, pkl_path)
		self.draw_events(histo)
Example #3
0
###############################################################################

# load train and dev dataset
train_corpus = data.Corpus(args.tokenize, args.max_query_length,
                           args.max_doc_length)
train_corpus.parse(args.data + 'train.txt', max_example=args.max_example)
print('train set size = ', len(train_corpus))
dev_corpus = data.Corpus(args.tokenize, args.max_query_length,
                         args.max_doc_length)
dev_corpus.parse(args.data + 'dev.txt')
print('development set size = ', len(dev_corpus))

dictionary = data.Dictionary()
dictionary.build_dict(train_corpus, args.max_words)
# save the dictionary object to use during testing
helper.save_object(dictionary, args.save_path + 'dictionary.p')
print('vocabulary size = ', len(dictionary))

embeddings_index = helper.load_word_embeddings(args.word_vectors_directory,
                                               args.word_vectors_file,
                                               dictionary.word2idx)
print('number of OOV words = ', len(dictionary) - len(embeddings_index))

# ###############################################################################
# # Build the model
# ###############################################################################

model = NSRF(dictionary, embeddings_index, args)
print(model)
optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()),
                       args.lr)
def main():
    ###############################################################################
    # Load data
    ###############################################################################

    dictionary = data.Dictionary()
    train_corpus = data.Corpus(dictionary)
    dev_corpus = data.Corpus(dictionary)
    test_corpus = data.Corpus(dictionary)

    task_names = ['snli', 'multinli'] if args.task == 'allnli' else [args.task]
    for task in task_names:
        skip_first_line = True if task == 'sick' else False
        train_corpus.parse(task,
                           args.data,
                           'train.txt',
                           args.tokenize,
                           num_examples=args.max_example,
                           skip_first_line=skip_first_line)
        if task == 'multinli':
            dev_corpus.parse(task, args.data, 'dev_matched.txt', args.tokenize)
            dev_corpus.parse(task, args.data, 'dev_mismatched.txt',
                             args.tokenize)
            test_corpus.parse(task,
                              args.data,
                              'test_matched.txt',
                              args.tokenize,
                              is_test_corpus=False)
            test_corpus.parse(task,
                              args.data,
                              'test_mismatched.txt',
                              args.tokenize,
                              is_test_corpus=False)
        else:
            dev_corpus.parse(task,
                             args.data,
                             'dev.txt',
                             args.tokenize,
                             skip_first_line=skip_first_line)
            test_corpus.parse(task,
                              args.data,
                              'test.txt',
                              args.tokenize,
                              is_test_corpus=False,
                              skip_first_line=skip_first_line)

    print('train set size = ', len(train_corpus.data))
    print('development set size = ', len(dev_corpus.data))
    print('test set size = ', len(test_corpus.data))
    print('vocabulary size = ', len(dictionary))

    # save the dictionary object to use during testing
    helper.save_object(dictionary,
                       args.save_path + args.task + '_dictionary.pkl')

    embeddings_index = helper.load_word_embeddings(args.word_vectors_directory,
                                                   args.word_vectors_file,
                                                   dictionary.word2idx)
    print('number of OOV words = ', len(dictionary) - len(embeddings_index))

    # ###############################################################################
    # # Build the model
    # ###############################################################################

    model = SentenceClassifier(dictionary, embeddings_index, args)
    optim_fn, optim_params = helper.get_optimizer(args.optimizer)
    optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()),
                         **optim_params)
    best_acc = 0

    if args.cuda:
        model = model.cuda()

    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_acc = checkpoint['best_acc']
            model.load_state_dict(checkpoint['state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # ###############################################################################
    # # Train the model
    # ###############################################################################

    train = Train(model, optimizer, dictionary, embeddings_index, args,
                  best_acc)
    bestmodel = train.train_epochs(train_corpus, dev_corpus, args.start_epoch,
                                   args.epochs)
    test_batches = helper.batchify(test_corpus.data, args.batch_size)
    if 'multinli' in task_names:
        print(
            'Skipping evaluating best model. Evaluate using the test script.')
    else:
        test_accuracy, test_f1 = evaluate(bestmodel, test_batches, dictionary)
        print('accuracy: %.2f%%' % test_accuracy)
        print('f1: %.2f%%' % test_f1)
                                        threshold_examples:mid_test +
                                        threshold_examples]

print('train set size = ', len(train_corpus.data))
print('development set size = ', len(dev_corpus.data))
print('test set size = ', len(test_corpus.data))

# save the dictionary object to use during testing
if os.path.exists(args.output_base_path + args.task + '/' + 'dictionary.p'):
    print('loading dictionary')
    dictionary = helper.load_object(args.output_base_path + args.task + '/' +
                                    'dictionary.p')
else:
    dictionary = data.Dictionary()
    dictionary.build_dict(train_corpus.data, args.max_words)
    helper.save_object(
        dictionary, args.output_base_path + args.task + '/' + 'dictionary.p')

print('vocabulary size = ', len(dictionary))

# ###############################################################################

# train = train.Train(model, optimizer, selector, optimizer_selector, dictionary, args, best_acc)
# train.train_epochs(train_corpus, dev_corpus, test_corpus, args.start_epoch, args.epochs)

numpy.random.shuffle(
    train_corpus.data)  #helper.batchify(train_corpus.data, args.batch_size)

# num_batches=len(train_batches)

# save the model to disk
filename = args.task + '_model.pcl'
Example #6
0
    #load dictionary
    dictionary = helper.load_object(args.data_path + args.dataset +
                                    '/dictionary.p')
    print('vocabulary size = ', len(dictionary))

    if not os.path.exists(args.data_path + args.dataset + '/test_dataset.p'):
        #build test dataset
        test_dataset = dataload.Dataset(args.max_query_len, args.max_doc_len,
                                        args.hist_session_num_limit,
                                        args.click_num_limit)
        test_dataset.parse(args.corpus_path + args.dataset + '/test.txt',
                           dictionary, args.max_example)
        print('test set size = ', len(test_dataset))
        #save the test_dataset object
        helper.save_object(test_dataset,
                           args.data_path + args.dataset + '/test_dataset.p')
    else:
        #load test dataset
        test_dataset = helper.load_object(args.data_path + args.dataset +
                                          '/test_dataset.p')
        print('test set size = ', len(test_dataset))

    #build pretrained weight
    pretrained_weight = helper.init_embedding_weights(
        dictionary, args.emb_dim)  #不利用预训练参数进行测试,使用载入的训练参数进行测试

    #build model
    model = LostNet(vocab_size=len(dictionary),
                    emb_dim=args.emb_dim,
                    max_query_len=args.max_query_len,
                    max_doc_len=args.max_doc_len,
Example #7
0
                candid_next_queries = extract_next_queries

            if anchor_query not in anchor_candidates:
                anchor_candidates[anchor_query] = []
            random.shuffle(candid_next_queries)
            for candid_next_query in candid_next_queries:
                candid_sen = dataload.Sentence(tag=True)
                candid_sen.sen2seq(candid_next_query, dictionary,
                                   args.max_query_len)
                anchor_candidates[anchor_query].append(candid_sen)

        print('anchor queries size = ',
              len(anchor_candidates))  #输出测试集中anchor queries的数目
        #save anchor candidates object
        helper.save_object(
            anchor_candidates,
            args.data_path + args.dataset + '/anchor_candidates.p')
    else:
        #load anchor candidates
        anchor_candidates = helper.load_object(args.data_path + args.dataset +
                                               '/anchor_candidates.p')
        print('anchor queries size = ', len(anchor_candidates))

    if not os.path.exists(args.data_path + args.dataset + '/test_dataset.p'):
        #build test dataset
        test_dataset = dataload.Dataset(args.max_query_len, args.max_doc_len,
                                        args.hist_session_num_limit,
                                        args.click_num_limit)
        test_dataset.parse(args.corpus_path + args.dataset + '/test.txt',
                           dictionary, args.max_example)
        print('test set size = ', len(test_dataset))
def main():
    # if output directory doesn't exist, create it
    if not os.path.exists(args.save_path):
        os.makedirs(args.save_path)

    # set the random seed manually for reproducibility.
    numpy.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if torch.cuda.is_available():
        if not args.cuda:
            print(
                "WARNING: You have a CUDA device, so you should probably run with --cuda"
            )
        else:
            torch.cuda.manual_seed(args.seed)

    print('\ncommand-line params : {0}\n'.format(sys.argv[1:]))
    print('{0}\n'.format(args))

    ###############################################################################
    # Load data
    ###############################################################################

    dictionary = data.Dictionary()
    tasks = []
    train_dict, dev_dict = {}, {}

    if 'quora' in args.task:
        print('**Task name : Quora**')
        # load quora dataset
        quora_train = data.Corpus(args.data, dictionary)
        quora_train.parse('quora/train.txt', 'quora', args.tokenize,
                          args.max_example)
        print('Found {} pairs of train sentences.'.format(len(
            quora_train.data)))

        quora_dev = data.Corpus(args.data, dictionary)
        quora_dev.parse('quora/dev.txt', 'quora', args.tokenize)
        print('Found {} pairs of dev sentences.'.format(len(quora_dev.data)))

        quora_test = data.Corpus(args.data, dictionary)
        quora_test.parse('quora/test.txt', 'quora', args.tokenize)
        print('Found {} pairs of test sentences.'.format(len(quora_test.data)))

        tasks.append(('quora', 2))
        train_dict['quora'] = quora_train
        dev_dict['quora'] = quora_dev

    if 'snli' in args.task:
        print('**Task name : SNLI**')
        # load snli dataset
        snli_train = data.Corpus(args.data, dictionary)
        snli_train.parse('snli/train.txt', 'snli', args.tokenize,
                         args.max_example)
        print('Found {} pairs of train sentences.'.format(len(
            snli_train.data)))

        snli_dev = data.Corpus(args.data, dictionary)
        snli_dev.parse('snli/dev.txt', 'snli', args.tokenize)
        print('Found {} pairs of dev sentences.'.format(len(snli_dev.data)))

        snli_test = data.Corpus(args.data, dictionary)
        snli_test.parse('snli/test.txt', 'snli', args.tokenize)
        print('Found {} pairs of test sentences.'.format(len(snli_test.data)))

        tasks.append(('snli', 3))
        train_dict['snli'] = snli_train
        dev_dict['snli'] = snli_dev

    if 'multinli' in args.task:
        print('**Task name : Multi-NLI**')
        # load multinli dataset
        multinli_train = data.Corpus(args.data, dictionary)
        multinli_train.parse('multinli/train.txt', 'multinli', args.tokenize,
                             args.max_example)
        print('Found {} pairs of train sentences.'.format(
            len(multinli_train.data)))

        multinli_dev = data.Corpus(args.data, dictionary)
        multinli_dev.parse('multinli/dev_matched.txt', 'multinli',
                           args.tokenize)
        multinli_dev.parse('multinli/dev_mismatched.txt', 'multinli',
                           args.tokenize)
        print('Found {} pairs of dev sentences.'.format(len(
            multinli_dev.data)))

        multinli_test = data.Corpus(args.data, dictionary)
        multinli_test.parse('multinli/test_matched.txt', 'multinli',
                            args.tokenize)
        multinli_test.parse('multinli/test_mismatched.txt', 'multinli',
                            args.tokenize)
        print('Found {} pairs of test sentences.'.format(
            len(multinli_test.data)))

        tasks.append(('multinli', 3))
        train_dict['multinli'] = multinli_train
        dev_dict['multinli'] = multinli_dev

    if 'allnli' in args.task:
        print('**Task name : AllNLI**')
        # load allnli dataset
        allnli_train = data.Corpus(args.data, dictionary)
        allnli_train.parse('snli/train.txt', 'snli', args.tokenize,
                           args.max_example)
        allnli_train.parse('multinli/train.txt', 'multinli', args.tokenize,
                           args.max_example)
        print('Found {} pairs of train sentences.'.format(
            len(allnli_train.data)))

        allnli_dev = data.Corpus(args.data, dictionary)
        allnli_dev.parse('snli/dev.txt', 'snli', args.tokenize)
        allnli_dev.parse('multinli/dev_matched.txt', 'multinli', args.tokenize)
        allnli_dev.parse('multinli/dev_mismatched.txt', 'multinli',
                         args.tokenize)
        print('Found {} pairs of dev sentences.'.format(len(allnli_dev.data)))

        allnli_test = data.Corpus(args.data, dictionary)
        allnli_test.parse('snli/test.txt', 'snli', args.tokenize)
        allnli_test.parse('multinli/test_matched.txt', 'multinli',
                          args.tokenize)
        allnli_test.parse('multinli/test_mismatched.txt', 'multinli',
                          args.tokenize)
        print('Found {} pairs of test sentences.'.format(len(
            allnli_test.data)))

        tasks.append(('allnli', 3))
        train_dict['allnli'] = allnli_train
        dev_dict['allnli'] = allnli_dev

    print('\nvocabulary size = ', len(dictionary))

    # save the dictionary object to use during testing
    helper.save_object(dictionary, args.save_path + 'dictionary.p')

    embeddings_index = helper.load_word_embeddings(args.word_vectors_directory,
                                                   args.word_vectors_file,
                                                   dictionary.word2idx)
    print('number of OOV words = ', len(dictionary) - len(embeddings_index))

    # ###############################################################################
    # # Build the model
    # ###############################################################################

    if not tasks:
        return

    model = MultitaskDomainAdapter(dictionary, embeddings_index, args, tasks)
    print(model)

    optim_fn, optim_params = helper.get_optimizer(args.optimizer)
    optimizer = optim_fn(filter(lambda p: p.requires_grad, model.parameters()),
                         **optim_params)
    best_accuracy = 0

    # for training on multiple GPUs. use CUDA_VISIBLE_DEVICES=0,1 to specify which GPUs to use
    if 'CUDA_VISIBLE_DEVICES' in os.environ:
        cuda_visible_devices = [
            int(x) for x in os.environ['CUDA_VISIBLE_DEVICES'].split(',')
        ]
        if len(cuda_visible_devices) > 1:
            model = torch.nn.DataParallel(model,
                                          device_ids=cuda_visible_devices)
    if args.cuda:
        model = model.cuda()

    if args.resume:
        if os.path.isfile(args.resume):
            print("=> loading checkpoint '{}'".format(args.resume))
            checkpoint = torch.load(args.resume)
            args.start_epoch = checkpoint['epoch']
            best_accuracy = checkpoint['best_acc']
            model.load_state_dict(checkpoint['state_dict']['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            print("=> loaded checkpoint '{}' (epoch {})".format(
                args.resume, checkpoint['epoch']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume))

    # ###############################################################################
    # # Train the model
    # ###############################################################################

    train = Train(model, optimizer, dictionary, embeddings_index, args,
                  best_accuracy)
    train.set_train_dev_corpus(train_dict, dev_dict)
    train.train_epochs(args.start_epoch, args.epochs)