def _get_new_trained_model(): logger.info('Training new model') training_samples, training_labels = samples.get_samples( os.path.join(config.INSTALL_DIR, config.POSITIVE_SAMPLE_DIR), os.path.join(config.INSTALL_DIR, config.NEGATIVE_SAMPLE_DIR)) model = svm.SVC(kernel='linear') _score_model(model, training_samples, training_labels) logger.info('Fitting new model') model.fit(training_samples, training_labels) return model
def get_chain(sample): sample_dict = samples.get_samples("_".join(SAMPLE.split("_")[:-1])) s = sample_dict[sample] if s.is_reco: chain = TChain("superNt") sr_dict = SRs_reco else: chain = TChain("SuperTruth") sr_dict = SRs chain.Add(s.root_file_pattern) return chain
'random_forest': RandomForest, 'multilabel': MultiLabel, 'nn_multilabel': NeuralMultiLabel, 'neural': Neural, 'rnn': RNN, 'vote': VoteModel } if __name__ == '__main__': parser = argparse.ArgumentParser(prog=__package__) parser.add_argument('--model', choices=MODELS.keys(), default='multilabel') args = parser.parse_args() print('preprocessing') trees, max_edus = load_trees(TRAINING_DIR) vocab, samples = Vocabulary(trees), get_samples(trees) x_train, y_train, sents_idx = get_features(trees, samples, vocab, max_edus) print('training') model = MODELS[args.model](trees=trees, samples=samples, sents_idx=sents_idx, n_features=len(x_train[0]), models=[SGD, MultiLabel, RandomForest], num_classes=len(ACTIONS), hidden_size=256, batch_size=1024, epochs=100, lr=1e-4, w_decay=1e-5) model.train(x_train, y_train)
def start_training(start_epoch, end_epoch, domain_size, batch_size, unet, myoptimizer, warmStart=True, dtype=torch.cuda.FloatTensor): # # Start training # S = torch.zeros(batch_size,3,domain_size,domain_size) # Initialize a U-net # unet = UNet(dtype, img_size=domain_size).type(dtype) # create a new closure of conv_loss object get_loss = conv_loss(domain_size=domain_size, dtype=dtype) # Specify optimizer # optimizer = optim.Adam(unet.parameters(), lr = 1e-4) optimizer = myoptimizer # Mark experiment dirName = "0130_horizon" iteration_number, epochs = 10, end_epoch # Getting samples and corresponding solutions # if not warmStart: # S_sample, s_solution = get_samples(domain_size, u0=1.0, warm_start=warmStart) # S_sample tuple of 4D torch Tensors with initialization # sample, solution = get_samples(domain_size, 1, 0.5, 1, 0, hollowgeometry) # Create directory for this exp. plotdir = dirName + "/plots" try: # Create target Directory os.mkdir(dirName) os.mkdir(plotdir) print("Directory ", dirName, " Created ") except FileExistsError: print("Directory ", dirName, " already exists") err_list = [] loss_list = [] # start training start_time = time.clock() if warmStart: L = 1 # dimensionless LX / LX H = 1 # dimensionless LY / LX dx = L / (domain_size - 1) dy = H / (domain_size - 1) CFL = 0.04 dt = CFL * min(dx, dy) RE = 20 u = np.zeros((domain_size, domain_size)) v = np.zeros((domain_size, domain_size)) p = np.zeros((domain_size, domain_size)) early_time_steps = 10 blank_part = domain_size + 1 u0_vector = np.zeros((1, domain_size)) u0_vector[0, 0:blank_part - 1] = 0.2 v0 = 0.2 u_s, v_s, p_s = solve_flow(early_time_steps, domain_size, domain_size, u, v, dt, dx, dy, p, u0=u0_vector, v0=v0) S_sample, s_solution = get_samples(domain_size, u0=u0_vector, v0=v0, warm_start=warmStart, u_h=u_s, v_h=v_s, p_h=p_s) else: S_sample, s_solution = get_samples(domain_size, u0=1.0) u_s, v_s, p_s = None, None, None for epoch in range(start_epoch, epochs + 1): # an epoch starts epoch_loss = 0 for k in range(iteration_number): # an iteration starts # for j in range(batch_size): T = get_training_data(batch_size, domain_size, warm_start=warmStart) # T.requires_grad_(True) img = T.requires_grad_(True).type(dtype) # img = T.type(dtype) output = unet(img) loss = get_loss(output) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += float(loss) del loss, T del output # Conv loss tracking epoch_loss = epoch_loss / iteration_number print('epoch{}/{}, loss = {:.6f}'.format(epoch, epochs, epoch_loss)) # if (epoch == 1) or (epoch % 20 == 0): generations = unet(S_sample.type(dtype)) if epoch % 10 == 0: show_samples(s_solution[0], s_solution[1], s_solution[2], generations, epoch, plotdir) # RMSE error u_sol = s_solution[0] U_sol = torch.from_numpy(u_sol) error = RMSELoss(generations[0, 0, :, :], U_sol, dtype) # ONLY PLOT U err_list.append(error) loss_list.append(epoch_loss) if epoch > 1 and epoch % 10000 == 0: # torch.save(unet.state_dict(), "{}/history_{}.pth".format(dirName, epoch)) torch.save( { 'epoch': epoch, 'model_state_dict': unet.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss, }, '{}/history_{}.pth'.format(dirName, epoch)) print("Prediction on sample with RMSE = {:.3f}".format(error)) del error del epoch_loss elapsed = time.clock() - start_time average_time = elapsed / (epochs - start_epoch) # saveRMS(err_list) saveRMS(loss_list) print("Training ended with {} epochs, running {} seconds per epoch".format( epochs, average_time)) torch.save(unet.state_dict(), '{}/LaplaceHist_{}.pth'.format(dirName, epoch))
def start_training(start_epoch, end_epoch, domain_size, batch_size, unet, dtype=torch.cuda.FloatTensor): # # Start training T = torch.zeros(batch_size,1,domain_size,domain_size) # Initialize a U-net # unet = UNet(dtype, img_size=domain_size).type(dtype) # create a new closure of conv_loss object get_loss = conv_loss(D = 5, dtype = dtype, domain_size = domain_size) # Specify optimizer optimizer = optim.Adam(unet.parameters(), lr = 2e-4) # Mark experiment dirName = "1123hollow" iteration_number, epochs = 200, end_epoch # Getting samples and corresponding solutions hollowgeometry = torch.ones(batch_size, 1, domain_size, domain_size) center = 10 d = 3 hollowgeometry[0,0,center-d:center+d, center-d:center + d] = 0 sample, solution = get_samples(domain_size, 1, 0.5, 1, 0, hollowgeometry) # Create directory for this exp. plotdir = dirName + "/plots" try: # Create target Directory os.mkdir(dirName) os.mkdir(plotdir) print("Directory " , dirName , " Created ") except FileExistsError: print("Directory " , dirName , " already exists") err_list = [] # start training start_time = time.clock() for epoch in range(start_epoch, epochs + 1): # an epoch starts epoch_loss = 0 for k in range(iteration_number): # an iteration starts # for j in range(batch_size): T = get_training_data(batch_size, domain_size) # T.requires_grad_(True) img = T.requires_grad_(True).type(dtype) # img = T.type(dtype) output = unet(img) loss = get_loss(output) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += float(loss) del loss, T del output # Conv loss tracking epoch_loss = epoch_loss / iteration_number print('epoch{}/{}, loss = {:.3f}'.format(epoch, epochs, epoch_loss)) # if (epoch == 1) or (epoch % 20 == 0): show_samples(solution, unet(sample.type(dtype)), epoch, plotdir, hollowgeometry) # RMSE error sol = torch.from_numpy(solution) error = RMSELoss(unet(sample.type(dtype)), sol, dtype) err_list.append(error) if epoch % 200 == 0: # torch.save(unet.state_dict(), "{}/history_{}.pth".format(dirName, epoch)) torch.save({ 'epoch': epoch, 'model_state_dict': unet.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss, }, '{}/history_{}.pth'.format(dirName, epoch)) print("Prediction on sample with RMSE = {:.3f}".format(error)) del error del epoch_loss elapsed = time.clock() - start_time average_time = elapsed / (epochs - start_epoch) saveRMS(err_list) print("Training ended with {} epochs, running {} seconds per epoch".format(epochs, average_time)) torch.save(unet.state_dict(), '{}/LaplaceHist_{}.pth'.format(dirName, epoch))
import json import samples as sp infile=open("/work/projects/melanomics/analysis/genome/variants2/filter/all.stats.json") total_count, any_filter, counts = json.load(infile) #print total_count, any_filter, counts samples = sp.get_samples() vartypes=["variants", "snp", "ins", "del", "sub"] filters = ["Homopolymer", "microsatellites", "repeat masker", "segmental duplication", "self-chained regions"] print "Sample\tFilter\tVariant type\tCount" for sample in samples: for vartype in vartypes: print "%s\tAll variants\t%s\t%s" % (sample, vartype, total_count.get(sample, {}).get(vartype, 0)) for vartype in vartypes: print "%s\tFiltered\t%s\t%s" % (sample, vartype, any_filter.get(sample, {}).get(vartype, 0)) for filt in filters: for vartype in vartypes: print "%s\t%s\t%s\t%s" % (sample, filt, vartype, counts.get(sample, {}).get(filt, {}).get(vartype, 0))
def start_training(start_epoch, end_epoch, domain_size, batch_size, unet, myoptimizer, warmStart=False, dtype=torch.cuda.FloatTensor): # # Start training # S = torch.zeros(batch_size,3,domain_size,domain_size) # Initialize a U-net # unet = UNet(dtype, img_size=domain_size).type(dtype) # create a new closure of conv_loss object get_loss = conv_loss(domain_size=domain_size, dtype=dtype) # Specify optimizer # optimizer = optim.Adam(unet.parameters(), lr = 1e-4) optimizer = myoptimizer # Mark experiment dirName = "1211_warm" iteration_number, epochs = 400, end_epoch # Getting samples and corresponding solutions S_sample, s_solution = get_samples( domain_size, u0=1.0, warm_start=warmStart ) # S_sample tuple of 4D torch Tensors with initialization # sample, solution = get_samples(domain_size, 1, 0.5, 1, 0, hollowgeometry) # Create directory for this exp. plotdir = dirName + "/plots" try: # Create target Directory os.mkdir(dirName) os.mkdir(plotdir) print("Directory ", dirName, " Created ") except FileExistsError: print("Directory ", dirName, " already exists") err_list = [] # start training start_time = time.clock() for epoch in range(start_epoch, epochs + 1): # an epoch starts epoch_loss = 0 for k in range(iteration_number): # an iteration starts # for j in range(batch_size): T = get_training_data(batch_size, domain_size, warm_start=warmStart) # T.requires_grad_(True) img = T.requires_grad_(True).type(dtype) # img = T.type(dtype) output = unet(img) loss = get_loss(output) optimizer.zero_grad() loss.backward() optimizer.step() epoch_loss += float(loss) del loss, T del output # Conv loss tracking epoch_loss = epoch_loss / iteration_number print('epoch{}/{}, loss = {:.6f}'.format(epoch, epochs, epoch_loss)) # if (epoch == 1) or (epoch % 20 == 0): generations = unet(S_sample.type(dtype)) if epoch % 10 == 0: show_samples(s_solution[0], s_solution[1], s_solution[2], generations, epoch, plotdir) # RMSE error u_sol = s_solution[0] U_sol = torch.from_numpy(u_sol) error = RMSELoss(generations[0, 0, :, :], U_sol, dtype) # ONLY PLOT U err_list.append(error) if epoch > 1 and epoch % 500 == 0: # torch.save(unet.state_dict(), "{}/history_{}.pth".format(dirName, epoch)) torch.save( { 'epoch': epoch, 'model_state_dict': unet.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'loss': epoch_loss, }, '{}/history_{}.pth'.format(dirName, epoch)) print("Prediction on sample with RMSE = {:.3f}".format(error)) del error del epoch_loss elapsed = time.clock() - start_time average_time = elapsed / (epochs - start_epoch) saveRMS(err_list) print("Training ended with {} epochs, running {} seconds per epoch".format( epochs, average_time)) torch.save(unet.state_dict(), '{}/LaplaceHist_{}.pth'.format(dirName, epoch))
def perform_pca(channel): signals, backgrounds = samples.get_samples(channel, purpose='train') if channel == '01jet': branches = features.hh_01jet_vars else: branches = features.hh_2jet_vars X_train, X_test,\ w_train, w_test,\ y_train, y_test = samples.make_classification( *(samples.make_train_test(signals, backgrounds, branches=branches, train_fraction=.5, max_sig_train=2000, max_bkg_train=2000, max_sig_test=2000, max_bkg_test=2000, same_size_train=True, same_size_test=True, norm_sig_to_bkg_train=True, norm_sig_to_bkg_test=True)), standardize=True) print X_train print X_test print w_train print w_test print w_train.min(), w_train.max() pca = PCA(n_components=2) # fit only on background pca.fit(X_train[y_train == 0]) X_train_pca = pca.transform(X_train) X_test_pca = pca.transform(X_test) xmin = X_test_pca[:, 0].min() xmax = X_test_pca[:, 0].max() ymin = X_test_pca[:, 1].min() ymax = X_test_pca[:, 1].max() width = xmax - xmin height = ymax - ymin xmin -= width*.1 xmax += width*.1 ymin -= height*.1 ymax += height*.1 # fit support vector machine on output of PCA clf = svm.SVC(C=100, gamma=.01, probability=True, scale_C=True) clf.fit(X_train_pca, y_train, sample_weight=w_train) # plot the decision function xx, yy = np.meshgrid(np.linspace(xmin, xmax, 500), np.linspace(ymin, ymax, 500)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) channel_name = samples.CHANNEL_NAMES[channel] target_names = ['%s Signal' % channel_name, '%s Background' % channel_name] target_values = [1, 0] # Percentage of variance explained for each components print 'explained variance ratio (first two components):', \ pca.explained_variance_ratio_ # plot PCA and SVM output pl.figure() # plot support vector machine decision function pl.set_cmap(pl.cm.jet) pl.contourf(xx, yy, Z, alpha=0.75) for c, i, target_name in zip("rb", target_values, target_names): pl.scatter(X_test_pca[y_test == i, 0], X_test_pca[y_test == i, 1], c=c, label=target_name, s=w_test[y_test == i]*10, alpha=0.9) pl.xlim((xmin, xmax)) pl.ylim((ymin, ymax)) pl.legend() pl.xlabel('Principal Component [arb. units]') pl.ylabel('Secondary Component [arb. units]') pl.title('Principal Component Analysis\n' 'and Support Vector Machine Decision Function') pl.savefig('pca_%s.png' % channel) # testing: signals, backgrounds = samples.get_samples(channel, purpose='test', mass=125) signal_train, signal_weight_train, \ signal_test, signal_weight_test, \ background_train, background_weight_train, \ background_test, background_weight_test = samples.make_train_test( signals, backgrounds, branches=branches, train_fraction=.5, norm_sig_to_bkg_train=False, norm_sig_to_bkg_test=False) sample_test = np.concatenate((background_test, signal_test)) sample_test = samples.std(sample_test) background_test, signal_test = sample_test[:len(background_test)], \ sample_test[len(background_test):] signal_test = pca.transform(signal_test) background_test = pca.transform(background_test) pl.figure() pl.hist(clf.predict_proba(background_test)[:,-1], weights=background_weight_test, bins=30, range=(0, 1), label='Background', color='b') pl.hist(clf.predict_proba(signal_test)[:,-1], weights=signal_weight_test*10, bins=30, range=(0, 1), label='Signal x 10', color='r') pl.legend() pl.ylabel('Events') pl.xlabel('Support Vector Machine Signal Probability') pl.savefig('pca_svm_score_%s.png' % channel)
if is_filtered: if sample not in any_filter: any_filter[sample] = {} if vartype not in any_filter[sample]: any_filter[sample][vartype] = 0 any_filter[sample][vartype] += 1 any_filter[sample]["variants"] += 1 return total_count, any_filter, counts if __name__ == "__main__": infile = "/work/projects/melanomics/analysis/genome/variants2/filter/all.out" outfile = "/work/projects/melanomics/analysis/genome/variants2/filter/all.stats.json" # infile = "/work/projects/melanomics/analysis/genome/variants2/filter/hp_ms_rm_sr_sd/all.hp_ms_rm_sr_sd.out" # outfile = "/work/projects/melanomics/analysis/genome/variants2/filter/hp_ms_rm_sr_sd/all.hp_ms_rm_sr_sd.stats.json" filters = ["Homopolymer", "microsatellites", "repeat masker", "segmental duplication", "self-chained regions"] samples = samples.get_samples() total_count = {} any_filter = {} counts = {} for k in [total_count, any_filter]: for m in samples: k[m] = {} for sample in samples: total_count[sample]["variants"] = 0 any_filter[sample]["variants"] = 0 total = 0 for line in open(infile):
Linear Discriminant Analysis (LDA) tries to identify attributes that account for the most variance between classes. In particular, LDA, in constrast to PCA, is a supervised method, using known class labels. """ print __doc__ import numpy as np import pylab as pl from matplotlib.ticker import NullFormatter from sklearn.lda import LDA import samples import features signals, backgrounds = samples.get_samples('2jet', purpose='train') X_train, X_test,\ w_train, w_test,\ y_train, y_test = samples.make_classification( *(samples.make_train_test(signals, backgrounds, branches=features.hh_2jet_vars, train_fraction=.5, same_size_train=True, same_size_test=True)), standardize=True) print X_train print X_test print w_train