Exemple #1
0
def _get_new_trained_model():
    logger.info('Training new model')
    training_samples, training_labels = samples.get_samples(
        os.path.join(config.INSTALL_DIR, config.POSITIVE_SAMPLE_DIR),
        os.path.join(config.INSTALL_DIR, config.NEGATIVE_SAMPLE_DIR))
    model = svm.SVC(kernel='linear')
    _score_model(model, training_samples, training_labels)
    logger.info('Fitting new model')
    model.fit(training_samples, training_labels)

    return model
Exemple #2
0
def get_chain(sample):
    sample_dict = samples.get_samples("_".join(SAMPLE.split("_")[:-1]))
    s = sample_dict[sample]
    if s.is_reco:
        chain = TChain("superNt")
        sr_dict = SRs_reco
    else:
        chain = TChain("SuperTruth")
        sr_dict = SRs
    chain.Add(s.root_file_pattern)
    return chain
Exemple #3
0
    'random_forest': RandomForest,
    'multilabel': MultiLabel,
    'nn_multilabel': NeuralMultiLabel,
    'neural': Neural,
    'rnn': RNN,
    'vote': VoteModel
}

if __name__ == '__main__':
    parser = argparse.ArgumentParser(prog=__package__)
    parser.add_argument('--model', choices=MODELS.keys(), default='multilabel')
    args = parser.parse_args()

    print('preprocessing')
    trees, max_edus = load_trees(TRAINING_DIR)
    vocab, samples = Vocabulary(trees), get_samples(trees)
    x_train, y_train, sents_idx = get_features(trees, samples, vocab, max_edus)

    print('training')
    model = MODELS[args.model](trees=trees,
                               samples=samples,
                               sents_idx=sents_idx,
                               n_features=len(x_train[0]),
                               models=[SGD, MultiLabel, RandomForest],
                               num_classes=len(ACTIONS),
                               hidden_size=256,
                               batch_size=1024,
                               epochs=100,
                               lr=1e-4,
                               w_decay=1e-5)
    model.train(x_train, y_train)
def start_training(start_epoch,
                   end_epoch,
                   domain_size,
                   batch_size,
                   unet,
                   myoptimizer,
                   warmStart=True,
                   dtype=torch.cuda.FloatTensor):
    # # Start training
    # S = torch.zeros(batch_size,3,domain_size,domain_size)
    # Initialize a U-net
    # unet = UNet(dtype, img_size=domain_size).type(dtype)

    # create a new closure of conv_loss object
    get_loss = conv_loss(domain_size=domain_size, dtype=dtype)
    # Specify optimizer
    # optimizer = optim.Adam(unet.parameters(), lr = 1e-4)
    optimizer = myoptimizer
    # Mark experiment
    dirName = "0130_horizon"
    iteration_number, epochs = 10, end_epoch
    # Getting samples and corresponding solutions
    # if not warmStart:
    #     S_sample, s_solution = get_samples(domain_size, u0=1.0, warm_start=warmStart) # S_sample tuple of 4D torch Tensors with initialization

    # sample, solution = get_samples(domain_size, 1, 0.5, 1, 0, hollowgeometry)
    # Create directory for this exp.
    plotdir = dirName + "/plots"
    try:
        # Create target Directory
        os.mkdir(dirName)
        os.mkdir(plotdir)
        print("Directory ", dirName, " Created ")
    except FileExistsError:
        print("Directory ", dirName, " already exists")

    err_list = []
    loss_list = []
    # start training
    start_time = time.clock()
    if warmStart:
        L = 1  # dimensionless LX / LX
        H = 1  # dimensionless LY / LX
        dx = L / (domain_size - 1)
        dy = H / (domain_size - 1)
        CFL = 0.04
        dt = CFL * min(dx, dy)
        RE = 20
        u = np.zeros((domain_size, domain_size))
        v = np.zeros((domain_size, domain_size))
        p = np.zeros((domain_size, domain_size))
        early_time_steps = 10
        blank_part = domain_size + 1
        u0_vector = np.zeros((1, domain_size))
        u0_vector[0, 0:blank_part - 1] = 0.2
        v0 = 0.2
        u_s, v_s, p_s = solve_flow(early_time_steps,
                                   domain_size,
                                   domain_size,
                                   u,
                                   v,
                                   dt,
                                   dx,
                                   dy,
                                   p,
                                   u0=u0_vector,
                                   v0=v0)
        S_sample, s_solution = get_samples(domain_size,
                                           u0=u0_vector,
                                           v0=v0,
                                           warm_start=warmStart,
                                           u_h=u_s,
                                           v_h=v_s,
                                           p_h=p_s)
    else:
        S_sample, s_solution = get_samples(domain_size, u0=1.0)
        u_s, v_s, p_s = None, None, None

    for epoch in range(start_epoch, epochs + 1):
        # an epoch starts
        epoch_loss = 0
        for k in range(iteration_number):
            # an iteration starts
            # for j in range(batch_size):
            T = get_training_data(batch_size,
                                  domain_size,
                                  warm_start=warmStart)
            # T.requires_grad_(True)
            img = T.requires_grad_(True).type(dtype)
            # img = T.type(dtype)
            output = unet(img)
            loss = get_loss(output)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += float(loss)
            del loss, T
            del output
        # Conv loss tracking
        epoch_loss = epoch_loss / iteration_number
        print('epoch{}/{}, loss = {:.6f}'.format(epoch, epochs, epoch_loss))
        # if (epoch == 1) or (epoch % 20 == 0):
        generations = unet(S_sample.type(dtype))
        if epoch % 10 == 0:
            show_samples(s_solution[0], s_solution[1], s_solution[2],
                         generations, epoch, plotdir)
        # RMSE error
        u_sol = s_solution[0]
        U_sol = torch.from_numpy(u_sol)
        error = RMSELoss(generations[0, 0, :, :], U_sol, dtype)  # ONLY PLOT U
        err_list.append(error)
        loss_list.append(epoch_loss)
        if epoch > 1 and epoch % 10000 == 0:
            # torch.save(unet.state_dict(), "{}/history_{}.pth".format(dirName, epoch))
            torch.save(
                {
                    'epoch': epoch,
                    'model_state_dict': unet.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': epoch_loss,
                }, '{}/history_{}.pth'.format(dirName, epoch))

            print("Prediction on sample with RMSE = {:.3f}".format(error))
        del error
        del epoch_loss
    elapsed = time.clock() - start_time
    average_time = elapsed / (epochs - start_epoch)
    # saveRMS(err_list)
    saveRMS(loss_list)
    print("Training ended with {} epochs, running {} seconds per epoch".format(
        epochs, average_time))
    torch.save(unet.state_dict(),
               '{}/LaplaceHist_{}.pth'.format(dirName, epoch))
def start_training(start_epoch, end_epoch, domain_size, batch_size, unet, dtype=torch.cuda.FloatTensor):
    # # Start training
    T = torch.zeros(batch_size,1,domain_size,domain_size)
    # Initialize a U-net
    # unet = UNet(dtype, img_size=domain_size).type(dtype)
    # create a new closure of conv_loss object
    get_loss = conv_loss(D = 5, dtype = dtype, domain_size = domain_size)
    # Specify optimizer
    optimizer = optim.Adam(unet.parameters(), lr = 2e-4)
    # Mark experiment
    dirName = "1123hollow"
    iteration_number, epochs = 200, end_epoch
    # Getting samples and corresponding solutions
    hollowgeometry = torch.ones(batch_size, 1, domain_size, domain_size)
    center = 10
    d = 3
    hollowgeometry[0,0,center-d:center+d, center-d:center + d] = 0
    sample, solution = get_samples(domain_size, 1, 0.5, 1, 0, hollowgeometry)
    # Create directory for this exp.
    plotdir = dirName + "/plots"
    try:
        # Create target Directory
        os.mkdir(dirName)
        os.mkdir(plotdir)
        print("Directory " , dirName ,  " Created ") 
    except FileExistsError:
        print("Directory " , dirName ,  " already exists")
            
    err_list = []
    # start training
    start_time = time.clock()
    for epoch in range(start_epoch, epochs + 1):
        # an epoch starts    
        epoch_loss = 0
        for k in range(iteration_number):
            # an iteration starts
            # for j in range(batch_size):
            T = get_training_data(batch_size, domain_size)
            # T.requires_grad_(True)
            img = T.requires_grad_(True).type(dtype)
            # img = T.type(dtype)
            output = unet(img)
            loss = get_loss(output)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += float(loss)
            del loss, T
            del output
        # Conv loss tracking    
        epoch_loss = epoch_loss / iteration_number
        print('epoch{}/{}, loss = {:.3f}'.format(epoch, epochs, epoch_loss))
        # if (epoch == 1) or (epoch % 20 == 0):
        show_samples(solution, unet(sample.type(dtype)), epoch, plotdir, hollowgeometry)
        # RMSE error
        sol = torch.from_numpy(solution)
        error = RMSELoss(unet(sample.type(dtype)), sol, dtype)
        err_list.append(error)
        if epoch % 200 == 0:
            # torch.save(unet.state_dict(), "{}/history_{}.pth".format(dirName, epoch))
            torch.save({
                'epoch': epoch,
                'model_state_dict': unet.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': epoch_loss,
                }, '{}/history_{}.pth'.format(dirName, epoch))

            print("Prediction on sample with RMSE = {:.3f}".format(error))
        del error
        del epoch_loss
    elapsed = time.clock() - start_time
    average_time = elapsed / (epochs - start_epoch)
    saveRMS(err_list)
    print("Training ended with {} epochs, running {} seconds per epoch".format(epochs, average_time))
    torch.save(unet.state_dict(), '{}/LaplaceHist_{}.pth'.format(dirName, epoch))
Exemple #6
0
import json
import samples as sp

infile=open("/work/projects/melanomics/analysis/genome/variants2/filter/all.stats.json")
total_count, any_filter, counts = json.load(infile)


#print total_count, any_filter, counts
samples = sp.get_samples()
vartypes=["variants", "snp", "ins", "del", "sub"]
filters = ["Homopolymer", "microsatellites", "repeat masker", "segmental duplication", "self-chained regions"]

print "Sample\tFilter\tVariant type\tCount"
for sample in samples:
    for vartype in vartypes:
        print "%s\tAll variants\t%s\t%s" % (sample, vartype, total_count.get(sample, {}).get(vartype, 0))
    for vartype in vartypes:
        print "%s\tFiltered\t%s\t%s" % (sample, vartype, any_filter.get(sample, {}).get(vartype, 0))
 
    for filt in filters:
        for vartype in vartypes:
            print "%s\t%s\t%s\t%s" % (sample, filt, vartype, counts.get(sample, {}).get(filt, {}).get(vartype, 0))

Exemple #7
0
def start_training(start_epoch,
                   end_epoch,
                   domain_size,
                   batch_size,
                   unet,
                   myoptimizer,
                   warmStart=False,
                   dtype=torch.cuda.FloatTensor):
    # # Start training
    # S = torch.zeros(batch_size,3,domain_size,domain_size)
    # Initialize a U-net
    # unet = UNet(dtype, img_size=domain_size).type(dtype)

    # create a new closure of conv_loss object
    get_loss = conv_loss(domain_size=domain_size, dtype=dtype)
    # Specify optimizer
    # optimizer = optim.Adam(unet.parameters(), lr = 1e-4)
    optimizer = myoptimizer
    # Mark experiment
    dirName = "1211_warm"
    iteration_number, epochs = 400, end_epoch
    # Getting samples and corresponding solutions
    S_sample, s_solution = get_samples(
        domain_size, u0=1.0, warm_start=warmStart
    )  # S_sample tuple of 4D torch Tensors with initialization

    # sample, solution = get_samples(domain_size, 1, 0.5, 1, 0, hollowgeometry)
    # Create directory for this exp.
    plotdir = dirName + "/plots"
    try:
        # Create target Directory
        os.mkdir(dirName)
        os.mkdir(plotdir)
        print("Directory ", dirName, " Created ")
    except FileExistsError:
        print("Directory ", dirName, " already exists")

    err_list = []
    # start training
    start_time = time.clock()
    for epoch in range(start_epoch, epochs + 1):
        # an epoch starts
        epoch_loss = 0
        for k in range(iteration_number):
            # an iteration starts
            # for j in range(batch_size):
            T = get_training_data(batch_size,
                                  domain_size,
                                  warm_start=warmStart)
            # T.requires_grad_(True)
            img = T.requires_grad_(True).type(dtype)
            # img = T.type(dtype)
            output = unet(img)
            loss = get_loss(output)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            epoch_loss += float(loss)
            del loss, T
            del output
        # Conv loss tracking
        epoch_loss = epoch_loss / iteration_number
        print('epoch{}/{}, loss = {:.6f}'.format(epoch, epochs, epoch_loss))
        # if (epoch == 1) or (epoch % 20 == 0):
        generations = unet(S_sample.type(dtype))
        if epoch % 10 == 0:
            show_samples(s_solution[0], s_solution[1], s_solution[2],
                         generations, epoch, plotdir)
        # RMSE error
        u_sol = s_solution[0]
        U_sol = torch.from_numpy(u_sol)
        error = RMSELoss(generations[0, 0, :, :], U_sol, dtype)  # ONLY PLOT U
        err_list.append(error)
        if epoch > 1 and epoch % 500 == 0:
            # torch.save(unet.state_dict(), "{}/history_{}.pth".format(dirName, epoch))
            torch.save(
                {
                    'epoch': epoch,
                    'model_state_dict': unet.state_dict(),
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': epoch_loss,
                }, '{}/history_{}.pth'.format(dirName, epoch))

            print("Prediction on sample with RMSE = {:.3f}".format(error))
        del error
        del epoch_loss
    elapsed = time.clock() - start_time
    average_time = elapsed / (epochs - start_epoch)
    saveRMS(err_list)
    print("Training ended with {} epochs, running {} seconds per epoch".format(
        epochs, average_time))
    torch.save(unet.state_dict(),
               '{}/LaplaceHist_{}.pth'.format(dirName, epoch))
Exemple #8
0
def perform_pca(channel):

    signals, backgrounds = samples.get_samples(channel, purpose='train')

    if channel == '01jet':
        branches = features.hh_01jet_vars
    else:
        branches = features.hh_2jet_vars

    X_train, X_test,\
    w_train, w_test,\
    y_train, y_test = samples.make_classification(
            *(samples.make_train_test(signals, backgrounds,
                branches=branches,
                train_fraction=.5,
                max_sig_train=2000,
                max_bkg_train=2000,
                max_sig_test=2000,
                max_bkg_test=2000,
                same_size_train=True,
                same_size_test=True,
                norm_sig_to_bkg_train=True,
                norm_sig_to_bkg_test=True)),
            standardize=True)

    print X_train
    print X_test

    print w_train
    print w_test

    print w_train.min(), w_train.max()

    pca = PCA(n_components=2)
    # fit only on background
    pca.fit(X_train[y_train == 0])
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)

    xmin = X_test_pca[:, 0].min()
    xmax = X_test_pca[:, 0].max()
    ymin = X_test_pca[:, 1].min()
    ymax = X_test_pca[:, 1].max()

    width = xmax - xmin
    height = ymax - ymin

    xmin -= width*.1
    xmax += width*.1
    ymin -= height*.1
    ymax += height*.1

    # fit support vector machine on output of PCA
    clf = svm.SVC(C=100, gamma=.01, probability=True, scale_C=True)
    clf.fit(X_train_pca, y_train, sample_weight=w_train)

    # plot the decision function
    xx, yy = np.meshgrid(np.linspace(xmin, xmax, 500), np.linspace(ymin, ymax, 500))

    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    channel_name = samples.CHANNEL_NAMES[channel]
    target_names = ['%s Signal' % channel_name,
                    '%s Background' % channel_name]
    target_values = [1, 0]

    # Percentage of variance explained for each components
    print 'explained variance ratio (first two components):', \
        pca.explained_variance_ratio_

    # plot PCA and SVM output
    pl.figure()
    # plot support vector machine decision function
    pl.set_cmap(pl.cm.jet)
    pl.contourf(xx, yy, Z, alpha=0.75)

    for c, i, target_name in zip("rb", target_values, target_names):
        pl.scatter(X_test_pca[y_test == i, 0], X_test_pca[y_test == i, 1],
                   c=c, label=target_name,
                   s=w_test[y_test == i]*10,
                   alpha=0.9)

    pl.xlim((xmin, xmax))
    pl.ylim((ymin, ymax))
    pl.legend()
    pl.xlabel('Principal Component [arb. units]')
    pl.ylabel('Secondary Component [arb. units]')
    pl.title('Principal Component Analysis\n'
             'and Support Vector Machine Decision Function')
    pl.savefig('pca_%s.png' % channel)


    # testing:
    signals, backgrounds = samples.get_samples(channel, purpose='test',
            mass=125)

    signal_train, signal_weight_train, \
    signal_test, signal_weight_test, \
    background_train, background_weight_train, \
    background_test, background_weight_test = samples.make_train_test(
                signals, backgrounds,
                branches=branches,
                train_fraction=.5,
                norm_sig_to_bkg_train=False,
                norm_sig_to_bkg_test=False)

    sample_test = np.concatenate((background_test, signal_test))
    sample_test = samples.std(sample_test)
    background_test, signal_test = sample_test[:len(background_test)], \
                                   sample_test[len(background_test):]

    signal_test = pca.transform(signal_test)
    background_test = pca.transform(background_test)

    pl.figure()
    pl.hist(clf.predict_proba(background_test)[:,-1],
            weights=background_weight_test, bins=30, range=(0, 1),
            label='Background', color='b')
    pl.hist(clf.predict_proba(signal_test)[:,-1],
            weights=signal_weight_test*10, bins=30, range=(0, 1),
            label='Signal x 10', color='r')
    pl.legend()
    pl.ylabel('Events')
    pl.xlabel('Support Vector Machine Signal Probability')
    pl.savefig('pca_svm_score_%s.png' % channel)
Exemple #9
0
    if is_filtered:
        if sample not in any_filter:
            any_filter[sample] = {}
        if vartype not in any_filter[sample]:
            any_filter[sample][vartype] = 0
        any_filter[sample][vartype] += 1
        any_filter[sample]["variants"] += 1
    return total_count, any_filter, counts 

if __name__ == "__main__":
    infile = "/work/projects/melanomics/analysis/genome/variants2/filter/all.out"
    outfile = "/work/projects/melanomics/analysis/genome/variants2/filter/all.stats.json"
    # infile = "/work/projects/melanomics/analysis/genome/variants2/filter/hp_ms_rm_sr_sd/all.hp_ms_rm_sr_sd.out"
    # outfile = "/work/projects/melanomics/analysis/genome/variants2/filter/hp_ms_rm_sr_sd/all.hp_ms_rm_sr_sd.stats.json" 
    filters = ["Homopolymer", "microsatellites", "repeat masker", "segmental duplication", "self-chained regions"]
    samples = samples.get_samples()
                   
    total_count = {}
    any_filter = {}
    counts = {}
    for k in [total_count, any_filter]:
        for m in samples:
            k[m] = {}
    for sample in samples:
        total_count[sample]["variants"] = 0
        any_filter[sample]["variants"] = 0



    total = 0
    for line in open(infile):
Exemple #10
0
Linear Discriminant Analysis (LDA) tries to identify attributes that account for
the most variance between classes. In particular, LDA, in constrast to PCA, is a
supervised method, using known class labels.
"""
print __doc__

import numpy as np
import pylab as pl
from matplotlib.ticker import NullFormatter

from sklearn.lda import LDA

import samples
import features

signals, backgrounds = samples.get_samples('2jet', purpose='train')

X_train, X_test,\
w_train, w_test,\
y_train, y_test = samples.make_classification(
        *(samples.make_train_test(signals, backgrounds,
            branches=features.hh_2jet_vars,
            train_fraction=.5,
            same_size_train=True,
            same_size_test=True)),
        standardize=True)

print X_train
print X_test

print w_train
Exemple #11
0
Linear Discriminant Analysis (LDA) tries to identify attributes that account for
the most variance between classes. In particular, LDA, in constrast to PCA, is a
supervised method, using known class labels.
"""
print __doc__

import numpy as np
import pylab as pl
from matplotlib.ticker import NullFormatter

from sklearn.lda import LDA

import samples
import features

signals, backgrounds = samples.get_samples('2jet', purpose='train')

X_train, X_test,\
w_train, w_test,\
y_train, y_test = samples.make_classification(
        *(samples.make_train_test(signals, backgrounds,
            branches=features.hh_2jet_vars,
            train_fraction=.5,
            same_size_train=True,
            same_size_test=True)),
        standardize=True)

print X_train
print X_test

print w_train