Esempio n. 1
0
def train(dataroot, classifier_name='cnn'):
    balance = get_balancing_technique()
    K = 10
    fold_prefix = str(K) + 'bal_fold_{}.csv' if balance == 'explicit' else str(
        K) + 'r_fold_{}.csv'

    class_weight = get_class_weights(dataroot)

    classifier_args, config = get_args(classifier_name, class_weight)
    pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
    fingerprint = join(pre_fingerprint + config, 'K_{}'.format(K))
    print(fingerprint)
    folds_data = load_folds(dataroot, fold_prefix, K)
    for test_index in range(K):
        print('-----------{}----------'.format(test_index))
        X_train = np.concatenate(
            [fold[0] for i, fold in enumerate(folds_data) if i != test_index],
            axis=0)
        y_train = np.concatenate(
            [fold[1] for i, fold in enumerate(folds_data) if i != test_index],
            axis=0)

        logdir = join(fingerprint, 'log', '{}'.format(test_index))
        ensure_dir(logdir)
        classifier_args['runs_dir'] = logdir
        clf = get_classifier(classifier_args)
        clf.fit(X_train, y_train)
        modelname = join(classifier_args['runs_dir'], 'model.pkl')
        pickle.dump(clf, open(modelname, 'wb'))
Esempio n. 2
0
def train(dataroot, classifier_name='cnn'):
    balance = get_balancing_technique()
    K = 10
    fold_prefix = '{}bal_fold_{}.csv' if balance == 'explicit' else '{}r_fold_{}.csv'

    class_weight = get_class_weights(dataroot)

    classifier_args, config = get_args(classifier_name, class_weight)
    pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
    fingerprint = join(pre_fingerprint + config, 'K_{}'.format(K))
    print(fingerprint)
    num_epochs = 40
    for test_index in range(K):
        print('-----------{}----------'.format(test_index))
        dev_indices = [i for i in range(K) if i != test_index]
        val_index = dev_indices[0]
        train_indices = dev_indices[1:]
        val_csv = join(dataroot, fold_prefix.format(K, val_index))
        list_of_train_csvs = [
            join(dataroot, fold_prefix.format(K, i)) for i in train_indices
        ]

        logdir = join(fingerprint, 'log', '{}'.format(test_index))
        ensure_dir(logdir)
        classifier_args['runs_dir'] = logdir
        clf = get_classifier(classifier_args)
        clf.fit(list_of_train_csvs, val_csv, num_epochs)
Esempio n. 3
0
def classify(dataroot,classifier_name):
        K=5
        balance = get_balancing_technique()
        train_data = []
        #single fold 29M records
        # 4 folds 120M records
        # if 20M records require 5% RAM
        # then 120M records require 30% memory
        print("Reading the data...")
        tick=time.time()
        label_to_id, id_to_label, _ = get_ids18_mappers()
        num_train_records = 0
        print("Reading 4 folds ")
        
        if balance=='with_loss' or balance=='no' or balance=='with_loss_sub': 
            regex  = 'r_fold_{}.csv'
        elif balance=='explicit':
            regex = 'bal_fold_{}.csv'
            
        for fold_index in tqdm(range(K)):
            if fold_index==0:
                continue
            reader = pd.read_csv(join(dataroot,regex.format(fold_index)),chunksize=10**6, usecols=get_cols4ml(), dtype=get_dtype4normalized())# 10**6 rows read in 9min, total is 29*10**6
            # remove the extra header row
            for df in tqdm(reader):
                y_str = df.Label.values
                x = df.drop(columns=['Label']).values
                train_data.append((x,encode_label(y_str)))
                num_train_records +=df.shape[0]
                print(df.memory_usage(deep=True).sum()*(799902)/(1024*1024*1024 ))
        tock = time.time()
        print("read data in {:.2f}".format(tock-tick)) # 24min

        classifier_args, config = get_args(classifier_name, num_class='dummy', class_weight=None)
        pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
            
        fingerprint = pre_fingerprint + config
        logdir = join(fingerprint,'log')
        ensure_dir(logdir)
               
        X_train = np.concatenate([fold[0] for fold in train_data ],axis=0)
        y_train = np.concatenate([fold[1] for fold in train_data ],axis=0)
        classifier_args['runs_dir']=logdir

        print("Start training")
        tick = time.time()
        clf= get_classifier(classifier_args)
        print("classes")
        print(np.unique(y_train))
        clf.fit(X_train, y_train)
        fn = classifier_args['runs_dir']+'.pkl'
        pickle.dump(clf,open(fn,'wb'))
        print("Done training {} flow records in {:.2f} sec".format(y_train.shape[0],time.time()-tick))
Esempio n. 4
0
def train_fold(X_train,y_train,args):
    classifier_name = args['classifier_name']
    runs_dir = args['runs_dir']

    clf = get_classifier(args) # runs_dir is a dir to put training log of the classifier   

    print('fitting the model')
    tick = time.time()
    clf.fit(X_train,y_train)
    tock = time.time()
    duration = tock-tick
    print("Trained data of size {} in {:.0f} min, {:.0f} sec ".format(X_train.shape,duration//60,duration%60))

    if classifier_name in ['tree', 'forest']:
        fn = runs_dir+'.pkl'
        print("Saving to ",fn)
        pickle.dump(clf,open(fn,'wb'))

    return clf,duration
def classify(dataroot, classifier_name):
    K = 5
    fraction = 1

    #total_records = 6907705; # in fold fraction after removin small classes <K
    folds_df = []
    fold_root = join(dataroot, 'folds_fraction_{}'.format(fraction))
    print("Reading the data...")
    ds_list = []
    for fold_index in range(K):
        df = pd.read_csv(join(fold_root, 'fold_{}.csv'.format(fold_index)))
        folds_df.append(df)
        ds_list.append(df.Label)
    total_df = pd.concat(folds_df)
    total_label_df = pd.concat(ds_list)
    labels = total_label_df.sort_values().unique()
    total_records = total_label_df.shape[0]
    #labels,labels_d = get_labels(total_label_df.unique())
    label_to_id, id_to_label, _ = get_ids18_mappers()
    class_weight = get_class_weights(
        encode_label(total_label_df.values, label_to_id))

    balance = get_balancing_technique()
    input_dim = folds_df[0].shape[
        1] - 2  # because we remove Label and FlowID columns from X
    gt_num_class = len(label_to_id)
    num_class = len(labels)
    assert gt_num_class == num_class, 'all classess should be observed gt_classes!=observed_classes {}!={}'.format(
        gt_num_class, num_class)

    classifier_args, config = get_args(classifier_name, total_records,
                                       gt_num_class, input_dim, class_weight,
                                       balance)
    pre_fingerprint = join(
        dataroot, 'r_{}_c_{}_k_{}'.format(fraction, classifier_name, str(K)))

    fingerprint = pre_fingerprint + '_mem_constrained' + config
    logdir = join(pre_fingerprint + config, 'log')
    runs_dir = get_runs_dir(logdir)
    classifier_args['runs_dir'] = runs_dir
    clf = get_classifier(classifier_args)
    time_inference(classifier_name, clf, total_df, dataroot)
Esempio n. 6
0
def classify(dataroot, classifier_name='cnn'):
    class_weight = get_class_weights(dataroot)
    balance = get_balancing_technique()
    print('balancing technique ', balance)
    if balance == 'explicit':
        train_csv = join(dataroot, 'bal_train.csv')
        val_csv = join(dataroot, 'bal_fold_1.csv'
                       )  # no need to use bal__fold because it is shuffled
    else:
        train_csv = join(dataroot, 'r_train.csv')
        val_csv = join(dataroot, 'r_fold_1.csv')

    result_val = subprocess.run(['wc', '-l', val_csv], stdout=subprocess.PIPE)
    result_train = subprocess.run(['wc', '-l', train_csv],
                                  stdout=subprocess.PIPE)
    train_records = int(result_train.stdout.split()[0]) - 1  # for the header
    val_records = int(result_val.stdout.split()[0]) - 1
    print("Number of train and val records ({},{})".format(
        train_records, val_records))

    num_epochs = 40
    label_to_id, id_to_label, _ = get_ids18_mappers()
    #class_weight = None
    class_weight = get_class_weights(dataroot)
    if balance == 'with_loss_inverse':
        class_weight = 1. / class_weight

    num_class = len(label_to_id)  # we assume all the categories are observed

    classifier_args, config = get_args(classifier_name, num_class,
                                       class_weight)
    pre_fingerprint = join(dataroot, 'c_{}'.format(classifier_name))
    fingerprint = pre_fingerprint + config
    logdir = join(fingerprint, 'log')
    ensure_dir(logdir)
    classifier_args['runs_dir'] = logdir
    clf = get_classifier(classifier_args)
    clf.fit(train_csv, val_csv, num_epochs, train_records, val_records)
Esempio n. 7
0
def train_and_save_classifier(X_train,y_train,args):
    classifier_name = args['classifier_name']
    balance = args['balance']
    clf = get_classifier(args) # runs_dir is a dir to put training log of the classifier   
    
    if balance=='explicit':
        tick = time.time()
        X_train,y_train = balance_data(X_train,y_train)
        tock = time.time()

    tick = time.time()
    print("Shufling data")
    X_train, y_train = shuffle(X_train,y_train)
    print('fitting model')
    clf.fit(X_train,y_train)
    
    if classifier_name in ['tree', 'forest']:
        with open(join(args['runs_dir'],'model.pkl'),'wb') as f:
            pickle.dump(clf,f)

    tock = time.time()
    duration = tock-tick
    print("Trained data of size {} in {:.0f} min, {:.0f} sec ".format(X_train.shape,duration//60,duration%60))
    return
Esempio n. 8
0
    targets = f(dict(images=inputs), signature="reconstructions",
                as_dict=True)["images"]

if settings.dataname == 'faces' or settings.dataname == 'faces2' or settings.dataname == 'planes' or settings.dataname == 'cars' or settings.dataname == 'chairs' or settings.dataname == 'dlib_cars3d' or settings.dataname == 'dlib_faces3d':
    input_shape = [64, 64, 3]
elif settings.dataname == 'dlib_smallnorb':
    input_shape = [64, 64, 1]
else:
    input_shape = list(tr_data_loader.inputs[0].shape)
print('--- Created Dataset ---')

#####################################################
###################### Models #######################
#####################################################
if not dlib: simvae_model = get_classifier(settings, 'simvae')
gen_model = get_classifier(settings, 'gen')  #Decoder for the second VAE
residual_enc_model = get_classifier(
    settings, 'residual_enc')  #Encoder for the second VAE
if settings.add_encoder: enc_model = get_classifier(settings, 'enc')
if settings.add_mi_penalty: mi_disc_model = get_classifier(settings, 'mi_disc')
print('--- Created Models ---')

######################################################
############### Learning Rate and Optimizer ##########
######################################################
optimizer_gen = get_optimizers(settings, 'gen')
if settings.add_encoder: optimizer_enc = get_optimizers(settings, 'enc')
if settings.add_mi_penalty:
    optimizer_mi_disc = get_optimizers(settings, 'mi_disc')
if settings.add_infogan_penalty: