Exemple #1
0
def main(_):
    print("-" * 80)
    if not os.path.isdir(FLAGS.output_dir):
        print("Path {} does not exist. Creating.".format(FLAGS.output_dir))
        os.makedirs(FLAGS.output_dir)
    elif FLAGS.reset_output_dir:
        print("Path {} exists. Remove and remake.".format(FLAGS.output_dir))
        shutil.rmtree(FLAGS.output_dir)
        os.makedirs(FLAGS.output_dir)

    print("-" * 80)
    log_file = os.path.join(FLAGS.output_dir, "stdout")
    print("Logging to {}".format(log_file))
    sys.stdout = Logger(log_file)

    utils.print_user_flags()
    train()
Exemple #2
0
def Eval_NN():
  print("-" * 80)
  if not os.path.isdir(FLAGS.output_dir):
    print("Path {} does not exist. Creating.".format(FLAGS.output_dir))
    os.makedirs(FLAGS.output_dir)
  elif FLAGS.reset_output_dir:
    print("Path {} exists. Remove and remake.".format(FLAGS.output_dir))
    shutil.rmtree(FLAGS.output_dir)
    os.makedirs(FLAGS.output_dir)

  print("-" * 80)
  log_file = os.path.join(FLAGS.output_dir, "stdout")
  print("Logging to {}".format(log_file))
  sys.stdout = Logger(log_file)

  utils.print_user_flags()

  '''
  # below are for batch evaluation of all arcs defined in the structure_path
  if not FLAGS.structure_path:
    exit()
  with open(FLAGS.structure_path, 'r') as fp:
    lines = fp.readlines()
  lines = [eval(line.strip()) for line in lines]
  structures = []
  for line in lines:
    row = []
    for ele in line:
      row += ele
    structures.append(row) 
  n = len(lines)
  # eval the first structure
  Acc = []
  eva = Eval()
  eva.eval(structures[0])
  eva.eval(structures[1])
  acc = eva.eval(structures[0])
  print(acc)
  pdb.set_trace()
  '''
  eva = Eval()
  return eva
Exemple #3
0
def main(_):
    print("-" * 80)
    if not os.path.isdir(FLAGS.output_dir):
        print("Path {} does not exist. Creating.".format(FLAGS.output_dir))
        os.makedirs(FLAGS.output_dir)
    elif FLAGS.reset_output_dir:
        print("Path {} exists. Remove and remake.".format(FLAGS.output_dir))
        shutil.rmtree(FLAGS.output_dir)
        os.makedirs(FLAGS.output_dir)

    print("-" * 80)
    log_file = os.path.join(FLAGS.output_dir, "stdout")
    print("Logging to {}".format(log_file))
    sys.stdout = Logger(log_file)

    utils.print_user_flags()
    model_file = os.path.join(FLAGS.output_dir, "models.csv")

    if FLAGS.child_fixed_arc is None:
        with open(model_file, 'a+') as f:
            headers = ['num_layers', 'accuracy', 'models_arc']
            writer = csv.DictWriter(f,
                                    headers,
                                    delimiter=',',
                                    lineterminator='\n')
            writer.writeheader()
            for i in range(FLAGS.search_from, FLAGS.child_num_layers + 1):
                tf.compat.v1.logging.info(
                    "Searching with constraint, num_layers: %d" % i)
                map_task = train(i)
                for k, v in map_task.items():
                    writer.writerow({
                        'num_layers': i,
                        'accuracy': k,
                        'models_arc': v
                    })
                f.flush()
    else:
        _ = train(FLAGS.child_num_layers)
Exemple #4
0
import os

from src import config
from src.utils import IndexDatabase, InvertedIndexBuilder, Logger

if __name__ == '__main__':
    logger = Logger().get_logger(__name__)
    doc_dir = os.getenv('DOCUMENTS_PATH', config.DOCUMENTS_PATH)

    try:
        index = InvertedIndexBuilder(doc_dir)
    except Exception:
        logger.exception('Error building index.')
    else:
        IndexDatabase().write_index(index.get(), index.total_docs_count)
        IndexDatabase().insert_api_keys(config.TOKENS)
Exemple #5
0
import abc
import tensorflow as tf
from src.utils import Logger, __fn__, mkdir, filter_params, pickle_dump, pickle_load
import numpy as np
import sys

logger = Logger(__fn__())


class BaseModel(object, metaclass=abc.ABCMeta):

    NAME = 'BaseModel'

    TENSORS = dict(loss='Loss/LOSS',
                   regularizer='Loss/REGL',
                   acc3='Evaluation/ACC3',
                   pred='Output/PRED',
                   alpha='Attention/ALPHA',
                   X='X',
                   asp='asp',
                   lx='lx',
                   y='y',
                   dropout_keep='dropout_keep')

    OPS = dict(train_op='TrainOp/TRAIN_OP')

    OPTIMIZERS = dict(adagrad=tf.train.AdagradOptimizer,
                      adam=tf.train.AdamOptimizer,
                      sgd=tf.train.GradientDescentOptimizer,
                      momentum=tf.train.MomentumOptimizer,
                      rmsprop=tf.train.RMSPropOptimizer)
Exemple #6
0
# Load transformer with Adam optimizer and MSE loss function
net = Transformer(d_input,
                  d_model,
                  d_output,
                  q,
                  v,
                  h,
                  N,
                  attention_size=attention_size,
                  dropout=dropout,
                  chunk_mode=chunk_mode,
                  pe=pe).to(device)
optimizer = optim.Adam(net.parameters(), lr=LR)
loss_function = OZELoss(alpha=0.3)

logger = Logger(f'logs/training.csv', params=['loss'])

with tqdm(total=EPOCHS) as pbar:
    # Fit model
    loss = fit(net,
               optimizer,
               loss_function,
               dataloader_train,
               dataloader_val,
               epochs=EPOCHS,
               pbar=pbar,
               device=device)

    # Log
    logger.log(loss=loss)
Exemple #7
0
dataloader_train = DataLoader(dataset_train,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=NUM_WORKERS)

dataloader_val = DataLoader(dataset_val,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            num_workers=NUM_WORKERS)

# Start search
n_steps = np.prod(
    [len(search_range) for search_range in search_params.values()])

logger = Logger('search_log.csv', search_params)

with tqdm(total=n_steps * EPOCHS) as pbar:
    for params in itertools.product(*search_params.values()):
        params = {
            key: params[idx]
            for idx, key in enumerate(search_params.keys())
        }
        pbar.set_postfix(params)

        # Load transformer with Adam optimizer and MSE loss function
        net = Transformer(d_input=d_input,
                          d_output=d_output,
                          dropout=dropout,
                          chunk_mode=chunk_mode,
                          pe=pe,
Exemple #8
0
                               reduction='none',
                               occupation=occupation),
    'r2_tint':
    lambda y_true, y_pred: np.array([
        r2_score(y_true[:, i, -1], y_pred[:, i, -1])
        for i in range(y_true.shape[1])
    ]),
    'r2_cold':
    lambda y_true, y_pred: np.array([
        r2_score(y_true[:, i, 0:-1], y_pred[:, i, 0:-1])
        for i in range(y_true.shape[1])
    ])
}

logger = Logger(
    f'logs/training.csv',
    model_name=net.name,
    params=[y for key in metrics.keys() for y in (key, key + '_std')])

# Fit model
with tqdm(total=EPOCHS) as pbar:
    loss = fit(net,
               optimizer,
               loss_function,
               dataloader_train,
               dataloader_val,
               epochs=EPOCHS,
               pbar=pbar,
               device=device)

# Switch to evaluation
_ = net.eval()
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
]

staining_train_dataset = StainingDataset(dataset_dir=opt.dataroot,
                                         transform=transforms_,
                                         unaligned=True)
dataset_train_loader = DataLoader(staining_train_dataset,
                                  batch_size=opt.batchSize,
                                  shuffle=True,
                                  num_workers=opt.n_cpu)
#dataloader = DataLoader(ImageDataset(opt.dataroot, transforms_=transforms_, unaligned=True),
#                        batch_size=opt.batchSize, shuffle=True, num_workers=opt.n_cpu)
print('Train Model')
im_per_epoch = 10
# Loss plot
logger = Logger(opt.n_epochs, im_per_epoch)
###################################
#
###### Training ######

for epoch in range(opt.epoch, opt.n_epochs):

    for i, batch in enumerate(dataset_train_loader):
        # Set model input
        real_A = Variable(input_A.copy_(batch['HE_image']))
        real_B = Variable(input_B.copy_(batch['C4D_image']))

        ###### Generators A2B and B2A ######
        optimizer_G.zero_grad()

        # Identity loss
Exemple #10
0
d_input = 38  # From dataset
d_output = 8  # From dataset

# Config
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device {device}")

# Load dataset
ozeDataset = OzeDataset(DATASET_PATH)

# Load network
# Load transformer with Adam optimizer and MSE loss function
loss_function = OZELoss(alpha=0.3)

logger = Logger(f'logs/crossvalidation_log.csv', params=['loss'])

kfoldIterator = kfold(ozeDataset,
                      n_chunk=CHUNKS,
                      batch_size=BATCH_SIZE,
                      num_workers=NUM_WORKERS)

with tqdm(total=CHUNKS * EPOCHS) as pbar:
    for dataloader_train, dataloader_val in kfoldIterator:

        # Load transformer with Adam optimizer and MSE loss function
        # net = Transformer(d_input, d_model, d_output, q, v, h, N, attention_size=attention_size,
        #                   dropout=dropout, chunk_mode=chunk_mode, pe=pe).to(device)
        net = BiGRU(d_input,
                    d_model,
                    d_output,
Exemple #11
0
import os

import cv2

from src.utils import Logger, Visualizer

logger = Logger.get_logger('VideoProcessor')


class VideoProcessor(object):
    def __init__(self, path, score_fn, annotated_path):

        self.score_fn = score_fn
        self.annotated_path = annotated_path
        self.visualizer = Visualizer()

        if not os.path.exists(path):
            raise IOError('file %s does not exist'.format(path))
        self.capture = cv2.VideoCapture(path)
        if os.path.exists(annotated_path):
            os.remove(annotated_path)
        self.writer = cv2.VideoWriter(annotated_path,
                                      cv2.VideoWriter_fourcc(*'XVID'), 50.0,
                                      (640, 360))

        while not self.capture.isOpened():
            cv2.waitKey(1000)
            logger.debug('Wait for header')

    def start(self, max_frame_num=2 << 32, fps=1000):
        num_frames = min(int(self.capture.get(cv2.CAP_PROP_FRAME_COUNT)),
# for testing purposes:
val_data = (val_data[0][:5000], val_data[1][:5000])
print('WARNING: only using 500 points for validation')
# test_data = (test_data[0][:500], test_data[1][:500])

print('POLICY: ',args.policy)

# this is the policy by which one should choose acquisition functions
policy = policy_parser(args.policy, args)

# this is the reward that is calculated based on previous acc/val 
# and current acc/val 
reward_process = RewardProcess(args.reward)

# logger to record experiments
logger = Logger(experiment_name=args.policy, folder=args.folder)
logger.save_args(args)
print('Saving to ', logger.save_folder)

print('Starting Experiment')


"""
GET INITIAL ESTIMATE OF VALIDATION ACCURACY
"""
model = cnn(input_shape=x_train.shape[1:],
            output_classes=n_classes,
            bayesian= args.model == 'bayesian',
            train_size=x_train.shape[0],
            weight_constant=weight_constant)
Exemple #13
0
def main(_):
  # Prepare directory
  pdb.set_trace()
  print("-" * 80)
  if not os.path.isdir(FLAGS.output_dir):
    print("Path {} does not exist. Creating.".format(FLAGS.output_dir))
    os.makedirs(FLAGS.output_dir)
  elif FLAGS.reset_output_dir:
    print("Path {} exists. Remove and remake.".format(FLAGS.output_dir))
    shutil.rmtree(FLAGS.output_dir)
    os.makedirs(FLAGS.output_dir)

  # Redirect stdout1 --------------------------------------------------------------------------------------------
  print("-" * 80)
  log_file = os.path.join(FLAGS.output_dir, "stdout1")
  if not os.path.exists(log_file):
    os.mknod(log_file)

  print("Logging to {}".format(log_file))
  sys.stdout = Logger(log_file)

  utils.print_user_flags()

  print('Reserving gpu memory...')
  tf.Session()
  # Load pickles file
  print('Loading pickled file...')
  with open('/home/yuwei/projects/vincent/pickleRick/allCrops1.pkl') as p_crop:
      allCrops1 = cPickle.load(p_crop)
  with open('/home/yuwei/projects/vincent/pickleRick/allCrops2.pkl') as p_crop:
      allCrops2 = cPickle.load(p_crop)

  with open('/home/yuwei/projects/vincent/pickleRick/labels.pkl','r') as p_crop:
    labels1 = cPickle.load(p_crop)
    labels2 = cPickle.load(p_crop)
    labels3 = cPickle.load(p_crop)    
    labels1_Brio1 = cPickle.load(p_crop)
    labels1_Brio2 = cPickle.load(p_crop)
    labels2_Brio1 = cPickle.load(p_crop)
  
  # Prepare and divide data
  autoTrainNN = AutoTrain()
  combined_1 = zip(allCrops1 + allCrops2, np.concatenate((labels1,labels2)))
  autoTrainNN.addLabelledData(combined_1)
  # train(autoTrainNN)

  # Redirect stdout2 -------------------------------------------------------------------------------------------
  print("-" * 80)
  log_file = os.path.join(FLAGS.output_dir, "stdout2")
  if not os.path.exists(log_file):
    os.mknod(log_file)

  print("Logging to {}".format(log_file))
  sys.stdout.log = open(log_file, "a") # Change log file

  # Load pickles file
  print('Loading pickled file...')
  with open('/home/yuwei/projects/vincent/pickleRick/allCrops3.pkl') as p_crop:
      allCrops3 = cPickle.load(p_crop)    
  with open('/home/yuwei/projects/vincent/pickleRick/brio1/allCrops1.pkl') as p_crop:
      allCrops1_Brio1 = cPickle.load(p_crop)

  combined_2 = zip(allCrops3 + allCrops1_Brio1, np.concatenate((labels3,labels1_Brio1)))
  autoTrainNN.addLabelledData(combined_2)
  # train(autoTrainNN)

  # Redirect stdout3 --------------------------------------------------------------------------------------------
  print("-" * 80)
  log_file = os.path.join(FLAGS.output_dir, "stdout3")
  if not os.path.exists(log_file):
    os.mknod(log_file)

  print("Logging to {}".format(log_file))
  sys.stdout.log = open(log_file, "a") # Change log file

  utils.print_user_flags()

  # Load pickles file
  print('Loading pickled file...')
  with open('/home/yuwei/projects/vincent/pickleRick/brio2/allCrops1.pkl') as p_crop:
      allCrops1_Brio2 = cPickle.load(p_crop)
  with open('/home/yuwei/projects/vincent/pickleRick/brio1/allCrops2.pkl') as p_crop:
      allCrops2_Brio1 = cPickle.load(p_crop)

  combined_3 = zip(allCrops1_Brio2 + allCrops2_Brio1, np.concatenate((labels1_Brio2,labels2_Brio1)))
  autoTrainNN.addLabelledData(combined_3)
  train(autoTrainNN)
def train(model,
          optimizer,
          criterion,
          train_loader,
          num_epoch,
          device,
          val_loader=None,
          scheduler=None,
          save_best=True,
          weights_path='',
          model_name='best_model.pt'):
    """
     Starts training process of the input model, using specified optimizer

    :param model: torch model
    :param optimizer: torch optimizer
    :param criterion: torch criterion
    :param train_loader: torch dataloader instance of training set
    :param val_loader: torch dataloader instance of validation set
    :param num_epoch: number of epochs to train
    :param device: device to train on
    """

    loss_logger = Logger()
    best_loss = float('inf')

    for epoch in range(num_epoch):
        model.train()
        loss_logger.reset()
        for sample in train_loader:
            X, Y_true = sample['X'], sample['Y']

            # transfer tensors to the current device
            X = X.to(device)
            Y_true = Y_true.to(device)

            # zero all gradients
            optimizer.zero_grad()

            # forward propagate
            Y_pred = model(X)
            loss = criterion(Y_pred, Y_true)
            loss_logger.update(loss.item())

            # backprop and update the params
            loss.backward()
            optimizer.step()

        print(f"Epoch: {epoch} | Train loss: {loss_logger.average} |", end=" ")

        # evaluation of model performance on validation set
        loss_logger.reset()
        model.eval()
        for sample in val_loader:
            X = sample['X'].to(device)
            Y_true = sample['Y'].to(device)

            with torch.no_grad():
                Y_pred = model(X)
                val_loss = criterion(Y_pred, Y_true)
            loss_logger.update(val_loss.item())

        print(f"Val loss: {loss_logger.average}")

        # scheduler
        if scheduler:
            scheduler.step(loss_logger.average)

        # save the best model
        if loss_logger.average < best_loss and save_best:
            save_model(model, os.path.join(weights_path, model_name))
            best_loss = loss_logger.average

        # save checkpoint
        save_model(model, os.path.join(weights_path, 'checkpoint.pt'))
Exemple #15
0
def main(args):
    if args.seed is not None:
        np.random.seed(args.seed)
        random.seed(args.seed)

    ## Options
    dataset = args.dataset
    cluster_option = args.cluster_option
    data_dir = osp.join(args.data_dir, dataset)
    output_path = data_dir
    if not osp.exists(data_dir):
        os.makedirs(data_dir)

    ## plotting options
    plot_option_clusters_vs_lambda = args.plot_option_clusters_vs_lambda
    plot_option_fairness_vs_clusterE = args.plot_option_fairness_vs_clusterE
    plot_option_balance_vs_clusterE = args.plot_option_balance_vs_clusterE
    plot_option_convergence = args.plot_option_convergence

    # ###  Data load
    savepath_compare = osp.join(data_dir, dataset + '.npz')
    if not os.path.exists(savepath_compare):
        X_org, demograph, K = read_dataset(dataset, data_dir)
        if X_org.shape[0] > 200000:
            np.savez_compressed(savepath_compare,
                                X_org=X_org,
                                demograph=demograph,
                                K=K)
        else:
            np.savez(savepath_compare, X_org=X_org, demograph=demograph, K=K)

    else:
        datas = np.load(savepath_compare)
        X_org = datas['X_org']
        demograph = datas['demograph']
        K = datas['K'].item()

    log_path = osp.join(data_dir, cluster_option + '_log.txt')
    sys.stdout = Logger(log_path)
    # Scale and Normalize Features
    X_org = scale(X_org, axis=0)
    X = normalizefea(X_org)

    N, D = X.shape
    print('Cluster number for dataset {} is {}'.format(dataset, K))
    V_list = [np.array(demograph == j) for j in np.unique(demograph)]
    V_sum = [x.sum() for x in V_list]
    print('Balance of the dataset {}'.format(min(V_sum) / max(V_sum)))

    print('Number of points in the dataset {}'.format(N))
    #    J = len(V_sum)

    # demographic probability for each V_j

    u_V = [x / N for x in V_sum]  #proportional
    print('Demographic-probabilites: {}'.format(u_V))
    print('Demographic-numbers per group: {}'.format(V_sum))

    #############################################################################

    ######################## Run Fair clustering #################################

    #############################################################################
    #
    fairness = True  # Setting False only runs unfair clustering

    elapsetimes = []
    avg_balance_set = []
    min_balance_set = []
    fairness_error_set = []
    E_cluster_set = []
    E_cluster_discrete_set = []
    bestacc = 1e10
    best_avg_balance = -1
    best_min_balance = -1

    if args.lmbda_tune:
        print('Lambda tune is true')
        lmbdas = np.arange(0, 10000, 100).tolist()
    else:
        lmbdas = [args.lmbda]

    length_lmbdas = len(lmbdas)

    l = None

    if (not 'A' in locals()) and cluster_option == 'ncut':
        alg_option = 'flann' if N > 50000 else 'None'
        affinity_path = osp.join(data_dir, dataset + '_affinity_ncut.npz')
        knn = 20
        if not osp.exists(affinity_path):
            A = utils.create_affinity(X,
                                      knn,
                                      savepath=affinity_path,
                                      alg=alg_option)
        else:
            A = utils.create_affinity(X, knn, W_path=affinity_path)

    init_C_path = osp.join(
        data_dir, '{}_init_{}_{}.npz'.format(dataset, cluster_option, K))
    if not osp.exists(init_C_path):
        print('Generating initial seeds')
        C_init, l_init = km_init(X, K, 'kmeans_plus')
        np.savez(init_C_path, C_init=C_init, l_init=l_init)

    else:
        temp = np.load(init_C_path)
        C_init = temp['C_init']  # Load initial seeds
        l_init = temp['l_init']

    for count, lmbda in enumerate(lmbdas):

        print('Inside Lambda ', lmbda)

        if cluster_option == 'ncut':

            C, l, elapsed, S, E = fair_clustering(X,
                                                  K,
                                                  u_V,
                                                  V_list,
                                                  lmbda,
                                                  fairness,
                                                  cluster_option,
                                                  C_init=C_init,
                                                  l_init=l_init,
                                                  A=A)

        else:

            C, l, elapsed, S, E = fair_clustering(X,
                                                  K,
                                                  u_V,
                                                  V_list,
                                                  lmbda,
                                                  fairness,
                                                  cluster_option,
                                                  C_init=C_init,
                                                  l_init=l_init)

        min_balance, avg_balance = get_fair_accuracy(u_V, V_list, l, N, K)
        fairness_error = get_fair_accuracy_proportional(u_V, V_list, l, N, K)

        print(
            'lambda = {}, \n fairness_error {: .2f} and \n avg_balance = {: .2f} \n min_balance = {: .2f}'
            .format(lmbda, fairness_error, avg_balance, min_balance))

        # Plot the figure with clusters

        if dataset in ['Synthetic', 'Synthetic-unequal'
                       ] and plot_option_clusters_vs_lambda == True:
            cluster_plot_location = osp.join(output_path, 'cluster_output')
            if not osp.exists(cluster_plot_location):
                os.makedirs(cluster_plot_location)

            filename = osp.join(
                cluster_plot_location,
                'cluster-plot_fair_{}-{}_lambda_{}.png'.format(
                    cluster_option, dataset, lmbda))
            plot_clusters_vs_lambda(X_org, l, filename, dataset, lmbda,
                                    fairness_error)
    #
        if avg_balance > best_avg_balance:
            best_avg_balance = avg_balance
            best_lambda_avg_balance = lmbda

        if min_balance > best_min_balance:
            best_min_balance = min_balance
            best_lambda_min_balance = lmbda

        if fairness_error < bestacc:
            bestacc = fairness_error
            best_lambda_acc = lmbda

        if plot_option_convergence == True and count == 0:

            filename = osp.join(
                output_path,
                'Fair_{}_convergence_{}.png'.format(cluster_option, dataset))
            E_fair = E['fair_cluster_E']
            plot_convergence(cluster_option, filename, E_fair)

        print('Best fairness_error %0.4f' % bestacc, '|Error lambda = ',
              best_lambda_acc)
        print('Best  Avg balance %0.4f' % best_avg_balance,
              '| Avg Balance lambda = ', best_lambda_avg_balance)
        print('Best  Min balance %0.4f' % best_min_balance,
              '| Min Balance lambda = ', best_lambda_min_balance)
        elapsetimes.append(elapsed)
        avg_balance_set.append(avg_balance)
        min_balance_set.append(min_balance)
        fairness_error_set.append(fairness_error)
        E_cluster_set.append(E['cluster_E'][-1])
        E_cluster_discrete_set.append(E['cluster_E_discrete'][-1])

    avgelapsed = sum(elapsetimes) / len(elapsetimes)
    print('avg elapsed ', avgelapsed)

    if plot_option_fairness_vs_clusterE == True and length_lmbdas > 1:

        savefile = osp.join(
            data_dir, 'Fair_{}_fairness_vs_clusterEdiscrete_{}.npz'.format(
                cluster_option, dataset))
        filename = osp.join(
            output_path, 'Fair_{}_fairness_vs_clusterEdiscrete_{}.png'.format(
                cluster_option, dataset))
        plot_fairness_vs_clusterE(cluster_option, savefile, filename, lmbdas,
                                  fairness_error_set, min_balance_set,
                                  avg_balance_set, E_cluster_discrete_set)

    if plot_option_balance_vs_clusterE == True and length_lmbdas > 1:

        savefile = osp.join(
            data_dir, 'Fair_{}_balance_vs_clusterEdiscrete_{}.npz'.format(
                cluster_option, dataset))
        filename = osp.join(
            output_path, 'Fair_{}_balance_vs_clusterEdiscrete_{}.png'.format(
                cluster_option, dataset))

        plot_balance_vs_clusterE(cluster_option, savefile, filename, lmbdas,
                                 fairness_error_set, min_balance_set,
                                 avg_balance_set, E_cluster_discrete_set)
Exemple #16
0
SSD_TO_RAW_CLASS_MAPPING = {
    7: 1,  # vehicle
    15: 2,  # pedestrian
    2: 3,  # cyclist
    # 21: 20, # traffic lights
}

RAW_TO_SSD_CLASS_MAPPING = {
    1: 7,  # vehicle
    2: 15,  # pedestrian
    3: 2,  # cyclist
    # 20: 21, # traffic lights
}

logger = Logger.get_logger('SSD')


class SSDModel(BaseModel):
    """ SSD Model """
    def __init__(self):
        BaseModel.__init__(self, ModelConstants.MODEL_NAME)

        self.session = None
        self.image_4d = None
        self.predictions = None
        self.localisations = None
        self.img_input = None  # tf placeholder
        self.bbox_img = None
        self.net_shape = (300, 300)
        self.ssd_anchors = None
d_output = 8  # From dataset

# Config
sns.set()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device {device}")

# Load dataset
ozeDataset = OzeDataset(DATASET_PATH)

# Load network
# Load transformer with Adam optimizer and MSE loss function
loss_function = OZELoss(alpha=0.3)


logger = Logger('learningcurve_log.csv')

learningcurveIterator = leargnin_curve(ozeDataset, n_part=PARTS, validation_split=VALIDATION_SPLIT,
                      batch_size=BATCH_SIZE, num_workers=NUM_WORKERS)

with tqdm(total=PARTS*EPOCHS) as pbar:
    for dataloader_train, dataloader_val in learningcurveIterator:

        # Load transformer with Adam optimizer and MSE loss function
        net = Transformer(d_input, d_model, d_output, q, v, h, N, attention_size=attention_size,
                          dropout=dropout, chunk_mode=chunk_mode, pe=pe).to(device)

        optimizer = optim.Adam(net.parameters(), lr=LR)

        # Fit model
        loss = fit(net, optimizer, loss_function, dataloader_train,
Exemple #18
0
import os

from src.utils import Config, Logger

logger = Logger.get_logger('BaseModel')


class BaseModel(object):
    def __init__(self, model_name):
        self.asset_dir = os.path.join(Config.get('models_dir'), model_name)
        os.system('mkdir -p {}'.format(self.asset_dir))
        self.asset_url_map = {}

        model_configs = Config.get('models')
        for conf in model_configs:
            if conf.get('name') == model_name:
                asset_urls = conf.get('asset_urls')
                for asset in asset_urls:
                    self.asset_url_map[asset['name']] = asset['url']

    def _download_asset(self, asset_name):

        logger.debug('Downloading asset: {}'.format(asset_name))
        full_asset_name = os.path.join(self.asset_dir, asset_name)

        if os.path.exists(full_asset_name):
            logger.debug('Skip downloading, use cached files instead.')
            return

        os.system('wget {} -O {}'.format(self.asset_url_map.get(asset_name),
                                         full_asset_name))
Exemple #19
0
import os
import ujson
from src.model import SSDModel
from src.utils import Config, Logger, VideoProcessor

logger = Logger.get_logger('ServeHandler')


class ServeHandler(object):
    model = None
    scores = []
    frame_cnt = 0
    use_precomputed = False

    @classmethod
    def handle(cls):

        if Config.get('model') == 'ssd':
            cls.model = SSDModel()

        logger.debug('Start serving ...')
        full_video_path = os.path.join(Config.get('videos_dir'),
                                       Config.get('serve').get('video'))

        url = None
        precomputed_labels = None
        full_annotated_path = None
        confs = Config.get('videos')
        for conf in confs:
            if conf.get('name') == Config.get('serve').get('video'):
                url = conf.get('url')
Exemple #20
0
    def __init__(self):
        self._empty_query_msg = 'Empty search query.'
        self._invalid_result_size_msg = 'Invalid result size.'

        self.__index = InvertedIndex()
        self.__logger = Logger().get_logger(__name__)
Exemple #21
0
from src.data import Processor
from src.utils import Config, Logger
import urllib

logger = Logger.get_logger('TrainHandler')


class TrainHandler(object):

    train_sets = Config.get('train').get('train_sets', [])
    test_sets = Config.get('train').get('test_sets', [])

    @classmethod
    def handle(cls):
        cls._download_data()
        cls._convert_data()
        cls._split_data()
        cls._train()

    @classmethod
    def _download_data(cls):
        logger.debug('Fetching data sets: ' + str(cls.train_sets))
        for name in cls.train_sets:
            Processor.download(name)
        for name in cls.test_sets:
            Processor.download(name)

    @classmethod
    def _convert_data(cls):
        pass
Exemple #22
0
class DataReader:
    """
    Class for loading and processing raw tweets.

    Attributes
    ----------
    df : pd.DataFrame
        Data frame with raw text and cleared tokens.
        Columns:
            Name: raw_tweets, dtype: str
            Name: tokens, dtype: List[str]
            Name: tokens_count, dtype: int
            Name: tag, dtype: int
    """
    def __init__(self,
                 text_file: str,
                 tags_file: str = None,
                 force_reload: bool = False) -> None:
        self._logger = Logger('io')
        self._preprocessor = Preprocessor()
        self.df = self._load_data(text_file, tags_file, force_reload)
        self._stats = None
        self.stats

    def _load_data(self,
                   tweets_path: str,
                   tags_path: str,
                   force_reload: bool = False) -> pd.DataFrame:
        """
        Load dataframe with cleared and tokenized tweets.

        First tries to load processed data from pickle.
        If pickle not found, or ``force_reload`` is True, reads raw data and run processing.

        Parameters
        ----------
        tweets_path : str
            Name of a file with raw texts.
        tags_path : str
            Name of a file with tags.
        force_reload : bool
            If true loads from raw data even if pickle found.

        Returns
        -------
        pd.DataFrame
            Data frame with raw text and cleared tokens.
        """
        pickle_path = tweets_path.replace('.txt',
                                          '.pkl').replace('raw', 'processed')
        pickle_folder, pickle_name = os.path.split(pickle_path)

        if (pickle_name in os.listdir(pickle_folder)) & ~force_reload:
            self._logger.log('reading from pickle')
            with open(pickle_path, "rb") as f:
                df = pickle.load(f)
        else:
            self._logger.log('processing raw data')
            df = self._build_dataframe(tweets_path, tags_path)

        self._logger.log('data ready')
        return df

    def _build_dataframe(self, tweets_path: str,
                         tags_path: str) -> pd.DataFrame:
        """
        Clear and tokenize raw texts.
        Pickle processed data

        Parameters
        ----------
        tweets_path : str
            Name of a file with raw texts.
        tags_path : str
            Name of a file with tags.

        Returns
        -------
        pd.DataFrame
            Data frame with raw text and cleared tokens.
        """
        with open(tweets_path) as f:
            raw_tweets = f.readlines()

            df = pd.DataFrame(raw_tweets, columns=['raw_tweets'])
            df['tokens'] = self._preprocessor.transform(raw_tweets)
            df['tokens_count'] = df['tokens'].apply(len)

            if tags_path is not None:
                df['tag'] = pd.read_fwf(tags_path, header=None)[0]
            else:
                df['tag'] = np.nan

            pickle_path = tweets_path.replace('.txt', '.pkl').replace(
                'raw', 'processed')
            with open(pickle_path, "wb") as p:
                pickle.dump(df, p)

            return df

    @property
    def stats(self):
        self._stats = dict()
        self._stats['tweets count'] = self.df.shape[0]
        self._stats['tokens in tweet distribution'] = self.df[
            'tokens_count'].describe([.25, .5, .75, .95, .99])
        self._stats['unique tokens'] = len(
            {toc
             for tweet_toc in self.df['tokens'] for toc in tweet_toc})
        self._stats['tags count'] = self.df['tag'].value_counts().sort_index()

        print("-------- stats --------")
        for stat, value in self._stats.items():
            print(f"=======================\n{stat}:\n{value}")
Exemple #23
0
class Preprocessor(BaseEstimator):
    """
    Class for cleaning and tokenizing tweet's raw text

    Steps:
        1. remove ``@anonymized_account`` tag
        2. remove chars other than letters and spaces
        3. remove duplicate spaces
        4. apply lowercase
        5. lemmatizes tokens with ``pl_spacy_model``
        6. convert polish diacritics to latin letters
        7. drop adjacent equals letters
        8. collapse words exploded with spaces
        9. remove zero/one letter tokens
    """
    def __init__(self, min_tok_len: int = 2):
        self._min_tok_len = min_tok_len
        self._logger = Logger('preproc')
        self._nlp = None

    def fit(self, tweets: Tweets, tags: Tags = None) -> Preprocessor:
        return self

    def transform_tweet(self, tweet: Tweet) -> Tokens:

        tweet: Tweet = self._base_cleanup(tweet)
        tokens: Tokens = self._tokenizer(tweet)
        tokens = [Preprocessor._latinize_diacritics(tok) for tok in tokens]
        tokens = [Preprocessor._drop_adjacent_equals(tok) for tok in tokens]
        tokens = [Preprocessor._collapse_exploded(tok) for tok in tokens]
        tokens = [tok for tok in tokens if len(tok) >= self._min_tok_len]

        return tokens

    def transform(self, tweets: Tweets, tags: Tags = None) -> List[Tokens]:
        tokens = [self.transform_tweet(tweet) for tweet in tweets]

        return tokens

    @staticmethod
    def _base_cleanup(tweet: Tweet) -> Tweet:
        """Keep only letters and spaces, apply to lower, remove ``@anonymized_account`` and extra spaces"""
        tweet = tweet.strip()
        tweet = re.sub(r'@anonymized_account', '', tweet)
        tweet = re.sub(r'[^\w\s]', '', tweet)
        tweet = re.sub(r'[0-9]', '', tweet)
        tweet = re.sub(r' +', ' ', tweet)
        tweet = tweet.lower()
        tweet = tweet.strip()

        return tweet

    def load_spacy_model(self) -> None:
        """Tokenize tweet"""
        if self._nlp is None:
            self._logger.log('loading spacy model')
            self._nlp = spacy.load('pl_spacy_model')

    def _tokenizer(self, tweet: Tweet) -> Tokens:
        """Tokenize tweet"""
        self.load_spacy_model()
        tokens = [tok.lemma_ for tok in self._nlp(tweet)]

        return tokens

    @staticmethod
    def _drop_adjacent_equals(tok: Token) -> Token:
        """
        Remove adjacent duplicate characters.

        Examples
        --------
        >>> _drop_adjacent_equals('kkk')
        'k'

        >>> _drop_adjacent_equals('lekkie pióórko')
        'lekie piórko'
        """
        return ''.join(c[0] for c in itertools.groupby(tok))

    @staticmethod
    def _collapse_exploded(tok: Token, separators: str = ' .-_') -> Token:
        """
        Collapse word expanded with ``separators``.

        Example
        --------
        >>> _collapse_exploded('jesteś b r z y d k i')
        'jesteś brzydki'
        """
        if len(tok) < 5:
            return tok

        remove = []
        for i, l in enumerate(tok[2:-1]):
            if l in separators:
                if (tok[i - 2] in separators) & (tok[i + 2] in separators):
                    if (tok[i - 1].isalpha()) & (tok[i + 1].isalpha()):
                        remove.append(i)
                        remove.append(i + 2)

        return ''.join([l for i, l in enumerate(tok) if i not in remove])

    @staticmethod
    def _latinize_diacritics(tok: Token) -> Token:
        """
        Convert polish diacritics to latin letters.

        Example
        --------
        >>> _latinize_diacritics('gęśl')
        'gesl'
        """
        letters_diac = 'ąćęłńóśżźĄĆĘŁŃÓŚŻŹ'
        letters_latin = 'acelnoszzACELNOSZZ'
        table = str.maketrans(letters_diac, letters_latin)
        return tok.translate(table)
Exemple #24
0
 def __init__(self, min_tok_len: int = 2):
     self._min_tok_len = min_tok_len
     self._logger = Logger('preproc')
     self._nlp = None
Exemple #25
0
from rest_framework.views import APIView
from custom_decoraters.request_body_validator import request_body_validator
from django.contrib.auth import authenticate, login
from src.utils import HttpStatus, HttpResponse, MethodNotAllowedException, Logger
from src.constants import LoginType
from apps.models import BlogsAuth
from custom_middlewares.validator import RequestBodyValidatorMiddleware
from custom_decoraters import request_body_validator
from apps.json_schema_validators import login_json_schema
from custom_auth_backend.jwt.token import Token

logger = Logger()


class Login(APIView):
    def get(self, request):
        return HttpResponse(http_status=HttpStatus.HTTP_200_OK,
                            data="Kept only for testing using drf view")

    @request_body_validator(login_json_schema)
    def post(self, request):
        self.sanitize_request_data(request.data)
        auth_obj = authenticate(request,
                                auth_id="",
                                password=request.data['password'])
        if isinstance(auth_obj, HttpResponse):
            return auth_obj
        elif auth_obj is not None:
            # login(request, auth_obj)
            # method is invoked to get same method invoked while register
            return self.login_response(auth_obj)
d_input = 38  # From dataset
d_output = 8  # From dataset

# Config
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device {device}")

# Load dataset
ozeDataset = OzeDataset(DATASET_PATH)

# Load network
# Load transformer with Adam optimizer and MSE loss function
loss_function = OZELoss(alpha=0.3)

logger = Logger(f'crossvalidation_log_{attention_size}_{h}_{N}.csv')

kfoldIterator = kfold(ozeDataset,
                      n_chunk=CHUNKS,
                      batch_size=BATCH_SIZE,
                      num_workers=NUM_WORKERS)

with tqdm(total=CHUNKS * EPOCHS) as pbar:
    for dataloader_train, dataloader_val in kfoldIterator:

        # Load transformer with Adam optimizer and MSE loss function
        net = Transformer(d_input,
                          d_model,
                          d_output,
                          q,
                          v,
Exemple #27
0
dataloader_train = DataLoader(dataset_train,
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=NUM_WORKERS)

dataloader_val = DataLoader(dataset_val,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            num_workers=NUM_WORKERS)

# Start search
n_steps = np.prod(
    [len(search_range) for search_range in search_params.values()])

logger = Logger('logs/search_log.csv', list(search_params.keys()) + ['loss'])

with tqdm(total=n_steps * EPOCHS) as pbar:
    for params in itertools.product(*search_params.values()):
        params = {
            key: params[idx]
            for idx, key in enumerate(search_params.keys())
        }
        pbar.set_postfix(params)

        # Load transformer with Adam optimizer and MSE loss function
        net = Transformer(d_input=d_input,
                          d_output=d_output,
                          dropout=dropout,
                          chunk_mode=chunk_mode,
                          pe=pe,
Exemple #28
0
                              batch_size=BATCH_SIZE,
                              shuffle=True,
                              num_workers=NUM_WORKERS
                              )

dataloader_val = DataLoader(dataset_val,
                            batch_size=BATCH_SIZE,
                            shuffle=True,
                            num_workers=NUM_WORKERS
                            )

# Start search
n_steps = np.prod([len(search_range)
                   for search_range in search_params.values()])

logger = Logger('search_log.csv', list(search_params.keys()))

with tqdm(total=n_steps*EPOCHS) as pbar:
    for params in itertools.product(*search_params.values()):
        params = {key: params[idx]
                  for idx, key in enumerate(search_params.keys())}
        pbar.set_postfix(params)

        # Load transformer with Adam optimizer and MSE loss function
        net = Transformer(d_input=d_input,
                          d_output=d_output,
                          dropout=dropout,
                          chunk_mode=chunk_mode,
                          pe=pe,
                          **params).to(device)
        optimizer = optim.Adam(net.parameters(), lr=LR)
Exemple #29
0
else:
    # Download if not exist already
    if not os.path.isfile(X_train_path):
        urlretrieve(args.blob_path + "/X_train.npy", X_train_path)
    if not os.path.isfile(y_train_path):
        urlretrieve(args.blob_path + "/y_train.npy", y_train_path)
    if not os.path.isfile(X_valid_path):
        urlretrieve(args.blob_path + "/X_valid.npy", X_valid_path)
    if not os.path.isfile(y_valid_path):
        urlretrieve(args.blob_path + "/y_valid.npy", y_valid_path)

X_t = np.load(X_train_path)
y_t = np.load(y_train_path)
X_v = np.load(X_valid_path)
y_v = np.load(y_valid_path)

params = vars(args)

mnt_path = os.path.join(os.getenv('TEST_TMPDIR', '/tmp'),
                        'tensorflow')  # azurefile mount path
ts = int(round(time.time() * 1000))
params['model_dir'] = os.path.join(mnt_path, '{}_model'.format(ts))
params['log_dir'] = os.path.join(mnt_path, '{}_logs'.format(ts))

logger = Logger(None, 'katib')
logger.log(
    'model_id', ts
)  # This is hack, storing the model id as a metric in order to record it.

train(X_t, y_t, X_v, y_v, logger=logger, **params)
                    help="Directory for loading model")

args, _ = parser.parse_known_args()

# Download and load data
mnist_path = os.path.join('data', 'mnist')
os.makedirs(mnist_path, exist_ok=True)
X_test_path = os.path.join(mnist_path, 'X_test.npy')
y_test_path = os.path.join(mnist_path, 'y_test.npy')

if not args.blob_path:
    raise ValueError("Data path should be provided")
else:
    # Download if not exist already
    if not os.path.isfile(X_test_path):
        urlretrieve(args.blob_path + "/X_test.npy", X_test_path)
    if not os.path.isfile(y_test_path):
        urlretrieve(args.blob_path + "/y_test.npy", y_test_path)

X_t = np.load(X_test_path)
y_t = np.load(y_test_path)

test_acc = test(
    os.path.join(args.model_dir, "mnist-tf.model.meta"),
    os.path.join(args.model_dir, "mnist-tf.model"),
    X_t,
    y_t,
    logger=Logger(logging, 'python'),
    verbose=False,
)