def bring_redbox_negatives(task, avoid_flags, add_num, data_dir, fn_train):
  neg_classification = ' 0' # see dump_to_files [('Default',0),(task,1)]
  notperf, total = [], []
  for fname in os.listdir(data_dir):
    if fname.endswith('.dat'):
      total.append(fname)

  print "Gathering vacant Redbox images without %s's flags..."%(task)
  count = 0
  with open(fn_train,'r') as f_already:
    c_already = f_already.readlines()
    c_already = [line.split(' ')[0] for line in c_already]
    for i in range(len(total)):
      content = open(oj(data_dir,total[i]),'r').readlines()
      content = [line.strip() for line in content]
      if all([len([flag for flag in content if flag in avoid_flags])==0,
              total[i] not in c_already]):
        notperf.append(oj(data_dir,total[i][:-4])+'.jpg'+neg_classification+'\n')
        count += 1
        if count > add_num: break

  random.shuffle(notperf)
  print "Gathering completed."

  print "Adding %i negatives to %s"%(add_num,fn_train)
  newcomers, notperf_left = notperf[:add_num], notperf[add_num:]

  with open(fn_train,'r') as f_train:
    c_train = f_train.readlines()
  c_train += newcomers
  random.shuffle(c_train)
  f_train = open(fn_train,'w')
  # print "writing:", c_train
  f_train.writelines(c_train)
def save_mislabs(d, data_info, PRETRAINED):
  mislab_dir = oj(data_info,'potential_mislabels_'+PRETRAINED.split('/')[-1])
  try: os.mkdir(mislab_dir)
  except:
    shutil.rmtree(mislab_dir)
    os.mkdir(mislab_dir)
  for idx in d['pot_mislab']:
    shutil.copy(oj(data_info,'test',d['fname'][idx]), mislab_dir)
  print "saving potential mislabels to %s"%(mislab_dir)
def get_pretrained_model(classifier_dir):
  suggest = os.listdir(classifier_dir)
  suggest = [fname for fname in suggest
             if 'iter' in fname and 'solverstate' not in fname]
  if len(suggest) > 1:
    for elem in enumerate(suggest): print elem
    idx = int(raw_input("\nWhich model? "))
    return oj(classifier_dir,suggest[idx])
  elif len(suggest) == 1:
    return oj(classifier_dir,suggest[0])
  else:
    print "ERROR: no model found in", classifier_dir
    exit()
def all_labels(data_dir):
  Queries = {'perfect':[]}
  for dirname in os.listdir(data_dir):
    dirname = oj(data_dir,dirname)
    with open(oj(dirname,'inspection.txt')) as f:
      lines = f.readlines()
      lines = [line.strip() for line in lines]
      if lines == []:
        Queries['perfect'].append(dirname)
      for line in lines:
        if line not in Queries.keys():
          Queries[line] = []
        Queries[line].append(dirname)
  return Queries
def classify_data(classifier_dir, symlink_dir, data_info, PRETRAINED, redbox=False):
  N = 96
  classifier_name = classifier_dir.split('/')[-1]  
  if classifier_name.split('-fine')[0]+'_deploy.prototxt' not in os.listdir(classifier_dir):
    create_deploy_file(classifier_dir)
    
  MODEL_FILE = oj(classifier_dir, classifier_name.split('-fine')[0]+'_deploy.prototxt')
  MEAN_FILE = np.load(get_np_mean_fname(symlink_dir))
  print 'loading network...'
  net = caffe.Classifier(MODEL_FILE, PRETRAINED,
                         image_dims=(256, 256), input_scale=255,
                         mean=MEAN_FILE, channel_swap=(2,1,0))
  # flow of control:
  #   classifier::__init__(
  #   classifier::caffe.Net.__init__()
  print 'network loaded successfully'
  # set phase to test since we are doing testing
  net.set_phase_test()
  net.set_mode_gpu()
  d = {'fname': [],
       'pred': [],
       'time': [],
       'dude': [],
       'label': [],
       'pred_lab_thresh': [],
       'pred_lab_std': [],
       'pot_mislab': []}
  # load images
  if redbox:
    imgs, d = load_all_images_from_dir(d, oj(symlink_dir,'redbox'), redbox)
  else:
    imgs, d =  load_all_images_from_dir(d, oj(symlink_dir,'test'))

  # classify images
  num_imgs = len(d['fname'])
  print "computing preds..."
  d['pred'] = net.predict(imgs[:N])
  # print pred
  if num_imgs > N:
    for i in range(1,num_imgs/N):
      d['pred'] = np.append(d['pred'],net.predict(imgs[i*N:(i+1)*N]),axis=0)
    d['pred']=np.append(d['pred'],net.predict(imgs[-(len(imgs)%N):]),axis=0)
  print "preds computed."

  # save preds
  assert len(d['pred']) == num_imgs
  np.save(oj(data_info, PRETRAINED.split('/')[-1]+'_pred.npy'), d)
  return d
Exemple #6
0
def get_caffe_errors(model_dir, typ, idx):
  data_files = []
  for fname in os.listdir(model_dir):
    if 'train_output' in fname and fname.endswith('.log.'+typ): data_files.append(fname)
  if len(data_files) != 1:
    print 'there is not exactly 1 filename otf \'*train_output*.log.%s\' in given directory'%(typ)
    sys.exit()
  content = open(oj(model_dir,data_files[0]),'r').readlines()
  legit_length = len(content[1])
  content = [' '.join(line.split()).split(' ') for line in content
             if not line.startswith('#')]
  print 'raw content looked like %s and %s'%(content[0], content[-1])

  for i in range(len(content)):
    if len(content[i]) <= idx:
      print 'line[%i] is messed up: %s'%(i,content[i])
      sys.exit()
  content = [(line[0],line[idx]) for line in content]
  # end = len(content)
  # while True:
  #   try:
  #     content = [(line[0],line[idx]) for line in content[:end]]
  #     break
  #   except:
  #     end -= 1
    
  print 'selected content looks like %s and %s'%(content[0], content[-1])
  return content
def LoadFeatureFileData( groupName, base_file_path, loadStatsCollection, debug=False):
    """This loads the feature files into a database... does not load the data IN Them however...
    i.e. it just indexes the *.seg.txt files """
    files_processed = newly_loaded = total_files = 0
    
    feature_file_list = glob.glob( oj(base_file_path,groupName,'TCGA-*.seg*.txt'))
    print "Analyzing ",groupName
    if debug:
        print "Processing %s which has %d files" % ( groupName, len(feature_file_list))
        
    for ff in feature_file_list:
        LinePrinter( "Processed %d files out of %d total files, and %d just now" % ( files_processed,total_files,newly_loaded ))
        FFI = {}  ###Feature File Info
        FFI['filename'] = ff
        fileLoaded = loadStatsCollection.find_one( {'filename':ff})
        if not fileLoaded:
            FFI['LoadedToMongo'] = False
            FFI['slideGroup'] = groupName
            FFI['full_feature_filename'] = os.path.basename(ff)
            FFI['slide_name_detail'] = os.path.basename(ff).split('.seg.')[0]
            FFI['slide_name'] = os.path.basename(ff).split('.')[0]
            FFI['MongoCollName'] = None

            FFI['tot_features'] = file_len(ff)  ## TYhis is an expensive operation so just do it in here
            loadStatsCollection.insert_one(FFI)
            newly_loaded +=1
        files_processed +=1
Exemple #8
0
def already_parsed(model_dir):
  fnames = []
  listdir = os.listdir(model_dir)
  for fname in listdir:
    if 'train' in fname and fname.endswith('.log'):
      fnames.append(oj(model_dir,fname))
  if len(fnames) == 0:
    print "ERROR: no file containing 'train_output' and ending in '.log' found in", model_dir
  elif len(fnames) > 1:
    for elem in enumerate(fnames): print elem
    fname = oj(model_dir,fnames[int(raw_input("\nChoose index number from above: "))])
  else: fname = oj(model_dir,fnames[0])
  if all([os.path.basename(fname)+'.train' in listdir,
          os.path.basename(fname)+'.test' in listdir]):
    return fname, 'Y' # found log and parsed
  else: return fname, 'N' # found log but not parsed
Exemple #9
0
def matplot(model_dir, train, val_acc, val_loss, start=-1, end=-1):
  
  if end == start == -1:
    start, end = 0, len(train)
    print 'plotting entire training data'
  
  elif start == -1:
    start = 0
    print 'plotting from iter %i to %i'%(start,end)
    
  elif end == -1:
    print 'plotting from iter %i to the end'%(start)
    end = len(train)

  else:
    print 'plotting from iter %i to %i'%(start,end)

  plt.ylim([0,1.2])
  x = np.array(range(len(train[start:end])))
  ytrain = np.array([float(el[1]) for el in train[start:end]])
  ytest_acc = np.array([float(el[1]) for el in val_acc[start:end]])
  ytest_loss = np.array([np.float(el[1]) for el in val_loss[start:end]])
  plt.plot(x, ytrain, label='training loss', color='0.55')
  # plt.plot(x, ytrain, label='training loss')
  if len(x) != len(ytest_acc):
    print 'len(x) %i != %i len(ytrain)'%(len(x),len(ytest_acc))
    sys.exit()
  plt.plot(x, ytest_acc, label='validation accuracy',color='g')
  plt.plot(x, ytest_loss, label='validation loss',color='r')
  plt.legend(loc='upper left')
  plt.xlabel('Iters')
  plt.ylabel('TrainingLoss')
  # plt.title('Go on choose one')
  plt.grid(True)
  plt.savefig(oj(model_dir,'plot_more_'+model_dir.split('/')[-3]+'_'+model_dir.split('/')[-1]+'.png'))
def get_flag_and_thresh(data_info):
  ''' flag_val is the number in data_info/[model]/read.txt which indexes
  the class corresponding to when flag is present. ''' 
  flag_val, thresh = 0, 0.5
  rl = open(oj(data_info,'read.txt'),'r').readlines()
  
  if len([l for l in rl if 'flag_val' in l]) == 0:
    # set up read.txt to contain flag val and threshold
    augment_read(data_info)
    rl = open(oj(data_info,'read.txt'),'r').readlines()
    
  rl = [l.split() for l in rl]
  for l in rl[2:]:
    if l == ['1','flag_val']: flag_val = 1
    elif l[1] == 'threshold': thresh = float(l[0])
  # if got no thresh to return, means read.txt needs be filled in
  return flag_val, thresh
def sample_from_label(Queries, data_dir):
  for elem in enumerate(Queries.keys()): print elem
  lab, length = -1, -1
  while lab not in range(len(Queries.keys())):
    lab = int(raw_input("\nName 1 class number you wish to sample from: "))
  lab = Queries.keys()[lab]
  while length not in range(len(Queries[lab])):
    length = int(raw_input("\nSample how many? "))
  try:
    os.mkdir(lab)
  except:
    if not raw_input('Sample for that class already exists, ok to overwrite? [Y]/N ') == 'N':
      shutil.rmtree(lab)
      os.mkdir(lab)
  for directory in Queries[lab][:length]:
    for f in os.listdir(oj(data_dir,directory)):
      if f.endswith('.jpg'): 
        shutil.copy(oj(data_dir,directory,f),oj(lab,f))
def get_(data_dir, fname, what):
  ret = []
  meta_name = fname.split('.')[0] + '.met'
  data_dir = REDBOX_DIR
  for line in open(oj(data_dir,meta_name),'r').readlines():
    for field in what:
      if line.startswith(field):
        ret.append(line.split(field+'=')[-1].split()[0][:10])
  return ret
Exemple #13
0
def push_tarball():
    local('tar czf tarball.tar.gz *')
    run('rm -rf ' + REMOTE_DIR)
    run('mkdir ' + REMOTE_DIR)
    put('tarball.tar.gz', oj(REMOTE_DIR, 'tarball.tar.gz'))
    local('rm tarball.tar.gz')
    with cd(REMOTE_DIR):
        run('tar xzf tarball.tar.gz')
        run('rm tarball.tar.gz')
Exemple #14
0
def dump_to_files(Keep, data_info, task, data_dir):
  ''' This function "trusts" you. It will overwrite data lookup 
  files. '''
  dump = []
  part = [0, 0.85, 1] # partition into train val test
  dump_fnames = ['train.txt','val.txt'] #,'test.txt']
  for i in xrange(len(dump_fnames)):
    dump.append([])
    for [key,num] in [('Default',0),(task,1)]:
      l = len(Keep[key])
      dump[i] += [[f,num] for f in
                  Keep[key][int(part[i]*l):int(part[i+1]*l)]]
    # this is the important shuffle actually
    random.shuffle(dump[i])
    if os.path.isfile(oj(data_info,dump_fnames[i])):
      print "WARNING: overwriting", oj(data_info,dump_fnames[i])
    with open(oj(data_info,dump_fnames[i]),'w') as dfile:
      dfile.writelines(["%s %i\n" % (oj(data_dir,f),num)
                        for (f,num) in dump[i]])
def get_np_mean_fname(symlink_dir):
  proto_img_fname = ''
  for fname in os.listdir(symlink_dir):
    if fname.endswith('mean.binaryproto'):
      print 'found binaryproto: %s'%(fname)
      proto_img_fname = fname
      break
  if proto_img_fname == '':
    print 'ERROR: no *mean.npy nor *mean.binaryproto found in %s'%(symlink_dir)
    sys.exit()
  # er wait how does it know where the proto img file is?
  blob = caffe_pb2.BlobProto()
  data = open(oj(symlink_dir,proto_img_fname), "rb").read()
  blob.ParseFromString(data)
  nparray = caffe.io.blobproto_to_array(blob)[0]
  npy_mean_fname = (proto_img_fname.split('_mean.binaryproto')[0]).split('_fine')[0]+'_mean2.npy'
  npy_mean_file = file(oj(symlink_dir,npy_mean_fname),"wb")
  np.save(npy_mean_file, nparray)
  npy_mean_file.close()  
  return oj(symlink_dir, npy_mean_fname)
def bring_redbox_positives(task, flags, add_num, redbox_dir, fn_train):
  added = []
  listdir = os.listdir(redbox_dir)
  random.shuffle(listdir)
  for fl in listdir:
    if fl.endswith('.dat'):
      pres = False
      with open(oj(redbox_dir,fl), 'r') as f:
        for line in f:
          if line.strip() in flags:
            pres = True
            break
      if pres:
        added.append(fl)
        if len(added) >= add_num:
          break

  with open(fn_train, 'a') as f:
    for fl in added:
      fl = fl.replace('dat','jpg')
      f.write("\n"+oj(redbox_dir,fl)+ " 1")
def create_redbox_data_info_etc(symlink_dir, data_info):
  data_dir = REDBOX_DIR
  All = sa.get_label_dict(data_dir)
  total_num_images = All.pop('total_num_images')
  Keep = sa.classes_to_learn(All)
  Keep = sa.default_class(All, Keep)
  total_num_check = sum([len(Keep[key]) for key in Keep.keys()])
  if total_num_images != total_num_check:
    print "\nWARNING! started off with %i images, now have %i distinct training cases"%(total_num_images, total_num_check)
  if len(Keep.keys()) > 2:
    Keep,num_output = sa.merge_classes(Keep)
    Keep,num_output = sa.check_mutual_exclusion(Keep, num_output)
  dump = symlink_redbox_dataset(Keep,data_dir,oj(symlink_dir,'redbox'))
  dump_redbox_to_files(Keep, dump, data_info)
def plot_time(d, save_dir):
  if len(d['time']) != len(d['error']):
    print "len(d['time']) %i != %i len(d['error'])"%(len(d['time']),len(d['error']))
    sys.exit()
  # order by time
  data = np.array(zip(d['time'],d['error']), dtype=object)
  data = numpy.sort(data, axis=0)
  plt.ylim([0,1.2])
  x, y = data[:,0], data[:,1]
  plt.plot(x, y)
  plt.legend(loc='upper left')
  plt.xlabel('Inspected Time')
  plt.ylabel('Classification Error')
  # plt.title('Go on choose one')
  plt.grid(True)
  plt.savefig(oj(save_dir,'plot_redbox_'+save_dir.split('/')[-3]+'_'+save_dir.split('/')[-1]+'_time.png'))
Exemple #19
0
def get_label_dict_knowing(data_dir, task, pos_class):
  ''' get_label_dict() knowing exactly which flags to look for and 
  how to group them into classes. 
  task is the name of what we're learning to detect,
  pos_class is a list of the actual flag names to look for. '''
  d = {'Default': [], task: []}
  print 'generating specific dict of class:files from %s with pos class %s...'%(data_dir,pos_class)
  for filename in os.listdir(data_dir):
    if not filename.endswith('.dat'): continue
    with open(oj(data_dir, filename)) as f:
      content = [line.strip() for line in f.readlines()]
      if any([label==line for (label,line)
              in itertools.product(pos_class,content)]):
        d[task].append(filename.split('.')[0]+'.jpg')
      else:
        d['Default'].append(filename.split('.')[0]+'.jpg')
  return d
Exemple #20
0
def matplot(model_dir, Ys, start, end):
  col = {'TrainLoss': '0.5',
         'ValLoss' : '#000066',
         'ValAcc_0': '#00CC00',
         'ValAcc_1': '#ff4d4d',
         'ValPCAcc': 'k',
         'ValAcc'  : 'y'}
  plt.ylim([0,1.2])
  x = np.array(range(start,end))
  plt.xlabel('Iters')
  for key in Ys.keys():
    Ys[key] = np.array([np.float(el) for el in Ys[key][start:end]])
    plt.plot(x, Ys[key], label=key, color=col[key])
  plt.legend(loc='upper left',ncol=len(Ys)/2,prop={'size':10})
  # plt.title('Go on choose one')
  plt.grid(True)
  plt.savefig(oj(model_dir,'plot_'+model_dir.split('/')[-2]+'.png'))
def load_all_images_from_dir(d, test_dir, redbox=False):
  imgs = []
  d['fname'] = os.listdir(test_dir)
  print 'loading images from %s...'%(test_dir)
  # d_multJoints is a dict: fname -> joint_name
  d_multJoints = create_dict_jname(REDBOX_DIR)
  for fname in d['fname']:
    full_fname = oj(test_dir, fname)
    imgs.append(caffe.io.load_image(full_fname))
    if redbox:
      [dude,time] = get_(REDBOX_DIR,fname,['InspectedTime','InspectedBy'])
      l_time = time.split('/')
      time = l_time[2] + '-' + l_time[1] + '-' + l_time[0]
      d['time'].append(time)
      d['dude'].append(dude)
  print 'finished loading images.'
  return imgs, d
def plot_dudes(d, save_dir):
  # get a 2d array of dudes of freq pot mislab
  data = {}
  num_imgs = len(d['dude'])
  for s in set(d['dude']): data[s] = 0
  for idx in d['pot_mislab']:
    data[d['dude'][idx]] += 1
  data = [[key, float(d[key])/num_imgs] for key in data.keys()]
  data = np.array(zip())
  fig = plt.figure()
  width = .35
  ind = np.arange(len(data))
  plt.bar(ind, data[:,1])
  plt.xticks(ind + width / 2, data[:,0])
  fig.autofmt_xdate()
  plt.xlabel('Inspected By')
  plt.ylabel('% mis-classifications')
  plt.savefig(oj(save_dir,'plot_redbox_'+save_dir.split('/')[-3]+'_'+save_dir.split('/')[-1]+'_dude.png'))
Exemple #23
0
import os
from os.path import join as oj
import sys
sys.path.insert(1, oj(sys.path[0], '..'))  # insert parent path

import torch
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import DataLoader
from copy import deepcopy
import pickle as pkl
import pandas as pd


# generate mixture model
# means and sds should be lists of lists (sds just scale variances)
def generate_gaussian_data(N, means=[0, 1], sds=[1, 1], labs=[0, 1]):
    num_means = len(means)
    # deal with 1D
    if type(means[0]) == int or type(means[0]) == float:
        means = [[m] for m in means]
        sds = [[sd] for sd in sds]
        P = 1
    else:
        P = len(means[0])
    X = np.zeros((N, P), dtype=np.float32)
    y_plot = np.zeros((N, 1), dtype=np.float32)
    y_one_hot = np.zeros((N, 2), dtype=np.float32)
Exemple #24
0
def plot_dims_flexible(
        results,
        out_dir='figs',
        xlim=None,
        percent_to_explain=0.85,
        figname='explained',
        dim_types=['explained_var_dicts_pca', 'explained_var_dicts_rbf']):
    # params for plotting
    num_lays = len(results.iloc[0].weight_names) - 1
    #     print(results.iloc[0].weight_names)
    plt.figure(figsize=(num_lays * 3, 8), dpi=100)
    #     skips = [('adam', 0.1), ('adam', 0.01), ('adam', 0.001)]
    skips = []

    dim_dicts = {}
    R, C = 5, max(3, num_lays)
    for index, row in results.iterrows():
        # style for plotting
        #     style = '^' if row.optimizer == 'sgd' else '.'
        #     color = {0.1: 'red', 0.01: 'blue', 0.001: 'green'}[row.lr]
        color = 'orange' if row.optimizer == 'sgd' else 'deepskyblue'
        style = {1: '^', 0.1: '-', 0.01: '--', 0.001: '.'}[row.lr]
        alpha = {1.0: 0.3, 0.1: 0.8, 0.01: 0.8, 0.001: .3}[row.lr]
        if not (row.optimizer, row.lr) in skips:

            # accs
            try:
                plt.ylabel(row.dset[0])
            except:
                pass
            plt.subplot(R, C, 1)
            plt.plot(row.its,
                     row.losses_train,
                     style,
                     label=row.optimizer + ' ' + str(row.lr),
                     color=color,
                     alpha=alpha)
            plt.yscale('log')
            plt.title('train loss')

            plt.subplot(R, C, 2)
            plt.plot(row.its, row.losses_test, style, color=color, alpha=alpha)
            plt.yscale('log')
            plt.title('test loss')

            plt.subplot(R, C, 3)
            plt.plot(row.its, row.accs_test, style, color=color, alpha=alpha)
            plt.title('test acc')

            # dims
            for r in range(len(dim_types)):
                offset = C * (1 + r)
                dim_dicts = row[dim_types[r]]

                lays = row.weight_names
                if 'act' in dim_types[r]:
                    lays = [
                        lay[:lay.rfind('.')] for lay in lays
                    ]  # act uses forward_all dict which doesn't have any . or .weight

                lab = dim_types[r].replace('_var_dicts_', '')
                lab = lab.replace('explained', '')
                lab = lab.replace('act', 'act: ')
                for c in range(len(lays) - 1):
                    plt.subplot(R, C, offset + 1 + c)
                    plt.plot(row.its, [
                        frac_dims_to_explain_X_percent(
                            d[lays[c]], percent_to_explain) for d in dim_dicts
                    ],
                             style,
                             color=color,
                             alpha=alpha)
                    plt.ylim((0, 1))
                    if c == 0:
                        plt.ylabel(lab + ' ' + str(100 * percent_to_explain) +
                                   '% frac\ndimsof ' +
                                   str(dim_dicts[0][lays[c]].size))

                    if r == 0:
                        plt.title(lays[c])

            if not xlim is None:
                for i in range(R * C):
                    plt.subplot(R, C, 1 + i)
                    plt.xlim((0, xlim))

    plt.subplot(R, C, 1)
    # remove duplicate labels
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())
    plt.savefig(oj(out_dir, 'dims_flexible_' + figname + '.png'),
                bbox_inches='tight')
    plt.show()
def compute_classification_stats(d, data_info, redbox=False):
  # this comes early because flag_val prompts user
  flag_val, threshold = get_flag_and_thresh(data_info)
  num_imgs = len(d['fname'])
  # get data_info test file
  if redbox:
    print 'opening redbox data info'
    lines = open(oj(data_info,'redbox.txt'),'r').readlines()
  else:
    lines = open(oj(data_info,'test.txt'),'r').readlines()
  _class = {}
  for line in lines: _class[line.split()[0]] = line.split()[1]
  # print "lines:", lines
  try:
    assert set(d['fname']) == set(_class.keys())
  except:
    print "don't match: d['fname']", d['fname']
    print "and _class.keys()", _class.keys()
    exit
  # fill with true labels
  d['label'] = [int(_class[el]) for el in d['fname']]
  # fill in predicted labels and flag if potentially mislab
  # *_thresh is with classification boundary according to threshold
  # *_std is with classification boundary at 0.5
  false_pos_thresh, num_pos, false_neg_thresh, num_neg, false_neg_std, false_pos_std = 0, 0, 0, 0, 0, 0
  # print "\nd['label'] has types %s and flag_val of type %s \n"%(type(d['label'][0]),type(flag_val))
  for idx in range(num_imgs):
    if d['label'][idx] == flag_val:
      print "%s is a positive, with preds %s"%(d['fname'][idx])
      print "its preds are", d['pred'][idx]
      num_pos += 1
    else: num_neg += 1
    # assign predicted label wrt threshold
    if d['pred'][idx][flag_val] >= threshold:
      d['pred_lab_thresh'].append(flag_val) 
      # print "thresh thinks no clamp! appending", flag_val
    else:
      d['pred_lab_thresh'].append((flag_val+1)%2)
      print "thresh thinks clamp! appending", (flag_val+1)%2
    # assign predicted label in std way
    if d['pred'][idx][flag_val] >= 0.5:
      d['pred_lab_std'].append(flag_val) 
      # print "std thinks no clamp! appending", flag_val, "\n"
    else:
      d['pred_lab_std'].append((flag_val+1)%2)
      print "std thinks clamp! appending", (flag_val+1)%2, "\n"
    # correct thresh classification or not 
    if d['pred_lab_thresh'][idx] != d['label'][idx]:
      if d['label'][idx] == flag_val: false_neg_thresh += 1
      else: false_pos_thresh += 1
    # correct std classification or not 
    if d['pred_lab_std'][idx] != d['label'][idx]:
      d['pot_mislab'].append(idx)
      if d['label'][idx] == flag_val: false_neg_std += 1
      else: false_pos_std += 1

  print 'false_neg_thresh: %i, false_pos_thresh: %i'%(false_neg_thresh,false_pos_thresh)
  print 'false_neg_std: %i, false_pos_std: %i'%(false_neg_std,false_pos_std)
  print 'num_neg: %i, num_pos: %i'%(num_neg,num_pos)
  # compute accuracies
  d['accuracy']= {}
  d['accuracy']['total_thresh'] = 1-(false_neg_thresh+false_pos_thresh)/float(num_imgs)
  d['accuracy']['pos_thresh'] = 1-false_neg_thresh/float(num_pos)
  d['accuracy']['neg_thresh'] = 1-false_pos_thresh/float(num_neg)
  d['accuracy']['total_std'] = 1-(false_neg_std+false_pos_std)/float(num_imgs)
  d['accuracy']['pos_std'] = 1-false_neg_std/float(num_pos)
  d['accuracy']['neg_std'] = 1-false_pos_std/float(num_neg)
  print "d['accuracy']", d['accuracy']
  return d
Exemple #26
0
def plot_weight_norms_and_margin(results, xlim=None, out_dir='figs'):
    # params for plotting
    skips = [('adam', 0.1)]
    #     skips = []
    dim_dicts = {}
    R, C = 4, 4
    plt.figure(figsize=(14, 14), dpi=100)
    for index, row in results.iterrows():
        # style for plotting
        color = 'orange' if row.optimizer == 'sgd' else 'deepskyblue'
        style = {1: '^', 0.1: '-', 0.01: '--', 0.001: '.'}[row.lr]
        alpha = {1.0: 0.3, 0.1: 0.8, 0.01: 0.8, 0.001: .3}[row.lr]

        if not (row.optimizer, row.lr) in skips:

            # dims
            wnorms = row.weight_norms

            if not 'weight_names' in list(
                    results):  # this is old, remove after some reruns
                lays = ['fc1.weight', 'fc2.weight', 'fc3.weight']
            else:
                lays = row.weight_names
#             lays = ['fc1.weight', 'fc2.weight', 'fc3.weight']
            keys = sorted(wnorms.keys())
            if row.optimizer == 'sgd':
                for j in range(min(3, len(lays))):
                    plt.subplot(R, C, 1 + j)
                    vals = [wnorms[key][lays[j] + '_fro'] for key in keys]
                    plt.plot(keys,
                             vals,
                             style,
                             color=color,
                             alpha=alpha,
                             label=row.optimizer + ' ' + str(row.lr))
                    plt.title(lays[j] + ' frobenius norm')
            else:
                #                 print('lays', lays, wnorms[0].keys(), keys)
                for j in range(min(3, len(lays))):
                    plt.subplot(R, C, 1 + C + j)
                    vals = [wnorms[key][lays[j] + '_fro'] for key in keys]
                    plt.plot(keys,
                             vals,
                             style,
                             color=color,
                             alpha=alpha,
                             label=row.optimizer + ' ' + str(row.lr))
                    plt.title(lays[j] + ' frobenius norm')

            plt.subplot(R, C, 1 + C * 2)
            #             norms_fro = [row.weight_norms[it][] in row.its
            #             print(row.weight_norms)
            plt.plot(row.its,
                     row.mean_margin_train_unnormalized,
                     style,
                     color=color,
                     alpha=alpha,
                     label=row.optimizer + ' ' + str(row.lr))
            plt.title('train margin unnormalized')

            plt.subplot(R, C, 2 + C * 2)
            plt.plot(row.its,
                     row.mean_margin_test_unnormalized,
                     style,
                     color=color,
                     alpha=alpha,
                     label=row.optimizer + ' ' + str(row.lr))
            plt.title('test margin unnormalized')

            plt.subplot(R, C, 3 + C * 2)

            norm_prods_fro = [1] * len(keys)
            for j in range(len(lays)):
                norm_prods_fro = [
                    norm_prods_fro[i] * wnorms[key][lays[j] + '_fro']
                    for i, key in enumerate(keys)
                ]
            plt.plot(row.its,
                     row.mean_margin_train_unnormalized / norm_prods_fro,
                     style,
                     color=color,
                     alpha=alpha,
                     label=row.optimizer + ' ' + str(row.lr))
            plt.title('train margin over frobenius norm')

            plt.subplot(R, C, 4 + C * 2)
            plt.plot(row.its,
                     row.mean_margin_test_unnormalized / norm_prods_fro,
                     style,
                     color=color,
                     alpha=alpha,
                     label=row.optimizer + ' ' + str(row.lr))
            plt.title('test margin over frobenius norm')

            plt.subplot(R, C, 1 + C * 3)
            plt.plot(row.its,
                     row.mean_margin_train,
                     style,
                     color=color,
                     alpha=alpha,
                     label=row.optimizer + ' ' + str(row.lr))
            plt.title('train softmax margin')

            plt.subplot(R, C, 2 + C * 3)
            plt.plot(row.its,
                     row.mean_margin_test,
                     style,
                     color=color,
                     alpha=alpha,
                     label=row.optimizer + ' ' + str(row.lr))
            plt.title('test softmax margin')

            norm_prods_spectral = [1] * len(keys)
            for j in range(len(lays)):
                norm_prods_spectral = [
                    norm_prods_spectral[i] * wnorms[key][lays[j] + '_spectral']
                    for i, key in enumerate(keys)
                ]
            plt.subplot(R, C, 3 + C * 3)
            plt.plot(row.its,
                     row.mean_margin_train_unnormalized / norm_prods_spectral,
                     style,
                     color=color,
                     alpha=alpha,
                     label=row.optimizer + ' ' + str(row.lr))
            plt.title('train margin over spectral norm')

            plt.subplot(R, C, 4 + C * 3)
            plt.plot(row.its,
                     row.mean_margin_test_unnormalized / norm_prods_spectral,
                     style,
                     color=color,
                     alpha=alpha,
                     label=row.optimizer + ' ' + str(row.lr))
            plt.title('test margin over spectral norm')

        if not xlim is None:
            for i in range(R * C):
                plt.subplot(R, C, 1 + i)
                plt.xlim((0, xlim))

    plt.subplot(R, C, 1)
    # remove duplicate labels
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())

    plt.subplot(R, C, 4)
    # remove duplicate labels
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())
    plt.savefig(oj(out_dir, 'weight_norms_and_margin.png'),
                bbox_inches='tight')
    plt.show()
Exemple #27
0
import numpy as np
import pandas as pd
from colorama import Fore
from matplotlib import pyplot as plt
from matplotlib_venn import venn2
from sklearn import metrics
from sklearn.utils.multiclass import unique_labels
import os
from os.path import join as oj
import seaborn as sns

DIR_FILE = os.path.dirname(os.path.realpath(__file__)) # directory of this file
DIR_FIGS = oj(DIR_FILE, '../reports/figs')


cb2 = '#66ccff'
cb = '#1f77b4'
cr = '#cc0000'
cp = '#cc3399'
cy = '#d8b365'
cg = '#5ab4ac'
cm = sns.diverging_palette(10, 240, n=1000, as_cmap=True)
cm_rev = sns.diverging_palette(240, 10, n=1000, as_cmap=True)
cmap_div = sns.diverging_palette(10, 220, as_cmap=True)

def rename(s):
    RENAMING = {
        'gcsscore': 'GCS Score',
        'initheartrate': 'Heart rate',
        'initsysbprange': 'Systolic BP',
        'abdtenderdegree': 'Abd. tenderness\ndegree',
Exemple #28
0
from matplotlib import pyplot as plt
import os
from os.path import join as oj
plt.style.use('dark_background')
import sys
sys.path.append('../src')
import data
from tqdm import tqdm
from src import train
# from src.viz import *
import config

if __name__ == '__main__':
    # some settings
    outcome_def = 'y_consec_thresh'
    out_dir = oj('/scratch/users/vision/chandan/abc', 'nov16')
    dset_key = 'clath_aux+gak_a7d2'
    dset = config.DSETS[dset_key]
    binarize = False  # True

    # get data
    df = data.get_data(dset=dset_key)
    df = df[df['valid']]  # exclude test cells, short/long tracks, hotspots
    feat_names = data.get_feature_names(df)
    feat_names = data.select_final_feats(feat_names, binarize=binarize)
    print('num feats', len(feat_names))
    print(feat_names)

    # run
    os.makedirs(out_dir, exist_ok=True)
    feature_selection_nums = [
Exemple #29
0
import pandas as pd
import numpy as np
from os.path import join as oj
from tqdm import tqdm
import data
import os
from os.path import join as oj
import sys, time
sys.path.insert(1, oj(sys.path[0], '..'))  # insert parent path
import seaborn as sns
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
import pmlb
from tqdm import tqdm
from copy import deepcopy
import pickle as pkl
import pandas as pd
import data
import fit


def fill_in_default_results(results):
    '''add keys for things which weren't recorded at the time
    '''
    for key in ['H_trace']:
        if key not in results:
            results[key] = None
    if 'beta_norm' not in results:
        results['beta_norm'] = 1
Exemple #30
0
def get_test_interval(model_dir):
  test = open(oj(model_dir,'train_output.log.test'),'r').readlines()
  return int(test[2].split()[0])
 def setup(self,  metadata_file=oj(config.DIR_PROCESSED, 'metadata_clath_aux+gak_a7d2.pkl')):
     np.random.seed(13)
     self.m = pkl.load(open(metadata_file, 'rb'))
     '''
Exemple #32
0
def fit(p):
    print(p._str(p))

    # set random seed
    np.random.seed(p.seed)
    torch.manual_seed(p.seed)

    # generate data
    X, y_onehot, y_scalar = data.generate_gaussian_data(p.N,
                                                        means=p.means,
                                                        sds=p.sds,
                                                        labs=p.labs)
    dset = data.dset(X, y_scalar)
    # viz.plot_data()

    # make model
    if p.loss_func == 'cross_entropy':
        model = torch.nn.Sequential(
            torch.nn.Linear(p.d_in, p.hidden1),
            torch.nn.ReLU(),
            torch.nn.Linear(p.hidden1, p.d_out),
            # don't use softmax with crossentropy loss
        )
    else:
        model = torch.nn.Sequential(torch.nn.Linear(p.d_in, p.hidden1),
                                    torch.nn.ReLU(),
                                    torch.nn.Linear(p.hidden1, p.d_out),
                                    torch.nn.Softmax())

    # set up optimization
    optimizer = torch.optim.SGD(
        model.parameters(),
        lr=p.lr)  # only optimize ridge (otherwise use model.parameters())
    scheduler = StepLR(optimizer,
                       step_size=p.step_size_optimizer,
                       gamma=p.gamma_optimizer)
    if p.loss_func == 'cross_entropy':
        loss_fn = torch.nn.CrossEntropyLoss()
    else:
        loss_fn = torch.nn.MSELoss(size_average=False)
    dataloader = DataLoader(dset, batch_size=p.batch_size, shuffle=True)

    if p.init == 'data-driven':
        initialize_bs_as_neg_x_times_w(X, model)

    # to record
    weights = {}
    losses = np.zeros(p.num_iters)
    norms = np.zeros((p.num_iters, p.num_layers))
    accs = np.zeros(p.num_iters)

    X_torch = torch.from_numpy(X)
    if p.loss_func == 'cross_entropy':
        y_torch = Variable(torch.from_numpy(y_scalar.flatten()).long(),
                           requires_grad=False)
    else:
        y_torch = Variable(torch.from_numpy(y_onehot), requires_grad=False)

    # fit
    # batch gd
    for it in tqdm(range(p.num_iters)):
        y_pred = model(Variable(X_torch))  # predict
        loss = loss_fn(y_pred,
                       y_torch)  # long target is needed for crossentropy loss
        optimizer.zero_grad()  # zero the gradients
        loss.backward()  # backward pass
        optimizer.step()  # update weights
        scheduler.step()  # step for incrementing optimizer

        # output
        if it % 100 == 0 or it == p.num_iters - 1:
            weight_dict = {
                x[0]: x[1].data.numpy()
                for x in model.named_parameters()
            }
            weights[it] = deepcopy(weight_dict)
        losses[it] = loss.data  #.item()
        accs[it] = np.mean(
            np.argmax(y_pred.data.numpy(), axis=1) == y_scalar.flatten()) * 100
        norms[it, 0] = np.linalg.norm(weight_dict['0.weight'])**2 + np.sum(
            weight_dict['0.bias']**2)
        norms[it, 1] = np.linalg.norm(weight_dict['2.weight'])**2

    # save
    if not os.path.exists(
            p.out_dir):  # delete the features if they already exist
        os.makedirs(p.out_dir)
    params = p._dict(p)

    # predict things
    X_train = X
    y_train = y_scalar
    pred_train = model(Variable(torch.from_numpy(X_train),
                                requires_grad=True)).data.numpy()  # predict

    if p.d_in == 1:
        X_test = np.linspace(np.min(X), np.max(X), 1000, dtype=np.float32)
        X_test = X_test.reshape(X_test.shape[0], 1)
        pred_test = model(
            Variable(torch.from_numpy(X_test),
                     requires_grad=True)).data.numpy()
    else:
        X_test = None
        pred_test = None

    # calculate time to min loss
    min_loss = np.min(losses)
    t_min_loss_plus_5_perc = np.argmax(losses <= min_loss * 1.05)
    t_min_loss_plus_10_perc = np.argmax(losses <= min_loss * 1.10)
    t_min_loss_plus_20_perc = np.argmax(losses <= min_loss * 1.20)

    results = {
        'weights': weights,
        'losses': losses,
        'norms': norms,
        'accs': accs,
        'min_loss': min_loss,
        'max_acc': np.max(accs),
        'model': model,
        'X_train': X_train,
        'y_train': y_scalar,
        'pred_train': pred_train,
        'X_test': X_test,
        'pred_test': pred_test,
        't_min_loss_plus_5_perc': t_min_loss_plus_5_perc,
        't_min_loss_plus_10_perc': t_min_loss_plus_10_perc,
        't_min_loss_plus_20_perc': t_min_loss_plus_20_perc
    }
    results_combined = {**params, **results}
    pkl.dump(results_combined, open(oj(p.out_dir, p._str(p) + '.pkl'), 'wb'))
    return results_combined, model
Exemple #33
0
#! /usr/bin/python3

import pandas as pd
import numpy as np
import os
from os.path import join as oj
from os.path import dirname

if __name__ == '__main__':
    import sys
    sys.path.append(oj(os.path.dirname(__file__), '..', '..', 'raw', 'usafacts_infections'))
    from load import load_usafacts_infections
else:
    from ...raw.usafacts_infections.load import load_usafacts_infections


def clean_usafacts_infections(data_dir=oj('..', '..', 'raw', 'usafacts_infections'),
                      out_dir='.'):
    ''' Clean usafacts data
    
    Parameters
    ----------
    data_dir : str; path to the data directory to find raw csv
    
    out_dir : str; path to the data directory to write cleaned csv
    
    Returns
    -------
    writes out cleaned csv file and returns clean data frame
    '''
    
    args = parser.parse_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    device = torch.device("cuda" if args.cuda else "cpu")

    kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {}
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('data', train=True, download=True,
                       transform=transforms.ToTensor()),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('data', train=False, transform=transforms.ToTensor()),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    out_dir = 'samples'
    os.makedirs(out_dir, exist_ok=True)
    model = VAE().to(device)
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    
    # actually do training
    for epoch in range(1, args.epochs + 1):
        train(epoch)
        test(epoch)
        with torch.no_grad():
            sample = torch.randn(64, 20).to(device)
            sample = model.decode(sample).cpu()
            save_image(sample.view(64, 1, 28, 28),
                       oj(out_dir, 'sample_' + str(epoch) + '.png'))
Exemple #35
0
def aggregate_results(results, group_idxs, out_dir):
    '''Takes in results and makes curves when varying n_train + aggregates over seeds
    '''
    r2 = results.groupby(group_idxs)
    ind = pd.MultiIndex.from_tuples(r2.indices, names=group_idxs)
    df = pd.DataFrame(index=ind)

    # keys to record
    keys = [
        'ratio', 'bias', 'var', 'wnorm', 'mse_train', 'mse_test',
        'num_nonzero', 'mse_noiseless', 'df1', 'df2', 'df3', 'n_train',
        'num_features'
    ]
    for key in keys:
        df[key] = None
    for name, gr in tqdm(r2):
        p = gr.iloc[0]
        dset = p.dset
        noise_std = p.noise_std
        dset_num = p.dset_num
        model_type = p.model_type
        reg_param = p.reg_param
        num_features = p.num_features
        curve = gr.groupby(['n_train'])  #.sort_index()
        row = {k: [] for k in keys}
        row['model_type'] = model_type
        row['reg_param'] = reg_param
        row['num_features'] = num_features
        row['noise_std'] = noise_std
        for curve_name, gr2 in curve:

            # calculate bias/var across repeats
            '''
            if dset == 'gaussian':
                dset_name = ''
                _, _, _, y_true, betastar = \
                    data.get_data_train_test(n_test=p.n_test, p=p.num_features, 
                                             noise_std=0, noise_distr=p.noise_distr, iid=p.iid, # parameters to be determined
                                             beta_type=p.beta_type, beta_norm=p.beta_norm, cov_param=p.cov_param)
                y_true = y_true.reshape(1, -1) # 1 x n_test
            elif dset == 'pmlb':
                dset_name = data.REGRESSION_DSETS_LARGE_NAMES_RECOGNIZABLE[dset_num] # note this was switched at some point
                X, y = pmlb.fetch_data(dset_name, return_X_y=True)
                fit.seed(703858704)
                _, _, _, y_true = train_test_split(X, y) # get test set

                
            preds = gr2.preds_test.values
            preds = np.stack(preds) # num_seeds x n_test
            preds_mean = preds.mean(axis=0).reshape(1, -1) # 1 x n_test
            y_true_rep = np.repeat(y_true, repeats=preds.shape[0], axis=0) # num_seeds x n_test
            preds_mu = np.mean(preds)
            bias = np.mean(preds_mu - y_true_rep.flatten())
            var = np.mean(np.square(preds.flatten() - preds_mu))
            mse_noiseless = metrics.mean_squared_error(preds.flatten(), y_true_rep.flatten())
            row['bias'].append(bias)
            row['var'].append(var)
            row['mse_noiseless'].append(mse_noiseless)
            '''

            # aggregate calculated stats
            row['ratio'].append(gr2.num_features.values[0] /
                                gr2.n_train.values[0])
            row['n_train'].append(gr2.n_train.values[0])
            row['wnorm'].append(gr2.wnorm.mean())
            row['mse_train'].append(gr2.train_mse.mean())
            row['mse_test'].append(gr2.test_mse.mean())

            for key in ['num_nonzero', 'df1', 'df2', 'df3']:
                row[key].append(gr2[key].mean())

        for k in keys:
            df.at[name, k] = np.array(row[k])  #3# ratios\
    # df['mse_zero'] = metrics.mean_squared_error(y_true, np.zeros(y_true.size).reshape(y_true.shape))
    df.to_pickle(oj(out_dir, 'processed.pkl'))  # save into out_dir

    return df
def load_county(data_dir=".",
                cached_file="county_data.csv",
                cached_abridged_file="county_data_abridged.csv",
                cached=True,
                abridged=True,
                infections_data="usafacts",
                rm_na=True):
    '''  Load in merged county data set
    
    Parameters
    ----------
    data_dir : string; path to the data directory
    
    cached_file : string; name of cached county-level data
    
    cached_abridged_file : string; name of cached abridged county-level data
    
    cached : logical; whether or not to load in cached data (if possible)
    
    abridged : logical; whether or not to load in abridged data
    
    infections_data : string; source for daily cases/deaths counts from
                      COVID-19 infections; must be either 'usafacts' or 'nytimes'
                      
    rm_na : logical; whether or not to remove counties with NA cases or deaths
        
    Returns
    -------
    data frame with abridged or full county-level data set
    '''

    # error checking
    if infections_data not in ['usafacts', 'nytimes']:
        raise ValueError(
            "infections_data must be either 'usafacts' or 'nytimes'")

    # data directories
    orig_dir = os.getcwd()
    data_dir_raw = oj(data_dir, "county_level", "raw")
    data_dir_clean = oj(data_dir, "county_level", "processed")

    if cached == True:
        # read in cached data
        if abridged == True:
            if os.path.exists(oj(data_dir, cached_abridged_file)):
                cnty = pd.read_csv(oj(data_dir, cached_abridged_file))
            else:
                raise ValueError("Cached abridged file cannot be found. " +
                                 "Please set cached = False.")
        else:
            if os.path.exists(oj(data_dir, cached_file)):
                cnty = pd.read_csv(oj(data_dir, cached_file))
            else:
                raise ValueError("Cached file cannot be found. " +
                                 "Please set cached = False")
        cnty["countyFIPS"] = cnty["countyFIPS"].astype(str).str.zfill(5)
    else:
        ## ADD PUBLIC DATASETS HERE
        public_datasets = [
            "ahrf_health", "cdc_svi", "chrr_health", "dhdsp_heart",
            "dhdsp_stroke", "hpsa_shortage", "ihme_respiratory", "khn_icu",
            "medicare_chronic", "mit_voting", "nchs_mortality",
            "usdss_diabetes", "jhu_interventions"
        ]
        ## ADD PRIVATE DATASETS HERE
        private_datasets = ["unacast_mobility"]

        # load in and clean county-level datasets
        df_ls = []
        for dataset in public_datasets + private_datasets:
            # check if raw data files exist locally; if not, download raw data
            if dataset == "chrr_health":
                os.chdir(oj(data_dir_raw, dataset))
                if not os.path.exists("state_data"):
                    # download raw data
                    os.system("python download.py")
                    print("downloaded " + dataset + " successfully")
                elif len(os.listdir("state_data")) != 51:
                    # download raw data
                    os.system("python download.py")
                    print("downloaded " + dataset + " successfully")
                os.chdir(orig_dir)
            elif dataset in private_datasets:
                os.chdir(oj(data_dir_raw, dataset))
                if not os.path.exists("../../../../../covid-19-private-data"):
                    # skip loading and cleaning
                    os.chdir(orig_dir)
                    continue
                os.chdir(orig_dir)
            elif dataset != "jhu_interventions":
                if not any(fname.startswith(dataset) \
                           for fname in os.listdir(oj(data_dir_raw, dataset))):
                    # download raw data
                    os.chdir(oj(data_dir_raw, dataset))
                    os.system("python download.py")
                    print("downloaded " + dataset + " successfully")
                    os.chdir(orig_dir)

            # clean data
            os.chdir(oj(data_dir_clean, dataset))
            df_ls.append(eval("clean_" + dataset + "()"))
            print("loaded and cleaned " + dataset + " successfully")
            os.chdir(orig_dir)

        # merge county ids data
        cnty_fips = pd.read_csv(
            oj(data_dir_raw, "county_ids", "county_fips.csv"))
        cnty_fips["countyFIPS"] = cnty_fips["countyFIPS"].str.zfill(5)
        cnty_latlong = pd.read_csv(
            oj(data_dir_raw, "county_ids", "county_latlong.csv"))
        cnty_latlong = cnty_latlong[["countyFIPS", "State", "lat", "lon"]]
        cnty_latlong["countyFIPS"] = cnty_latlong["countyFIPS"].astype(
            str).str.zfill(5)
        cnty_popcenters = pd.read_csv(
            oj(data_dir_raw, "county_ids", "county_popcenters.csv"))
        cnty_popcenters = cnty_popcenters[[
            "STATEFP", "COUNTYFP", "LATITUDE", "LONGITUDE"
        ]]
        cnty_popcenters = cnty_popcenters.rename(columns={
            "LATITUDE": "POP_LATITUDE",
            "LONGITUDE": "POP_LONGITUDE"
        })
        cnty_popcenters["countyFIPS"] = cnty_popcenters["STATEFP"].astype(
            str).str.zfill(2) + cnty_popcenters["COUNTYFP"].astype(
                str).str.zfill(3)
        cnty = pd.merge(cnty_fips, cnty_latlong, on="countyFIPS", how="left")
        cnty = pd.merge(cnty, cnty_popcenters, on="countyFIPS", how="left")

        # merge county-level data with county ids
        for i in range(0, len(df_ls)):
            df_ls[i] = clean_id(
                df_ls[i])  # remove potentially duplicate ID columns
            cnty = pd.merge(cnty, df_ls[i], on='countyFIPS',
                            how="left")  # merge data

        # basic preprocessing
        cnty = cnty.loc[:, ~cnty.columns.duplicated()]
        cnty = cnty.infer_objects()

        # add new features
        cnty = add_features(cnty)

        if abridged == True:
            # get shortlist of important variables for abridged data set
            id_vars = [
                "countyFIPS", "STATEFP", "COUNTYFP", 'CountyName', 'StateName',
                'State', 'lat', 'lon', "POP_LATITUDE", "POP_LONGITUDE"
            ]
            important_vars = id_vars + important_keys(cnty)
            cnty = cnty[important_vars]
            cnty.to_csv(oj(data_dir, cached_abridged_file),
                        header=True,
                        index=False)
            print("saved " + cached_abridged_file + " successfully")
        else:
            # write full county data to file
            cnty.to_csv(oj(data_dir, cached_file), header=True, index=False)
            print("saved " + cached_file + " successfully")

    # get covid-19 infections data
    if infections_data == 'usafacts':
        covid = load_usafacts_infections(
            oj(data_dir_raw, "usafacts_infections"))
    elif infections_data == 'nytimes':
        raise ValueError('infections_data = "nytimes" not yet implemented')

    # merge county data with covid data
    if rm_na == True:
        df = pd.merge(cnty, covid, on='countyFIPS', how='right')
    else:
        df = pd.merge(cnty, covid, on='countyFIPS', how='left')

    return df
                [max(a[i][0] - a[i - 1][1], 0),
                 max(a[i][1] - a[i - 1][0], 0)])
        return tmp

    for i in range(df_county.shape[0]):
        df_county.loc[i, newname + '_interval'].extend(
            find_intervals(df_county.loc[i, name],
                           df_county.loc[i, var + 'Intervals']))
    return df_county


if __name__ == '__main__':
    print('loading data...')
    NUM_DAYS_LIST = [1, 2, 3, 4, 5, 6, 7]
    df_county = load_data.load_county_level(
        data_dir=oj(parentdir, 'data')).fillna(0)
    df_county = add_preds(
        df_county,
        NUM_DAYS_LIST=NUM_DAYS_LIST,
        cached_dir=oj(parentdir,
                      'data'))  # adds keys like "Predicted Deaths 1-day"

    ## orgnize predicts as array

    add_pre(df_county, 'Predicted Cases ', 'pred_cases', 'pred_new_cases')
    add_pre(df_county, 'Predicted Deaths ', 'pred_deaths', 'pred_new_deaths')

    ## add new cases/death to dataframe
    add_new(df_county)
    ## Add new cases/deaths predictions and their intervals
    df_county = add_new_pre(df_county, 'Predicted Cases ', 'tot_cases',
Exemple #38
0
  print('Usage: python plot.py path/to/model [start-epoch=..] [end-epoch==..]')

  try: 
    os.environ['DISPLAY']
  except: 
    print 'ERROR: X11 forwarding not enabled, cannot run script'
    sys.exit()

  model_dir = os.path.abspath(sys.argv[1])

  # command = "./parselog.sh %s"%(oj(model_dir,'train_output.log'))
  # print os.path.isfile(oj(model_dir,'train_output.log'))
  # print 'command:', command
  # call(command)
  cmd = "./parselog.sh "+oj(model_dir,'train_output.log')
  subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE, stderr=subprocess.STDOUT)
    
  # test_interval = [int(arg.split('=')[-1]) for arg in sys.argv
  #                  if arg.startswith('test-inter=')]
  # if len(test_interval) != 1:
  #     print 'ERROR: test-inter not properly given'
  #     sys.exit()
  # else: test_interval = test_interval[0]
  
  start,end = -1,-1
  for arg in sys.argv:
    if arg.startswith("start-iter="):
      start = int(arg.split('=')[-1])
    if arg.startswith("end-iter="):
      end = int(arg.split('=')[-1])
Exemple #39
0
    ]
    remap = {1: 'Low', 2: 'Medium', 3: 'High'}
    for i in NUM_DAYS_LIST:
        ks.append(f'Severity {i}-day')
        ks.append(f'Predicted New Deaths Hospital {i}-day')
        ks.append(f'Severity Index {i}-day')
        df[f'Severity Index {i}-day'] = [
            remap[x] for x in df[f'Severity {i}-day']
        ]
    return df[ks]


if __name__ == '__main__':
    print('loading data...')
    NUM_DAYS_LIST = [1, 2, 3, 4, 5, 6, 7]
    df_county = load_data.load_county_level(data_dir=oj(parentdir, 'data'))
    df_hospital = load_data.load_hospital_level(
        data_dir=oj(parentdir, 'data_hospital_level'))
    df_county = add_preds(
        df_county,
        NUM_DAYS_LIST=NUM_DAYS_LIST,
        cached_dir=oj(parentdir,
                      'data'))  # adds keys like "Predicted Deaths 1-day"
    df = merge_data.merge_county_and_hosp(df_county, df_hospital)
    df = add_severity_index(df, NUM_DAYS_LIST)
    df = df.sort_values('Total Deaths Hospital', ascending=False)

    write_to_gsheets_and_api(df,
                             service_file=oj(parentdir, 'creds.json'),
                             api_file=oj(parentdir, 'ian_key.env'))
    print('succesfully wrote to gsheets')
def generate_map(df):
    df = rename(df)
    df['POS'] = df['County'] + ', ' + df['StateName']
    maps = []
    with urlopen(
            'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json'
    ) as response:
        counties = json.load(response)
    for key in keys:
        fig = px.choropleth(
            df,
            geojson=counties,
            locations='countyFIPS',
            color=np.log(df[key] + 1),
            color_continuous_scale=[
                '#F7E8E4', '#F5C8BB', '#B96D67', '#A83C3B', '#8B2222',
                '#5B0D0D', '#5A2318'
            ],
            scope="usa",
            hover_data=[
                'State', 'County', 'Cumulative Cases', 'New Cases',
                'Cumulative Deaths', 'New Deaths', 'Deaths per 100k',
                'Cases per 100k', 'New Cases per 100k', 'New Deaths per 100k'
            ],
            title=key + ' on ' +
            (datetime.today() - timedelta(days=1)).strftime('%m-%d'))
        fig.update_layout(coloraxis_colorbar=dict(
            len=0.75,
            title=key,
            tickvals=[
                2.302585092994046, 4.605170185988092, 6.907755278982137,
                9.210340371976184, 11.512925464970229
            ],
            ticktext=['10', '100', '1k', '10k', '100k', '1000k'],
            x=1,
            y=0.5))
        ## update the hover information
        for c in ["countyFIPS=%{location}<br>", "<br>color=%{z}"]:
            fig['data'][0]['hovertemplate'] = fig['data'][0][
                'hovertemplate'].replace(c, "")
        fig['data'][0]['hovertemplate'] = fig['data'][0][
            'hovertemplate'].replace("=", ": ")
        fig.update_layout(margin={"r": 0, "t": 40, "l": 0, "b": 0})
        fig.update_layout(
            paper_bgcolor='rgb(0,0,0)',
            plot_bgcolor='rgb(0,0,0)',
            template='plotly_dark',
        )
        fig['layout'].update(width=900, height=450, autosize=True, title_x=0.3)
        if key == 'Cumulative Cases':
            fig.write_image(oj(parentdir, "results/search_map.svg"),
                            width=900,
                            height=450)
        maps.append(
            plotly.offline.plot(fig, include_plotlyjs=False,
                                output_type='div'))

        df_tab = df.sort_values(by=key, ascending=False)
        df_tab = df_tab.reset_index(drop=True)[['POS', key]].loc[:19, :]
        fig = go.Figure(data=[
            go.Table(header=dict(values=['', 'County', key],
                                 line_color='grey',
                                 fill_color='darkgrey',
                                 font_color='white',
                                 font_size=12,
                                 align='center'),
                     cells=dict(values=[[i + 1 for i in range(len(df_tab))],
                                        df_tab['POS'], df_tab[key]],
                                line_color='darkgrey',
                                fill_color='grey',
                                font_color='white',
                                font_size=11,
                                align='center'),
                     columnwidth=[20, 120, 80])
        ])
        fig['layout'].update(paper_bgcolor='rgb(0,0,0)',
                             plot_bgcolor='rgb(0,0,0)',
                             margin=dict(l=0, r=0, t=0, b=0),
                             width=200,
                             height=550,
                             autosize=True,
                             template='plotly_dark')
        fig.write_image(oj(parentdir, "results/" + key + ".svg"),
                        width=200,
                        height=550)
    print('succesfully generated search map')
    return maps
Exemple #41
0
    hist_dict['train_acc_history'] = train_acc_history

    hist_dict['train_loss_history'] = val_loss_history
    hist_dict['train_cd_history'] = train_cd_history
    model.load_state_dict(best_model_wts)
    return model, hist_dict  #TODO hist


params_to_update = model.parameters()
criterion = nn.CrossEntropyLoss(weight=weights.double().float())
optimizer_ft = optim.SGD(params_to_update, lr=args.lr, momentum=args.momentum)

#optimizer_ft = optim.Adam(params_to_update, weight_decay = 0.001)
model, hist_dict = train_model(model,
                               dataloaders,
                               criterion,
                               optimizer_ft,
                               num_epochs=num_epochs)
pid = ''.join(["%s" % randint(0, 9) for num in range(0, 20)])
torch.save(model.state_dict(), oj(model_path, pid + ".pt"))
import pickle as pkl
hist_dict['pid'] = pid
hist_dict['regularizer_rate'] = -1
hist_dict['seed'] = args.seed
hist_dict['batch_size'] = args.batch_size
hist_dict['momentum'] = args.momentum

hist_dict['learning_rate'] = args.lr

pkl.dump(hist_dict, open(os.path.join(model_path, pid + '.pkl'), 'wb'))
Exemple #42
0
def plot_dims(results,
              out_dir='figs',
              xlim=None,
              percent_to_explain=0.85,
              figname='explained',
              dim_types=[
                  'explained_var_dicts_pca', 'explained_var_dicts_rbf',
                  'explained_var_dicts_lap', 'explained_var_dicts_cosine'
              ]):
    # params for plotting
    plt.figure(figsize=(10, 18), dpi=100)
    #     skips = [('adam', 0.1), ('adam', 0.01), ('adam', 0.001)]
    skips = []

    dim_dicts = {}
    R, C = 5, 3
    for index, row in results.iterrows():
        # style for plotting
        #     style = '^' if row.optimizer == 'sgd' else '.'
        #     color = {0.1: 'red', 0.01: 'blue', 0.001: 'green'}[row.lr]
        color = 'orange' if row.optimizer == 'sgd' else 'deepskyblue'
        style = {1: '^', 0.1: '-', 0.01: '--', 0.001: '.'}[row.lr]
        alpha = {1.0: 0.3, 0.1: 0.8, 0.01: 0.8, 0.001: .3}[row.lr]
        if not (row.optimizer, row.lr) in skips:

            # accs
            try:
                plt.ylabel(row.dset[0])
            except:
                pass
            plt.subplot(R, C, 1)
            plt.plot(row.its,
                     row.losses_train,
                     style,
                     label=row.optimizer + ' ' + str(row.lr),
                     color=color,
                     alpha=alpha)
            plt.yscale('log')
            plt.title('train loss')

            plt.subplot(R, C, 2)
            plt.plot(row.its, row.losses_test, style, color=color, alpha=alpha)
            plt.yscale('log')
            plt.title('test loss')

            plt.subplot(R, C, 3)
            plt.plot(row.its, row.accs_test, style, color=color, alpha=alpha)
            plt.title('test acc')

            # dims
            for j in range(4):
                offset = 3 * (1 + j)
                plt.subplot(R, C, offset + 1)
                dim_dicts = row[dim_types[j]]

                # pick keys
                if not 'weight_names' in list(
                        results):  # this is old, remove after some reruns
                    if 'explained' in dim_types[j]:
                        lays = ['fc1.weight', 'fc2.weight', 'fc3.weight']
                    elif 'act' in dim_types[j]:
                        #                 dim_dicts = dim_dicts[0]
                        #                 print(dim_dicts.keys())
                        lays = ['fc1', 'fc2', 'fc3']
                else:
                    lays = row.weight_names
                    if 'act' in dim_types[j]:
                        lays = [
                            lay[:lay.rfind('.')] for lay in lays
                        ]  # act uses forward_all dict which doesn't have any . or .weight


#                 print(lays, dim_dicts[0].keys())

                lab = dim_types[j].replace('_var_dicts_', '')
                lab = lab.replace('explained', '')
                lab = lab.replace('act', 'act: ')
                plt.plot(row.its, [
                    frac_dims_to_explain_X_percent(
                        d[lays[0]], percent_to_explain) for d in dim_dicts
                ],
                         style,
                         color=color,
                         alpha=alpha)
                plt.ylabel(lab + '\n' + str(100 * percent_to_explain) +
                           '% frac dims (of ' +
                           str(dim_dicts[0][lays[0]].size) + ')')
                plt.title(lays[0])

                plt.subplot(R, C, offset + 2)
                plt.plot(row.its, [
                    frac_dims_to_explain_X_percent(
                        d[lays[1]], percent_to_explain) for d in dim_dicts
                ],
                         style,
                         color=color,
                         alpha=alpha)
                plt.title(lays[1])
                plt.ylabel('out of ' + str(dim_dicts[0][lays[1]].size))

                if len(lays) > 2:
                    plt.subplot(R, C, offset + 3)
                    plt.plot(row.its, [
                        frac_dims_to_explain_X_percent(
                            d[lays[2]], percent_to_explain) for d in dim_dicts
                    ],
                             style,
                             color=color,
                             alpha=alpha)
                    plt.title(lays[2])
                    plt.ylabel('out of ' + str(dim_dicts[0][lays[2]].size))

            if not xlim is None:
                for i in range(R * C):
                    plt.subplot(R, C, 1 + i)
                    plt.xlim((0, xlim))

    plt.subplot(R, C, 1)
    # remove duplicate labels
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())
    plt.savefig(oj(out_dir, 'dims_' + figname + '.png'), bbox_inches='tight')
    plt.show()
Exemple #43
0
import os
from os.path import join as oj

DIR_REPO = os.path.dirname(
    os.path.realpath(__file__))  # directory of the config file

# running and saving #################################
# DIR_PROCESSED_MISC = oj(DIR_REPO, 'processed')
DIR_RESULTS = oj(DIR_REPO, 'src', 'results')
DIR_FIGS = oj(DIR_REPO, 'figs')

# data #################################
DIR_CELEBA = oj(DIR_REPO, 'data/celeba-hq')
DIR_IMS = oj(DIR_CELEBA, 'ims/')
DIR_PROCESSED = oj(DIR_REPO, 'data/processed/')
DIR_GEN = oj(DIR_PROCESSED, 'gen', 'generated_images_0.1')

# lib paths #################################
DIR_LIB = oj(DIR_REPO, 'lib')
DIR_STYLEGAN = oj(DIR_LIB, 'stylegan2')

# attrs in latent space
DIR_LINEAR_DIRECTIONS = oj(DIR_REPO, 'data/annotations_stylegan2/linear_models'
                           )  # linear models for attributes on latent space
ATTRS = ['age', 'facial-hair', 'skin-color', 'gender', 'hair-length', 'makeup']
ATTRS_MEASURED = 'HAGCBM'
ALL_ATTRS = 'HAGCBMSEW'  # 'HAGCBMSEW'
ATTR_LABELS = {
    'C': 'skin-color',
    'H': 'hair-length',
    'G': 'gender\n(perceived)',
Exemple #44
0
def savefig(s: str):
    plt.savefig(oj(DIR_FIGS, s + '.pdf'))
    plt.savefig(oj(DIR_FIGS, s + '.png'), dpi=300)
  for dr in dirs:
    cdr = baseDir + "/" + dr
    jpgs = filter(lambda x: "jpg" in x, os.listdir(cdr))
    for jpg in jpgs:
      name = jpg.split(".")[0]
      shutil.move(cdr+"/"+jpg, tdr+"/"+jpg) 
      shutil.copyfile(cdr+"/inspection.txt", tdr+"/"+name+".dat")
      shutil.copyfile(cdr+"/meta.txt", tdr+"/"+name+".met")
      getUnsuitableFlags(tdr, name)
    shutil.rmtree(cdr) 
 
def getUnsuitableFlags(tdr, name):
  with open(tdr+"/"+name+".met") as met_f:
    if any(["UnsuitablePhoto=True" in line for line in met_f.readlines()]):
      with open(tdr+"/"+name+".dat", 'a') as dat_f:
        dat_f.write("UnsuitablePhoto")

if __name__ == "__main__":
  baseDir = sys.argv[1]
  tdr = os.path.abspath(baseDir)
  print 'checking whether any joint dirs left...'
  if any([os.path.isdir(oj(tdr,fd)) for fd in os.listdir(tdr)]):
    print 'found some; reorganizing them'
    reorganize(tdr)
  print 'no more joint dirs left'
  jpgs = filter(lambda x: "jpg" in x, os.listdir(tdr))
  for jpg in jpgs:
    name = jpg.split(".")[0]
    getUnsuitableFlags(tdr, name)

Exemple #46
0
            symlink_dir = os.path.abspath(arg.split('=')[-1])
        elif "data-info=" in arg:
            data_info = os.path.abspath(arg.split('=')[-1])

    redbox = False
    if '--redbox' in sys.argv: redbox = True

    if check.check(symlink_dir, data_info) != [0, 0] and not redbox:
        print 'ERROR: mismatch between test files in data_dir and data_info'
        sys.exit()

    if redbox:
        flag_val = create_redbox_data_info_etc(symlink_dir, data_info)

    PRETRAINED = get_pretrained_model(classifier_dir)
    already_pred = oj(data_info, PRETRAINED.split('/')[-1] + '_pred.npy')
    if os.path.isfile(already_pred) and raw_input('found %s; use? ([Y]/N) ' %
                                                  (already_pred)) != 'N':
        d = (np.load(already_pred)).item()
    else:
        d = classify_data(classifier_dir, symlink_dir, data_info, PRETRAINED,
                          redbox)

    if redbox:
        d = arrange_preds_with_flag_val(d, flag_val)

    # this should go in main as well?
    # get true labels, assign predicted labels, get metrics
    d = compute_classification_stats(d, data_info, redbox)
    print_classification_stats(d)
Exemple #47
0
def plot_losses(results, out_dir='figs'):
    # params for plotting
    plt.figure(figsize=(12, 8), dpi=100, facecolor='w')
    percent_to_explain = 0.90
    dim_types = ['pca', 'rbf', 'lap', 'cosine']
    #     skips = [('adam', 0.1), ('sgd', 1.0)] #, ('sgd', 0.1)]
    skips = []

    dim_dicts = {}
    R, C = 2, 4
    for index, row in results.iterrows():

        color = 'orange' if row.optimizer == 'sgd' else 'deepskyblue'
        style = {1: '^', 0.5: '-', 0.1: '-', 0.01: '--', 0.001: '.'}[row.lr]
        alpha = {1.0: 0.3, 0.5: 0.5, 0.1: 0.8, 0.01: 0.8, 0.001: .3}[row.lr]
        xlim = None  #20 # None

        if not (row.optimizer, row.lr) in skips:
            # accs
            plt.subplot(R, C, 1)
            plt.ylabel('full model')
            plt.plot(row.its,
                     row.losses_train,
                     style,
                     label=row.optimizer + ' ' + str(row.lr),
                     color=color,
                     alpha=alpha)
            plt.yscale('log')
            plt.title('train loss')

            plt.subplot(R, C, 2)
            plt.plot(row.its, row.losses_test, style, color=color, alpha=alpha)
            plt.yscale('log')
            plt.title('test loss')

            plt.subplot(R, C, 3)
            plt.plot(row.its,
                     row.accs_train,
                     style,
                     label=row.optimizer + ' ' + str(row.lr),
                     color=color,
                     alpha=alpha)
            plt.title('train acc')

            plt.subplot(R, C, 4)
            plt.plot(row.its, row.accs_test, style, color=color, alpha=alpha)
            plt.title('test acc')

            plt.subplot(R, C, 5)
            plt.ylabel('reconstructed with 85% PCs')
            plt.plot(row.its,
                     row.losses_train_r,
                     style,
                     label=row.optimizer + ' ' + str(row.lr),
                     color=color,
                     alpha=alpha)
            plt.yscale('log')
            plt.title('train loss')

            plt.subplot(R, C, 6)
            plt.plot(row.its,
                     row.losses_test_r,
                     style,
                     color=color,
                     alpha=alpha)
            plt.yscale('log')
            plt.title('test loss')

            plt.subplot(R, C, 7)
            plt.plot(row.its,
                     row.accs_train_r,
                     style,
                     label=row.optimizer + ' ' + str(row.lr),
                     color=color,
                     alpha=alpha)
            plt.title('train acc')

            plt.subplot(R, C, 8)
            plt.plot(row.its, row.accs_test_r, style, color=color, alpha=alpha)
            plt.title('test acc')

    plt.subplot(R, C, 1)
    # remove duplicate labels
    handles, labels = plt.gca().get_legend_handles_labels()
    by_label = OrderedDict(zip(labels, handles))
    plt.legend(by_label.values(), by_label.keys())
    plt.savefig(oj(out_dir, 'losses' + '.png'), bbox_inches='tight')
    plt.show()
      symlink_dir = os.path.abspath(arg.split('=')[-1])
    elif "data-info=" in arg:
      data_info = os.path.abspath(arg.split('=')[-1])

  redbox = False
  if '--redbox' in sys.argv: redbox = True
  
  if check.check(symlink_dir, data_info) != [0,0] and not redbox:
    print 'ERROR: mismatch between test files in data_dir and data_info'
    sys.exit()

  if redbox:
    flag_val = create_redbox_data_info_etc(symlink_dir, data_info)

  PRETRAINED = get_pretrained_model(classifier_dir)
  already_pred = oj(data_info, PRETRAINED.split('/')[-1]+'_pred.npy')
  if os.path.isfile(already_pred) and raw_input('found %s; use? ([Y]/N) '%(already_pred)) != 'N':
    d = (np.load(already_pred)).item()
  else:
    d = classify_data(classifier_dir, symlink_dir, data_info, PRETRAINED, redbox)

  if redbox:
    d = arrange_preds_with_flag_val(d, flag_val)
  
  # this should go in main as well?
  # get true labels, assign predicted labels, get metrics
  d = compute_classification_stats(d, data_info, redbox)
  print_classification_stats(d)
  
  # potential mislabels
  if "--mislab" in sys.argv:
Exemple #49
0
def restart_apache():
    run(oj(up(REMOTE_DIR), 'apache2/bin/restart'))
    metavar='N',
    help='hyperparameter for CDEP weight - higher means more regularization')
args = parser.parse_args()

regularizer_rate = args.regularizer_rate

num_epochs = args.epochs

device = torch.device(0)

# load model
model = models.vgg16(pretrained=True)
model.classifier[-1] = nn.Linear(4096, 2)
model = model.classifier.to(device)

with open(oj(dataset_path, "cancer.npy"), 'rb') as f:
    cancer_featuress = np.load(f)
with open(oj(dataset_path, "not_cancer.npy"), 'rb') as f:
    not_cancer_featuress = np.load(f)

cancer_targets = np.ones((cancer_featuress.shape[0])).astype(np.int64)
not_cancer_targets = np.zeros((not_cancer_featuress.shape[0])).astype(np.int64)
with open(oj(dataset_path, "not_cancer_cd.npy"), 'rb') as f:
    not_cancer_cd = np.load(f)
not_cancer_dataset = TensorDataset(
    torch.from_numpy(not_cancer_featuress).float(),
    torch.from_numpy(not_cancer_targets),
    torch.from_numpy(not_cancer_cd).float())

cancer_dataset = TensorDataset(
    torch.from_numpy(cancer_featuress).float(),
def add_preds(
    df_county,
    NUM_DAYS_LIST=[1, 2, 3],
    verbose=False,
    cached_dir=None,
    outcomes=['Deaths', 'Cases'],
    discard=False,
    d=datetime.datetime.today(),
    add_predict_interval=True,
    interval_target_days=[],
):
    '''Adds predictions for the current best model
    Adds keys that look like 'Predicted Deaths 1-day', 'Predicted Deaths 2-day', ...
    '''

    # select the best model
    advanced_model = {'model_type': 'advanced_shared_model'}
    linear = {'model_type': 'linear'}
    BEST_MODEL = [advanced_model, linear]

    # load cached preds
    if cached_dir is not None:
        # getting current date and time
        if not discard:
            cached_fname = oj(cached_dir,
                              f'preds_{d.month}_{d.day}_cached.pkl')
        else:
            cached_fname = oj(
                cached_dir, f'preds_{d.month}_{d.day}_cached_discard1day.pkl')
        if os.path.exists(cached_fname):
            return pd.read_pickle(cached_fname)

    print('predictions not cached, now calculating (might take a while)')
    for outcome in outcomes:
        print(f'predicting {outcome}...')
        tmp = [0 for _ in range(df_county.shape[0])]
        for num_days_in_future in tqdm(NUM_DAYS_LIST):  # 1 is tomorrow
            output_key = f'Predicted {outcome} {num_days_in_future}-day'
            df_county = fit_and_predict_ensemble(df_county,
                                                 methods=BEST_MODEL,
                                                 outcome=outcome.lower(),
                                                 mode='predict_future',
                                                 target_day=np.array(
                                                     [num_days_in_future]),
                                                 output_key=output_key,
                                                 verbose=verbose)
            vals = df_county[output_key].values
            out = []
            for i in range(vals.shape[0]):
                if np.isnan(vals[i]):
                    out.append(0)
                else:
                    out.append(
                        max(vals[i][0],
                            list(df_county[outcome.lower()])[i][-1], tmp[i]))
            df_county[output_key] = out
            tmp = out

        output_key = f'Predicted {outcome} Intervals'
        if add_predict_interval:
            if not interval_target_days:
                interval_target_days = NUM_DAYS_LIST
            print('prediction intervals...')
            print(interval_target_days)
            df_county = add_prediction_intervals(
                df_county,
                target_day=np.array(interval_target_days),
                outcome=outcome.lower(),
                methods=BEST_MODEL,
                interval_type='local',
                output_key=output_key)

    # add 3-day lagged death preds
    output_key = f'Predicted Deaths 3-day Lagged'
    df_county = fit_and_predict_ensemble(df_county,
                                         methods=BEST_MODEL,
                                         outcome='deaths',
                                         mode='eval_mode',
                                         target_day=np.array([3]),
                                         output_key=output_key,
                                         verbose=verbose)
    df_county[output_key] = [v[0] for v in df_county[output_key].values]

    if cached_dir is not None:
        df_county.to_pickle(cached_fname)
    return df_county
Exemple #52
0
import scipy as sp
import pandas as pd
from functions import merge_data
from sklearn.model_selection import RandomizedSearchCV
import load_data
import exponential_modeling
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from fit_and_predict import add_preds
from datetime import datetime, timedelta
import pygsheets

if __name__ == '__main__':
    NUM_DAYS_LIST = [1, 2, 3, 4, 5, 6, 7]
    df_county = load_data.load_county_level(data_dir=oj(parentdir, 'data'))
    df_county = add_preds(
        df_county,
        NUM_DAYS_LIST=NUM_DAYS_LIST,
        cached_dir=oj(parentdir,
                      'data'))  # adds keys like "Predicted Deaths 1-day"

    # county-level stuff (#ICU_beds is county-level)
    k_surge = 'Severity (Surge) Prediction'
    df_county[k_surge] = 2 * df_county['Predicted Deaths 3-day'] - df_county[
        '#ICU_beds'].fillna(0)

    # rewrite pred cols
    today = datetime.today().strftime("%B %d")
    days = [
        'Predicted Deaths by ' +
Exemple #53
0
    # hyperparams
    seed = 1
    out_dir = '/scratch/users/vision/chandan/rf_sims_real/use_rf_rerun' # sim_results_fix_cov_C=0.25''
    use_rf = True
    C = 1
    dset_num = 0
    random_state = 42 # for each train_test_split


    # dset_num sys argv
    if len(sys.argv) > 1:
        dset_num = int(sys.argv[1])
    print('dset num', dset_num)    
    dset_name = dset_names[dset_num]

    # generate data
    np.random.seed(seed)
    os.makedirs(out_dir, exist_ok=True)
    X, y, df, X_test, y_test, X_cond, y_cond = get_data(dset_name, data_dir, random_state)

    # fit model
    forest, test_mse = run_sim.fit_model(X, y, X_test, y_test)
    pkl.dump({'rf': forest, 'test_mse': test_mse}, open(oj(out_dir, f'model_{dset_num}.pkl'), 'wb'))

    # generate data for conditional
    if use_rf:
        Y_cond = forest.predict(X_cond)

    # calc curves
    run_sim.calc_curves(X, y, df, X_cond, y_cond, forest, out_dir, dset_num, C)
    print('done!')
Exemple #54
0
def compute_classification_stats(d, data_info, redbox=False):
    # this comes early because flag_val prompts user
    flag_val, threshold = get_flag_and_thresh(data_info)
    num_imgs = len(d['fname'])
    # get data_info test file
    if redbox:
        print 'opening redbox data info'
        lines = open(oj(data_info, 'redbox.txt'), 'r').readlines()
    else:
        lines = open(oj(data_info, 'test.txt'), 'r').readlines()
    _class = {}
    for line in lines:
        _class[line.split()[0]] = line.split()[1]
    # print "lines:", lines
    try:
        assert set(d['fname']) == set(_class.keys())
    except:
        print "don't match: d['fname']", d['fname']
        print "and _class.keys()", _class.keys()
        exit
    # fill with true labels
    d['label'] = [int(_class[el]) for el in d['fname']]
    # fill in predicted labels and flag if potentially mislab
    # *_thresh is with classification boundary according to threshold
    # *_std is with classification boundary at 0.5
    false_pos_thresh, num_pos, false_neg_thresh, num_neg, false_neg_std, false_pos_std = 0, 0, 0, 0, 0, 0
    # print "\nd['label'] has types %s and flag_val of type %s \n"%(type(d['label'][0]),type(flag_val))
    for idx in range(num_imgs):
        if d['label'][idx] == flag_val:
            print "%s is a positive, with preds %s" % (d['fname'][idx])
            print "its preds are", d['pred'][idx]
            num_pos += 1
        else:
            num_neg += 1
        # assign predicted label wrt threshold
        if d['pred'][idx][flag_val] >= threshold:
            d['pred_lab_thresh'].append(flag_val)
            # print "thresh thinks no clamp! appending", flag_val
        else:
            d['pred_lab_thresh'].append((flag_val + 1) % 2)
            print "thresh thinks clamp! appending", (flag_val + 1) % 2
        # assign predicted label in std way
        if d['pred'][idx][flag_val] >= 0.5:
            d['pred_lab_std'].append(flag_val)
            # print "std thinks no clamp! appending", flag_val, "\n"
        else:
            d['pred_lab_std'].append((flag_val + 1) % 2)
            print "std thinks clamp! appending", (flag_val + 1) % 2, "\n"
        # correct thresh classification or not
        if d['pred_lab_thresh'][idx] != d['label'][idx]:
            if d['label'][idx] == flag_val: false_neg_thresh += 1
            else: false_pos_thresh += 1
        # correct std classification or not
        if d['pred_lab_std'][idx] != d['label'][idx]:
            d['pot_mislab'].append(idx)
            if d['label'][idx] == flag_val: false_neg_std += 1
            else: false_pos_std += 1

    print 'false_neg_thresh: %i, false_pos_thresh: %i' % (false_neg_thresh,
                                                          false_pos_thresh)
    print 'false_neg_std: %i, false_pos_std: %i' % (false_neg_std,
                                                    false_pos_std)
    print 'num_neg: %i, num_pos: %i' % (num_neg, num_pos)
    # compute accuracies
    d['accuracy'] = {}
    d['accuracy']['total_thresh'] = 1 - (false_neg_thresh +
                                         false_pos_thresh) / float(num_imgs)
    d['accuracy']['pos_thresh'] = 1 - false_neg_thresh / float(num_pos)
    d['accuracy']['neg_thresh'] = 1 - false_pos_thresh / float(num_neg)
    d['accuracy']['total_std'] = 1 - (false_neg_std +
                                      false_pos_std) / float(num_imgs)
    d['accuracy']['pos_std'] = 1 - false_neg_std / float(num_pos)
    d['accuracy']['neg_std'] = 1 - false_pos_std / float(num_neg)
    print "d['accuracy']", d['accuracy']
    return d
Exemple #55
0
def vroom():
    
    zz = argparse.ArgumentParser(
        description="Python script to upload multiple (or single) files to 'transfer.sh'"
    )
    
    zz.add_argument(
        'path',
        help='Path containing files to upload'
    )
    
    zz.add_argument(
        '-c', '--use-curl',
        dest='curl',
        action='store_true',
        help='Uses curl to upload file(s)'
    )
    
    zz.add_argument(
        '-w', '--use-wget',
        dest='wget',
        action='store_true',
        help='Uses wget to upload file(s)'
    )
    
    args = zz.parse_args()
    
    if args.curl and args.wget:
        zz.error("Cannot use both 'wget' and 'curl'")
        sys.exit(1)

    elif not args.curl and not args.wget:
        zz.error("Please choose an upload method (-c OR -w)")
        sys.exit(1)

    args.path = os.path.abspath(args.path)
    
    file_list = []
    
    skipped = []
    
    
    if pid(args.path):
        root_dir = args.path
        
        for sub_file in ld(root_dir):
            if sub_file.startswith('.'):
                continue
            
            if re.search(r"[$/\\&\s\[\]{}^%]", sub_file):
                skipped.append(sub_file)
                continue
            
            sub_path = oj(root_dir, sub_file)
            
            if piff(sub_path):
                
                file_list.append(sub_path)
    
        if args.curl:
            for f_count, file in enumerate(file_list):
                
                collected_length = len(file_list)
                
                print("File %d of %d" % (f_count, collected_length))
                
                this_dir = oj(os.path.dirname(sys.argv[0]))
                
                logger = oj(this_dir, 'log.txt')
                
                file_name = os.path.basename(file)
                
                command = """ curl --upload-file "%s" https://transfer.sh/%s >> "%s" """ % (file, file_name, logger)
                
                os.system(command)
        
        elif args.wget:
            for f_count, file in enumerate(file_list):
                collected_length = len(file_list)
            
                print("File %d of %d" % (f_count, collected_length))
            
                this_dir = oj(os.path.dirname(sys.argv[0]))
            
                logger = oj(this_dir, 'log.txt')
            
                file_name = os.path.basename(file)
            
                command = """ wget --method PUT --body-file="%s" https://transfer.sh/%s -O - -v >> "%s" """ % (file, file_name, logger)
            
                os.system(command)
    
    if skipped:
        print('files skipped')
        
        print('\n'.join(str(f) for f in skipped))
        
        print("check file names for illegal characters (spaces, $, (), [], {}) ")
    
    elif piff(args.path):
        
        print("File 1 of 1")

        file = args.path
        
        sub_file = os.path.basename(file)
        
        if re.search(r"[$/\\&\s\[\]{}^%]", sub_file):
            print('file skipped')
            print("check file names for illegal characters (spaces, $, (), [], {})")
            # print("use '-a' flag to auto-rename files")

        logger = oj(this_dir, 'log.txt')

        if args.curl:
        
            this_dir = oj(os.path.dirname(sys.argv[0]))

            file_name = os.path.basename(file)
    
            command = """ curl --upload-file "%s" https://transfer.sh/%s >> "%s" """ % (file, file_name, logger)
    
            os.system(command)

        elif args.wget:
            
            this_dir = oj(os.path.dirname(sys.argv[0]))
    
            file_name = os.path.basename(file)
    
            command = """ wget --method PUT --body-file="%s" https://transfer.sh/%s -O - -v >> "%s" """ % (file, file_name, logger)
    
            os.system(command)
        
        
    
    rdr = open(logger, 'r')
    
    lines = rdr.read()
    
    rdr.close()
    
    os.remove(logger)
    
    lines = lines.split('https')
    
    for l in lines:

        if not l:
            continue

        print('https%s' % l)
    
    pass
def savefig(s: str, png=False):
#     plt.tight_layout()
    plt.savefig(oj(DIR_FIGS, 'fig_' + s + '.pdf'), bbox_inches='tight')
    if png:
        plt.savefig(oj(DIR_FIGS, 'fig_' + s + '.png'), dpi=300, bbox_inches='tight')
Exemple #57
0
def classify_data(classifier_dir,
                  symlink_dir,
                  data_info,
                  PRETRAINED,
                  redbox=False):
    N = 96
    classifier_name = classifier_dir.split('/')[-1]
    if classifier_name.split(
            '-fine')[0] + '_deploy.prototxt' not in os.listdir(classifier_dir):
        create_deploy_file(classifier_dir)

    MODEL_FILE = oj(classifier_dir,
                    classifier_name.split('-fine')[0] + '_deploy.prototxt')
    MEAN_FILE = np.load(get_np_mean_fname(symlink_dir))
    print 'loading network...'
    net = caffe.Classifier(MODEL_FILE,
                           PRETRAINED,
                           image_dims=(256, 256),
                           input_scale=255,
                           mean=MEAN_FILE,
                           channel_swap=(2, 1, 0))
    # flow of control:
    #   classifier::__init__(
    #   classifier::caffe.Net.__init__()
    print 'network loaded successfully'
    # set phase to test since we are doing testing
    net.set_phase_test()
    net.set_mode_gpu()
    d = {
        'fname': [],
        'pred': [],
        'time': [],
        'dude': [],
        'label': [],
        'pred_lab_thresh': [],
        'pred_lab_std': [],
        'pot_mislab': []
    }
    # load images
    if redbox:
        imgs, d = load_all_images_from_dir(d, oj(symlink_dir, 'redbox'),
                                           redbox)
    else:
        imgs, d = load_all_images_from_dir(d, oj(symlink_dir, 'test'))

    # classify images
    num_imgs = len(d['fname'])
    print "computing preds..."
    d['pred'] = net.predict(imgs[:N])
    # print pred
    if num_imgs > N:
        for i in range(1, num_imgs / N):
            d['pred'] = np.append(d['pred'],
                                  net.predict(imgs[i * N:(i + 1) * N]),
                                  axis=0)
        d['pred'] = np.append(d['pred'],
                              net.predict(imgs[-(len(imgs) % N):]),
                              axis=0)
    print "preds computed."

    # save preds
    assert len(d['pred']) == num_imgs
    np.save(oj(data_info, PRETRAINED.split('/')[-1] + '_pred.npy'), d)
    return d
Exemple #58
0
#! /usr/bin/python3

import pandas as pd
import numpy as np
from os.path import join as oj
import os

if __name__ == '__main__':
    import sys
    sys.path.append(
        oj(os.path.dirname(__file__), '..', '..', 'raw', 'mit_voting'))
    from load import load_mit_voting
else:
    from ...raw.mit_voting.load import load_mit_voting


def clean_mit_voting(
        data_dir=oj('..', '..', 'raw', 'mit_voting'), out_dir='.'):
    ''' Clean 2000-2016 County Presidential Data
    
    Parameters
    ----------
    data_dir : str; path to the data directory to find raw csv
    
    out_dir : str; path to the data directory to write cleaned csv
    
    Returns
    -------
    writes out cleaned csv file and returns clean data frame
    '''
Exemple #59
0
  
  try: 
    os.environ['DISPLAY']
  except: 
    raise Exception('ERROR: X11 forwarding not enabled, cannot run script')

  if len(sys.argv) < 2:
      print_help()
  else:
    
    model_dir = os.path.abspath(sys.argv[1])

    log_fname, parsed = already_parsed(model_dir)
    if parsed == 'N':
      parse_log(log_fname)
    lfname = oj(model_dir, log_fname)

    test_dict = get_test_dict(lfname+'.test')
    train_dict = get_train_dict(lfname+'.train')

    Ys = {}
    tr_te_fields = train_fields + test_fields
    for key in train_fields:
      Ys[key] = train_dict[key]
    for key in test_fields:
      try:
        Ys[key] = test_dict[key]
      except:
        print "WARNING: found no %s fields"%(key)
        tr_te_fields.remove(key)