def bring_redbox_negatives(task, avoid_flags, add_num, data_dir, fn_train): neg_classification = ' 0' # see dump_to_files [('Default',0),(task,1)] notperf, total = [], [] for fname in os.listdir(data_dir): if fname.endswith('.dat'): total.append(fname) print "Gathering vacant Redbox images without %s's flags..."%(task) count = 0 with open(fn_train,'r') as f_already: c_already = f_already.readlines() c_already = [line.split(' ')[0] for line in c_already] for i in range(len(total)): content = open(oj(data_dir,total[i]),'r').readlines() content = [line.strip() for line in content] if all([len([flag for flag in content if flag in avoid_flags])==0, total[i] not in c_already]): notperf.append(oj(data_dir,total[i][:-4])+'.jpg'+neg_classification+'\n') count += 1 if count > add_num: break random.shuffle(notperf) print "Gathering completed." print "Adding %i negatives to %s"%(add_num,fn_train) newcomers, notperf_left = notperf[:add_num], notperf[add_num:] with open(fn_train,'r') as f_train: c_train = f_train.readlines() c_train += newcomers random.shuffle(c_train) f_train = open(fn_train,'w') # print "writing:", c_train f_train.writelines(c_train)
def save_mislabs(d, data_info, PRETRAINED): mislab_dir = oj(data_info,'potential_mislabels_'+PRETRAINED.split('/')[-1]) try: os.mkdir(mislab_dir) except: shutil.rmtree(mislab_dir) os.mkdir(mislab_dir) for idx in d['pot_mislab']: shutil.copy(oj(data_info,'test',d['fname'][idx]), mislab_dir) print "saving potential mislabels to %s"%(mislab_dir)
def get_pretrained_model(classifier_dir): suggest = os.listdir(classifier_dir) suggest = [fname for fname in suggest if 'iter' in fname and 'solverstate' not in fname] if len(suggest) > 1: for elem in enumerate(suggest): print elem idx = int(raw_input("\nWhich model? ")) return oj(classifier_dir,suggest[idx]) elif len(suggest) == 1: return oj(classifier_dir,suggest[0]) else: print "ERROR: no model found in", classifier_dir exit()
def all_labels(data_dir): Queries = {'perfect':[]} for dirname in os.listdir(data_dir): dirname = oj(data_dir,dirname) with open(oj(dirname,'inspection.txt')) as f: lines = f.readlines() lines = [line.strip() for line in lines] if lines == []: Queries['perfect'].append(dirname) for line in lines: if line not in Queries.keys(): Queries[line] = [] Queries[line].append(dirname) return Queries
def classify_data(classifier_dir, symlink_dir, data_info, PRETRAINED, redbox=False): N = 96 classifier_name = classifier_dir.split('/')[-1] if classifier_name.split('-fine')[0]+'_deploy.prototxt' not in os.listdir(classifier_dir): create_deploy_file(classifier_dir) MODEL_FILE = oj(classifier_dir, classifier_name.split('-fine')[0]+'_deploy.prototxt') MEAN_FILE = np.load(get_np_mean_fname(symlink_dir)) print 'loading network...' net = caffe.Classifier(MODEL_FILE, PRETRAINED, image_dims=(256, 256), input_scale=255, mean=MEAN_FILE, channel_swap=(2,1,0)) # flow of control: # classifier::__init__( # classifier::caffe.Net.__init__() print 'network loaded successfully' # set phase to test since we are doing testing net.set_phase_test() net.set_mode_gpu() d = {'fname': [], 'pred': [], 'time': [], 'dude': [], 'label': [], 'pred_lab_thresh': [], 'pred_lab_std': [], 'pot_mislab': []} # load images if redbox: imgs, d = load_all_images_from_dir(d, oj(symlink_dir,'redbox'), redbox) else: imgs, d = load_all_images_from_dir(d, oj(symlink_dir,'test')) # classify images num_imgs = len(d['fname']) print "computing preds..." d['pred'] = net.predict(imgs[:N]) # print pred if num_imgs > N: for i in range(1,num_imgs/N): d['pred'] = np.append(d['pred'],net.predict(imgs[i*N:(i+1)*N]),axis=0) d['pred']=np.append(d['pred'],net.predict(imgs[-(len(imgs)%N):]),axis=0) print "preds computed." # save preds assert len(d['pred']) == num_imgs np.save(oj(data_info, PRETRAINED.split('/')[-1]+'_pred.npy'), d) return d
def get_caffe_errors(model_dir, typ, idx): data_files = [] for fname in os.listdir(model_dir): if 'train_output' in fname and fname.endswith('.log.'+typ): data_files.append(fname) if len(data_files) != 1: print 'there is not exactly 1 filename otf \'*train_output*.log.%s\' in given directory'%(typ) sys.exit() content = open(oj(model_dir,data_files[0]),'r').readlines() legit_length = len(content[1]) content = [' '.join(line.split()).split(' ') for line in content if not line.startswith('#')] print 'raw content looked like %s and %s'%(content[0], content[-1]) for i in range(len(content)): if len(content[i]) <= idx: print 'line[%i] is messed up: %s'%(i,content[i]) sys.exit() content = [(line[0],line[idx]) for line in content] # end = len(content) # while True: # try: # content = [(line[0],line[idx]) for line in content[:end]] # break # except: # end -= 1 print 'selected content looks like %s and %s'%(content[0], content[-1]) return content
def LoadFeatureFileData( groupName, base_file_path, loadStatsCollection, debug=False): """This loads the feature files into a database... does not load the data IN Them however... i.e. it just indexes the *.seg.txt files """ files_processed = newly_loaded = total_files = 0 feature_file_list = glob.glob( oj(base_file_path,groupName,'TCGA-*.seg*.txt')) print "Analyzing ",groupName if debug: print "Processing %s which has %d files" % ( groupName, len(feature_file_list)) for ff in feature_file_list: LinePrinter( "Processed %d files out of %d total files, and %d just now" % ( files_processed,total_files,newly_loaded )) FFI = {} ###Feature File Info FFI['filename'] = ff fileLoaded = loadStatsCollection.find_one( {'filename':ff}) if not fileLoaded: FFI['LoadedToMongo'] = False FFI['slideGroup'] = groupName FFI['full_feature_filename'] = os.path.basename(ff) FFI['slide_name_detail'] = os.path.basename(ff).split('.seg.')[0] FFI['slide_name'] = os.path.basename(ff).split('.')[0] FFI['MongoCollName'] = None FFI['tot_features'] = file_len(ff) ## TYhis is an expensive operation so just do it in here loadStatsCollection.insert_one(FFI) newly_loaded +=1 files_processed +=1
def already_parsed(model_dir): fnames = [] listdir = os.listdir(model_dir) for fname in listdir: if 'train' in fname and fname.endswith('.log'): fnames.append(oj(model_dir,fname)) if len(fnames) == 0: print "ERROR: no file containing 'train_output' and ending in '.log' found in", model_dir elif len(fnames) > 1: for elem in enumerate(fnames): print elem fname = oj(model_dir,fnames[int(raw_input("\nChoose index number from above: "))]) else: fname = oj(model_dir,fnames[0]) if all([os.path.basename(fname)+'.train' in listdir, os.path.basename(fname)+'.test' in listdir]): return fname, 'Y' # found log and parsed else: return fname, 'N' # found log but not parsed
def matplot(model_dir, train, val_acc, val_loss, start=-1, end=-1): if end == start == -1: start, end = 0, len(train) print 'plotting entire training data' elif start == -1: start = 0 print 'plotting from iter %i to %i'%(start,end) elif end == -1: print 'plotting from iter %i to the end'%(start) end = len(train) else: print 'plotting from iter %i to %i'%(start,end) plt.ylim([0,1.2]) x = np.array(range(len(train[start:end]))) ytrain = np.array([float(el[1]) for el in train[start:end]]) ytest_acc = np.array([float(el[1]) for el in val_acc[start:end]]) ytest_loss = np.array([np.float(el[1]) for el in val_loss[start:end]]) plt.plot(x, ytrain, label='training loss', color='0.55') # plt.plot(x, ytrain, label='training loss') if len(x) != len(ytest_acc): print 'len(x) %i != %i len(ytrain)'%(len(x),len(ytest_acc)) sys.exit() plt.plot(x, ytest_acc, label='validation accuracy',color='g') plt.plot(x, ytest_loss, label='validation loss',color='r') plt.legend(loc='upper left') plt.xlabel('Iters') plt.ylabel('TrainingLoss') # plt.title('Go on choose one') plt.grid(True) plt.savefig(oj(model_dir,'plot_more_'+model_dir.split('/')[-3]+'_'+model_dir.split('/')[-1]+'.png'))
def get_flag_and_thresh(data_info): ''' flag_val is the number in data_info/[model]/read.txt which indexes the class corresponding to when flag is present. ''' flag_val, thresh = 0, 0.5 rl = open(oj(data_info,'read.txt'),'r').readlines() if len([l for l in rl if 'flag_val' in l]) == 0: # set up read.txt to contain flag val and threshold augment_read(data_info) rl = open(oj(data_info,'read.txt'),'r').readlines() rl = [l.split() for l in rl] for l in rl[2:]: if l == ['1','flag_val']: flag_val = 1 elif l[1] == 'threshold': thresh = float(l[0]) # if got no thresh to return, means read.txt needs be filled in return flag_val, thresh
def sample_from_label(Queries, data_dir): for elem in enumerate(Queries.keys()): print elem lab, length = -1, -1 while lab not in range(len(Queries.keys())): lab = int(raw_input("\nName 1 class number you wish to sample from: ")) lab = Queries.keys()[lab] while length not in range(len(Queries[lab])): length = int(raw_input("\nSample how many? ")) try: os.mkdir(lab) except: if not raw_input('Sample for that class already exists, ok to overwrite? [Y]/N ') == 'N': shutil.rmtree(lab) os.mkdir(lab) for directory in Queries[lab][:length]: for f in os.listdir(oj(data_dir,directory)): if f.endswith('.jpg'): shutil.copy(oj(data_dir,directory,f),oj(lab,f))
def get_(data_dir, fname, what): ret = [] meta_name = fname.split('.')[0] + '.met' data_dir = REDBOX_DIR for line in open(oj(data_dir,meta_name),'r').readlines(): for field in what: if line.startswith(field): ret.append(line.split(field+'=')[-1].split()[0][:10]) return ret
def push_tarball(): local('tar czf tarball.tar.gz *') run('rm -rf ' + REMOTE_DIR) run('mkdir ' + REMOTE_DIR) put('tarball.tar.gz', oj(REMOTE_DIR, 'tarball.tar.gz')) local('rm tarball.tar.gz') with cd(REMOTE_DIR): run('tar xzf tarball.tar.gz') run('rm tarball.tar.gz')
def dump_to_files(Keep, data_info, task, data_dir): ''' This function "trusts" you. It will overwrite data lookup files. ''' dump = [] part = [0, 0.85, 1] # partition into train val test dump_fnames = ['train.txt','val.txt'] #,'test.txt'] for i in xrange(len(dump_fnames)): dump.append([]) for [key,num] in [('Default',0),(task,1)]: l = len(Keep[key]) dump[i] += [[f,num] for f in Keep[key][int(part[i]*l):int(part[i+1]*l)]] # this is the important shuffle actually random.shuffle(dump[i]) if os.path.isfile(oj(data_info,dump_fnames[i])): print "WARNING: overwriting", oj(data_info,dump_fnames[i]) with open(oj(data_info,dump_fnames[i]),'w') as dfile: dfile.writelines(["%s %i\n" % (oj(data_dir,f),num) for (f,num) in dump[i]])
def get_np_mean_fname(symlink_dir): proto_img_fname = '' for fname in os.listdir(symlink_dir): if fname.endswith('mean.binaryproto'): print 'found binaryproto: %s'%(fname) proto_img_fname = fname break if proto_img_fname == '': print 'ERROR: no *mean.npy nor *mean.binaryproto found in %s'%(symlink_dir) sys.exit() # er wait how does it know where the proto img file is? blob = caffe_pb2.BlobProto() data = open(oj(symlink_dir,proto_img_fname), "rb").read() blob.ParseFromString(data) nparray = caffe.io.blobproto_to_array(blob)[0] npy_mean_fname = (proto_img_fname.split('_mean.binaryproto')[0]).split('_fine')[0]+'_mean2.npy' npy_mean_file = file(oj(symlink_dir,npy_mean_fname),"wb") np.save(npy_mean_file, nparray) npy_mean_file.close() return oj(symlink_dir, npy_mean_fname)
def bring_redbox_positives(task, flags, add_num, redbox_dir, fn_train): added = [] listdir = os.listdir(redbox_dir) random.shuffle(listdir) for fl in listdir: if fl.endswith('.dat'): pres = False with open(oj(redbox_dir,fl), 'r') as f: for line in f: if line.strip() in flags: pres = True break if pres: added.append(fl) if len(added) >= add_num: break with open(fn_train, 'a') as f: for fl in added: fl = fl.replace('dat','jpg') f.write("\n"+oj(redbox_dir,fl)+ " 1")
def create_redbox_data_info_etc(symlink_dir, data_info): data_dir = REDBOX_DIR All = sa.get_label_dict(data_dir) total_num_images = All.pop('total_num_images') Keep = sa.classes_to_learn(All) Keep = sa.default_class(All, Keep) total_num_check = sum([len(Keep[key]) for key in Keep.keys()]) if total_num_images != total_num_check: print "\nWARNING! started off with %i images, now have %i distinct training cases"%(total_num_images, total_num_check) if len(Keep.keys()) > 2: Keep,num_output = sa.merge_classes(Keep) Keep,num_output = sa.check_mutual_exclusion(Keep, num_output) dump = symlink_redbox_dataset(Keep,data_dir,oj(symlink_dir,'redbox')) dump_redbox_to_files(Keep, dump, data_info)
def plot_time(d, save_dir): if len(d['time']) != len(d['error']): print "len(d['time']) %i != %i len(d['error'])"%(len(d['time']),len(d['error'])) sys.exit() # order by time data = np.array(zip(d['time'],d['error']), dtype=object) data = numpy.sort(data, axis=0) plt.ylim([0,1.2]) x, y = data[:,0], data[:,1] plt.plot(x, y) plt.legend(loc='upper left') plt.xlabel('Inspected Time') plt.ylabel('Classification Error') # plt.title('Go on choose one') plt.grid(True) plt.savefig(oj(save_dir,'plot_redbox_'+save_dir.split('/')[-3]+'_'+save_dir.split('/')[-1]+'_time.png'))
def get_label_dict_knowing(data_dir, task, pos_class): ''' get_label_dict() knowing exactly which flags to look for and how to group them into classes. task is the name of what we're learning to detect, pos_class is a list of the actual flag names to look for. ''' d = {'Default': [], task: []} print 'generating specific dict of class:files from %s with pos class %s...'%(data_dir,pos_class) for filename in os.listdir(data_dir): if not filename.endswith('.dat'): continue with open(oj(data_dir, filename)) as f: content = [line.strip() for line in f.readlines()] if any([label==line for (label,line) in itertools.product(pos_class,content)]): d[task].append(filename.split('.')[0]+'.jpg') else: d['Default'].append(filename.split('.')[0]+'.jpg') return d
def matplot(model_dir, Ys, start, end): col = {'TrainLoss': '0.5', 'ValLoss' : '#000066', 'ValAcc_0': '#00CC00', 'ValAcc_1': '#ff4d4d', 'ValPCAcc': 'k', 'ValAcc' : 'y'} plt.ylim([0,1.2]) x = np.array(range(start,end)) plt.xlabel('Iters') for key in Ys.keys(): Ys[key] = np.array([np.float(el) for el in Ys[key][start:end]]) plt.plot(x, Ys[key], label=key, color=col[key]) plt.legend(loc='upper left',ncol=len(Ys)/2,prop={'size':10}) # plt.title('Go on choose one') plt.grid(True) plt.savefig(oj(model_dir,'plot_'+model_dir.split('/')[-2]+'.png'))
def load_all_images_from_dir(d, test_dir, redbox=False): imgs = [] d['fname'] = os.listdir(test_dir) print 'loading images from %s...'%(test_dir) # d_multJoints is a dict: fname -> joint_name d_multJoints = create_dict_jname(REDBOX_DIR) for fname in d['fname']: full_fname = oj(test_dir, fname) imgs.append(caffe.io.load_image(full_fname)) if redbox: [dude,time] = get_(REDBOX_DIR,fname,['InspectedTime','InspectedBy']) l_time = time.split('/') time = l_time[2] + '-' + l_time[1] + '-' + l_time[0] d['time'].append(time) d['dude'].append(dude) print 'finished loading images.' return imgs, d
def plot_dudes(d, save_dir): # get a 2d array of dudes of freq pot mislab data = {} num_imgs = len(d['dude']) for s in set(d['dude']): data[s] = 0 for idx in d['pot_mislab']: data[d['dude'][idx]] += 1 data = [[key, float(d[key])/num_imgs] for key in data.keys()] data = np.array(zip()) fig = plt.figure() width = .35 ind = np.arange(len(data)) plt.bar(ind, data[:,1]) plt.xticks(ind + width / 2, data[:,0]) fig.autofmt_xdate() plt.xlabel('Inspected By') plt.ylabel('% mis-classifications') plt.savefig(oj(save_dir,'plot_redbox_'+save_dir.split('/')[-3]+'_'+save_dir.split('/')[-1]+'_dude.png'))
import os from os.path import join as oj import sys sys.path.insert(1, oj(sys.path[0], '..')) # insert parent path import torch from torch.autograd import Variable import numpy as np import matplotlib.pyplot as plt from tqdm import tqdm from torch.optim.lr_scheduler import StepLR from torch.utils.data import DataLoader from copy import deepcopy import pickle as pkl import pandas as pd # generate mixture model # means and sds should be lists of lists (sds just scale variances) def generate_gaussian_data(N, means=[0, 1], sds=[1, 1], labs=[0, 1]): num_means = len(means) # deal with 1D if type(means[0]) == int or type(means[0]) == float: means = [[m] for m in means] sds = [[sd] for sd in sds] P = 1 else: P = len(means[0]) X = np.zeros((N, P), dtype=np.float32) y_plot = np.zeros((N, 1), dtype=np.float32) y_one_hot = np.zeros((N, 2), dtype=np.float32)
def plot_dims_flexible( results, out_dir='figs', xlim=None, percent_to_explain=0.85, figname='explained', dim_types=['explained_var_dicts_pca', 'explained_var_dicts_rbf']): # params for plotting num_lays = len(results.iloc[0].weight_names) - 1 # print(results.iloc[0].weight_names) plt.figure(figsize=(num_lays * 3, 8), dpi=100) # skips = [('adam', 0.1), ('adam', 0.01), ('adam', 0.001)] skips = [] dim_dicts = {} R, C = 5, max(3, num_lays) for index, row in results.iterrows(): # style for plotting # style = '^' if row.optimizer == 'sgd' else '.' # color = {0.1: 'red', 0.01: 'blue', 0.001: 'green'}[row.lr] color = 'orange' if row.optimizer == 'sgd' else 'deepskyblue' style = {1: '^', 0.1: '-', 0.01: '--', 0.001: '.'}[row.lr] alpha = {1.0: 0.3, 0.1: 0.8, 0.01: 0.8, 0.001: .3}[row.lr] if not (row.optimizer, row.lr) in skips: # accs try: plt.ylabel(row.dset[0]) except: pass plt.subplot(R, C, 1) plt.plot(row.its, row.losses_train, style, label=row.optimizer + ' ' + str(row.lr), color=color, alpha=alpha) plt.yscale('log') plt.title('train loss') plt.subplot(R, C, 2) plt.plot(row.its, row.losses_test, style, color=color, alpha=alpha) plt.yscale('log') plt.title('test loss') plt.subplot(R, C, 3) plt.plot(row.its, row.accs_test, style, color=color, alpha=alpha) plt.title('test acc') # dims for r in range(len(dim_types)): offset = C * (1 + r) dim_dicts = row[dim_types[r]] lays = row.weight_names if 'act' in dim_types[r]: lays = [ lay[:lay.rfind('.')] for lay in lays ] # act uses forward_all dict which doesn't have any . or .weight lab = dim_types[r].replace('_var_dicts_', '') lab = lab.replace('explained', '') lab = lab.replace('act', 'act: ') for c in range(len(lays) - 1): plt.subplot(R, C, offset + 1 + c) plt.plot(row.its, [ frac_dims_to_explain_X_percent( d[lays[c]], percent_to_explain) for d in dim_dicts ], style, color=color, alpha=alpha) plt.ylim((0, 1)) if c == 0: plt.ylabel(lab + ' ' + str(100 * percent_to_explain) + '% frac\ndimsof ' + str(dim_dicts[0][lays[c]].size)) if r == 0: plt.title(lays[c]) if not xlim is None: for i in range(R * C): plt.subplot(R, C, 1 + i) plt.xlim((0, xlim)) plt.subplot(R, C, 1) # remove duplicate labels handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) plt.legend(by_label.values(), by_label.keys()) plt.savefig(oj(out_dir, 'dims_flexible_' + figname + '.png'), bbox_inches='tight') plt.show()
def compute_classification_stats(d, data_info, redbox=False): # this comes early because flag_val prompts user flag_val, threshold = get_flag_and_thresh(data_info) num_imgs = len(d['fname']) # get data_info test file if redbox: print 'opening redbox data info' lines = open(oj(data_info,'redbox.txt'),'r').readlines() else: lines = open(oj(data_info,'test.txt'),'r').readlines() _class = {} for line in lines: _class[line.split()[0]] = line.split()[1] # print "lines:", lines try: assert set(d['fname']) == set(_class.keys()) except: print "don't match: d['fname']", d['fname'] print "and _class.keys()", _class.keys() exit # fill with true labels d['label'] = [int(_class[el]) for el in d['fname']] # fill in predicted labels and flag if potentially mislab # *_thresh is with classification boundary according to threshold # *_std is with classification boundary at 0.5 false_pos_thresh, num_pos, false_neg_thresh, num_neg, false_neg_std, false_pos_std = 0, 0, 0, 0, 0, 0 # print "\nd['label'] has types %s and flag_val of type %s \n"%(type(d['label'][0]),type(flag_val)) for idx in range(num_imgs): if d['label'][idx] == flag_val: print "%s is a positive, with preds %s"%(d['fname'][idx]) print "its preds are", d['pred'][idx] num_pos += 1 else: num_neg += 1 # assign predicted label wrt threshold if d['pred'][idx][flag_val] >= threshold: d['pred_lab_thresh'].append(flag_val) # print "thresh thinks no clamp! appending", flag_val else: d['pred_lab_thresh'].append((flag_val+1)%2) print "thresh thinks clamp! appending", (flag_val+1)%2 # assign predicted label in std way if d['pred'][idx][flag_val] >= 0.5: d['pred_lab_std'].append(flag_val) # print "std thinks no clamp! appending", flag_val, "\n" else: d['pred_lab_std'].append((flag_val+1)%2) print "std thinks clamp! appending", (flag_val+1)%2, "\n" # correct thresh classification or not if d['pred_lab_thresh'][idx] != d['label'][idx]: if d['label'][idx] == flag_val: false_neg_thresh += 1 else: false_pos_thresh += 1 # correct std classification or not if d['pred_lab_std'][idx] != d['label'][idx]: d['pot_mislab'].append(idx) if d['label'][idx] == flag_val: false_neg_std += 1 else: false_pos_std += 1 print 'false_neg_thresh: %i, false_pos_thresh: %i'%(false_neg_thresh,false_pos_thresh) print 'false_neg_std: %i, false_pos_std: %i'%(false_neg_std,false_pos_std) print 'num_neg: %i, num_pos: %i'%(num_neg,num_pos) # compute accuracies d['accuracy']= {} d['accuracy']['total_thresh'] = 1-(false_neg_thresh+false_pos_thresh)/float(num_imgs) d['accuracy']['pos_thresh'] = 1-false_neg_thresh/float(num_pos) d['accuracy']['neg_thresh'] = 1-false_pos_thresh/float(num_neg) d['accuracy']['total_std'] = 1-(false_neg_std+false_pos_std)/float(num_imgs) d['accuracy']['pos_std'] = 1-false_neg_std/float(num_pos) d['accuracy']['neg_std'] = 1-false_pos_std/float(num_neg) print "d['accuracy']", d['accuracy'] return d
def plot_weight_norms_and_margin(results, xlim=None, out_dir='figs'): # params for plotting skips = [('adam', 0.1)] # skips = [] dim_dicts = {} R, C = 4, 4 plt.figure(figsize=(14, 14), dpi=100) for index, row in results.iterrows(): # style for plotting color = 'orange' if row.optimizer == 'sgd' else 'deepskyblue' style = {1: '^', 0.1: '-', 0.01: '--', 0.001: '.'}[row.lr] alpha = {1.0: 0.3, 0.1: 0.8, 0.01: 0.8, 0.001: .3}[row.lr] if not (row.optimizer, row.lr) in skips: # dims wnorms = row.weight_norms if not 'weight_names' in list( results): # this is old, remove after some reruns lays = ['fc1.weight', 'fc2.weight', 'fc3.weight'] else: lays = row.weight_names # lays = ['fc1.weight', 'fc2.weight', 'fc3.weight'] keys = sorted(wnorms.keys()) if row.optimizer == 'sgd': for j in range(min(3, len(lays))): plt.subplot(R, C, 1 + j) vals = [wnorms[key][lays[j] + '_fro'] for key in keys] plt.plot(keys, vals, style, color=color, alpha=alpha, label=row.optimizer + ' ' + str(row.lr)) plt.title(lays[j] + ' frobenius norm') else: # print('lays', lays, wnorms[0].keys(), keys) for j in range(min(3, len(lays))): plt.subplot(R, C, 1 + C + j) vals = [wnorms[key][lays[j] + '_fro'] for key in keys] plt.plot(keys, vals, style, color=color, alpha=alpha, label=row.optimizer + ' ' + str(row.lr)) plt.title(lays[j] + ' frobenius norm') plt.subplot(R, C, 1 + C * 2) # norms_fro = [row.weight_norms[it][] in row.its # print(row.weight_norms) plt.plot(row.its, row.mean_margin_train_unnormalized, style, color=color, alpha=alpha, label=row.optimizer + ' ' + str(row.lr)) plt.title('train margin unnormalized') plt.subplot(R, C, 2 + C * 2) plt.plot(row.its, row.mean_margin_test_unnormalized, style, color=color, alpha=alpha, label=row.optimizer + ' ' + str(row.lr)) plt.title('test margin unnormalized') plt.subplot(R, C, 3 + C * 2) norm_prods_fro = [1] * len(keys) for j in range(len(lays)): norm_prods_fro = [ norm_prods_fro[i] * wnorms[key][lays[j] + '_fro'] for i, key in enumerate(keys) ] plt.plot(row.its, row.mean_margin_train_unnormalized / norm_prods_fro, style, color=color, alpha=alpha, label=row.optimizer + ' ' + str(row.lr)) plt.title('train margin over frobenius norm') plt.subplot(R, C, 4 + C * 2) plt.plot(row.its, row.mean_margin_test_unnormalized / norm_prods_fro, style, color=color, alpha=alpha, label=row.optimizer + ' ' + str(row.lr)) plt.title('test margin over frobenius norm') plt.subplot(R, C, 1 + C * 3) plt.plot(row.its, row.mean_margin_train, style, color=color, alpha=alpha, label=row.optimizer + ' ' + str(row.lr)) plt.title('train softmax margin') plt.subplot(R, C, 2 + C * 3) plt.plot(row.its, row.mean_margin_test, style, color=color, alpha=alpha, label=row.optimizer + ' ' + str(row.lr)) plt.title('test softmax margin') norm_prods_spectral = [1] * len(keys) for j in range(len(lays)): norm_prods_spectral = [ norm_prods_spectral[i] * wnorms[key][lays[j] + '_spectral'] for i, key in enumerate(keys) ] plt.subplot(R, C, 3 + C * 3) plt.plot(row.its, row.mean_margin_train_unnormalized / norm_prods_spectral, style, color=color, alpha=alpha, label=row.optimizer + ' ' + str(row.lr)) plt.title('train margin over spectral norm') plt.subplot(R, C, 4 + C * 3) plt.plot(row.its, row.mean_margin_test_unnormalized / norm_prods_spectral, style, color=color, alpha=alpha, label=row.optimizer + ' ' + str(row.lr)) plt.title('test margin over spectral norm') if not xlim is None: for i in range(R * C): plt.subplot(R, C, 1 + i) plt.xlim((0, xlim)) plt.subplot(R, C, 1) # remove duplicate labels handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) plt.legend(by_label.values(), by_label.keys()) plt.subplot(R, C, 4) # remove duplicate labels handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) plt.legend(by_label.values(), by_label.keys()) plt.savefig(oj(out_dir, 'weight_norms_and_margin.png'), bbox_inches='tight') plt.show()
import numpy as np import pandas as pd from colorama import Fore from matplotlib import pyplot as plt from matplotlib_venn import venn2 from sklearn import metrics from sklearn.utils.multiclass import unique_labels import os from os.path import join as oj import seaborn as sns DIR_FILE = os.path.dirname(os.path.realpath(__file__)) # directory of this file DIR_FIGS = oj(DIR_FILE, '../reports/figs') cb2 = '#66ccff' cb = '#1f77b4' cr = '#cc0000' cp = '#cc3399' cy = '#d8b365' cg = '#5ab4ac' cm = sns.diverging_palette(10, 240, n=1000, as_cmap=True) cm_rev = sns.diverging_palette(240, 10, n=1000, as_cmap=True) cmap_div = sns.diverging_palette(10, 220, as_cmap=True) def rename(s): RENAMING = { 'gcsscore': 'GCS Score', 'initheartrate': 'Heart rate', 'initsysbprange': 'Systolic BP', 'abdtenderdegree': 'Abd. tenderness\ndegree',
from matplotlib import pyplot as plt import os from os.path import join as oj plt.style.use('dark_background') import sys sys.path.append('../src') import data from tqdm import tqdm from src import train # from src.viz import * import config if __name__ == '__main__': # some settings outcome_def = 'y_consec_thresh' out_dir = oj('/scratch/users/vision/chandan/abc', 'nov16') dset_key = 'clath_aux+gak_a7d2' dset = config.DSETS[dset_key] binarize = False # True # get data df = data.get_data(dset=dset_key) df = df[df['valid']] # exclude test cells, short/long tracks, hotspots feat_names = data.get_feature_names(df) feat_names = data.select_final_feats(feat_names, binarize=binarize) print('num feats', len(feat_names)) print(feat_names) # run os.makedirs(out_dir, exist_ok=True) feature_selection_nums = [
import pandas as pd import numpy as np from os.path import join as oj from tqdm import tqdm import data import os from os.path import join as oj import sys, time sys.path.insert(1, oj(sys.path[0], '..')) # insert parent path import seaborn as sns from sklearn.model_selection import train_test_split import numpy as np import matplotlib.pyplot as plt from sklearn import metrics import pmlb from tqdm import tqdm from copy import deepcopy import pickle as pkl import pandas as pd import data import fit def fill_in_default_results(results): '''add keys for things which weren't recorded at the time ''' for key in ['H_trace']: if key not in results: results[key] = None if 'beta_norm' not in results: results['beta_norm'] = 1
def get_test_interval(model_dir): test = open(oj(model_dir,'train_output.log.test'),'r').readlines() return int(test[2].split()[0])
def setup(self, metadata_file=oj(config.DIR_PROCESSED, 'metadata_clath_aux+gak_a7d2.pkl')): np.random.seed(13) self.m = pkl.load(open(metadata_file, 'rb')) '''
def fit(p): print(p._str(p)) # set random seed np.random.seed(p.seed) torch.manual_seed(p.seed) # generate data X, y_onehot, y_scalar = data.generate_gaussian_data(p.N, means=p.means, sds=p.sds, labs=p.labs) dset = data.dset(X, y_scalar) # viz.plot_data() # make model if p.loss_func == 'cross_entropy': model = torch.nn.Sequential( torch.nn.Linear(p.d_in, p.hidden1), torch.nn.ReLU(), torch.nn.Linear(p.hidden1, p.d_out), # don't use softmax with crossentropy loss ) else: model = torch.nn.Sequential(torch.nn.Linear(p.d_in, p.hidden1), torch.nn.ReLU(), torch.nn.Linear(p.hidden1, p.d_out), torch.nn.Softmax()) # set up optimization optimizer = torch.optim.SGD( model.parameters(), lr=p.lr) # only optimize ridge (otherwise use model.parameters()) scheduler = StepLR(optimizer, step_size=p.step_size_optimizer, gamma=p.gamma_optimizer) if p.loss_func == 'cross_entropy': loss_fn = torch.nn.CrossEntropyLoss() else: loss_fn = torch.nn.MSELoss(size_average=False) dataloader = DataLoader(dset, batch_size=p.batch_size, shuffle=True) if p.init == 'data-driven': initialize_bs_as_neg_x_times_w(X, model) # to record weights = {} losses = np.zeros(p.num_iters) norms = np.zeros((p.num_iters, p.num_layers)) accs = np.zeros(p.num_iters) X_torch = torch.from_numpy(X) if p.loss_func == 'cross_entropy': y_torch = Variable(torch.from_numpy(y_scalar.flatten()).long(), requires_grad=False) else: y_torch = Variable(torch.from_numpy(y_onehot), requires_grad=False) # fit # batch gd for it in tqdm(range(p.num_iters)): y_pred = model(Variable(X_torch)) # predict loss = loss_fn(y_pred, y_torch) # long target is needed for crossentropy loss optimizer.zero_grad() # zero the gradients loss.backward() # backward pass optimizer.step() # update weights scheduler.step() # step for incrementing optimizer # output if it % 100 == 0 or it == p.num_iters - 1: weight_dict = { x[0]: x[1].data.numpy() for x in model.named_parameters() } weights[it] = deepcopy(weight_dict) losses[it] = loss.data #.item() accs[it] = np.mean( np.argmax(y_pred.data.numpy(), axis=1) == y_scalar.flatten()) * 100 norms[it, 0] = np.linalg.norm(weight_dict['0.weight'])**2 + np.sum( weight_dict['0.bias']**2) norms[it, 1] = np.linalg.norm(weight_dict['2.weight'])**2 # save if not os.path.exists( p.out_dir): # delete the features if they already exist os.makedirs(p.out_dir) params = p._dict(p) # predict things X_train = X y_train = y_scalar pred_train = model(Variable(torch.from_numpy(X_train), requires_grad=True)).data.numpy() # predict if p.d_in == 1: X_test = np.linspace(np.min(X), np.max(X), 1000, dtype=np.float32) X_test = X_test.reshape(X_test.shape[0], 1) pred_test = model( Variable(torch.from_numpy(X_test), requires_grad=True)).data.numpy() else: X_test = None pred_test = None # calculate time to min loss min_loss = np.min(losses) t_min_loss_plus_5_perc = np.argmax(losses <= min_loss * 1.05) t_min_loss_plus_10_perc = np.argmax(losses <= min_loss * 1.10) t_min_loss_plus_20_perc = np.argmax(losses <= min_loss * 1.20) results = { 'weights': weights, 'losses': losses, 'norms': norms, 'accs': accs, 'min_loss': min_loss, 'max_acc': np.max(accs), 'model': model, 'X_train': X_train, 'y_train': y_scalar, 'pred_train': pred_train, 'X_test': X_test, 'pred_test': pred_test, 't_min_loss_plus_5_perc': t_min_loss_plus_5_perc, 't_min_loss_plus_10_perc': t_min_loss_plus_10_perc, 't_min_loss_plus_20_perc': t_min_loss_plus_20_perc } results_combined = {**params, **results} pkl.dump(results_combined, open(oj(p.out_dir, p._str(p) + '.pkl'), 'wb')) return results_combined, model
#! /usr/bin/python3 import pandas as pd import numpy as np import os from os.path import join as oj from os.path import dirname if __name__ == '__main__': import sys sys.path.append(oj(os.path.dirname(__file__), '..', '..', 'raw', 'usafacts_infections')) from load import load_usafacts_infections else: from ...raw.usafacts_infections.load import load_usafacts_infections def clean_usafacts_infections(data_dir=oj('..', '..', 'raw', 'usafacts_infections'), out_dir='.'): ''' Clean usafacts data Parameters ---------- data_dir : str; path to the data directory to find raw csv out_dir : str; path to the data directory to write cleaned csv Returns ------- writes out cleaned csv file and returns clean data frame '''
args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if args.cuda else "cpu") kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=True, download=True, transform=transforms.ToTensor()), batch_size=args.batch_size, shuffle=True, **kwargs) test_loader = torch.utils.data.DataLoader( datasets.MNIST('data', train=False, transform=transforms.ToTensor()), batch_size=args.batch_size, shuffle=True, **kwargs) out_dir = 'samples' os.makedirs(out_dir, exist_ok=True) model = VAE().to(device) optimizer = optim.Adam(model.parameters(), lr=1e-3) # actually do training for epoch in range(1, args.epochs + 1): train(epoch) test(epoch) with torch.no_grad(): sample = torch.randn(64, 20).to(device) sample = model.decode(sample).cpu() save_image(sample.view(64, 1, 28, 28), oj(out_dir, 'sample_' + str(epoch) + '.png'))
def aggregate_results(results, group_idxs, out_dir): '''Takes in results and makes curves when varying n_train + aggregates over seeds ''' r2 = results.groupby(group_idxs) ind = pd.MultiIndex.from_tuples(r2.indices, names=group_idxs) df = pd.DataFrame(index=ind) # keys to record keys = [ 'ratio', 'bias', 'var', 'wnorm', 'mse_train', 'mse_test', 'num_nonzero', 'mse_noiseless', 'df1', 'df2', 'df3', 'n_train', 'num_features' ] for key in keys: df[key] = None for name, gr in tqdm(r2): p = gr.iloc[0] dset = p.dset noise_std = p.noise_std dset_num = p.dset_num model_type = p.model_type reg_param = p.reg_param num_features = p.num_features curve = gr.groupby(['n_train']) #.sort_index() row = {k: [] for k in keys} row['model_type'] = model_type row['reg_param'] = reg_param row['num_features'] = num_features row['noise_std'] = noise_std for curve_name, gr2 in curve: # calculate bias/var across repeats ''' if dset == 'gaussian': dset_name = '' _, _, _, y_true, betastar = \ data.get_data_train_test(n_test=p.n_test, p=p.num_features, noise_std=0, noise_distr=p.noise_distr, iid=p.iid, # parameters to be determined beta_type=p.beta_type, beta_norm=p.beta_norm, cov_param=p.cov_param) y_true = y_true.reshape(1, -1) # 1 x n_test elif dset == 'pmlb': dset_name = data.REGRESSION_DSETS_LARGE_NAMES_RECOGNIZABLE[dset_num] # note this was switched at some point X, y = pmlb.fetch_data(dset_name, return_X_y=True) fit.seed(703858704) _, _, _, y_true = train_test_split(X, y) # get test set preds = gr2.preds_test.values preds = np.stack(preds) # num_seeds x n_test preds_mean = preds.mean(axis=0).reshape(1, -1) # 1 x n_test y_true_rep = np.repeat(y_true, repeats=preds.shape[0], axis=0) # num_seeds x n_test preds_mu = np.mean(preds) bias = np.mean(preds_mu - y_true_rep.flatten()) var = np.mean(np.square(preds.flatten() - preds_mu)) mse_noiseless = metrics.mean_squared_error(preds.flatten(), y_true_rep.flatten()) row['bias'].append(bias) row['var'].append(var) row['mse_noiseless'].append(mse_noiseless) ''' # aggregate calculated stats row['ratio'].append(gr2.num_features.values[0] / gr2.n_train.values[0]) row['n_train'].append(gr2.n_train.values[0]) row['wnorm'].append(gr2.wnorm.mean()) row['mse_train'].append(gr2.train_mse.mean()) row['mse_test'].append(gr2.test_mse.mean()) for key in ['num_nonzero', 'df1', 'df2', 'df3']: row[key].append(gr2[key].mean()) for k in keys: df.at[name, k] = np.array(row[k]) #3# ratios\ # df['mse_zero'] = metrics.mean_squared_error(y_true, np.zeros(y_true.size).reshape(y_true.shape)) df.to_pickle(oj(out_dir, 'processed.pkl')) # save into out_dir return df
def load_county(data_dir=".", cached_file="county_data.csv", cached_abridged_file="county_data_abridged.csv", cached=True, abridged=True, infections_data="usafacts", rm_na=True): ''' Load in merged county data set Parameters ---------- data_dir : string; path to the data directory cached_file : string; name of cached county-level data cached_abridged_file : string; name of cached abridged county-level data cached : logical; whether or not to load in cached data (if possible) abridged : logical; whether or not to load in abridged data infections_data : string; source for daily cases/deaths counts from COVID-19 infections; must be either 'usafacts' or 'nytimes' rm_na : logical; whether or not to remove counties with NA cases or deaths Returns ------- data frame with abridged or full county-level data set ''' # error checking if infections_data not in ['usafacts', 'nytimes']: raise ValueError( "infections_data must be either 'usafacts' or 'nytimes'") # data directories orig_dir = os.getcwd() data_dir_raw = oj(data_dir, "county_level", "raw") data_dir_clean = oj(data_dir, "county_level", "processed") if cached == True: # read in cached data if abridged == True: if os.path.exists(oj(data_dir, cached_abridged_file)): cnty = pd.read_csv(oj(data_dir, cached_abridged_file)) else: raise ValueError("Cached abridged file cannot be found. " + "Please set cached = False.") else: if os.path.exists(oj(data_dir, cached_file)): cnty = pd.read_csv(oj(data_dir, cached_file)) else: raise ValueError("Cached file cannot be found. " + "Please set cached = False") cnty["countyFIPS"] = cnty["countyFIPS"].astype(str).str.zfill(5) else: ## ADD PUBLIC DATASETS HERE public_datasets = [ "ahrf_health", "cdc_svi", "chrr_health", "dhdsp_heart", "dhdsp_stroke", "hpsa_shortage", "ihme_respiratory", "khn_icu", "medicare_chronic", "mit_voting", "nchs_mortality", "usdss_diabetes", "jhu_interventions" ] ## ADD PRIVATE DATASETS HERE private_datasets = ["unacast_mobility"] # load in and clean county-level datasets df_ls = [] for dataset in public_datasets + private_datasets: # check if raw data files exist locally; if not, download raw data if dataset == "chrr_health": os.chdir(oj(data_dir_raw, dataset)) if not os.path.exists("state_data"): # download raw data os.system("python download.py") print("downloaded " + dataset + " successfully") elif len(os.listdir("state_data")) != 51: # download raw data os.system("python download.py") print("downloaded " + dataset + " successfully") os.chdir(orig_dir) elif dataset in private_datasets: os.chdir(oj(data_dir_raw, dataset)) if not os.path.exists("../../../../../covid-19-private-data"): # skip loading and cleaning os.chdir(orig_dir) continue os.chdir(orig_dir) elif dataset != "jhu_interventions": if not any(fname.startswith(dataset) \ for fname in os.listdir(oj(data_dir_raw, dataset))): # download raw data os.chdir(oj(data_dir_raw, dataset)) os.system("python download.py") print("downloaded " + dataset + " successfully") os.chdir(orig_dir) # clean data os.chdir(oj(data_dir_clean, dataset)) df_ls.append(eval("clean_" + dataset + "()")) print("loaded and cleaned " + dataset + " successfully") os.chdir(orig_dir) # merge county ids data cnty_fips = pd.read_csv( oj(data_dir_raw, "county_ids", "county_fips.csv")) cnty_fips["countyFIPS"] = cnty_fips["countyFIPS"].str.zfill(5) cnty_latlong = pd.read_csv( oj(data_dir_raw, "county_ids", "county_latlong.csv")) cnty_latlong = cnty_latlong[["countyFIPS", "State", "lat", "lon"]] cnty_latlong["countyFIPS"] = cnty_latlong["countyFIPS"].astype( str).str.zfill(5) cnty_popcenters = pd.read_csv( oj(data_dir_raw, "county_ids", "county_popcenters.csv")) cnty_popcenters = cnty_popcenters[[ "STATEFP", "COUNTYFP", "LATITUDE", "LONGITUDE" ]] cnty_popcenters = cnty_popcenters.rename(columns={ "LATITUDE": "POP_LATITUDE", "LONGITUDE": "POP_LONGITUDE" }) cnty_popcenters["countyFIPS"] = cnty_popcenters["STATEFP"].astype( str).str.zfill(2) + cnty_popcenters["COUNTYFP"].astype( str).str.zfill(3) cnty = pd.merge(cnty_fips, cnty_latlong, on="countyFIPS", how="left") cnty = pd.merge(cnty, cnty_popcenters, on="countyFIPS", how="left") # merge county-level data with county ids for i in range(0, len(df_ls)): df_ls[i] = clean_id( df_ls[i]) # remove potentially duplicate ID columns cnty = pd.merge(cnty, df_ls[i], on='countyFIPS', how="left") # merge data # basic preprocessing cnty = cnty.loc[:, ~cnty.columns.duplicated()] cnty = cnty.infer_objects() # add new features cnty = add_features(cnty) if abridged == True: # get shortlist of important variables for abridged data set id_vars = [ "countyFIPS", "STATEFP", "COUNTYFP", 'CountyName', 'StateName', 'State', 'lat', 'lon', "POP_LATITUDE", "POP_LONGITUDE" ] important_vars = id_vars + important_keys(cnty) cnty = cnty[important_vars] cnty.to_csv(oj(data_dir, cached_abridged_file), header=True, index=False) print("saved " + cached_abridged_file + " successfully") else: # write full county data to file cnty.to_csv(oj(data_dir, cached_file), header=True, index=False) print("saved " + cached_file + " successfully") # get covid-19 infections data if infections_data == 'usafacts': covid = load_usafacts_infections( oj(data_dir_raw, "usafacts_infections")) elif infections_data == 'nytimes': raise ValueError('infections_data = "nytimes" not yet implemented') # merge county data with covid data if rm_na == True: df = pd.merge(cnty, covid, on='countyFIPS', how='right') else: df = pd.merge(cnty, covid, on='countyFIPS', how='left') return df
[max(a[i][0] - a[i - 1][1], 0), max(a[i][1] - a[i - 1][0], 0)]) return tmp for i in range(df_county.shape[0]): df_county.loc[i, newname + '_interval'].extend( find_intervals(df_county.loc[i, name], df_county.loc[i, var + 'Intervals'])) return df_county if __name__ == '__main__': print('loading data...') NUM_DAYS_LIST = [1, 2, 3, 4, 5, 6, 7] df_county = load_data.load_county_level( data_dir=oj(parentdir, 'data')).fillna(0) df_county = add_preds( df_county, NUM_DAYS_LIST=NUM_DAYS_LIST, cached_dir=oj(parentdir, 'data')) # adds keys like "Predicted Deaths 1-day" ## orgnize predicts as array add_pre(df_county, 'Predicted Cases ', 'pred_cases', 'pred_new_cases') add_pre(df_county, 'Predicted Deaths ', 'pred_deaths', 'pred_new_deaths') ## add new cases/death to dataframe add_new(df_county) ## Add new cases/deaths predictions and their intervals df_county = add_new_pre(df_county, 'Predicted Cases ', 'tot_cases',
print('Usage: python plot.py path/to/model [start-epoch=..] [end-epoch==..]') try: os.environ['DISPLAY'] except: print 'ERROR: X11 forwarding not enabled, cannot run script' sys.exit() model_dir = os.path.abspath(sys.argv[1]) # command = "./parselog.sh %s"%(oj(model_dir,'train_output.log')) # print os.path.isfile(oj(model_dir,'train_output.log')) # print 'command:', command # call(command) cmd = "./parselog.sh "+oj(model_dir,'train_output.log') subprocess.Popen(cmd, shell=True, stdout = subprocess.PIPE, stderr=subprocess.STDOUT) # test_interval = [int(arg.split('=')[-1]) for arg in sys.argv # if arg.startswith('test-inter=')] # if len(test_interval) != 1: # print 'ERROR: test-inter not properly given' # sys.exit() # else: test_interval = test_interval[0] start,end = -1,-1 for arg in sys.argv: if arg.startswith("start-iter="): start = int(arg.split('=')[-1]) if arg.startswith("end-iter="): end = int(arg.split('=')[-1])
] remap = {1: 'Low', 2: 'Medium', 3: 'High'} for i in NUM_DAYS_LIST: ks.append(f'Severity {i}-day') ks.append(f'Predicted New Deaths Hospital {i}-day') ks.append(f'Severity Index {i}-day') df[f'Severity Index {i}-day'] = [ remap[x] for x in df[f'Severity {i}-day'] ] return df[ks] if __name__ == '__main__': print('loading data...') NUM_DAYS_LIST = [1, 2, 3, 4, 5, 6, 7] df_county = load_data.load_county_level(data_dir=oj(parentdir, 'data')) df_hospital = load_data.load_hospital_level( data_dir=oj(parentdir, 'data_hospital_level')) df_county = add_preds( df_county, NUM_DAYS_LIST=NUM_DAYS_LIST, cached_dir=oj(parentdir, 'data')) # adds keys like "Predicted Deaths 1-day" df = merge_data.merge_county_and_hosp(df_county, df_hospital) df = add_severity_index(df, NUM_DAYS_LIST) df = df.sort_values('Total Deaths Hospital', ascending=False) write_to_gsheets_and_api(df, service_file=oj(parentdir, 'creds.json'), api_file=oj(parentdir, 'ian_key.env')) print('succesfully wrote to gsheets')
def generate_map(df): df = rename(df) df['POS'] = df['County'] + ', ' + df['StateName'] maps = [] with urlopen( 'https://raw.githubusercontent.com/plotly/datasets/master/geojson-counties-fips.json' ) as response: counties = json.load(response) for key in keys: fig = px.choropleth( df, geojson=counties, locations='countyFIPS', color=np.log(df[key] + 1), color_continuous_scale=[ '#F7E8E4', '#F5C8BB', '#B96D67', '#A83C3B', '#8B2222', '#5B0D0D', '#5A2318' ], scope="usa", hover_data=[ 'State', 'County', 'Cumulative Cases', 'New Cases', 'Cumulative Deaths', 'New Deaths', 'Deaths per 100k', 'Cases per 100k', 'New Cases per 100k', 'New Deaths per 100k' ], title=key + ' on ' + (datetime.today() - timedelta(days=1)).strftime('%m-%d')) fig.update_layout(coloraxis_colorbar=dict( len=0.75, title=key, tickvals=[ 2.302585092994046, 4.605170185988092, 6.907755278982137, 9.210340371976184, 11.512925464970229 ], ticktext=['10', '100', '1k', '10k', '100k', '1000k'], x=1, y=0.5)) ## update the hover information for c in ["countyFIPS=%{location}<br>", "<br>color=%{z}"]: fig['data'][0]['hovertemplate'] = fig['data'][0][ 'hovertemplate'].replace(c, "") fig['data'][0]['hovertemplate'] = fig['data'][0][ 'hovertemplate'].replace("=", ": ") fig.update_layout(margin={"r": 0, "t": 40, "l": 0, "b": 0}) fig.update_layout( paper_bgcolor='rgb(0,0,0)', plot_bgcolor='rgb(0,0,0)', template='plotly_dark', ) fig['layout'].update(width=900, height=450, autosize=True, title_x=0.3) if key == 'Cumulative Cases': fig.write_image(oj(parentdir, "results/search_map.svg"), width=900, height=450) maps.append( plotly.offline.plot(fig, include_plotlyjs=False, output_type='div')) df_tab = df.sort_values(by=key, ascending=False) df_tab = df_tab.reset_index(drop=True)[['POS', key]].loc[:19, :] fig = go.Figure(data=[ go.Table(header=dict(values=['', 'County', key], line_color='grey', fill_color='darkgrey', font_color='white', font_size=12, align='center'), cells=dict(values=[[i + 1 for i in range(len(df_tab))], df_tab['POS'], df_tab[key]], line_color='darkgrey', fill_color='grey', font_color='white', font_size=11, align='center'), columnwidth=[20, 120, 80]) ]) fig['layout'].update(paper_bgcolor='rgb(0,0,0)', plot_bgcolor='rgb(0,0,0)', margin=dict(l=0, r=0, t=0, b=0), width=200, height=550, autosize=True, template='plotly_dark') fig.write_image(oj(parentdir, "results/" + key + ".svg"), width=200, height=550) print('succesfully generated search map') return maps
hist_dict['train_acc_history'] = train_acc_history hist_dict['train_loss_history'] = val_loss_history hist_dict['train_cd_history'] = train_cd_history model.load_state_dict(best_model_wts) return model, hist_dict #TODO hist params_to_update = model.parameters() criterion = nn.CrossEntropyLoss(weight=weights.double().float()) optimizer_ft = optim.SGD(params_to_update, lr=args.lr, momentum=args.momentum) #optimizer_ft = optim.Adam(params_to_update, weight_decay = 0.001) model, hist_dict = train_model(model, dataloaders, criterion, optimizer_ft, num_epochs=num_epochs) pid = ''.join(["%s" % randint(0, 9) for num in range(0, 20)]) torch.save(model.state_dict(), oj(model_path, pid + ".pt")) import pickle as pkl hist_dict['pid'] = pid hist_dict['regularizer_rate'] = -1 hist_dict['seed'] = args.seed hist_dict['batch_size'] = args.batch_size hist_dict['momentum'] = args.momentum hist_dict['learning_rate'] = args.lr pkl.dump(hist_dict, open(os.path.join(model_path, pid + '.pkl'), 'wb'))
def plot_dims(results, out_dir='figs', xlim=None, percent_to_explain=0.85, figname='explained', dim_types=[ 'explained_var_dicts_pca', 'explained_var_dicts_rbf', 'explained_var_dicts_lap', 'explained_var_dicts_cosine' ]): # params for plotting plt.figure(figsize=(10, 18), dpi=100) # skips = [('adam', 0.1), ('adam', 0.01), ('adam', 0.001)] skips = [] dim_dicts = {} R, C = 5, 3 for index, row in results.iterrows(): # style for plotting # style = '^' if row.optimizer == 'sgd' else '.' # color = {0.1: 'red', 0.01: 'blue', 0.001: 'green'}[row.lr] color = 'orange' if row.optimizer == 'sgd' else 'deepskyblue' style = {1: '^', 0.1: '-', 0.01: '--', 0.001: '.'}[row.lr] alpha = {1.0: 0.3, 0.1: 0.8, 0.01: 0.8, 0.001: .3}[row.lr] if not (row.optimizer, row.lr) in skips: # accs try: plt.ylabel(row.dset[0]) except: pass plt.subplot(R, C, 1) plt.plot(row.its, row.losses_train, style, label=row.optimizer + ' ' + str(row.lr), color=color, alpha=alpha) plt.yscale('log') plt.title('train loss') plt.subplot(R, C, 2) plt.plot(row.its, row.losses_test, style, color=color, alpha=alpha) plt.yscale('log') plt.title('test loss') plt.subplot(R, C, 3) plt.plot(row.its, row.accs_test, style, color=color, alpha=alpha) plt.title('test acc') # dims for j in range(4): offset = 3 * (1 + j) plt.subplot(R, C, offset + 1) dim_dicts = row[dim_types[j]] # pick keys if not 'weight_names' in list( results): # this is old, remove after some reruns if 'explained' in dim_types[j]: lays = ['fc1.weight', 'fc2.weight', 'fc3.weight'] elif 'act' in dim_types[j]: # dim_dicts = dim_dicts[0] # print(dim_dicts.keys()) lays = ['fc1', 'fc2', 'fc3'] else: lays = row.weight_names if 'act' in dim_types[j]: lays = [ lay[:lay.rfind('.')] for lay in lays ] # act uses forward_all dict which doesn't have any . or .weight # print(lays, dim_dicts[0].keys()) lab = dim_types[j].replace('_var_dicts_', '') lab = lab.replace('explained', '') lab = lab.replace('act', 'act: ') plt.plot(row.its, [ frac_dims_to_explain_X_percent( d[lays[0]], percent_to_explain) for d in dim_dicts ], style, color=color, alpha=alpha) plt.ylabel(lab + '\n' + str(100 * percent_to_explain) + '% frac dims (of ' + str(dim_dicts[0][lays[0]].size) + ')') plt.title(lays[0]) plt.subplot(R, C, offset + 2) plt.plot(row.its, [ frac_dims_to_explain_X_percent( d[lays[1]], percent_to_explain) for d in dim_dicts ], style, color=color, alpha=alpha) plt.title(lays[1]) plt.ylabel('out of ' + str(dim_dicts[0][lays[1]].size)) if len(lays) > 2: plt.subplot(R, C, offset + 3) plt.plot(row.its, [ frac_dims_to_explain_X_percent( d[lays[2]], percent_to_explain) for d in dim_dicts ], style, color=color, alpha=alpha) plt.title(lays[2]) plt.ylabel('out of ' + str(dim_dicts[0][lays[2]].size)) if not xlim is None: for i in range(R * C): plt.subplot(R, C, 1 + i) plt.xlim((0, xlim)) plt.subplot(R, C, 1) # remove duplicate labels handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) plt.legend(by_label.values(), by_label.keys()) plt.savefig(oj(out_dir, 'dims_' + figname + '.png'), bbox_inches='tight') plt.show()
import os from os.path import join as oj DIR_REPO = os.path.dirname( os.path.realpath(__file__)) # directory of the config file # running and saving ################################# # DIR_PROCESSED_MISC = oj(DIR_REPO, 'processed') DIR_RESULTS = oj(DIR_REPO, 'src', 'results') DIR_FIGS = oj(DIR_REPO, 'figs') # data ################################# DIR_CELEBA = oj(DIR_REPO, 'data/celeba-hq') DIR_IMS = oj(DIR_CELEBA, 'ims/') DIR_PROCESSED = oj(DIR_REPO, 'data/processed/') DIR_GEN = oj(DIR_PROCESSED, 'gen', 'generated_images_0.1') # lib paths ################################# DIR_LIB = oj(DIR_REPO, 'lib') DIR_STYLEGAN = oj(DIR_LIB, 'stylegan2') # attrs in latent space DIR_LINEAR_DIRECTIONS = oj(DIR_REPO, 'data/annotations_stylegan2/linear_models' ) # linear models for attributes on latent space ATTRS = ['age', 'facial-hair', 'skin-color', 'gender', 'hair-length', 'makeup'] ATTRS_MEASURED = 'HAGCBM' ALL_ATTRS = 'HAGCBMSEW' # 'HAGCBMSEW' ATTR_LABELS = { 'C': 'skin-color', 'H': 'hair-length', 'G': 'gender\n(perceived)',
def savefig(s: str): plt.savefig(oj(DIR_FIGS, s + '.pdf')) plt.savefig(oj(DIR_FIGS, s + '.png'), dpi=300)
for dr in dirs: cdr = baseDir + "/" + dr jpgs = filter(lambda x: "jpg" in x, os.listdir(cdr)) for jpg in jpgs: name = jpg.split(".")[0] shutil.move(cdr+"/"+jpg, tdr+"/"+jpg) shutil.copyfile(cdr+"/inspection.txt", tdr+"/"+name+".dat") shutil.copyfile(cdr+"/meta.txt", tdr+"/"+name+".met") getUnsuitableFlags(tdr, name) shutil.rmtree(cdr) def getUnsuitableFlags(tdr, name): with open(tdr+"/"+name+".met") as met_f: if any(["UnsuitablePhoto=True" in line for line in met_f.readlines()]): with open(tdr+"/"+name+".dat", 'a') as dat_f: dat_f.write("UnsuitablePhoto") if __name__ == "__main__": baseDir = sys.argv[1] tdr = os.path.abspath(baseDir) print 'checking whether any joint dirs left...' if any([os.path.isdir(oj(tdr,fd)) for fd in os.listdir(tdr)]): print 'found some; reorganizing them' reorganize(tdr) print 'no more joint dirs left' jpgs = filter(lambda x: "jpg" in x, os.listdir(tdr)) for jpg in jpgs: name = jpg.split(".")[0] getUnsuitableFlags(tdr, name)
symlink_dir = os.path.abspath(arg.split('=')[-1]) elif "data-info=" in arg: data_info = os.path.abspath(arg.split('=')[-1]) redbox = False if '--redbox' in sys.argv: redbox = True if check.check(symlink_dir, data_info) != [0, 0] and not redbox: print 'ERROR: mismatch between test files in data_dir and data_info' sys.exit() if redbox: flag_val = create_redbox_data_info_etc(symlink_dir, data_info) PRETRAINED = get_pretrained_model(classifier_dir) already_pred = oj(data_info, PRETRAINED.split('/')[-1] + '_pred.npy') if os.path.isfile(already_pred) and raw_input('found %s; use? ([Y]/N) ' % (already_pred)) != 'N': d = (np.load(already_pred)).item() else: d = classify_data(classifier_dir, symlink_dir, data_info, PRETRAINED, redbox) if redbox: d = arrange_preds_with_flag_val(d, flag_val) # this should go in main as well? # get true labels, assign predicted labels, get metrics d = compute_classification_stats(d, data_info, redbox) print_classification_stats(d)
def plot_losses(results, out_dir='figs'): # params for plotting plt.figure(figsize=(12, 8), dpi=100, facecolor='w') percent_to_explain = 0.90 dim_types = ['pca', 'rbf', 'lap', 'cosine'] # skips = [('adam', 0.1), ('sgd', 1.0)] #, ('sgd', 0.1)] skips = [] dim_dicts = {} R, C = 2, 4 for index, row in results.iterrows(): color = 'orange' if row.optimizer == 'sgd' else 'deepskyblue' style = {1: '^', 0.5: '-', 0.1: '-', 0.01: '--', 0.001: '.'}[row.lr] alpha = {1.0: 0.3, 0.5: 0.5, 0.1: 0.8, 0.01: 0.8, 0.001: .3}[row.lr] xlim = None #20 # None if not (row.optimizer, row.lr) in skips: # accs plt.subplot(R, C, 1) plt.ylabel('full model') plt.plot(row.its, row.losses_train, style, label=row.optimizer + ' ' + str(row.lr), color=color, alpha=alpha) plt.yscale('log') plt.title('train loss') plt.subplot(R, C, 2) plt.plot(row.its, row.losses_test, style, color=color, alpha=alpha) plt.yscale('log') plt.title('test loss') plt.subplot(R, C, 3) plt.plot(row.its, row.accs_train, style, label=row.optimizer + ' ' + str(row.lr), color=color, alpha=alpha) plt.title('train acc') plt.subplot(R, C, 4) plt.plot(row.its, row.accs_test, style, color=color, alpha=alpha) plt.title('test acc') plt.subplot(R, C, 5) plt.ylabel('reconstructed with 85% PCs') plt.plot(row.its, row.losses_train_r, style, label=row.optimizer + ' ' + str(row.lr), color=color, alpha=alpha) plt.yscale('log') plt.title('train loss') plt.subplot(R, C, 6) plt.plot(row.its, row.losses_test_r, style, color=color, alpha=alpha) plt.yscale('log') plt.title('test loss') plt.subplot(R, C, 7) plt.plot(row.its, row.accs_train_r, style, label=row.optimizer + ' ' + str(row.lr), color=color, alpha=alpha) plt.title('train acc') plt.subplot(R, C, 8) plt.plot(row.its, row.accs_test_r, style, color=color, alpha=alpha) plt.title('test acc') plt.subplot(R, C, 1) # remove duplicate labels handles, labels = plt.gca().get_legend_handles_labels() by_label = OrderedDict(zip(labels, handles)) plt.legend(by_label.values(), by_label.keys()) plt.savefig(oj(out_dir, 'losses' + '.png'), bbox_inches='tight') plt.show()
symlink_dir = os.path.abspath(arg.split('=')[-1]) elif "data-info=" in arg: data_info = os.path.abspath(arg.split('=')[-1]) redbox = False if '--redbox' in sys.argv: redbox = True if check.check(symlink_dir, data_info) != [0,0] and not redbox: print 'ERROR: mismatch between test files in data_dir and data_info' sys.exit() if redbox: flag_val = create_redbox_data_info_etc(symlink_dir, data_info) PRETRAINED = get_pretrained_model(classifier_dir) already_pred = oj(data_info, PRETRAINED.split('/')[-1]+'_pred.npy') if os.path.isfile(already_pred) and raw_input('found %s; use? ([Y]/N) '%(already_pred)) != 'N': d = (np.load(already_pred)).item() else: d = classify_data(classifier_dir, symlink_dir, data_info, PRETRAINED, redbox) if redbox: d = arrange_preds_with_flag_val(d, flag_val) # this should go in main as well? # get true labels, assign predicted labels, get metrics d = compute_classification_stats(d, data_info, redbox) print_classification_stats(d) # potential mislabels if "--mislab" in sys.argv:
def restart_apache(): run(oj(up(REMOTE_DIR), 'apache2/bin/restart'))
metavar='N', help='hyperparameter for CDEP weight - higher means more regularization') args = parser.parse_args() regularizer_rate = args.regularizer_rate num_epochs = args.epochs device = torch.device(0) # load model model = models.vgg16(pretrained=True) model.classifier[-1] = nn.Linear(4096, 2) model = model.classifier.to(device) with open(oj(dataset_path, "cancer.npy"), 'rb') as f: cancer_featuress = np.load(f) with open(oj(dataset_path, "not_cancer.npy"), 'rb') as f: not_cancer_featuress = np.load(f) cancer_targets = np.ones((cancer_featuress.shape[0])).astype(np.int64) not_cancer_targets = np.zeros((not_cancer_featuress.shape[0])).astype(np.int64) with open(oj(dataset_path, "not_cancer_cd.npy"), 'rb') as f: not_cancer_cd = np.load(f) not_cancer_dataset = TensorDataset( torch.from_numpy(not_cancer_featuress).float(), torch.from_numpy(not_cancer_targets), torch.from_numpy(not_cancer_cd).float()) cancer_dataset = TensorDataset( torch.from_numpy(cancer_featuress).float(),
def add_preds( df_county, NUM_DAYS_LIST=[1, 2, 3], verbose=False, cached_dir=None, outcomes=['Deaths', 'Cases'], discard=False, d=datetime.datetime.today(), add_predict_interval=True, interval_target_days=[], ): '''Adds predictions for the current best model Adds keys that look like 'Predicted Deaths 1-day', 'Predicted Deaths 2-day', ... ''' # select the best model advanced_model = {'model_type': 'advanced_shared_model'} linear = {'model_type': 'linear'} BEST_MODEL = [advanced_model, linear] # load cached preds if cached_dir is not None: # getting current date and time if not discard: cached_fname = oj(cached_dir, f'preds_{d.month}_{d.day}_cached.pkl') else: cached_fname = oj( cached_dir, f'preds_{d.month}_{d.day}_cached_discard1day.pkl') if os.path.exists(cached_fname): return pd.read_pickle(cached_fname) print('predictions not cached, now calculating (might take a while)') for outcome in outcomes: print(f'predicting {outcome}...') tmp = [0 for _ in range(df_county.shape[0])] for num_days_in_future in tqdm(NUM_DAYS_LIST): # 1 is tomorrow output_key = f'Predicted {outcome} {num_days_in_future}-day' df_county = fit_and_predict_ensemble(df_county, methods=BEST_MODEL, outcome=outcome.lower(), mode='predict_future', target_day=np.array( [num_days_in_future]), output_key=output_key, verbose=verbose) vals = df_county[output_key].values out = [] for i in range(vals.shape[0]): if np.isnan(vals[i]): out.append(0) else: out.append( max(vals[i][0], list(df_county[outcome.lower()])[i][-1], tmp[i])) df_county[output_key] = out tmp = out output_key = f'Predicted {outcome} Intervals' if add_predict_interval: if not interval_target_days: interval_target_days = NUM_DAYS_LIST print('prediction intervals...') print(interval_target_days) df_county = add_prediction_intervals( df_county, target_day=np.array(interval_target_days), outcome=outcome.lower(), methods=BEST_MODEL, interval_type='local', output_key=output_key) # add 3-day lagged death preds output_key = f'Predicted Deaths 3-day Lagged' df_county = fit_and_predict_ensemble(df_county, methods=BEST_MODEL, outcome='deaths', mode='eval_mode', target_day=np.array([3]), output_key=output_key, verbose=verbose) df_county[output_key] = [v[0] for v in df_county[output_key].values] if cached_dir is not None: df_county.to_pickle(cached_fname) return df_county
import scipy as sp import pandas as pd from functions import merge_data from sklearn.model_selection import RandomizedSearchCV import load_data import exponential_modeling from sklearn.linear_model import LinearRegression from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from fit_and_predict import add_preds from datetime import datetime, timedelta import pygsheets if __name__ == '__main__': NUM_DAYS_LIST = [1, 2, 3, 4, 5, 6, 7] df_county = load_data.load_county_level(data_dir=oj(parentdir, 'data')) df_county = add_preds( df_county, NUM_DAYS_LIST=NUM_DAYS_LIST, cached_dir=oj(parentdir, 'data')) # adds keys like "Predicted Deaths 1-day" # county-level stuff (#ICU_beds is county-level) k_surge = 'Severity (Surge) Prediction' df_county[k_surge] = 2 * df_county['Predicted Deaths 3-day'] - df_county[ '#ICU_beds'].fillna(0) # rewrite pred cols today = datetime.today().strftime("%B %d") days = [ 'Predicted Deaths by ' +
# hyperparams seed = 1 out_dir = '/scratch/users/vision/chandan/rf_sims_real/use_rf_rerun' # sim_results_fix_cov_C=0.25'' use_rf = True C = 1 dset_num = 0 random_state = 42 # for each train_test_split # dset_num sys argv if len(sys.argv) > 1: dset_num = int(sys.argv[1]) print('dset num', dset_num) dset_name = dset_names[dset_num] # generate data np.random.seed(seed) os.makedirs(out_dir, exist_ok=True) X, y, df, X_test, y_test, X_cond, y_cond = get_data(dset_name, data_dir, random_state) # fit model forest, test_mse = run_sim.fit_model(X, y, X_test, y_test) pkl.dump({'rf': forest, 'test_mse': test_mse}, open(oj(out_dir, f'model_{dset_num}.pkl'), 'wb')) # generate data for conditional if use_rf: Y_cond = forest.predict(X_cond) # calc curves run_sim.calc_curves(X, y, df, X_cond, y_cond, forest, out_dir, dset_num, C) print('done!')
def compute_classification_stats(d, data_info, redbox=False): # this comes early because flag_val prompts user flag_val, threshold = get_flag_and_thresh(data_info) num_imgs = len(d['fname']) # get data_info test file if redbox: print 'opening redbox data info' lines = open(oj(data_info, 'redbox.txt'), 'r').readlines() else: lines = open(oj(data_info, 'test.txt'), 'r').readlines() _class = {} for line in lines: _class[line.split()[0]] = line.split()[1] # print "lines:", lines try: assert set(d['fname']) == set(_class.keys()) except: print "don't match: d['fname']", d['fname'] print "and _class.keys()", _class.keys() exit # fill with true labels d['label'] = [int(_class[el]) for el in d['fname']] # fill in predicted labels and flag if potentially mislab # *_thresh is with classification boundary according to threshold # *_std is with classification boundary at 0.5 false_pos_thresh, num_pos, false_neg_thresh, num_neg, false_neg_std, false_pos_std = 0, 0, 0, 0, 0, 0 # print "\nd['label'] has types %s and flag_val of type %s \n"%(type(d['label'][0]),type(flag_val)) for idx in range(num_imgs): if d['label'][idx] == flag_val: print "%s is a positive, with preds %s" % (d['fname'][idx]) print "its preds are", d['pred'][idx] num_pos += 1 else: num_neg += 1 # assign predicted label wrt threshold if d['pred'][idx][flag_val] >= threshold: d['pred_lab_thresh'].append(flag_val) # print "thresh thinks no clamp! appending", flag_val else: d['pred_lab_thresh'].append((flag_val + 1) % 2) print "thresh thinks clamp! appending", (flag_val + 1) % 2 # assign predicted label in std way if d['pred'][idx][flag_val] >= 0.5: d['pred_lab_std'].append(flag_val) # print "std thinks no clamp! appending", flag_val, "\n" else: d['pred_lab_std'].append((flag_val + 1) % 2) print "std thinks clamp! appending", (flag_val + 1) % 2, "\n" # correct thresh classification or not if d['pred_lab_thresh'][idx] != d['label'][idx]: if d['label'][idx] == flag_val: false_neg_thresh += 1 else: false_pos_thresh += 1 # correct std classification or not if d['pred_lab_std'][idx] != d['label'][idx]: d['pot_mislab'].append(idx) if d['label'][idx] == flag_val: false_neg_std += 1 else: false_pos_std += 1 print 'false_neg_thresh: %i, false_pos_thresh: %i' % (false_neg_thresh, false_pos_thresh) print 'false_neg_std: %i, false_pos_std: %i' % (false_neg_std, false_pos_std) print 'num_neg: %i, num_pos: %i' % (num_neg, num_pos) # compute accuracies d['accuracy'] = {} d['accuracy']['total_thresh'] = 1 - (false_neg_thresh + false_pos_thresh) / float(num_imgs) d['accuracy']['pos_thresh'] = 1 - false_neg_thresh / float(num_pos) d['accuracy']['neg_thresh'] = 1 - false_pos_thresh / float(num_neg) d['accuracy']['total_std'] = 1 - (false_neg_std + false_pos_std) / float(num_imgs) d['accuracy']['pos_std'] = 1 - false_neg_std / float(num_pos) d['accuracy']['neg_std'] = 1 - false_pos_std / float(num_neg) print "d['accuracy']", d['accuracy'] return d
def vroom(): zz = argparse.ArgumentParser( description="Python script to upload multiple (or single) files to 'transfer.sh'" ) zz.add_argument( 'path', help='Path containing files to upload' ) zz.add_argument( '-c', '--use-curl', dest='curl', action='store_true', help='Uses curl to upload file(s)' ) zz.add_argument( '-w', '--use-wget', dest='wget', action='store_true', help='Uses wget to upload file(s)' ) args = zz.parse_args() if args.curl and args.wget: zz.error("Cannot use both 'wget' and 'curl'") sys.exit(1) elif not args.curl and not args.wget: zz.error("Please choose an upload method (-c OR -w)") sys.exit(1) args.path = os.path.abspath(args.path) file_list = [] skipped = [] if pid(args.path): root_dir = args.path for sub_file in ld(root_dir): if sub_file.startswith('.'): continue if re.search(r"[$/\\&\s\[\]{}^%]", sub_file): skipped.append(sub_file) continue sub_path = oj(root_dir, sub_file) if piff(sub_path): file_list.append(sub_path) if args.curl: for f_count, file in enumerate(file_list): collected_length = len(file_list) print("File %d of %d" % (f_count, collected_length)) this_dir = oj(os.path.dirname(sys.argv[0])) logger = oj(this_dir, 'log.txt') file_name = os.path.basename(file) command = """ curl --upload-file "%s" https://transfer.sh/%s >> "%s" """ % (file, file_name, logger) os.system(command) elif args.wget: for f_count, file in enumerate(file_list): collected_length = len(file_list) print("File %d of %d" % (f_count, collected_length)) this_dir = oj(os.path.dirname(sys.argv[0])) logger = oj(this_dir, 'log.txt') file_name = os.path.basename(file) command = """ wget --method PUT --body-file="%s" https://transfer.sh/%s -O - -v >> "%s" """ % (file, file_name, logger) os.system(command) if skipped: print('files skipped') print('\n'.join(str(f) for f in skipped)) print("check file names for illegal characters (spaces, $, (), [], {}) ") elif piff(args.path): print("File 1 of 1") file = args.path sub_file = os.path.basename(file) if re.search(r"[$/\\&\s\[\]{}^%]", sub_file): print('file skipped') print("check file names for illegal characters (spaces, $, (), [], {})") # print("use '-a' flag to auto-rename files") logger = oj(this_dir, 'log.txt') if args.curl: this_dir = oj(os.path.dirname(sys.argv[0])) file_name = os.path.basename(file) command = """ curl --upload-file "%s" https://transfer.sh/%s >> "%s" """ % (file, file_name, logger) os.system(command) elif args.wget: this_dir = oj(os.path.dirname(sys.argv[0])) file_name = os.path.basename(file) command = """ wget --method PUT --body-file="%s" https://transfer.sh/%s -O - -v >> "%s" """ % (file, file_name, logger) os.system(command) rdr = open(logger, 'r') lines = rdr.read() rdr.close() os.remove(logger) lines = lines.split('https') for l in lines: if not l: continue print('https%s' % l) pass
def savefig(s: str, png=False): # plt.tight_layout() plt.savefig(oj(DIR_FIGS, 'fig_' + s + '.pdf'), bbox_inches='tight') if png: plt.savefig(oj(DIR_FIGS, 'fig_' + s + '.png'), dpi=300, bbox_inches='tight')
def classify_data(classifier_dir, symlink_dir, data_info, PRETRAINED, redbox=False): N = 96 classifier_name = classifier_dir.split('/')[-1] if classifier_name.split( '-fine')[0] + '_deploy.prototxt' not in os.listdir(classifier_dir): create_deploy_file(classifier_dir) MODEL_FILE = oj(classifier_dir, classifier_name.split('-fine')[0] + '_deploy.prototxt') MEAN_FILE = np.load(get_np_mean_fname(symlink_dir)) print 'loading network...' net = caffe.Classifier(MODEL_FILE, PRETRAINED, image_dims=(256, 256), input_scale=255, mean=MEAN_FILE, channel_swap=(2, 1, 0)) # flow of control: # classifier::__init__( # classifier::caffe.Net.__init__() print 'network loaded successfully' # set phase to test since we are doing testing net.set_phase_test() net.set_mode_gpu() d = { 'fname': [], 'pred': [], 'time': [], 'dude': [], 'label': [], 'pred_lab_thresh': [], 'pred_lab_std': [], 'pot_mislab': [] } # load images if redbox: imgs, d = load_all_images_from_dir(d, oj(symlink_dir, 'redbox'), redbox) else: imgs, d = load_all_images_from_dir(d, oj(symlink_dir, 'test')) # classify images num_imgs = len(d['fname']) print "computing preds..." d['pred'] = net.predict(imgs[:N]) # print pred if num_imgs > N: for i in range(1, num_imgs / N): d['pred'] = np.append(d['pred'], net.predict(imgs[i * N:(i + 1) * N]), axis=0) d['pred'] = np.append(d['pred'], net.predict(imgs[-(len(imgs) % N):]), axis=0) print "preds computed." # save preds assert len(d['pred']) == num_imgs np.save(oj(data_info, PRETRAINED.split('/')[-1] + '_pred.npy'), d) return d
#! /usr/bin/python3 import pandas as pd import numpy as np from os.path import join as oj import os if __name__ == '__main__': import sys sys.path.append( oj(os.path.dirname(__file__), '..', '..', 'raw', 'mit_voting')) from load import load_mit_voting else: from ...raw.mit_voting.load import load_mit_voting def clean_mit_voting( data_dir=oj('..', '..', 'raw', 'mit_voting'), out_dir='.'): ''' Clean 2000-2016 County Presidential Data Parameters ---------- data_dir : str; path to the data directory to find raw csv out_dir : str; path to the data directory to write cleaned csv Returns ------- writes out cleaned csv file and returns clean data frame '''
try: os.environ['DISPLAY'] except: raise Exception('ERROR: X11 forwarding not enabled, cannot run script') if len(sys.argv) < 2: print_help() else: model_dir = os.path.abspath(sys.argv[1]) log_fname, parsed = already_parsed(model_dir) if parsed == 'N': parse_log(log_fname) lfname = oj(model_dir, log_fname) test_dict = get_test_dict(lfname+'.test') train_dict = get_train_dict(lfname+'.train') Ys = {} tr_te_fields = train_fields + test_fields for key in train_fields: Ys[key] = train_dict[key] for key in test_fields: try: Ys[key] = test_dict[key] except: print "WARNING: found no %s fields"%(key) tr_te_fields.remove(key)