if not os.path.exists(os.path.join(outputFolder,e)): os.mkdir(os.path.join(outputFolder,e)) #similar to above, make subfolder if doesn't exist n = 'rooktemp' w = ['0193'] b = 'Mirror' theta = 0.7 alphaSpatial = '[0.3-0.0]' alphaSpatiotemp = '0.3-0.4]' alphaNonSpatial = '[0.0-0.0]' c_0 = cref[e] eventClass = e dataset = d print eventClass,dataset headers,matches = SOLARGenImageList.image_event_matches(dataset=d,waves = w) print 'matches calculated' treefileSpatial = "results/"+"_".join([str(e),str(n),str(d),"-".join(w),'c_0='+str(c_0),'balance='+str(b),'alpha='+str(alphaSpatial)])+".tree" treefileNonSpatial = "results/"+"_".join([str(e),str(n),str(d),"-".join(w),'c_0='+str(c_0),'balance='+str(b),'alpha='+str(alphaNonSpatial)])+".tree" treeSpatial = TreeNode('dummy') treeNonSpatial = TreeNode('dummy') with open(treefileSpatial) as f: treeSpatial.load(f) with open(treefileNonSpatial) as f: treeNonSpatial.load(f) S_train,S_test, = read_data(e,n,d,w,b) # read the data set cells_train, adj = S_train cells_test, adj_test = S_test counter = 0 for x in sorted(matches.keys()): # for each image of the data set paramsFilename = x[0]
import SOLARGenImageList import random import itertools random.seed(42) #This code copies the images of the specified datasets from their stored location on CBSIR to a local folder #events = ['SS','FL','CH','AR','SG','FI'] events = ['AR'] datasets = ['1WEEK'] radius = 1625 for e,d in itertools.product(events,datasets): eventClass = e dataset = d print eventClass,dataset headers,matches = SOLARGenImageList.image_event_matches([eventClass],dataset=dataset,waves = ['0171']) print 'matches calculated' for x in matches.keys(): paramsFilename = x[0] imageFilename = paramsFilename[:-4]+'_th.png' shutil.copyfile(imageFilename,os.path.join('images',e+'_'+os.path.basename(imageFilename))) # for imageKey in matches[0]: # for every image # eventBinMat = np.zeros([64,64],dtype = bool) # for E in matches[imageKey]: # for each event in that image # cc = pt.parseChainCode(E,headers) # if cc == "NA": # cc = pt.parseBoundingBox(E,headers) # eventBinMat = np.logical_or(eventBinMat, stuff.find_grid_cells(cc)) # combine all the events into one binary matrix for the image
def write_experiment_data(circle_mask, event_class, neighborhood, dataset, waves, balance_option): grid_for_random_sample = [x for x in itertools.product(range(64), range(64))] # build a set of cells for picking from # TODO: fix magic numbers output_file_train = "data/" + "_".join( [event_class, neighborhood, dataset, "-".join(waves), 'balance=' + balance_option]) + ".train" output_file_test = "data/" + "_".join( [event_class, neighborhood, dataset, "-".join(waves), 'balance=' + balance_option]) + ".test" if not (os.path.exists(output_file_train) and os.path.exists(output_file_test)): print event_class, neighborhood, dataset, waves, balance_option read_attempt_data = read_cell_data(event_class, dataset, waves, balance_option) if read_attempt_data != -1: # read success all_pixels = read_attempt_data cell_tracker = dict([(x['id'], x) for x in all_pixels]) imagefile_tracker = dict([(x['id'][0], x['id'][3]) for x in all_pixels]) # we need to know the individual events to generate the balancing filter print 'read data successfully' else: # read Failure ################################### headers, matches = SOLARGenImageList.image_event_matches([event_class], dataset=dataset, waves=waves) print 'matches calculated', event_class, dataset, waves image_counter = 0 all_pixels = [] # build a 'what cell is FI' matrix if balance_option == 'None': balancing_filter = np.ones([len(matches), 64, 64], dtype=bool) # keep everything else: # filter will be filled as we process images balancing_filter = np.zeros([len(matches), 64, 64], dtype=bool) for imagekey in sorted(matches.keys()): # for every image event_binmat = np.zeros([64, 64], dtype=bool) image_balancing_filter = np.zeros([64, 64], dtype=bool) for E in matches[imagekey]: # for each event in that image cc = parseChainCode(E, headers) if cc == "NA": cc = parseBoundingBox(E, headers) event_location_mat = find_grid_cells(cc) event_binmat = np.logical_or( event_binmat, event_location_mat) # combine all the events into one binary matrix for the image buffered_event_location_mat = buffer_binmat(event_location_mat) image_balancing_filter = np.logical_or( image_balancing_filter, buffered_event_location_mat) # add the positive examples to the training set if balance_option == 'Mirror': mirror_mat = mirror_event(buffered_event_location_mat) image_balancing_filter = np.logical_or( image_balancing_filter, mirror_mat) # add some negative examples to the training set if balance_option == 'Duplication': dup_mat = reposition_event(buffered_event_location_mat) image_balancing_filter = np.logical_or( image_balancing_filter, dup_mat) # add some negative examples to the training set # end of 'for event' loop if balance_option == 'Random': # randomly undersample negative class num_pos = np.sum(event_binmat) # calc the number of event cells in this image neg_grid_cells = [x for x in grid_for_random_sample if not event_binmat[x[0], x[1]]] rand_sample_of_grid = random.sample(neg_grid_cells, num_pos) # randomly pick some negative cells for x in rand_sample_of_grid: image_balancing_filter[x] = True balancing_filter[image_counter, :, :] = image_balancing_filter image_pixels = [] with open(imagekey[0]) as f: # read the parameter data for this image c = csv.reader(f, dialect='excel-tab') cells = [x for x in c] for cell in cells: # compile data/classification for all cells in image row = int(cell[0]) - 1 # silly juan 1 based index col = int(cell[1]) - 1 if not circle_mask[row, col]: # if cell off disk, pass # do nothing else: # else s = dict() # build the cell data structure s['id'] = (image_counter, row, col, imagekey) if event_binmat[row, col]: s['class'] = event_class else: s['class'] = 'null' s['P1'] = float(cell[2]) s['P2'] = float(cell[3]) s['P3'] = float(cell[4]) s['P4'] = float(cell[5]) s['P5'] = float(cell[6]) s['P6'] = float(cell[7]) s['P7'] = float(cell[8]) s['P8'] = float(cell[9]) s['P9'] = float(cell[10]) s['P10'] = float(cell[11]) image_pixels.append(s) all_pixels.extend(image_pixels) image_counter += 1 # end of images loop all_pixels = [x for x in all_pixels if balancing_filter[x['id'][0], x['id'][1], x['id'][2]] or x['id'][1] < 32] imagefile_tracker = dict([(x['id'][0], x['id'][3]) for x in all_pixels]) cell_tracker = dict([(x['id'], x) for x in all_pixels]) write_cell_data(event_class, dataset, waves, balance_option, all_pixels) #################################### # done generating cells, now we setup neighbor relationships fullHAdj = dict() fullIAdj = dict() for pix in all_pixels: # this part correctly assigns neighbors to each cell of the image # it's done after all the pixels are processed because we want the neighbors to exist imNum, iir, iic, imFile = pix['id'] if neighborhood == 'rook': neighbors = [ (imNum, iir - 1, iic, imFile), (imNum, iir, iic - 1, imFile), (imNum, iir, iic + 1, imFile), (imNum, iir + 1, iic, imFile) ] elif neighborhood == 'rooktemp': neighbors = [ (imNum, iir - 1, iic, imFile), (imNum, iir, iic - 1, imFile), (imNum, iir, iic + 1, imFile), (imNum, iir + 1, iic, imFile), (imNum + 1, iir, iic, get_image_file(imagefile_tracker, imNum + 1)), (imNum - 1, iir, iic, get_image_file(imagefile_tracker, imNum - 1)) ] elif neighborhood == 'rooktemplong': neighbors = [ (imNum, iir - 1, iic, imFile), (imNum, iir, iic - 1, imFile), (imNum, iir, iic + 1, imFile), (imNum, iir + 1, iic, imFile), (imNum + 1, iir, iic, get_image_file(imagefile_tracker, imNum + 1)), (imNum - 1, iir, iic, get_image_file(imagefile_tracker, imNum - 1)), (imNum + 2, iir, iic, get_image_file(imagefile_tracker, imNum + 2)), (imNum - 2, iir, iic, get_image_file(imagefile_tracker, imNum - 2)), (imNum + 3, iir, iic, get_image_file(imagefile_tracker, imNum + 3)), (imNum - 3, iir, iic, get_image_file(imagefile_tracker, imNum - 3)) ] elif neighborhood == 'queen': neighbors = [ (imNum, iir - 1, iic - 1, imFile), (imNum, iir - 1, iic + 0, imFile), (imNum, iir - 1, iic + 1, imFile), (imNum, iir + 0, iic - 1, imFile), (imNum, iir + 0, iic + 1, imFile), (imNum, iir + 1, iic - 1, imFile), (imNum, iir + 1, iic + 0, imFile), (imNum, iir + 1, iic + 1, imFile) ] else: raise Exception('neighborhood option not supported') hl = [] # list of homogenous neighbors (actual object, not just index) for pix il = [] # list of inhomogenous neighbors (actual object, not just index) for pix for Nindex in neighbors: try: N = cell_tracker[Nindex] if N['class'] == pix['class']: hl.append(N) else: il.append(N) except KeyError: # key errors will occur due to mask-based removal of cells and OOB issues pass fullHAdj[pix['id']] = hl fullIAdj[pix['id']] = il # end for pix loop random.shuffle(all_pixels) # I think this is an artifact of when we were dividing train and test sets randomly? # TODO: figure out if I can remove above line train_pixels = [x for x in all_pixels if x['id'][1] >= 32] # train on the top half of every image test_pixels = [x for x in all_pixels if x['id'][1] < 32] # test on the bottom half of every image A = set(x['id'] for x in train_pixels) B = set(x['id'] for x in test_pixels) train_adj_h = dict() train_adj_i = dict() test_adj_h = dict() test_adj_i = dict() for key in fullHAdj: # both adjacency mats have the same keys HNlist = fullHAdj[key] INlist = fullIAdj[key] if key in A: newHNlist = [x for x in HNlist if x['id'] in A] newINlist = [x for x in INlist if x['id'] in A] train_adj_h[key] = newHNlist train_adj_i[key] = newINlist else: newHNlist = [x for x in HNlist if x['id'] in B] newINlist = [x for x in INlist if x['id'] in B] test_adj_h[key] = newHNlist test_adj_i[key] = newINlist s_train = train_pixels, train_adj_h, train_adj_i s_test = test_pixels, test_adj_h, test_adj_i with open(output_file_train, 'wb') as f: pickle.dump(s_train, f) with open(output_file_test, 'wb') as f: pickle.dump(s_test, f) else: # file we would write to already exists print "already generated", event_class, neighborhood, dataset