def checkDataOrientation(experiments): # This code reads through every produced training file and prints the number of positive and negative cells # It's old, from back when I was still getting orientation wrong. # Its purpose is to check indexing across my data and the original data. # if the specified data hasn't been built, errors will happen for experiment in experiments: print experiment S_train, S_test = read_data(*experiment) cells,adj = S_train qwer = max([int(x['id'][0]) for x in cells]) #check balance pos = len([x for x in cells if x['class'] != 'null']) neg = len([x for x in cells if x['class'] == 'null']) print 'pos:',pos print 'neg:',neg #pick an arbitrary image arbImageNum = cells[0]['id'][0] arbImageCells = [x for x in cells if x['id'][0] == arbImageNum] dataFile = arbImageCells[0]['id'][3][0] # grab the original data file this images cells come from # id structure is (imageCounter,row,col,(dataFileName,timeStamp)) print dataFile with open(dataFile) as f2: # read the parameter data for this image c = csv.reader(f2,dialect = 'excel-tab') Juancells = [x for x in c] for cell in arbImageCells: myx = cell['id'][1]#these should be zero based myy = cell['id'][2] juanx = myx+1 # Juan's indexing is 1-based juany = myy+1 juanCell = [x for x in Juancells if x[0] == str(juanx) and x[1] == str(juany)][0] print
def checkDatasetNeighbors(experiments): # This code checks the number of temporal neighbors each cell has and prints a detailing ot the distribution # if the specified data hasn't been built, errors will happen for experiment in experiments: print experiment S_train, S_test = read_data(*experiment) cells,adj = S_train allNeighbors = [len(adj[cell['id']]) for cell in cells] # this is all neighbors c = Counter(allNeighbors) print c print
def checkDatasetSizes(experiments): # this function checks specified dataset structures and prints out the number of training cells and the number of entries in # the adjacency matrix. These numbers should be the same # if the specified data hasn't been built, errors will happen for experiment in experiments: print experiment S_train,S_test = read_data(*experiment) cells, adj = S_train print len(cells) print len(adj.keys()) print
def stuff2(): print 'looking at the value distributions of the different parameters' S_train,S_test = read_data('AR',neighborhood='rook',dataset='1DAY',waves=['0094'],balanceOption = 'Mirror') G = NeighborGraph.NeighborGraph(S_train) for p in G.F: # for each parameter l = [pix[p] for pix in G.cells] # grab the pixels print p print 'Max:',max(l) print 'Min:',min(l) print 'Avg:',sum(l)/len(l) print 'Most Common:',Counter(l).most_common(10) print
def checkDatasetSplits(experiments): # this function checks specified dataset structures and prints out the number _different_values_ for each parameter # this helps see how large the decision space for split selection is. # if the specified data hasn't been built, errors will happen for experiment in experiments: print experiment summ = 0 S_train,S_test = read_data(*experiment) cells, adj = S_train print 'numcells: ', len(cells) P = dict() for p in ['P1','P2','P3','P4','P5','P6','P7','P8','P9','P10']: s = set() P[p] = set() for cell in cells: s.add(cell[p]) print p,': ',len(s) summ+=len(s) print 'total:',summ print
from readData import read_data e = 'AR' n = 'rook' d = '1DAY' w = ['0171'] b = 'Mirror' # This script takes a single experiment's set of data and produces an ARFF file for use with WEKA. # Extending it to be able to handle multiple experiments at a time (producing multiple ARFF files) is a good idea S_train, S_test = read_data(e, neighborhood=n, dataset=d, waves=w, balanceOption=b) def make_arff(): filename = '_'.join([e, d, '-'.join(w), b, 'train']) + '.arff' relation_name = 'derp' feature_names = ['P1', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'P10'] classes = [e, 'null'] cells, adj = S_train cells2, adj2 = S_test with open(filename, 'w') as f: f.write('@relation ' + str(relation_name) + "\n\n") for n in feature_names: f.write("@attribute " + str(n) + " numeric\n") s = ','.join(classes) f.write("@attribute label {" + s + "}\n\n") f.write('@data\n')
def testing(): # events = ['FL','SG','AR','CH'] events = ['SG'] # events = ['FL','SG','FI','CH','AR','SS'] # neighborhoods = ['rook','queen','rooktemp'] neighborhoods = ['rook','rooktemp','rooktemplong'] datasets = ['3DAYDEMO'] thetas = [0.7] # theta is the classification parameter # grid = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0] # alphas = [x for x in itertools.product(grid,grid) if sum(x) <= 1]# alpha is the parameter that weights the entropy and neighb$ alphas = [(0.3,0.3),(0.6,0.3),(0.3,0.6)] NeighborFunctions = ["spatial","temporal"] splitRes = 10000000 # splitRes affects how many splits are considered among each parameter at each tree node c_0vals = [100]# c_0val is the _proportion_ of total values set as the minimum leaf size waves = [['0193'],['0171'],['0094']] # balances = ['Mirror','Duplication','Random'] balances = ['Mirror'] X = [x for x in itertools.product(events,neighborhoods,datasets,waves,balances,c_0vals)] Y = [x for x in itertools.product(alphas,thetas)] results = [] for e,n,d,w,b,cv in X: S_train,S_test = read_data(e,neighborhood = n, dataset = d, waves = w,balanceOption = b) cells, adj = S_train cells2, adj2 = S_test c_0 = max(len(cells)//cv,1) for alpha,theta in Y: t1 = time.time() if sum(alpha) > 1: raise Exception('invalid alpha') treefile = "temp.tree" TP,FP,TN,FN = SDT.sdt_learn(S_train, S_test, alpha, c_0, theta, splitRes, NeighborFunctions) ACC = 100*(TP+TN)/(TP+TN+FP+FN) if TP+FP != 0: PREC = 100*(TP)/(TP+FP) else: PREC = 0 if TP+FN != 0: REC = 100*TP/(TP+FN) else: REC = 0 F1 = 0 if PREC+REC == 0 else (2*PREC*REC)/(PREC+REC) summ = 0 for p in ['P1','P2','P3','P4','P5','P6','P7','P8','P9','P10']: # for each parameter s = set() # build a set of all of the values of that parameter for cell in cells: # the set data structure will eliminate duplicates s.add(cell[p]) # so its length is the number of distinct values for that summ+=len(s) # parameter splits = summ # add all these together to get the number of potential # splits in the dataset t2 = time.time() runTime = t2-t1 strr = "".join([ 'event:',str(e),'\n', 'neighborhood:',str(n),'\n', 'numcells: ', str(len(cells)),'\n', 'wave:',str(w),'\n', 'dataset:',str(d),'\n', 'alpha:',str(alpha),'\n', 'c_0:',str(c_0),'\n', 'theta:',str(theta),'\n', 'balance:',str(b),'\n', 'splits:',str(splits),'\n', 'Runtime:',str(t2-t1),'\n' 'F1:',str(F1),'\n', ' ','\n', ' ','\n', ' ','\n']) logging.debug(strr) parameters = (n,w,alpha) results.append((parameters,F1,runTime)) with open('TestTimeNeighborhoodSize.results','wb') as f: cPickle.dump(results,f)
def testing(): timeout = 0 # if timeout is zero, no timeout will be triggered; otherwise the code will cut off the runtime of the experiment at this many seconds skipTOs = False; # if this is true, the script will skip runs that previously timed out; otherwise they will be re-run (and potentially timeout again) pooling = False; # if this is true, the runs will be parallelized with Python's Pool tool; it's faster, # especially for large run sets, but debugging errors is easier when it's off # -------------------- # setting parameters # -------------------- # events = ['FL','SG','AR','CH'] events = ['AR'] # events = ['SG'] # events = ['AR','SG'] # events = ['FL','SG','FI','CH','AR','SS'] # neighborhoods = ['rook','queen','rooktemp'] neighborhoods = ['rook'] # datasets = ['1DAY','3DAYDEMO'] datasets = ['1DAY'] # waves = [['0193'],['0171'],['0094']] waves = [['0193']] # balances = ['Mirror','Duplication','Random'] balances = ['Mirror'] c_0vals = [100] # c_0val is the _proportion_ of total values set as the minimum leaf size # a value of 100 means that minimum leaf size is roughly 1/100 of the total number of cells in the dataset # the actual training parameter needs to be calculated from this value and the dataset alphas = [(0.6,)] NeighborFunctions = ["none"] # grid = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0] # alphas = [x for x in itertools.product(grid,grid) if sum(x) <= 1]# alpha is the parameter that weights the entropy and neighbor coherence heuristics # NeighborFunctions = ["spatial","temporal"] # alphas = [(x) for x in grid]# alpha is the parameter that weights the entropy and neighbor coherence heuristics # NeigborFunctions = ["none"] thetas = [0.7] # theta is the classification parameter splitRes = 50 # splitRes specifies how many splits are considered among each parameter at each tree node these # splits are selected uniformly according to value distribution (e.g. with 100 splits you get 0th percentile value, # 1st percentile value, 2nd percentile value, etc.) # a very high value (e.g. 10,000,000) will cause the training to evaluate every potential split. # once every potential split is being evaluated, increasing splitRes won't affect the result or runtime of the algorithm. experiment_data = [x for x in itertools.product(events, neighborhoods, datasets, waves, balances)] experiment_data_and_c0_vals = [x for x in itertools.product(experiment_data, c_0vals)] trainingParamSets = [x for x in itertools.product(alphas, thetas)] # trainingParamSets.append(((0.0,0.0),1.1))# add baseline experiment # trainingParamSets.append(((0.0,),1.1))# add baseline experiment # this value of alpha/theta will result in a pure entropy-based classification tree # ----------------------- # end parameter setup # ----------------------- for experiment, cv in experiment_data_and_c0_vals: S_train, S_test = read_data(*experiment) cells = S_train[0] cells2 = S_test[0] c_0 = max(len(cells) // cv, 1) # if not os.path.exists(treefile) or treefile == 'temp.tree': # g = NeighborGraph(s_train) # tree = sdt_train(g, alpha, c_0, split_res, neighbor_functions) # with open(treefile, 'w') as f: # tree.save(f) # else: # print 'tree exists' # tree = TreeNode('dummy') # with open(treefile) as f: # tree.load(f) e, n, d, w, b = experiment fileBase = "results/" + "_".join([str(e), str(n), str(d), "-".join(w), 'c_0=' + str(c_0), 'balance=' + str(b)]) constants = [(experiment, cv, S_train, S_test, c_0, splitRes, NeighborFunctions, fileBase, skipTOs, timeout)] runs = [x for x in itertools.product(constants, trainingParamSets)] incompleteRuns = [] for run in runs: constants, trainingParams = run alpha, theta = trainingParams strAlpha = str(alpha).replace('(', '[').replace(')', ']').replace(', ', '-') treefile = "_".join([fileBase, 'alpha=' + strAlpha]) + ".tree" resultsfile = "_".join([fileBase, 'alpha=' + strAlpha, 'theta=' + str(theta)]) + ".results" if os.path.exists(resultsfile) or (skipTOs and os.path.exists(resultsfile + 'TO')): print 'already ran this', resultsfile else: incompleteRuns.append(run) if pooling: p = Pool(4) # TODO: fix magic number p.map(parallelized_component, incompleteRuns) else: for run in incompleteRuns: parallelized_component(run)
import socket import pynmea2 import time import datetime import readData UDP_IP = "127.0.0.1" UDP_PORT = 10111 sock = socket.socket(socket.AF_INET, # Internet socket.SOCK_DGRAM) # UDP sock.bind((UDP_IP, UDP_PORT)) print("Loading data") aisdata = readData.read_data("oceansofdata/ais-exploratorium-edu/feed.ais.txt") print("Data Loaded") start_time = time.time() #our datasources are old and don't change time sim_start_time = 1417005700 sim_end_time = sim_start_time + 60 sim_real_diff = start_time - sim_start_time #used to store a list of collisions and near misses collisions = list() near_misses = list() position_log = dict() position_log[0] = list()
# Define parameters of the decomposition max_iter = 4 opt_tol = 1 # % ns = 5 # Number of scenarios solved per Forward/Backward Pass per process # NOTE: ns should be between 1 and len(n_stage[n_stages])/NumProcesses z_alpha_2 = 1.96 # 95% confidence level # Parallel parameters NumProcesses = 1 # ###################################################################################################################### # create scenarios and input data nodes, n_stage, parent_node, children_node, prob, sc_nodes = create_scenario_tree(stages, scenarios, single_prob) readData.read_data(filepath, curPath, stages, n_stage, t_per_stage) sc_headers = list(sc_nodes.keys()) # operating scenarios operating_scenarios = list(range(0, len(readData.L_by_scenario))) prob_op = 1 / len(readData.L_by_scenario) # print(operating_scenarios) # separate nodes by processes scenarios_by_processid = {} for i in range(NumProcesses): start = int(len(sc_nodes) * i / float(NumProcesses)) stop = int(len(sc_nodes) * (i + 1) / float(NumProcesses)) scenarios_by_processid[i] = sc_headers[start:stop] # print(scenarios_by_processid)
single_prob = {'L': 1 / 3, 'R': 1 / 3, 'H': 1 / 3} # Define parameters of the decomposition max_iter = 10 opt_tol = 2 # % ns = 15 # Number of scenarios solved per Forward/Backward Pass # NOTE: ns should be between 1 and len(n_stage[time_periods]) z_alpha_2 = 1.96 # 95% confidence level # ###################################################################################################################### # create scenarios and input data nodes, n_stage, parent_node, children_node, prob, sc_nodes = create_scenario_tree( stages, scenarios, single_prob) sc_headers = list(sc_nodes.keys()) readData.read_data(filepath, stages, n_stage) # create blocks m = b.create_model(time_periods, max_iter, n_stage, nodes, prob) print('finished generating the blocks, started counting solution time') start_time = time.time() # Decomposition Parameters m.ngo_rn_par = Param(m.rn_r, m.n_stage, default=0, initialize=0, mutable=True) m.ngo_th_par = Param(m.th_r, m.n_stage, default=0, initialize=0, mutable=True) m.ngo_rn_par_k = Param(m.rn_r, m.n_stage, m.iter, default=0, initialize=0, mutable=True)
def __init__(self): self.read_data = read_data()
def main(): documents = readData.read_data( ) #this variable holds the edited_senteces in a list
alphaNonSpatial = '[0.0-0.0]' c_0 = cref[e] eventClass = e dataset = d print eventClass,dataset headers,matches = SOLARGenImageList.image_event_matches(dataset=d,waves = w) print 'matches calculated' treefileSpatial = "results/"+"_".join([str(e),str(n),str(d),"-".join(w),'c_0='+str(c_0),'balance='+str(b),'alpha='+str(alphaSpatial)])+".tree" treefileNonSpatial = "results/"+"_".join([str(e),str(n),str(d),"-".join(w),'c_0='+str(c_0),'balance='+str(b),'alpha='+str(alphaNonSpatial)])+".tree" treeSpatial = TreeNode('dummy') treeNonSpatial = TreeNode('dummy') with open(treefileSpatial) as f: treeSpatial.load(f) with open(treefileNonSpatial) as f: treeNonSpatial.load(f) S_train,S_test, = read_data(e,n,d,w,b) # read the data set cells_train, adj = S_train cells_test, adj_test = S_test counter = 0 for x in sorted(matches.keys()): # for each image of the data set paramsFilename = x[0] imageFilename = paramsFilename[:-4]+'_th.png' ISpatial = m.imread(imageFilename) # read the image INonSpatial = ISpatial.copy() outputname = os.path.join(outputFolder,e,os.path.basename(imageFilename)[:-4]+'_'+e+'_'+d+'.png') cells_testWeWant = [x for x in cells_test if x['id'][3][0] == paramsFilename] # get the test cells tied to this image for x in range(2): if x == 0: I = ISpatial tree = treeSpatial else:
def dataIter(batch_size,trainX, trainY): dataset = gluon.data.ArrayDataset(trainX, trainY) train_data_iter = gluon.data.DataLoader(dataset, batch_size, shuffle=True) return train_data_iter def train( trainX, trainY): train_data_iter=dataIter(batchSize,trainX,trainY) lenTrainY=len(trainY) net = Net() lr=0.0001 sigmoidBCEloss= gluon.loss.SigmoidBinaryCrossEntropyLoss(from_sigmoid=True) for e in range(epochs): total_loss = 0 for x,y in tqdm(train_data_iter): with autograd.record(): y_hat=net.net(x) loss = sigmoidBCEloss(y_hat,y) loss.backward() net.SGD(lr) total_loss += nd.sum(loss).asscalar() print("Epoch %d, average loss:%f" % (e, total_loss / lenTrainY)) return net if __name__ == '__main__': trainX, trainY, testX, testY = readData.read_data() net=train(trainX,trainY)
#events = ['AR','CH','FL','FI','SG','SS'] events = ['AR'] #datasets = ['1WEEK'] datasets = ['3DAYDEMO'] for e,d in itertools.product(events,datasets): if not os.path.exists(os.path.join(outputFolder,e)): os.mkdir(os.path.join(outputFolder,e)) #similar to above, make subfolder if doesn't exist n = 'rook' w = ['0171'] b = 'Mirror' eventClass = e dataset = d print eventClass,dataset headers,matches = SOLARGenImageList.image_event_matches(dataset=dataset,waves = ['0171']) S_train,S_test, = read_data(e,n,d,w,b) cells, adj = S_train cells2, adj2 = S_test print 'matches calculated' counter = 0 total = len(matches.keys()) for x in sorted(matches.keys()): # for each image paramsFilename = x[0] imageFilename = paramsFilename[:-4]+'_th.png' I = m.imread(imageFilename) outputname = os.path.join(outputFolder,e,os.path.basename(imageFilename)[:-4]+'_'+e+'.png') cellsWeWant = [x for x in cells if x['id'][3][0] == paramsFilename] for cell in cellsWeWant: cellr = cell['id'][1] cellc = cell['id'][2] I = drawSquare(I,cellr,cellc,colordict[e] if cell['class'] != 'null' else black )