def tests(args): # Test multiple input precision and recall values to check out if we are meeting and exceeding the input parameters xi,yi,zi1,zi2, data, xvals, yvals, xstable, ystable, precis, recall = args # Find the combination of x and y which is closest to the two thresholds combinations=[[xi[a][b],yi[a][b],zi1[a][b],zi2[a][b]] for a in range(len(zi1)) for b in range(len(zi1[0])) if zi1[a][b]>=precis] ID=np.array([((a[2]-precis)**2. + (a[3]-recall)**2.) for a in combinations]).argmin() above_thresh_sigma=combinations[ID] # Find the thresholds for these sigmas, by fitting the observed data with a Gaussian model sigcutx,paramx,range_x = generic_tools.get_sigcut([float(x[0]) for x in data],above_thresh_sigma[0]) sigcuty,paramy,range_y = generic_tools.get_sigcut([float(x[1]) for x in data],above_thresh_sigma[1]) # Count up the different numbers of tp, fp, fn fp=len([z for z in range(len(xstable)) if (xstable[z]>sigcutx and ystable[z]>sigcuty)]) # False Positive tp=len([z for z in range(len(xvals)) if (xvals[z]>sigcutx and yvals[z]>sigcuty)]) # True Positive fn=len([z for z in range(len(xvals)) if (xvals[z]<sigcutx or yvals[z]<sigcuty)]) # False Negative # Use these values to calculate the precision and recall values obtained with the trained threshold. # If the test is successful, the outputs should meet or exceed the input parameters. results1, results2 = generic_tools.precision_and_recall(tp,fp,fn) return [precis, recall, results1, results2]
def tests(args): # Test multiple input precision and recall values to check out if we are meeting and exceeding the input parameters xi, yi, zi1, zi2, data, xvals, yvals, xstable, ystable, precis, recall = args # Find the combination of x and y which is closest to the two thresholds combinations = [[xi[a][b], yi[a][b], zi1[a][b], zi2[a][b]] for a in range(len(zi1)) for b in range(len(zi1[0])) if zi1[a][b] >= precis] ID = np.array([((a[2] - precis)**2. + (a[3] - recall)**2.) for a in combinations]).argmin() above_thresh_sigma = combinations[ID] # Find the thresholds for these sigmas, by fitting the observed data with a Gaussian model sigcutx, paramx, range_x = generic_tools.get_sigcut( [float(x[0]) for x in data], above_thresh_sigma[0]) sigcuty, paramy, range_y = generic_tools.get_sigcut( [float(x[1]) for x in data], above_thresh_sigma[1]) # Count up the different numbers of tp, fp, fn fp = len([ z for z in range(len(xstable)) if (xstable[z] > sigcutx and ystable[z] > sigcuty) ]) # False Positive tp = len([ z for z in range(len(xvals)) if (xvals[z] > sigcutx and yvals[z] > sigcuty) ]) # True Positive fn = len([ z for z in range(len(xvals)) if (xvals[z] < sigcutx or yvals[z] < sigcuty) ]) # False Negative # Use these values to calculate the precision and recall values obtained with the trained threshold. # If the test is successful, the outputs should meet or exceed the input parameters. results1, results2 = generic_tools.precision_and_recall(tp, fp, fn) return [precis, recall, results1, results2]
def trial_data(args): # Find the precision and recall for a given pair of thresholds data, sigma1, sigma2 = args # Sort data into transient and non-transient xvals = [float(x[0]) for x in data if float(x[-1]) != 0.] yvals = [float(x[1]) for x in data if float(x[-1]) != 0.] xstable = [float(x[0]) for x in data if float(x[-1]) == 0.] ystable = [float(x[1]) for x in data if float(x[-1]) == 0.] # Find the thresholds for a given sigma, by fitting data with a Gaussian model sigcutx, paramx, range_x = generic_tools.get_sigcut( [float(x[0]) for x in data if float(x[-1]) == 0.], sigma1) sigcuty, paramy, range_y = generic_tools.get_sigcut( [float(x[1]) for x in data if float(x[-1]) == 0.], sigma2) # Count up the different numbers of tn, tp, fp, fn fp = len([ z for z in range(len(xstable)) if (xstable[z] > sigcutx and ystable[z] > sigcuty) ]) # False Positive tn = len([ z for z in range(len(xstable)) if (xstable[z] < sigcutx or ystable[z] < sigcuty) ]) # True Negative tp = len([ z for z in range(len(xvals)) if (xvals[z] > sigcutx and yvals[z] > sigcuty) ]) # True Positive fn = len([ z for z in range(len(xvals)) if (xvals[z] < sigcutx or yvals[z] < sigcuty) ]) # False Negative # Use these values to calculate the precision and recall values precision, recall = generic_tools.precision_and_recall(tp, fp, fn) return [sigma1, sigma2, precision, recall]
def trial_data(args): # Find the precision and recall for a given pair of thresholds data,sigma1,sigma2 = args # Sort data into transient and non-transient xvals = [float(x[0]) for x in data if float(x[-1]) != 0.] yvals = [float(x[1]) for x in data if float(x[-1]) != 0.] xstable = [float(x[0]) for x in data if float(x[-1]) == 0.] ystable = [float(x[1]) for x in data if float(x[-1]) == 0.] # Find the thresholds for a given sigma, by fitting data with a Gaussian model sigcutx,paramx,range_x = generic_tools.get_sigcut([float(x[0]) for x in data if float(x[-1]) == 0.],sigma1) sigcuty,paramy,range_y = generic_tools.get_sigcut([float(x[1]) for x in data if float(x[-1]) == 0.],sigma2) # Count up the different numbers of tn, tp, fp, fn fp=len([z for z in range(len(xstable)) if (xstable[z]>sigcutx and ystable[z]>sigcuty)]) # False Positive tn=len([z for z in range(len(xstable)) if (xstable[z]<sigcutx or ystable[z]<sigcuty)]) # True Negative tp=len([z for z in range(len(xvals)) if (xvals[z]>sigcutx and yvals[z]>sigcuty)]) # True Positive fn=len([z for z in range(len(xvals)) if (xvals[z]<sigcutx or yvals[z]<sigcuty)]) # False Negative # Use these values to calculate the precision and recall values precision, recall = generic_tools.precision_and_recall(tp,fp,fn) print sigma1, sigma2, precision, recall return [sigma1, sigma2, precision, recall]
def random_test(stable, variable, train, valid, precis_thresh, recall_thresh, path): multiple_trials( [[np.log10(float(x[1])), np.log10(float(x[2])), float(x[-1])] for x in train if float(x[1]) > 0 if float(x[2]) > 0], "temp_sigma_data.txt") data2 = np.genfromtxt('temp_sigma_data.txt', delimiter=' ') data = [[ np.log10(float(train[n][1])), np.log10(float(train[n][2])), train[n][5], float(train[n][-1]) ] for n in range(len(train)) if float(train[n][1]) > 0 if float(train[n][2]) > 0] best_sigma1, best_sigma2 = find_best_sigmas(precis_thresh, recall_thresh, data2, False, data, False, path) # Find the thresholds for a given sigma (in log space) sigcutx, paramx, range_x = generic_tools.get_sigcut( [a[0] for a in data if a[3] == 0.], best_sigma1) sigcuty, paramy, range_y = generic_tools.get_sigcut( [a[1] for a in data if a[3] == 0.], best_sigma2) trainIDs = [str(int(x[0])) for x in train] validIDs = [str(int(x[0])) for x in valid] # Calculate the training error fp = len([[z[0], float(z[1]), float(z[2]), float(z[3]), float(z[4]), 'FP'] for z in stable if (float(z[1]) >= 10.**sigcutx and float(z[2]) >= 10.**sigcuty) if z[0] in trainIDs]) # False Positive fn = len([[z[0], float(z[1]), float(z[2]), float(z[3]), float(z[4]), 'FN'] for z in variable if (float(z[1]) < 10.**sigcutx or float(z[2]) < 10.**sigcuty) if z[0] in trainIDs]) # False Negative trainErr = check_error(fp, fn, len(train)) # Caluculate the validation error fp = len([[z[0], float(z[1]), float(z[2]), float(z[3]), float(z[4]), 'FP'] for z in stable if (float(z[1]) >= 10.**sigcutx and float(z[2]) >= 10.**sigcuty) if z[0] in validIDs]) # False Positive fn = len([[z[0], float(z[1]), float(z[2]), float(z[3]), float(z[4]), 'FN'] for z in variable if (float(z[1]) < 10.**sigcutx or float(z[2]) < 10.**sigcuty) if z[0] in validIDs]) # False Negative validErr = check_error(fp, fn, len(valid)) #plotting_tools.plotLC(len(validErr), trainErr, validErr, path+'random', False, True, 'Trial number') return trainErr, validErr
def learning_curve(stable, variable, train, valid, precis_thresh, recall_thresh, rangeNums, path): validErr = [] valid_list = np.unique([x[0] for x in valid]) output = open(path + 'sigma_train.txt', 'w') trainErr = [] for num in rangeNums: if num > 0: filename = open(path + "temp_sigma_data.txt", "w") filename.write('') filename.close() trainTMP = train[:num, :] train_list = np.unique([int(x[0]) for x in trainTMP]) multiple_trials( [[np.log10(float(x[1])), np.log10(float(x[2])), float(x[-1])] for x in trainTMP if float(x[1]) > 0 if float(x[2]) > 0], path + "temp_sigma_data.txt") data2 = np.genfromtxt(path + 'temp_sigma_data.txt', delimiter=' ') data = [[ np.log10(float(trainTMP[n][1])), np.log10(float(trainTMP[n][2])), trainTMP[n][5], float(trainTMP[n][-1]) ] for n in range(num) if float(trainTMP[n][1]) > 0 if float(trainTMP[n][2]) > 0] best_sigma1, best_sigma2 = find_best_sigmas( precis_thresh, recall_thresh, data2, False, data, False, path) # Find the thresholds for a given sigma (in log space) sigcutx, paramx, range_x = generic_tools.get_sigcut( [a[0] for a in data if a[3] == 0.], best_sigma1) sigcuty, paramy, range_y = generic_tools.get_sigcut( [a[1] for a in data if a[3] == 0.], best_sigma2) # Calculate the training error fp = len([[ z[0], float(z[1]), float(z[2]), float(z[3]), float(z[4]), 'FP' ] for z in stable if ( float(z[1]) >= 10.**sigcutx and float(z[2]) >= 10.**sigcuty) if int(z[0]) in train_list]) # False Positive fn = len( [[ z[0], float(z[1]), float(z[2]), float(z[3]), float(z[4]), 'FN' ] for z in variable if (float(z[1]) < 10.**sigcutx or float(z[2]) < 10.**sigcuty) if int(z[0]) in train_list]) # False Negative trainErr.append(check_error(fp, fn, len(trainTMP))) # Caluculate the validation error fp = len([[ z[0], float(z[1]), float(z[2]), float(z[3]), float(z[4]), 'FP' ] for z in stable if ( float(z[1]) >= 10.**sigcutx and float(z[2]) >= 10.**sigcuty) if int(z[0]) in valid_list]) # False Positive fn = len( [[ z[0], float(z[1]), float(z[2]), float(z[3]), float(z[4]), 'FN' ] for z in variable if (float(z[1]) < 10.**sigcutx or float(z[2]) < 10.**sigcuty) if int(z[0]) in valid_list]) # False Negative validErr.append(check_error(fp, fn, len(valid))) output.write(str(num) + ',' + str(trainErr) + ',' + str(validErr)) output.close() return trainErr, validErr
if anomaly: ######### ANOMALY DETECTION ########## # train the anomaly detection algorithm by conducting multiple trials. if not os.path.exists('sigma_data.txt'): filename = open("sigma_data.txt", "w") filename.write('') filename.close() train_anomaly_detect.multiple_trials([[np.log10(float(x[1])), np.log10(float(x[2])), float(x[-1])] for x in variables if float(x[1]) > 0 if float(x[2]) > 0]) data2=np.genfromtxt('sigma_data.txt', delimiter=' ') data=[[np.log10(float(variables[n][1])),np.log10(float(variables[n][2])),variables[n][5],float(variables[n][-1])] for n in range(len(variables)) if float(variables[n][1]) > 0 if float(variables[n][2]) > 0] best_sigma1, best_sigma2 = train_anomaly_detect.find_best_sigmas(precis_thresh,recall_thresh,data2,tests,data) print 'sigma_(eta_nu)='+str(best_sigma1)+', sigma_(V_nu)='+str(best_sigma2) # Find the thresholds for a given sigma (in log space) sigcutx,paramx,range_x = generic_tools.get_sigcut([a[0] for a in data if a[3]==0.],best_sigma1) sigcuty,paramy,range_y = generic_tools.get_sigcut([a[1] for a in data if a[3]==0.],best_sigma2) print(r'Gaussian Fit $\eta$: '+str(round(10.**paramx[0],2))+'(+'+str(round((10.**(paramx[0]+paramx[1])-10.**paramx[0]),2))+' '+str(round((10.**(paramx[0]-paramx[1])-10.**paramx[0]),2))+')') print(r'Gaussian Fit $V$: '+str(round(10.**paramy[0],2))+'(+'+str(round((10.**(paramy[0]+paramy[1])-10.**paramy[0]),2))+' '+str(round((10.**(paramy[0]-paramy[1])-10.**paramy[0]),2))+')') print 'Eta_nu threshold='+str(10.**sigcutx)+', V_nu threshold='+str(10.**sigcuty) data=[[variables[n][0],np.log10(float(variables[n][1])),np.log10(float(variables[n][2])),variables[n][5],float(variables[n][-1])] for n in range(len(variables)) if float(variables[n][1]) > 0 if float(variables[n][2]) > 0] # Get the different frequencies in the dataset frequencies = generic_tools.get_frequencies(data) # Create the scatter_hist plot plotting_tools.create_scatter_hist(data,0,0,paramx,paramy,range_x,range_y,'',frequencies) # make second array for the diagnostic plot: [eta_nu, V_nu, maxflx_nu, flxrat_nu, nu] data2=[[variables[n][0],float(variables[n][1]),float(variables[n][2]),float(variables[n][3]),float(variables[n][4]),variables[n][5]] for n in range(len(variables)) if float(variables[n][1]) > 0 if float(variables[n][2]) > 0]
# get TraP data from the database and sort it into the required array which is then loaded if not os.path.isfile('ds'+str(dataset_id)+'_trans_data.txt'): format_TraP_data.format_data(database,dataset_id,release,host,port,username,password,lightcurves) trans_data=generic_tools.extract_data('ds'+str(dataset_id)+'_trans_data.txt') # make first array for the scatter_hist plot: [log10(eta_nu), log10(V_nu), nu] data=[[trans_data[n][0],np.log10(float(trans_data[n][1])),np.log10(float(trans_data[n][2])),trans_data[n][5], trans_data[n][-1]] for n in range(len(trans_data)) if float(trans_data[n][1]) > 0 if float(trans_data[n][2]) > 0 if trans_data[n][-4]=='2'] # print out the transients that TraP automatically found print 'Identified Transient Candidates (no margin)' print np.sort(list(set([int(x[0]) for x in trans_data if x[-4]!='2' if float(x[-2])>=float(x[-1]) if float(x[-3])<float(x[-1])]))) print 'Identified Transients (no margin)' print np.sort(list(set([int(x[0]) for x in trans_data if x[-4]!='2' if float(x[-3])>=float(x[-1])]))) # Find the thresholds for a given sigma (in log space) sigcutx,paramx,range_x = generic_tools.get_sigcut([a[1] for a in data],sigma1) sigcuty,paramy,range_y = generic_tools.get_sigcut([a[2] for a in data],sigma2) if sigma1 == 0: sigcutx=0 if sigma2 == 0: sigcuty=0 print(r'Gaussian Fit $\eta$: '+str(round(10.**paramx[0],2))+'(+'+str(round((10.**(paramx[0]+paramx[1])-10.**paramx[0]),2))+' '+str(round((10.**(paramx[0]-paramx[1])-10.**paramx[0]),2))+')') print(r'Gaussian Fit $V$: '+str(round(10.**paramy[0],2))+'(+'+str(round((10.**(paramy[0]+paramy[1])-10.**paramy[0]),2))+' '+str(round((10.**(paramy[0]-paramy[1])-10.**paramy[0]),2))+')') print 'Eta_nu threshold='+str(10.**sigcutx)+', V_nu threshold='+str(10.**sigcuty) # Get the different frequencies in the dataset frequencies = generic_tools.get_frequencies(data) # Create the scatter_hist plot IdTrans = plotting_tools.create_scatter_hist(data,sigcutx,sigcuty,paramx,paramy,range_x,range_y,dataset_id,frequencies)