def parallel_density_sampling(filenames, outputfilename, feature_order, truth_idx, density_threshold, p, num_bins, num_parallel_jobs=8,testMode=False): """ parallel_density_sampling: Using grid based sampling, divide the state space into a grid and pick points probabilistically relative to the number of points in the cell. In more detail the function takes the files passed and processes them. First maximum and minimum for each of the features is computed for each file, later the results are combined. After this is done the hyper-dimensional space of each file is segmented into cells and then counts of the data points falling into each cell are computed. Finally the cells and its counts are merged. In a way this is similar to kernel based density estimation where the kernel has no overlap also since this is in high dimensional space kernel density estimation is not feasible. filenames : list of data sets to be used outputfilename : output filename including path feature_order : features to be used truth_idx : Index of the data labels in the data set density_threshold : p : num_bins : num_parallel_jobs : Number of parallel jobs, the default is 8 """ folderstoDelete,files=dO.createSplitFolder(filenames,testMode=testMode) filenames=files #Instrumentation code log=ins.instrumento() log.act("start density sampling") log.params(filenames,outputfilename,feature_order,truth_idx,density_threshold,p,num_bins) #parallel get mins and maxs print("Calculating mins and maxs") job_args = [] for filename in filenames: print("working on file %s"%(filename)) features, truth = load_tsv.load_tsv_features_truth(filename, feature_order, truth_idx) if p < 0 or p > 1: print "Invalid starting probability" sys.stdout.flush() job_args.append((features)) #This section of the code extracts minimums and maximums over the features on #small pieces of data job_results = parallel_jobs.parallel_jobs(compute_state_space,job_args,num_parallel_jobs) #merge mins and maxs of each feature and compute the stepsize minimum = numpy.zeros(len(job_results[0][0])) maximum = numpy.zeros(len(job_results[0][1])) for i in range(len(minimum)): minimum[i] = job_results[0][0][i] maximum[i] = job_results[0][1][i] print "printing mins" sys.stdout.flush() for result in job_results: for i in range(len(minimum)): if result[0][i] < minimum[i]: minimum[i] = result[0][i] if result[1][i] > maximum[i]: maximum[i] = result[1][i] step_size = (maximum-minimum)/num_bins #put the samples in the grids and collect all the grids print("starting sampling") job_args = [] for filename in filenames: features, truth = load_tsv.load_tsv_features_truth(filename, feature_order, truth_idx) job_args.append((features, minimum, maximum, step_size, num_bins, filename)) #grid_sampling((features, minimum, maximum, step_size, num_bins, filename)) job_results2 = parallel_jobs.parallel_jobs(grid_sampling,job_args,num_parallel_jobs) print "merging output" sys.stdout.flush() #merge the grids, sample, and print the results #outputfile = open(outputfilename,"w") grid = job_results2[0][0] text_lines = job_results2[0][1] for i in range(1,len(job_results2)): for key in job_results2[i][0]: # if key not in grid: try: grid[key] = grid[key] + job_results2[i][0][key] text_lines[key] = text_lines[key]+job_results2[i][1][key] except KeyError: # else: #must merge two counts and two lists grid[key] = job_results2[i][0][key] text_lines[key] = job_results2[i][1][key] print "sampling" sys.stdout.flush() #randomly sample proportional to the set in the grid cell count = 0 outputfile = open(outputfilename,"w") for key in grid: if grid[key] >= density_threshold: for line in text_lines[key]: if random.random() < p: count = count+1 outputfile.write(line) outputfile.flush() print "completed output" sys.stdout.flush() outputfile.close() for i in folderstoDelete: shutil.rmtree(i) log.act("end parallel density sampling") return [outputfilename, count]
def parallel_random_sampling(p, filenames,outputFilename, num_parallel_jobs=8): """ parallel_random_sampling This function performs random sampling on a file and produces a random sample. The random sample file will be stored in the same folder were the data is with the prefix -rand p : is the probability of each sample from each file being chosen filenames : List of files to be used for the random sampling Note, we are not guaranteeing that p*filesize samples will be chosen filenames are strings of files to open and sample from """ log=ins.instrumento() log.act("start random sampling") log.params(p, filenames) if p >= 1.0 or p < 0.0: print "Invalid probability, must be (0.0, 1.0]" files=[] path="" folderstoDelete=[] # for i in filenames: # idx=np.random.randint(0,10,5) # idx=[str(s) for s in idx] # idx="".join(idx) # path=sU.extractPath(i)+"/temp_"+idx # folderstoDelete.append(path) # if not(os.path.isdir(path)): # os.mkdir(path) # files+=dO.splitter(i,path,prefix="split") folderstoDelete,files=dO.createSplitFolder(filenames) #Currently because of this line of code it will only work with a single path=dO.createFolder(filenames[0]) folderstoDelete.append(path) job_args = [] for filename in files: job_args.append((p,filename,path)) job_results = parallel_jobs.parallel_jobs(random_sampling,job_args,num_parallel_jobs) randFiles=[ i[0] for i in job_results] # print(job_results) dO.combiner(randFiles,outputFilename) log.act("end random sampling") # print(folderstoDelete) for i in folderstoDelete: shutil.rmtree(i) # print("to delete %s"%(i)) #TODO # Consider erasing all of the files in path once done # also erase all of the random files return job_results