コード例 #1
0
def parallel_density_sampling(filenames, outputfilename, feature_order, truth_idx, density_threshold, p, num_bins, num_parallel_jobs=8,testMode=False):
    """
    parallel_density_sampling:
    Using grid based sampling, divide the state space into a grid and pick points probabilistically relative 
    to the number of points in the cell. In more detail the function takes the files passed and processes them.
    First maximum and minimum for each of the features is computed for each file, later the results are combined.
    After this is done the hyper-dimensional space of each file is segmented into cells and then counts of the data
    points falling into each cell are computed. Finally the cells and its counts are merged. In a way this is similar
    to kernel based density estimation where the kernel has no overlap also since this is in high dimensional space
    kernel density estimation is not feasible. 
     
    
     
    filenames : list of data sets to be used
    outputfilename : output filename including path
    feature_order : features to be used
    truth_idx : Index of the data labels in the data set  
    density_threshold : 
    p :
    num_bins :
    num_parallel_jobs : Number of parallel jobs, the default is 8
    """
    
    folderstoDelete,files=dO.createSplitFolder(filenames,testMode=testMode)
    
    filenames=files
    #Instrumentation code
    log=ins.instrumento()
    log.act("start density sampling")
    log.params(filenames,outputfilename,feature_order,truth_idx,density_threshold,p,num_bins)
    
    #parallel get mins and maxs
    print("Calculating mins and maxs")
    job_args = []
    for filename in filenames:
        print("working on file %s"%(filename))
        features, truth = load_tsv.load_tsv_features_truth(filename, feature_order, truth_idx)
        if p < 0 or p > 1: 
            print "Invalid starting probability"
            sys.stdout.flush()
        job_args.append((features))
        
    #This section of the code extracts minimums and maximums over the features on 
    #small pieces of data
    job_results = parallel_jobs.parallel_jobs(compute_state_space,job_args,num_parallel_jobs)
    
    #merge mins and maxs of each feature and compute the stepsize
    minimum = numpy.zeros(len(job_results[0][0]))
    maximum = numpy.zeros(len(job_results[0][1]))
    for i in range(len(minimum)):
        minimum[i] = job_results[0][0][i]
        maximum[i] = job_results[0][1][i]
        
    print "printing mins"
    sys.stdout.flush()
    for result in job_results:
        for i in range(len(minimum)):
            if result[0][i] < minimum[i]:
                minimum[i] = result[0][i]
            if result[1][i] > maximum[i]:
                maximum[i] = result[1][i]
    step_size = (maximum-minimum)/num_bins
    
    #put the samples in the grids and collect all the grids
    print("starting sampling")
    job_args = []
    for filename in filenames:
        features, truth = load_tsv.load_tsv_features_truth(filename, feature_order, truth_idx)
        job_args.append((features, minimum, maximum, step_size, num_bins, filename))
        #grid_sampling((features, minimum, maximum, step_size, num_bins, filename))
    job_results2 = parallel_jobs.parallel_jobs(grid_sampling,job_args,num_parallel_jobs)
    
    print "merging output"
    sys.stdout.flush()
    #merge the grids, sample, and print the results
    #outputfile = open(outputfilename,"w")
    grid = job_results2[0][0]
    text_lines = job_results2[0][1]
    for i in range(1,len(job_results2)):
        for key in job_results2[i][0]:
#             if key not in grid:
            try:
                grid[key] = grid[key] + job_results2[i][0][key]
                text_lines[key] = text_lines[key]+job_results2[i][1][key]
            except KeyError:
#             else: #must merge two counts and two lists
                grid[key] = job_results2[i][0][key]
                text_lines[key] = job_results2[i][1][key] 
                
    
    print "sampling"
    sys.stdout.flush()
    #randomly sample proportional to the set in the grid cell
    count = 0
    outputfile = open(outputfilename,"w")
    
    
    for key in grid:
        if grid[key] >= density_threshold:
            for line in text_lines[key]:
                if random.random() < p:
                    count = count+1
                    outputfile.write(line)
                    outputfile.flush()
                    
    print "completed output"
    sys.stdout.flush()
    outputfile.close()
    
    for i in folderstoDelete:
        shutil.rmtree(i)
    
    log.act("end parallel density sampling")
    return [outputfilename, count]
コード例 #2
0
def parallel_random_sampling(p, filenames,outputFilename, num_parallel_jobs=8):
    """
    parallel_random_sampling
    This function performs random sampling on a file and produces
    a random sample. The random sample file will be stored in the same
    folder were the data is with the prefix -rand
    
    p         :    is the probability of each sample from each file being chosen 
    filenames :    List of files to be used for the random sampling
   Note, we are not guaranteeing that p*filesize samples will be chosen
   filenames are strings of files to open and sample from
   """
    log=ins.instrumento()
    log.act("start random sampling")
    log.params(p, filenames)
    if p >= 1.0 or p < 0.0: 
        print "Invalid probability, must be (0.0, 1.0]"
    
    files=[]
    path=""
    folderstoDelete=[]
    
        
#     for i in filenames:
#         idx=np.random.randint(0,10,5)
#         idx=[str(s) for s in idx]
#         idx="".join(idx)
#         path=sU.extractPath(i)+"/temp_"+idx
#         folderstoDelete.append(path)
#         if not(os.path.isdir(path)):
#             os.mkdir(path)
#         files+=dO.splitter(i,path,prefix="split")


    folderstoDelete,files=dO.createSplitFolder(filenames)
        
    #Currently because of this line of code it will only work with a single    
    path=dO.createFolder(filenames[0])
    folderstoDelete.append(path)
        
    job_args = []
    for filename in files:
        
        job_args.append((p,filename,path))
        
        
    
            
    job_results = parallel_jobs.parallel_jobs(random_sampling,job_args,num_parallel_jobs)
    
    
    
    randFiles=[ i[0] for i in job_results]
    
#     print(job_results)
    
    
    dO.combiner(randFiles,outputFilename)
    log.act("end random sampling")
    
#     print(folderstoDelete)
    
    for i in folderstoDelete:
        shutil.rmtree(i)
#         print("to delete %s"%(i))
    
    #TODO
#     Consider erasing all of the files in path once done
#     also erase all of the random files
    
    
    return job_results