Python load_tsv_features_truth Exemples, load_tsv.load_tsv_features_truth Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : uncertainty_sampling.py Projet : julian-ramos/sampling

def parallel_maxent_uncertainty_sampling(model, filenames, 
                                         feature_order, truth_idx, 
                                         p, number_parallel_jobs=8):
    
    log=ins.instrumento()
    log.act("start maxent_uncertainty sampling")
    log.params(model, filenames,feature_order, truth_idx, p)
    if p > 1.0 or p < 0.0:
        print "Invalid probability, must be (0.0,1.0)"
        
    job_args = []
    for filename in filenames: 
        features, truth = load_tsv.load_tsv_features_truth(filename, 
                                                           feature_order, 
                                                           truth_idx)
        job_args.append((model, features, int(p*len(features)),filename))

    job_results = parallel_jobs.parallel_jobs(class_maxentunc_sampling, 
                                              job_args, number_parallel_jobs)
    log.act("end maxent_uncertainty sampling")
    return job_results

Exemple #2

0

Afficher le fichier

Fichier : uncertainty_sampling.py Projet : julian-ramos/sampling

    outputfile.close()

    return [outputfilename, samples, indexes, N]


#features, truth = load_tsv.load_tsv_features_truth("data/part1.tsv", [0,1,2], 3)
#linreg = linear_model.LinearRegression()
#linreg.fit(features,truth)
#vals = parallel_reg_uncertainty_sampling(linreg, 
#                                         ["data/part1.tsv","data/part2.tsv"], 
#                                      [0,1,2], 3, 
#                                      1.0, 1.0, 
#                                      number_parallel_jobs=8)
    
    
if __name__=="__main__":
    
    #In progress
    
    import load_tsv
    import numpy
    from sklearn import linear_model
    import uncertainty_sampling
    
    features, truth = load_tsv.load_tsv_features_truth("data/part1.tsv",[0,1,2],3)
    linreg = linear_model.LinearRegression()
    linreg.fit(features, truth)
    vals = uncertainty_sampling.parallel_reg_uncertainty_sampling(linreg,["data/part1.tsv", "data/part2.tsv"],[0,1,2],3,0.1,1)
    print vals

Exemple #3

0

Afficher le fichier

Fichier : density_sampling.py Projet : julian-ramos/sampling

def parallel_density_sampling(filenames, outputfilename, feature_order, truth_idx, density_threshold, p, num_bins, num_parallel_jobs=8,testMode=False):
    """
    parallel_density_sampling:
    Using grid based sampling, divide the state space into a grid and pick points probabilistically relative 
    to the number of points in the cell. In more detail the function takes the files passed and processes them.
    First maximum and minimum for each of the features is computed for each file, later the results are combined.
    After this is done the hyper-dimensional space of each file is segmented into cells and then counts of the data
    points falling into each cell are computed. Finally the cells and its counts are merged. In a way this is similar
    to kernel based density estimation where the kernel has no overlap also since this is in high dimensional space
    kernel density estimation is not feasible. 
     
    
     
    filenames : list of data sets to be used
    outputfilename : output filename including path
    feature_order : features to be used
    truth_idx : Index of the data labels in the data set  
    density_threshold : 
    p :
    num_bins :
    num_parallel_jobs : Number of parallel jobs, the default is 8
    """
    
    folderstoDelete,files=dO.createSplitFolder(filenames,testMode=testMode)
    
    filenames=files
    #Instrumentation code
    log=ins.instrumento()
    log.act("start density sampling")
    log.params(filenames,outputfilename,feature_order,truth_idx,density_threshold,p,num_bins)
    
    #parallel get mins and maxs
    print("Calculating mins and maxs")
    job_args = []
    for filename in filenames:
        print("working on file %s"%(filename))
        features, truth = load_tsv.load_tsv_features_truth(filename, feature_order, truth_idx)
        if p < 0 or p > 1: 
            print "Invalid starting probability"
            sys.stdout.flush()
        job_args.append((features))
        
    #This section of the code extracts minimums and maximums over the features on 
    #small pieces of data
    job_results = parallel_jobs.parallel_jobs(compute_state_space,job_args,num_parallel_jobs)
    
    #merge mins and maxs of each feature and compute the stepsize
    minimum = numpy.zeros(len(job_results[0][0]))
    maximum = numpy.zeros(len(job_results[0][1]))
    for i in range(len(minimum)):
        minimum[i] = job_results[0][0][i]
        maximum[i] = job_results[0][1][i]
        
    print "printing mins"
    sys.stdout.flush()
    for result in job_results:
        for i in range(len(minimum)):
            if result[0][i] < minimum[i]:
                minimum[i] = result[0][i]
            if result[1][i] > maximum[i]:
                maximum[i] = result[1][i]
    step_size = (maximum-minimum)/num_bins
    
    #put the samples in the grids and collect all the grids
    print("starting sampling")
    job_args = []
    for filename in filenames:
        features, truth = load_tsv.load_tsv_features_truth(filename, feature_order, truth_idx)
        job_args.append((features, minimum, maximum, step_size, num_bins, filename))
        #grid_sampling((features, minimum, maximum, step_size, num_bins, filename))
    job_results2 = parallel_jobs.parallel_jobs(grid_sampling,job_args,num_parallel_jobs)
    
    print "merging output"
    sys.stdout.flush()
    #merge the grids, sample, and print the results
    #outputfile = open(outputfilename,"w")
    grid = job_results2[0][0]
    text_lines = job_results2[0][1]
    for i in range(1,len(job_results2)):
        for key in job_results2[i][0]:
#             if key not in grid:
            try:
                grid[key] = grid[key] + job_results2[i][0][key]
                text_lines[key] = text_lines[key]+job_results2[i][1][key]
            except KeyError:
#             else: #must merge two counts and two lists
                grid[key] = job_results2[i][0][key]
                text_lines[key] = job_results2[i][1][key] 
                
    
    print "sampling"
    sys.stdout.flush()
    #randomly sample proportional to the set in the grid cell
    count = 0
    outputfile = open(outputfilename,"w")
    
    
    for key in grid:
        if grid[key] >= density_threshold:
            for line in text_lines[key]:
                if random.random() < p:
                    count = count+1
                    outputfile.write(line)
                    outputfile.flush()
                    
    print "completed output"
    sys.stdout.flush()
    outputfile.close()
    
    for i in folderstoDelete:
        shutil.rmtree(i)
    
    log.act("end parallel density sampling")
    return [outputfilename, count]