Esempio n. 1
0
def parallel_inbox_sampling(bottom_left, top_right, p, filenames, num_parallel_jobs=8):
    log=ins.instrumento()
    log.act("start inbox sampling")
    log.params(bottom_left, top_right, p, filenames)
    if p >= 1.0 or p < 0.0: 
        print "Invalid probability, must be (0.0, 1.0]"
        
    job_args = []
    for filename in filenames:
        job_args.append((bottom_left, top_right, p, filename))
            
    job_results = parallel_jobs.parallel_jobs(inbox_sampling,job_args,num_parallel_jobs)
    log.act("end inbox sampling")
    return job_results
def parallel_maxent_uncertainty_sampling(model, filenames, 
                                         feature_order, truth_idx, 
                                         p, number_parallel_jobs=8):
    
    log=ins.instrumento()
    log.act("start maxent_uncertainty sampling")
    log.params(model, filenames,feature_order, truth_idx, p)
    if p > 1.0 or p < 0.0:
        print "Invalid probability, must be (0.0,1.0)"
        
    job_args = []
    for filename in filenames: 
        features, truth = load_tsv.load_tsv_features_truth(filename, 
                                                           feature_order, 
                                                           truth_idx)
        job_args.append((model, features, int(p*len(features)),filename))

    job_results = parallel_jobs.parallel_jobs(class_maxentunc_sampling, 
                                              job_args, number_parallel_jobs)
    log.act("end maxent_uncertainty sampling")
    return job_results
Esempio n. 3
0
#Test random sampling
import sys
sys.path.append("/Users/ingenia/git/instrumento/")
sys.path.append("/Users/ingenia/git/utilityFuncs/")
import random_sampling
import instrumento as ins
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from scipy.stats.mstats import mode
from sklearn.metrics import confusion_matrix as cm

log=ins.instrumento(filename="./pilotTest3.txt",printout=True)


log.act("start")


path="/Users/ingenia/git/data/data_sampling/"
dataFilename=path+"user_bot_data.tsv"
randFilename=path+"rand_sample.tsv"
botsFilename=path+"bots_sample.tsv"
fullRandFilename=path+"full_rand_sample.tsv"

# log.act("initial data exploration")
# data=np.genfromtxt(dataFilename,delimiter="\t")
# log.sum(data.shape,"data set size")
# log.sum(np.sum(data[:,1]),"number of bots")
# log.sum(data.shape[0]-np.sum(data[:,1]),"number of people")
# 
# ids=np.argwhere(np.isnan(data))
Esempio n. 4
0
def parallel_density_sampling(filenames, outputfilename, feature_order, truth_idx, density_threshold, p, num_bins, num_parallel_jobs=8,testMode=False):
    """
    parallel_density_sampling:
    Using grid based sampling, divide the state space into a grid and pick points probabilistically relative 
    to the number of points in the cell. In more detail the function takes the files passed and processes them.
    First maximum and minimum for each of the features is computed for each file, later the results are combined.
    After this is done the hyper-dimensional space of each file is segmented into cells and then counts of the data
    points falling into each cell are computed. Finally the cells and its counts are merged. In a way this is similar
    to kernel based density estimation where the kernel has no overlap also since this is in high dimensional space
    kernel density estimation is not feasible. 
     
    
     
    filenames : list of data sets to be used
    outputfilename : output filename including path
    feature_order : features to be used
    truth_idx : Index of the data labels in the data set  
    density_threshold : 
    p :
    num_bins :
    num_parallel_jobs : Number of parallel jobs, the default is 8
    """
    
    folderstoDelete,files=dO.createSplitFolder(filenames,testMode=testMode)
    
    filenames=files
    #Instrumentation code
    log=ins.instrumento()
    log.act("start density sampling")
    log.params(filenames,outputfilename,feature_order,truth_idx,density_threshold,p,num_bins)
    
    #parallel get mins and maxs
    print("Calculating mins and maxs")
    job_args = []
    for filename in filenames:
        print("working on file %s"%(filename))
        features, truth = load_tsv.load_tsv_features_truth(filename, feature_order, truth_idx)
        if p < 0 or p > 1: 
            print "Invalid starting probability"
            sys.stdout.flush()
        job_args.append((features))
        
    #This section of the code extracts minimums and maximums over the features on 
    #small pieces of data
    job_results = parallel_jobs.parallel_jobs(compute_state_space,job_args,num_parallel_jobs)
    
    #merge mins and maxs of each feature and compute the stepsize
    minimum = numpy.zeros(len(job_results[0][0]))
    maximum = numpy.zeros(len(job_results[0][1]))
    for i in range(len(minimum)):
        minimum[i] = job_results[0][0][i]
        maximum[i] = job_results[0][1][i]
        
    print "printing mins"
    sys.stdout.flush()
    for result in job_results:
        for i in range(len(minimum)):
            if result[0][i] < minimum[i]:
                minimum[i] = result[0][i]
            if result[1][i] > maximum[i]:
                maximum[i] = result[1][i]
    step_size = (maximum-minimum)/num_bins
    
    #put the samples in the grids and collect all the grids
    print("starting sampling")
    job_args = []
    for filename in filenames:
        features, truth = load_tsv.load_tsv_features_truth(filename, feature_order, truth_idx)
        job_args.append((features, minimum, maximum, step_size, num_bins, filename))
        #grid_sampling((features, minimum, maximum, step_size, num_bins, filename))
    job_results2 = parallel_jobs.parallel_jobs(grid_sampling,job_args,num_parallel_jobs)
    
    print "merging output"
    sys.stdout.flush()
    #merge the grids, sample, and print the results
    #outputfile = open(outputfilename,"w")
    grid = job_results2[0][0]
    text_lines = job_results2[0][1]
    for i in range(1,len(job_results2)):
        for key in job_results2[i][0]:
#             if key not in grid:
            try:
                grid[key] = grid[key] + job_results2[i][0][key]
                text_lines[key] = text_lines[key]+job_results2[i][1][key]
            except KeyError:
#             else: #must merge two counts and two lists
                grid[key] = job_results2[i][0][key]
                text_lines[key] = job_results2[i][1][key] 
                
    
    print "sampling"
    sys.stdout.flush()
    #randomly sample proportional to the set in the grid cell
    count = 0
    outputfile = open(outputfilename,"w")
    
    
    for key in grid:
        if grid[key] >= density_threshold:
            for line in text_lines[key]:
                if random.random() < p:
                    count = count+1
                    outputfile.write(line)
                    outputfile.flush()
                    
    print "completed output"
    sys.stdout.flush()
    outputfile.close()
    
    for i in folderstoDelete:
        shutil.rmtree(i)
    
    log.act("end parallel density sampling")
    return [outputfilename, count]
Esempio n. 5
0
if __name__=="__main__":
    print("Testing density sampling")
    import load_tsv
    import numpy
    from sklearn.svm import SVC
    import density_sampling
<<<<<<< HEAD
    
    runningAt="old"
    
#     path="/home/julian/data/"
#     path="/Users/ingenia/git/data/data_sampling/"
    path="/home/julian/Dropbox/data/"
    
    ins=ins.instrumento(path=path,logname="log.txt")
#     features, truth = load_tsv.load_tsv_features_truth("/Users/ingenia/git/data/data_sampling/user_bot_data.tsv",[0,1,2],3)
#     originalFile=["/Users/ingenia/git/data/data_sampling/user_bot_data.tsv"]
    originalFile=["%s/user_bot_data.tsv"%(path)]
#     originalFile=["/home/julian/data/user_bot_data.tsv"]

#     outputFile="/home/julian/data/user_bot_data_density_sample.tsv"
#     outputFile="/Users/ingenia/git/data/data_sampling/data_density_sample.tsv"
    outputFile="%sdata_density_sample.tsv"%(path)
=======
    import os
    import numpy as np
>>>>>>> 13fc38f062c685252ce5139518e2b9eb1ed183f7
    
    from os.path import expanduser
    home = expanduser("~")
Esempio n. 6
0
import sklearn
import density_sampling as ds
import query_by_committee_sampling as qbc
import random_sampling as rs
import load_tsv
import splitter as spi
import pandas
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import pickle
import os
import csv
import matplotlib.pyplot as plt
import dataGraphs as dG

log=ins.instrumento(filename="./pilotTest1.txt")
log.act("start")
dataFilename="/Users/ingenia/git/data/data_sampling/user_bot_data.tsv"
randomFilename="/Users/ingenia/git/data/data_sampling/random_sample_p2.tsv"
botsFilename="/Users/ingenia/git/data/data_sampling/botsData.tsv"
ranplusbotFilename="/Users/ingenia/git/data/data_sampling/ranplusbot.tsv"

# file=open(dataFilename,'r')
# count=0
# data=[]
# for i in file:
#     data.append( [float(i2) for i2 in i.split("\t")])
#     count+=1
#     if count>=5000:
#         break
# file.close()
Esempio n. 7
0
def parallel_random_sampling(p, filenames,outputFilename, num_parallel_jobs=8):
    """
    parallel_random_sampling
    This function performs random sampling on a file and produces
    a random sample. The random sample file will be stored in the same
    folder were the data is with the prefix -rand
    
    p         :    is the probability of each sample from each file being chosen 
    filenames :    List of files to be used for the random sampling
   Note, we are not guaranteeing that p*filesize samples will be chosen
   filenames are strings of files to open and sample from
   """
    log=ins.instrumento()
    log.act("start random sampling")
    log.params(p, filenames)
    if p >= 1.0 or p < 0.0: 
        print "Invalid probability, must be (0.0, 1.0]"
    
    files=[]
    path=""
    folderstoDelete=[]
    
        
#     for i in filenames:
#         idx=np.random.randint(0,10,5)
#         idx=[str(s) for s in idx]
#         idx="".join(idx)
#         path=sU.extractPath(i)+"/temp_"+idx
#         folderstoDelete.append(path)
#         if not(os.path.isdir(path)):
#             os.mkdir(path)
#         files+=dO.splitter(i,path,prefix="split")


    folderstoDelete,files=dO.createSplitFolder(filenames)
        
    #Currently because of this line of code it will only work with a single    
    path=dO.createFolder(filenames[0])
    folderstoDelete.append(path)
        
    job_args = []
    for filename in files:
        
        job_args.append((p,filename,path))
        
        
    
            
    job_results = parallel_jobs.parallel_jobs(random_sampling,job_args,num_parallel_jobs)
    
    
    
    randFiles=[ i[0] for i in job_results]
    
#     print(job_results)
    
    
    dO.combiner(randFiles,outputFilename)
    log.act("end random sampling")
    
#     print(folderstoDelete)
    
    for i in folderstoDelete:
        shutil.rmtree(i)
#         print("to delete %s"%(i))
    
    #TODO
#     Consider erasing all of the files in path once done
#     also erase all of the random files
    
    
    return job_results
Esempio n. 8
0
import sklearn
import density_sampling as ds
import query_by_committee_sampling as qbc
import random_sampling as rs
import load_tsv
import splitter as spi
import pandas
import matplotlib.pyplot as plt
from sklearn import svm
import os
from __builtin__ import file


path="/Users/ingenia/git/sampling"
dataFilename="/Users/ingenia/git/data/data_sampling/user_bot_data.tsv"
log=ins.instrumento(filename=path+"/pilotTest1.txt")
splitPath="/Users/ingenia/git/data/data_sampling/splits"
random_sample_filename="/Users/ingenia/git/data/data_sampling/random_sample.tsv"

prefix='split'
lines=10000

log.act("start")

# log.act("loading data")
# features,labels=load_tsv.load_tsv_features_truth(dataFilename,range(2,29),1)
# print(features.shape)


# df=pandas.DataFrame(features[1:1000,:])
# log.act("getting description of the data")