def parallel_inbox_sampling(bottom_left, top_right, p, filenames, num_parallel_jobs=8): log=ins.instrumento() log.act("start inbox sampling") log.params(bottom_left, top_right, p, filenames) if p >= 1.0 or p < 0.0: print "Invalid probability, must be (0.0, 1.0]" job_args = [] for filename in filenames: job_args.append((bottom_left, top_right, p, filename)) job_results = parallel_jobs.parallel_jobs(inbox_sampling,job_args,num_parallel_jobs) log.act("end inbox sampling") return job_results
def parallel_maxent_uncertainty_sampling(model, filenames, feature_order, truth_idx, p, number_parallel_jobs=8): log=ins.instrumento() log.act("start maxent_uncertainty sampling") log.params(model, filenames,feature_order, truth_idx, p) if p > 1.0 or p < 0.0: print "Invalid probability, must be (0.0,1.0)" job_args = [] for filename in filenames: features, truth = load_tsv.load_tsv_features_truth(filename, feature_order, truth_idx) job_args.append((model, features, int(p*len(features)),filename)) job_results = parallel_jobs.parallel_jobs(class_maxentunc_sampling, job_args, number_parallel_jobs) log.act("end maxent_uncertainty sampling") return job_results
#Test random sampling import sys sys.path.append("/Users/ingenia/git/instrumento/") sys.path.append("/Users/ingenia/git/utilityFuncs/") import random_sampling import instrumento as ins import numpy as np from sklearn.linear_model import LogisticRegression from sklearn.metrics import f1_score from scipy.stats.mstats import mode from sklearn.metrics import confusion_matrix as cm log=ins.instrumento(filename="./pilotTest3.txt",printout=True) log.act("start") path="/Users/ingenia/git/data/data_sampling/" dataFilename=path+"user_bot_data.tsv" randFilename=path+"rand_sample.tsv" botsFilename=path+"bots_sample.tsv" fullRandFilename=path+"full_rand_sample.tsv" # log.act("initial data exploration") # data=np.genfromtxt(dataFilename,delimiter="\t") # log.sum(data.shape,"data set size") # log.sum(np.sum(data[:,1]),"number of bots") # log.sum(data.shape[0]-np.sum(data[:,1]),"number of people") # # ids=np.argwhere(np.isnan(data))
def parallel_density_sampling(filenames, outputfilename, feature_order, truth_idx, density_threshold, p, num_bins, num_parallel_jobs=8,testMode=False): """ parallel_density_sampling: Using grid based sampling, divide the state space into a grid and pick points probabilistically relative to the number of points in the cell. In more detail the function takes the files passed and processes them. First maximum and minimum for each of the features is computed for each file, later the results are combined. After this is done the hyper-dimensional space of each file is segmented into cells and then counts of the data points falling into each cell are computed. Finally the cells and its counts are merged. In a way this is similar to kernel based density estimation where the kernel has no overlap also since this is in high dimensional space kernel density estimation is not feasible. filenames : list of data sets to be used outputfilename : output filename including path feature_order : features to be used truth_idx : Index of the data labels in the data set density_threshold : p : num_bins : num_parallel_jobs : Number of parallel jobs, the default is 8 """ folderstoDelete,files=dO.createSplitFolder(filenames,testMode=testMode) filenames=files #Instrumentation code log=ins.instrumento() log.act("start density sampling") log.params(filenames,outputfilename,feature_order,truth_idx,density_threshold,p,num_bins) #parallel get mins and maxs print("Calculating mins and maxs") job_args = [] for filename in filenames: print("working on file %s"%(filename)) features, truth = load_tsv.load_tsv_features_truth(filename, feature_order, truth_idx) if p < 0 or p > 1: print "Invalid starting probability" sys.stdout.flush() job_args.append((features)) #This section of the code extracts minimums and maximums over the features on #small pieces of data job_results = parallel_jobs.parallel_jobs(compute_state_space,job_args,num_parallel_jobs) #merge mins and maxs of each feature and compute the stepsize minimum = numpy.zeros(len(job_results[0][0])) maximum = numpy.zeros(len(job_results[0][1])) for i in range(len(minimum)): minimum[i] = job_results[0][0][i] maximum[i] = job_results[0][1][i] print "printing mins" sys.stdout.flush() for result in job_results: for i in range(len(minimum)): if result[0][i] < minimum[i]: minimum[i] = result[0][i] if result[1][i] > maximum[i]: maximum[i] = result[1][i] step_size = (maximum-minimum)/num_bins #put the samples in the grids and collect all the grids print("starting sampling") job_args = [] for filename in filenames: features, truth = load_tsv.load_tsv_features_truth(filename, feature_order, truth_idx) job_args.append((features, minimum, maximum, step_size, num_bins, filename)) #grid_sampling((features, minimum, maximum, step_size, num_bins, filename)) job_results2 = parallel_jobs.parallel_jobs(grid_sampling,job_args,num_parallel_jobs) print "merging output" sys.stdout.flush() #merge the grids, sample, and print the results #outputfile = open(outputfilename,"w") grid = job_results2[0][0] text_lines = job_results2[0][1] for i in range(1,len(job_results2)): for key in job_results2[i][0]: # if key not in grid: try: grid[key] = grid[key] + job_results2[i][0][key] text_lines[key] = text_lines[key]+job_results2[i][1][key] except KeyError: # else: #must merge two counts and two lists grid[key] = job_results2[i][0][key] text_lines[key] = job_results2[i][1][key] print "sampling" sys.stdout.flush() #randomly sample proportional to the set in the grid cell count = 0 outputfile = open(outputfilename,"w") for key in grid: if grid[key] >= density_threshold: for line in text_lines[key]: if random.random() < p: count = count+1 outputfile.write(line) outputfile.flush() print "completed output" sys.stdout.flush() outputfile.close() for i in folderstoDelete: shutil.rmtree(i) log.act("end parallel density sampling") return [outputfilename, count]
if __name__=="__main__": print("Testing density sampling") import load_tsv import numpy from sklearn.svm import SVC import density_sampling <<<<<<< HEAD runningAt="old" # path="/home/julian/data/" # path="/Users/ingenia/git/data/data_sampling/" path="/home/julian/Dropbox/data/" ins=ins.instrumento(path=path,logname="log.txt") # features, truth = load_tsv.load_tsv_features_truth("/Users/ingenia/git/data/data_sampling/user_bot_data.tsv",[0,1,2],3) # originalFile=["/Users/ingenia/git/data/data_sampling/user_bot_data.tsv"] originalFile=["%s/user_bot_data.tsv"%(path)] # originalFile=["/home/julian/data/user_bot_data.tsv"] # outputFile="/home/julian/data/user_bot_data_density_sample.tsv" # outputFile="/Users/ingenia/git/data/data_sampling/data_density_sample.tsv" outputFile="%sdata_density_sample.tsv"%(path) ======= import os import numpy as np >>>>>>> 13fc38f062c685252ce5139518e2b9eb1ed183f7 from os.path import expanduser home = expanduser("~")
import sklearn import density_sampling as ds import query_by_committee_sampling as qbc import random_sampling as rs import load_tsv import splitter as spi import pandas import matplotlib.pyplot as plt from sklearn.linear_model import LogisticRegression import pickle import os import csv import matplotlib.pyplot as plt import dataGraphs as dG log=ins.instrumento(filename="./pilotTest1.txt") log.act("start") dataFilename="/Users/ingenia/git/data/data_sampling/user_bot_data.tsv" randomFilename="/Users/ingenia/git/data/data_sampling/random_sample_p2.tsv" botsFilename="/Users/ingenia/git/data/data_sampling/botsData.tsv" ranplusbotFilename="/Users/ingenia/git/data/data_sampling/ranplusbot.tsv" # file=open(dataFilename,'r') # count=0 # data=[] # for i in file: # data.append( [float(i2) for i2 in i.split("\t")]) # count+=1 # if count>=5000: # break # file.close()
def parallel_random_sampling(p, filenames,outputFilename, num_parallel_jobs=8): """ parallel_random_sampling This function performs random sampling on a file and produces a random sample. The random sample file will be stored in the same folder were the data is with the prefix -rand p : is the probability of each sample from each file being chosen filenames : List of files to be used for the random sampling Note, we are not guaranteeing that p*filesize samples will be chosen filenames are strings of files to open and sample from """ log=ins.instrumento() log.act("start random sampling") log.params(p, filenames) if p >= 1.0 or p < 0.0: print "Invalid probability, must be (0.0, 1.0]" files=[] path="" folderstoDelete=[] # for i in filenames: # idx=np.random.randint(0,10,5) # idx=[str(s) for s in idx] # idx="".join(idx) # path=sU.extractPath(i)+"/temp_"+idx # folderstoDelete.append(path) # if not(os.path.isdir(path)): # os.mkdir(path) # files+=dO.splitter(i,path,prefix="split") folderstoDelete,files=dO.createSplitFolder(filenames) #Currently because of this line of code it will only work with a single path=dO.createFolder(filenames[0]) folderstoDelete.append(path) job_args = [] for filename in files: job_args.append((p,filename,path)) job_results = parallel_jobs.parallel_jobs(random_sampling,job_args,num_parallel_jobs) randFiles=[ i[0] for i in job_results] # print(job_results) dO.combiner(randFiles,outputFilename) log.act("end random sampling") # print(folderstoDelete) for i in folderstoDelete: shutil.rmtree(i) # print("to delete %s"%(i)) #TODO # Consider erasing all of the files in path once done # also erase all of the random files return job_results
import sklearn import density_sampling as ds import query_by_committee_sampling as qbc import random_sampling as rs import load_tsv import splitter as spi import pandas import matplotlib.pyplot as plt from sklearn import svm import os from __builtin__ import file path="/Users/ingenia/git/sampling" dataFilename="/Users/ingenia/git/data/data_sampling/user_bot_data.tsv" log=ins.instrumento(filename=path+"/pilotTest1.txt") splitPath="/Users/ingenia/git/data/data_sampling/splits" random_sample_filename="/Users/ingenia/git/data/data_sampling/random_sample.tsv" prefix='split' lines=10000 log.act("start") # log.act("loading data") # features,labels=load_tsv.load_tsv_features_truth(dataFilename,range(2,29),1) # print(features.shape) # df=pandas.DataFrame(features[1:1000,:]) # log.act("getting description of the data")