def measure(arg, commandline, delay, maxtime, outFile=None, errFile=None, inFile=None, logger=None, affinitymask=None): r, w = os.pipe() forkedPid = os.fork() if forkedPid: # read pickled measurements from the pipe os.close(w) rPipe = os.fdopen(r) r = cPickle.Unpickler(rPipe) measurements = r.load() rPipe.close() os.waitpid(forkedPid, 0) return measurements else: # Sample thread will be destroyed when the forked process _exits class Sample(threading.Thread): def __init__(self, program): threading.Thread.__init__(self) self.setDaemon(1) self.timedout = False self.p = program self.maxMem = 0 self.childpids = None self.start() def run(self): try: remaining = maxtime while remaining > 0: time.sleep(delay) remaining -= delay else: self.timedout = True os.kill(self.p, signal.SIGKILL) except OSError, (e, err): if logger: logger.error('%s %s', e, err) try: m = Record(arg) # only write pickles to the pipe os.close(r) wPipe = os.fdopen(w, 'w') w = cPickle.Pickler(wPipe) start = time.time() # spawn the program in a separate process p = Popen(commandline, stdout=outFile, stderr=errFile, stdin=inFile) # start a thread to sample the program's resident memory use t = Sample(program=p.pid) # wait for program exit status and resource usage rusage = os.wait3(0) elapsed = time.time() - start # summarize measurements if t.timedout: m.setTimedout() elif rusage[1] == os.EX_OK: m.setOkay() else: m.setError() m.userSysTime = rusage[2][0] + rusage[2][1] m.maxMem = t.maxMem m.cpuLoad = "%" m.elapsed = elapsed except KeyboardInterrupt: os.kill(p.pid, signal.SIGKILL) except ZeroDivisionError, (e, err): if logger: logger.warn('%s %s', err, 'too fast to measure?')
trainTargets[count] = ecfps[CID] count +=1 model.fit(trainImages, trainTargets, batch_size=batch_size, nb_epoch=1) shuffle(testFs) count = 0 for x in testFs[:chunkSize/10]: if x.find(".png") > -1: CID = x[:x.find(".png")] image = io.imread(direct+x,as_grey=True)[10:-10,10:-10] image = np.where(image > 0.1,1.0,0.0) testImages[count,0,:,:] = image testTargets[count] = ecfps[CID] count +=1 preds = model.predict(testImages) RMSE = np.sqrt(mean_squared_error(testTargets, preds)) print "RMSE of epoch: ", RMSE if DUMP_WEIGHTS: dumpWeights(model) with open(folder+"wholeModel.pickle", 'wb') as f: cp = cPickle.Pickler(f) cp.dump(model)
def dump(obj, file, protocol=None): pickler = pickle.Pickler(file, protocol=PROTOCOL) pickler.persistent_id = _function_pickling_handler pickler.dump(obj)
pass else: assert 0, "internal error - pickling should fail" try: p.dump(m) except pickle.PicklingError: pass else: assert 0, "internal error - pickling should fail" del c, m, fp, p if 1 and cPickle: c = Curl() m = CurlMulti() fp = StringIO() p = cPickle.Pickler(fp, 1) try: p.dump(c) except cPickle.PicklingError: pass else: assert 0, "internal error - pickling should fail" try: p.dump(m) except cPickle.PicklingError: pass else: assert 0, "internal error - pickling should fail" del c, m, fp, p
def dumps(self, arg, proto=0): p = cPickle.Pickler(proto) p.dump(arg) return p.getvalue()
def __setitem__(self, key, value): f = StringIO.StringIO() p = pickle.Pickler(f) p.dump(value) gimp.set_data(key, f.getvalue())
def get_requests(files, start=None, end=None, statsfname=None, writestats=None, readstats=None): finished = [] unfinished = {} if readstats: fp = open(statsfname, 'r') u = cPickle.Unpickler(fp) requests = u.load() fp.close() del u del fp else: while 1: tup = get_earliest_file_data(files) if tup is None: break code, pid, id, fromepoch, desc = tup if start is not None and fromepoch < start: continue if end is not None and fromepoch > end: break if code == 'U': # restart for upid, uid in list(unfinished.keys()): if upid == pid: val = unfinished[(upid, uid)] finished.append(val) del unfinished[(upid, uid)] request = StartupRequest() request.url = desc request.start = fromepoch finished.append(request) continue request = unfinished.get((pid, id)) if request is None: if code != "B": continue # garbage at beginning of file request = Request() for pending_req in unfinished.values(): pending_req.active = pending_req.active + 1 unfinished[(pid, id)] = request try: request.put(code, fromepoch, desc) except: print "Unable to handle entry: %s %s %s" % (code, fromepoch, desc) if request.isfinished(): del unfinished[(pid, id)] finished.append(request) finished.extend(unfinished.values()) requests = finished if writestats: fp = open(statsfname, 'w') p = cPickle.Pickler(fp) p.dump(requests) fp.close() del p del fp return requests
# continue with previous protein else: sequence += line.strip() p.close() # pop out first dummy sequence sequences.pop(0) # matrix representation of position along a sequence # where selected k-mers are found, up to m mismatches hit_matrix = compile_hit_matrix(sequences, kmers, m) # save compiled data f = open( data_path + virus_family + '_hitmatrix_collapsed_%d_%d.pkl' % (k, m), 'w') cPickle.Pickler(f, protocol=2).dump(hit_matrix) cPickle.Pickler(f, protocol=2).dump(viruses) cPickle.Pickler(f, protocol=2).dump(classes) f.close() # group viruses with similar hosts together sort_indices = hit_matrix[:, 0].argsort() sort_virus_id = [viruses[i] for i in sort_indices] sort_viruses = [classes[v][0] for v in sort_virus_id] # plot and save the visualization figure = plot_hit_matrix(hit_matrix[sort_indices, :], k, m, kmer_list) filename = '%s/fig/%s_protein_kmer_visualization_collapsed_%d_%d.eps' % ( project_path, virus_family, k, m) figure.savefig(fname, dpi=(500), format='eps')
def pickle(self): f = open(self.fname, "w") p = cPickle.Pickler(f) obj = (self.dbinterfaces, self.dbroutes, self.dbroutelist) p.dump(obj) f.close()
def adaboost(X, Y, x, y, predicted_labels, test_indices, params, kmer_dict, model='stump', virus_family='picorna'): """ Input: X : DxN array (Train data) Y : KxN array (Train labels) x : Dxn array (Test data) y : Kxn array (Test labels) predicted_labels : test_indices : params : tuple (fold index, kmer length, mismatch, num of boosting rounds) kmer_dict : a dictionary mapping row id to kmers. model : string can be "tree" or "stump" virus_family : 'picorna' / 'rhabdo' """ X = X.astype('float') Y = Y.astype('float') (D, N) = X.shape K = Y.shape[0] n = x.shape[1] test_indices.sort() f = params[0] k = params[1] m = params[2] T = params[3] """ creating output files onfname - test/train errors and the selected feature at each round is output in this file tnfname - the decision tree after T rounds of boosting is output in this file dfname - a general dump of the test/train predictions for all T rounds is output in this file """ filetag = model + '_%d_%d_%d' % (k, m, f) onfname = project_path + 'cache/%s_temp/%s_error%s.txt' % ( virus_family, virus_family, filetag) tnfname = project_path + 'cache/%s_temp/%s_decision%s.pkl' % ( virus_family, virus_family, filetag) dfname = project_path + 'cache/%s_temp/%s_dump%s.pkl' % ( virus_family, virus_family, filetag) # Initializing weight over examples - Uniform distribution w = np.ones(Y.shape, dtype='float') / (N * K) #Data structures to store output from boosting at each round. #dectree - a list of all nodes (and their attributes) in the decision tree #Tpred/tpred - stores the output of the decision tree at each round (train/test samples) Phidict = dict() phidict = dict() dectree = dict() order = [] Tpred = np.zeros((K, N, T + 1), dtype='float32') tpred = np.zeros((K, n, T + 1), dtype='float32') rocacc = np.zeros((T + 1, 5), dtype='float32') starttime = time.time() # root decision function/prediction node. # root decision function always outputs 1. v = ((w * Y).sum(1) > 0) * 2. - 1. v = v.reshape(K, 1) # compute cumulative weights Yv = Y * v Wp = (w * (Yv > 0)).sum() Wm = (w * (Yv < 0)).sum() # a = coefficient of weak rule a = 0.5 * np.log((Wp + EPS) / (Wm + EPS)) if a < 0: a = np.abs(a) v = -1 * v # update decision tree and prediction list. Phi = np.ones((1, N), dtype='float32') phi = np.ones((1, n), dtype='float32') Hweakrule = v * Phi hweakrule = v * phi # Phidict keys = feature ids # Phidict values = [\phi(x), feature wt, >/< decision, weak rule's output] Phidict[-1] = [[Phi, a, Hweakrule]] phidict[-1] = [[phi, a, hweakrule]] dectree[-1] = [-1, [a, [], v]] # compute the prediction output by the decision # tree for all train/test samples train_pred = np.zeros((K, N), dtype='float32') test_pred = np.zeros((K, n), dtype='float32') for kidx in Phidict.keys(): for aidx in range(len(Phidict[kidx])): train_pred = train_pred + Phidict[kidx][aidx][1] * Phidict[kidx][ aidx][2] test_pred = test_pred + phidict[kidx][aidx][1] * phidict[kidx][ aidx][2] # store the real-valued prediction Tpred[:, :, 0] = train_pred tpred[:, :, 0] = test_pred # compute classification error at round 0 rocacc[0, 1], rocacc[0, 3] = classification_error(train_pred, test_pred, Y, y, 0.) duration = time.time() - starttime # write some output to file # file format: boosting round, k-mer selected, # train roc, train error, test roc, test error, time elapsed owrite = open(onfname, 'w') to_write = [ -1, 'root', 'None', 0.5, rocacc[0, 1], 0.5, rocacc[0, 3], duration ] owrite.write('\t'.join(map(str, to_write)) + '\n') owrite.close() print to_write # update weights wnew = w * np.exp(-a * Hweakrule * Y) wnew = wnew / wnew.sum() w = wnew.copy() # starting boosting rounds for t in range(T): starttime = time.time() # choose the appropriate (leaf+weak rule) for the next prediction function #pstar, cstar, pastar, cvalue, Z = py_get_weak_rule(X, Y, Phidict, w, model) pstar, cstar, pastar, cvalue, Z = weave_get_weak_rule( X, Y, Phidict, w, model) PX = (X[cstar:cstar + 1, :] < cvalue) * 1 px = (x[cstar:cstar + 1, :] < cvalue) * 1 order.append(t) # Updating Tree and prediction dictionary Phidict[t] = [] phidict[t] = [] dectree[t] = [[kmer_dict[cstar], cvalue]] dectree[pstar][pastar + 1][1].append(t) Hweakrule = np.zeros((K, N), dtype='float') hweakrule = np.zeros((K, n), dtype='float') ans = [0, 1] for aidx in ans: # compute output of decision function Phi = Phidict[pstar][pastar][0] * (aidx + ((-1.)**aidx) * PX) phi = phidict[pstar][pastar][0] * (aidx + ((-1.)**aidx) * px) # calculate optimal value of alpha for that decision wYP = w * Y * Phi vstar = ((wYP.sum(1) > 0) * 2. - 1.).reshape(K, 1) YvP = Y * vstar * Phi Wp = (w * (YvP == 1)).sum() Wm = (w * (YvP == -1)).sum() a = 0.5 * np.log((Wp + EPS) / (Wm + EPS)) if a < 0: a = np.abs(a) v = -1 * v # compute f(x) = \alpha * \phi(x) * v for each decision node Hweakrule += a * vstar * Phi # Update Tree and prediction dictionary Phidict[t].append([Phi, a, vstar * Phi]) phidict[t].append([phi, a, vstar * phi]) dectree[t].append([a, [], vstar]) # Update example weights wnew = w * np.exp(-1. * Hweakrule * Y) wnew = wnew / wnew.sum() w = wnew.copy() # Calculate train and test predictions and errors train_pred = np.zeros((K, N), dtype='float32') test_pred = np.zeros((K, n), dtype='float32') for kidx in Phidict.keys(): for aidx in range(len(Phidict[kidx])): train_pred = train_pred + Phidict[kidx][aidx][1] * Phidict[ kidx][aidx][2] test_pred = test_pred + phidict[kidx][aidx][1] * phidict[kidx][ aidx][2] Tpred[:, :, t + 1] = train_pred tpred[:, :, t + 1] = test_pred rocacc[t + 1, 0], rocacc[t + 1, 2], rocacc[t + 1, 4] = roc_auc(train_pred, test_pred, Y, y) rocacc[t + 1, 1], rocacc[t + 1, 3] = classification_error(train_pred, test_pred, Y, y, rocacc[t + 1, 4]) predicted_labels[test_indices, t] = test_pred.argmax(0) duration = time.time() - starttime # output data owrite = open(onfname, 'a') to_write = [ t, kmer_dict[cstar], cvalue, rocacc[t + 1, 0], rocacc[t + 1, 1], rocacc[t + 1, 2], rocacc[t + 1, 3], duration ] owrite.write('\t'.join(map(str, to_write)) + '\n') owrite.close() print to_write # output decision tree twrite = open(tnfname, 'w') cPickle.Pickler(twrite, protocol=2).dump(dectree) cPickle.Pickler(twrite, protocol=2).dump(order) twrite.close() # dump predictions for more analysis dwrite = open(dfname, 'w') cPickle.Pickler(dwrite, protocol=2).dump(Tpred) cPickle.Pickler(dwrite, protocol=2).dump(tpred) cPickle.Pickler(dwrite, protocol=2).dump(rocacc) dwrite.close() return predicted_labels
def __init__(self): self.file = tempfile.TemporaryFile(suffix=".log") self.pickler = cPickle.Pickler(self.file, 1) self.pickler.fast = 1 self.stores = 0 self.read = 0
def loadDataMV (self, filename, verbose=True, replace_missing=True): ''' Get the data from a text file in one of 3 formats: matrix, sparse, sparse_binary''' # data.MV=[] # data.X=[] if verbose: print("========= Reading " + filename) ntime = nnum = ncat = nmvc = 0 start = time.time() # find the type of features for the data set dictfeats=self.feat_type usetime = np.array(np.where(self.feat_type=='Time'))[0] usenum = np.array(np.where(self.feat_type=='Numerical'))[0] usecat = np.array(np.where(self.feat_type=='Categorical'))[0] usemulticat = np.array(np.where(self.feat_type=='Multi-value'))[0] if verbose: print("=== Detected %d Numerical Features" % len(usenum)) print("=== Detected %d Categorical Features" % len(usecat)) print("=== Detected %d Multi-valued Categorical Features" % len(usemulticat)) print("=== Detected %d Time Features" % len(usetime)) # artificial headers for features for i in range(len(dictfeats)): dictfeats[i]=str(i) # read the first column to identify the total number of features df = pd.read_csv(filename,header=None, names=dictfeats,delim_whitespace=True, usecols=[0], parse_dates=True, na_values='NaN') n_samples=len(df.index) if verbose: print("=== %d Samples will be loaded " % len(df.index)) concadat= np.zeros((n_samples, 1)) del df # Check the available types of features if verbose: print("========================") """ if usetime != []: if verbose: print("=== Processing %d Time features " % len(usetime)) try: dftime = pd.read_csv(filename, header=None, names=self.feat_type[usetime], usecols=usetime, delim_whitespace=True,parse_dates=True, na_values='NaN') ddt=np.array(dftime) ntime=ddt.shape[1] concadat= np.concatenate((concadat,ddt),axis=1) del dftime del ddt except: print ("Failed to load time variables") """ if usenum != []: if verbose: print("=== Processing %d Numerical features " % len(usenum)) try: dfnum = pd.read_csv(filename, header=None, names=self.feat_type[usenum], usecols=usenum, delim_whitespace=True, na_values='NaN') dd=np.array(dfnum) nnum=dd.shape[1] concadat= np.concatenate((concadat,dd),axis=1) del dfnum del dd except: print ("Failed to load numerical variables") if usecat != []: # categorical features will be loaded as numbers for efficiency if verbose: print("=== Processing %d Categorical features " % len(usecat)) try: dfcat = pd.read_csv(filename, header=None, names=self.feat_type[usecat], usecols=usecat,dtype=object, delim_whitespace=True, na_values='NaN') ncat=dfcat.shape[1] CAT=dfcat # Treat categorical variables as integers or perform hash encoding (one hot encoding is far more expensive) # catnumeric_dataset=np.array(dfcat) #print("Tipo catego") #print (catnumeric_dataset.dtype) #enca = OrdinalEncoder().fit(dfcat) #catnumeric_dataset = enca.transform(dfcat) #catnumeric_dataset = np.array(catnumeric_dataset) # ncat = catnumeric_dataset.shape[1] # concadat= np.concatenate((concadat,catnumeric_dataset),axis=1) # print (catnumeric_dataset) # #np.savetxt('categ.csv',catnumeric_dataset,delimiter=',') del dfcat # del catnumeric_dataset except: print ("Failed to load Categorical variables") CAT=[] else: CAT=[] """ if len(usemulticat) > 0: if verbose: print("=== Processing %d Multi Valued Categorical features " % len(usemulticat)) try: dfmvc = pd.read_csv(filename, header=None, names=self.feat_type[usemulticat], usecols=usemulticat, dtype=object, delim_whitespace=True, na_values='NaN') nmvc = dfmvc.shape[1] MV=dfmvc del dfmvc except: print ("Failed to load Multi-Valued Categorical variables") MV=[] else: MV=[] """ ntime = 0 nmvc = 0 MV = [] concadat=np.delete(concadat, 0, 1) self.info['loaded_feat_types'] = [ntime, nnum, ncat, nmvc] if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")): with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint (verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'format' not in self.info.keys(): self.getFormatData(filename) print("not in self") if 'feat_num' not in self.info.keys(): self.getNbrFeatures(filename) dataX = concadat.astype(np.float64).copy(order='C') # IMPORTANT: when we replace missing values we double the number of variables if self.info['format']=='dense' and replace_missing and np.any(map(np.isnan,dataX)): vprint (verbose, "Replace missing values by 0 (slow, sorry)") dataX = data_converter.replace_missing(dataX) if self.use_pickle: with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(dataX) end = time.time() if verbose: print( "Loaded %d Samples and %d Features" % (dataX.shape[0], dataX.shape[1])) print( "[+] Success in %5.2f sec" % (end - start)) data={} data['numerical']=dataX data['MV']=MV; data['CAT']=CAT; return data
class People(object): """Klasa do piklowania.""" def __init__(self, imie, nazwisko): self.imie = imie self.nazwisko = nazwisko def __str__(self): return "%s %s" % (self.imie, self.nazwisko) p1 = People('Adam', 'Zazol') print "Przed piklowaniem: %s" % p1 f = open("dane.dat", "wb") # z trybem binarnym dla windows p = cPickle.Pickler(f, 2) # "piklowanie" w trybie binarnym p.dump(p1) # "zapiklowanie" do pliku f.close() # zamkniecie pliku del p1 # to dla niedowiarkow... f = open("dane.dat", "rb") # znow uklon w strone windows u = cPickle.Unpickler(f) # sam rozpozna ze tryb binarny p1 = u.load() # tu ladujemy obiekt print "Po odtworzeniu z pikla: %s" % p1 f.close() # dobre maniery mowia o zamykaniu...
def write_metrics(metrics, filename=None, filetype='json'): """ Write metrics to file after running self.run_metrics() Input: ------ metrics : dictionary Omnical_Metrics.run_metrics() output filename : str, default=None filename to write out, will use filename by default filetype : str, default='json', option=['json', 'pkl'] specify file format of output metrics file """ # get pols pols = list(metrics.keys()) if filename is None: filename = os.path.join(metrics[pols[0]]['filedir'], metrics[pols[0]]['filestem'] + '.omni_metrics') # write to file if filetype == 'json': if filename.split('.')[-1] != 'json': filename += '.json' # change ndarrays to lists metrics_out = copy.deepcopy(metrics) # loop over pols for h, pol in enumerate(metrics_out.keys()): # loop over keys for i, k in enumerate(metrics_out[pol].keys()): if isinstance(metrics_out[pol][k], np.ndarray): metrics_out[pol][k] = metrics[pol][k].tolist() elif isinstance(metrics_out[pol][k], (dict, odict)): if list(metrics_out[pol] [k].values())[0].dtype == np.complex: metrics_out[pol][k] = odict([ (j, metrics_out[pol][k][j].astype(np.str)) for j in metrics_out[pol][k] ]) metrics_out[pol][k] = odict([ (str(j), metrics_out[pol][k][j].tolist()) for j in metrics_out[pol][k] ]) elif isinstance(metrics_out[pol][k], (np.bool, np.bool_)): metrics_out[pol][k] = bool(metrics_out[pol][k]) elif isinstance(metrics_out[pol][k], np.float): metrics_out[pol][k] = float(metrics_out[pol][k]) elif isinstance(metrics_out[pol][k], np.integer): metrics_out[pol][k] = int(metrics_out[pol][k]) with open(filename, 'w') as f: json.dump(metrics_out, f, indent=4) elif filetype == 'pkl': if filename.split('.')[-1] != 'pkl': filename += '.pkl' with open(filename, 'wb') as f: outp = pkl.Pickler(f) outp.dump(metrics)
def measure(arg,commandline,delay,maxtime, outFile=None,errFile=None,inFile=None,logger=None,affinitymask=None): r,w = os.pipe() forkedPid = os.fork() if forkedPid: # read pickled measurements from the pipe os.close(w); rPipe = os.fdopen(r); r = cPickle.Unpickler(rPipe) measurements = r.load() rPipe.close() os.waitpid(forkedPid,0) return measurements else: # Sample thread will be destroyed when the forked process _exits class Sample(threading.Thread): def __init__(self,program): threading.Thread.__init__(self) self.setDaemon(1) self.timedout = False self.p = program self.maxMem = 0 self.childpids = None self.start() def run(self): try: remaining = maxtime while remaining > 0: mem = gtop.proc_mem(self.p).resident time.sleep(delay) remaining -= delay # race condition - will child processes have been created yet? self.maxMem = max((mem + self.childmem())/1024, self.maxMem) else: self.timedout = True os.kill(self.p, signal.SIGKILL) except OSError, (e,err): if logger: logger.error('%s %s',e,err) def childmem(self): if self.childpids == None: self.childpids = set() for each in gtop.proclist(): if gtop.proc_uid(each).ppid == self.p: self.childpids.add(each) mem = 0 for each in self.childpids: mem += gtop.proc_mem(each).resident return mem try: m = Record(arg) # only write pickles to the pipe os.close(r); wPipe = os.fdopen(w, 'w'); w = cPickle.Pickler(wPipe) # gtop cpu is since machine boot, so we need a before measurement cpus0 = gtop.cpu().cpus start = time.time() # spawn the program in a separate process p = Popen(commandline,stdout=outFile,stderr=errFile,stdin=inFile) # start a thread to sample the program's resident memory use t = Sample( program = p.pid ) # wait for program exit status and resource usage rusage = os.wait3(0) # gtop cpu is since machine boot, so we need an after measurement elapsed = time.time() - start cpus1 = gtop.cpu().cpus # summarize measurements if t.timedout: m.setTimedout() elif rusage[1] == os.EX_OK: m.setOkay() else: m.setError() m.userSysTime = rusage[2][0] + rusage[2][1] m.maxMem = t.maxMem load = map( lambda t0,t1: int(round( 100.0 * (1.0 - float(t1.idle-t0.idle)/(t1.total-t0.total)) )) ,cpus0 ,cpus1 ) #load.sort(reverse=1) # maybe more obvious unsorted m.cpuLoad = ("% ".join([str(i) for i in load]))+"%" m.elapsed = elapsed except KeyboardInterrupt: os.kill(p.pid, signal.SIGKILL) except ZeroDivisionError, (e,err): if logger: logger.warn('%s %s',err,'too fast to measure?')
pipeliner = pipeline.Pipeliner(extractor, preprocessor, dictTrainer, encoder, pooler) if args.encoder is None: mpiutils.rootprint('training...') pipeliner.train(cifar, 400000) mpiutils.rootprint('Processing data...') # save the labels and the pipeliner first if mpiutils.rank == 0: print 'tr label size: {}'.format(cifar.label_tr.shape) print 'te label size: {}'.format(cifar.label_te.shape) io.savemat(outputfolder + '/tr_label.mat', {'label': cifar.label_tr}, oned_as='row') io.savemat(outputfolder + '/te_label.mat', {'label': cifar.label_te}, oned_as='row') cPickle.Pickler(open(outputfolder + '/extractor.dat', 'w')).dump(extractor) cPickle.Pickler(open(outputfolder + '/preprocessor.dat', 'w')).dump(preprocessor) cPickle.Pickler(open(outputfolder + '/dictTrainer.dat', 'w')).dump(dictTrainer) cPickle.Pickler(open(outputfolder + '/encoder.dat', 'w')).dump(encoder) cPickle.Pickler(open(outputfolder + '/pooler.dat', 'w')).dump(pooler) # process the data pipeliner.batch_process_dataset(cifar, 1000, outputfolder + '/cifar_tr_{}_{}.mat') pipeliner.batch_process_dataset(cifar, 1000, outputfolder + '/cifar_te_{}_{}.mat', fromTraining=False) mpiutils.safebarrier()
def goDoIt(self, inputSeq, commonData, targetFunction, binplace=True, alternateSource=None, recursive=False, quiet=False, label="", email=""): """ Executes a function on the Golem cluster indicated by the settings for this object. Parameters: inputSeq - inputs to the function to run. This will be desequenced and run in a batch of method calls, one call per item. Objects in the input must be possible and meaningful to pickle, and unpickle on a different machine. commonData - input that should be provided to every invocation of the function. Must be possible and meaningful to pickle, then unpickle on a different machine. targetFunction - Function to execute. Must have prototype func(item, item), by any name, where the first item is something off the inputSeq sequence and the second is the commonData. It must return its result, as only these return values will be pickled and sent back; changes to global variables are not captured or returned. binPlace - Should the script containing the target function be copied to the path that the input data is being copied to? If so, that one file must be the ONLY non-library file required for the function to work properly, since files around it won't be known about. This permits development out of a directory that isn't world-readable. If this is set to something that evaluates to false, the file will be used in place, and must be visible to the workers by the same path it is visible to the client. Default: True. alternateSource - Path to the file to use as the source for the script containing the targetFunction. This should be blank in almost all instances, since this function will determine this data reflectively in that case and that is much more likely to be correct. Use this if, for some reason, the file detected by inspect.getAbsFilePath is wrong, or if a different path is simply required to access the data on the workers. If the binPlace flag is set, this is the file that is copied, if it is not None (if it is, then the detected file is copied). Default: None. recursive - Deprecated. quiet - Suppress server responses from being printed to stdout while waiting for results. Default: False, for backwards compatibility. "True" is more likely to be desirable. label - Alternate identifier for locating the job in the log later. Optional. email - Informational field to identify the person running the job in case they need to be contacted. Optional. """ if len(sys.argv) > 1 and sys.argv[1] == "--golemtask": # uh-oh raise InfiniteRecursionError( "goDoIt called from something that was already a Golem task, " + "without the 'recursive' flag indicating that this is intentional." + "Make sure to test for __name__ == '__main__' in your main program," + "or it will try to execute in its entirety when Golemizer tries to import." ) restoreThisCwdOrPeopleWillHateMePassionately = os.getcwd() loud = not quiet try: outName = str(uuid.uuid1()) os.chdir(self.pickleInputShare) os.mkdir(outName) # insecure: mode 0777 os.chdir(outName) picklePath = os.getcwd() pickleCount = 0 nextList = [] n = 0 localLimit = self.taskSize for parameter in inputSeq: nextList.append(parameter) n += 1 if n >= localLimit: self._spill(nextList, pickleCount) nextList = [] n = 0 pickleCount += 1 if nextList: self._spill(nextList, pickleCount) pickleCount += 1 if not alternateSource: # restore original path or getabsfile doesn't work correctly as # of 2.7 os.chdir(restoreThisCwdOrPeopleWillHateMePassionately) target = inspect.getabsfile(targetFunction) os.chdir(self.pickleInputShare) os.chdir(outName) # print "===> Original file:", target else: target = alternateSource if binplace: # print "===> Original file:", target newTarget = os.path.join(picklePath, os.path.basename(target)) # print "===> New file:", newTarget shutil.copy2(target, newTarget) target = newTarget time.sleep(2) commonFile = open("common.pkl", "wb") commonObjectPickler = cPickle.Pickler(commonFile, 2) commonObjectPickler.dump(commonData) commonObjectPickler.dump(targetFunction) commonFile.flush() commonFile.close() runlist = [ { "Count": 1, "Args": [ self.pyPath, self.thisLibraryPath, "--golemtask", os.path.join(picklePath, "common.pkl"), # we are making certain filename # assumptions on the client side os.path.join(picklePath, str(n) + ".pkl"), self.jobOutputPath, target ] } for n in range(0, pickleCount) ] response, content = golem.runBatch(runlist, self.serverPass, self.masterPath, loud, label, email) jobId = golemBlocking.jobIdFromResponse(content) finalStatus = golemBlocking.stall(jobId, self.masterPath, loud) if loud and (finalStatus["Status"] != "SUCCESS"): print "Uh-oh- job status is", finalStatus[ "Status"], "and we're probably going to crash soon" # Note: We're choosing to ignore stdout/stderr. We can revisit this design decision later and decide to # do something instead, if we really desperately want to # resultPathGenerator = (os.path.abspath( # os.path.join( # self.golemOutPath, "golem_" + x + os.sep, self.jobOutputPath, # ) #) for x in self.golemIds) golemDirPattern = re.compile("golem_\\d+") resultPathGenerator = (os.path.abspath( os.path.join(self.golemOutPath, foo)) for foo in os.listdir(self.golemOutPath) if golemDirPattern.match(foo)) resultFilesNumbered = [] filenamePattern = re.compile( "^{0}_(\\d+)\\.out\\.pkl$".format(jobId)) # because we're already performing the match, # decorate-sort-undecorate is the best sort strategy here for resultPath in resultPathGenerator: # print "==>", resultPath for file in os.listdir(resultPath): match = filenamePattern.match(file) if match: # print "====>", file resultFilesNumbered.append( (int(match.group(1)), os.path.join(resultPath, file))) if len(resultFilesNumbered) != pickleCount: raise ExecutionFailure( "Unknown error prevented {0} of {1} task bundles from completing." .format(pickleCount - len(resultFilesNumbered), pickleCount)) resultFilesNumbered.sort() return _unpickleSequence((pair[1] for pair in resultFilesNumbered)) finally: os.chdir(restoreThisCwdOrPeopleWillHateMePassionately)
# just pull out ten features from X to make sure the whole thing works X = X[:10,:] v = z.values() split_indices = splitdata.cv_multiclass_fold(Y,10) labels = np.argmax(Y,axis=0) labelled_featuresets = [(dict(zip(v,data)),y) for (data,y) in zip(X.T,labels)] test_labels=[] true_labels=[] for i,train_indices in enumerate(split_indices): test_indices = list(set(range(Y.shape[1])).difference(train_indices)) # train train_features = [labelled_featuresets[i] for i in train_indices] model = nb.NaiveBayesClassifier.train(train_features) # test test_features = [labelled_featuresets[i] for i in test_indices] label = [model.classify(featureset[0]) for featureset in test_features] # collect true_labels.append(Y[:,test_indices]) test_labels.append(label) # save fh = open('/proj/ar2384/picorna/labels.pkl','w') labels = { "true":true_labels, "test":test_labels } cPickle.Pickler(fh,protocol=2).dump(labels)
def __init__(self, storage, pool_size=7, pool_timeout=1 << 31, cache_size=400, cache_size_bytes=0, historical_pool_size=3, historical_cache_size=1000, historical_cache_size_bytes=0, historical_timeout=300, database_name='unnamed', databases=None, xrefs=True, large_record_size=1 << 24, **storage_args): """Create an object database. :Parameters: - `storage`: the storage used by the database, e.g. FileStorage - `pool_size`: expected maximum number of open connections - `cache_size`: target size of Connection object cache - `cache_size_bytes`: target size measured in total estimated size of objects in the Connection object cache. "0" means unlimited. - `historical_pool_size`: expected maximum number of total historical connections - `historical_cache_size`: target size of Connection object cache for historical (`at` or `before`) connections - `historical_cache_size_bytes` -- similar to `cache_size_bytes` for the historical connection. - `historical_timeout`: minimum number of seconds that an unused historical connection will be kept, or None. - `xrefs` - Boolian flag indicating whether implicit cross-database references are allowed """ if isinstance(storage, basestring): from ZODB import FileStorage storage = ZODB.FileStorage.FileStorage(storage, **storage_args) elif storage is None: from ZODB import MappingStorage storage = ZODB.MappingStorage.MappingStorage(**storage_args) # Allocate lock. x = threading.RLock() self._a = x.acquire self._r = x.release # pools and cache sizes self.pool = ConnectionPool(pool_size, pool_timeout) self.historical_pool = KeyedConnectionPool(historical_pool_size, historical_timeout) self._cache_size = cache_size self._cache_size_bytes = cache_size_bytes self._historical_cache_size = historical_cache_size self._historical_cache_size_bytes = historical_cache_size_bytes # Setup storage self.storage = storage self.references = ZODB.serialize.referencesf try: storage.registerDB(self) except TypeError: storage.registerDB(self, None) # Backward compat if (not hasattr(storage, 'tpc_vote')) and not storage.isReadOnly(): warnings.warn( "Storage doesn't have a tpc_vote and this violates " "the storage API. Violently monkeypatching in a do-nothing " "tpc_vote.", DeprecationWarning, 2) storage.tpc_vote = lambda *args: None if IMVCCStorage.providedBy(storage): temp_storage = storage.new_instance() else: temp_storage = storage try: try: temp_storage.load(z64, '') except KeyError: # Create the database's root in the storage if it doesn't exist from persistent.mapping import PersistentMapping root = PersistentMapping() # Manually create a pickle for the root to put in the storage. # The pickle must be in the special ZODB format. file = cStringIO.StringIO() p = cPickle.Pickler(file, 1) p.dump((root.__class__, None)) p.dump(root.__getstate__()) t = transaction.Transaction() t.description = 'initial database creation' temp_storage.tpc_begin(t) temp_storage.store(z64, None, file.getvalue(), '', t) temp_storage.tpc_vote(t) temp_storage.tpc_finish(t) finally: if IMVCCStorage.providedBy(temp_storage): temp_storage.release() # Multi-database setup. if databases is None: databases = {} self.databases = databases self.database_name = database_name if database_name in databases: raise ValueError("database_name %r already in databases" % database_name) databases[database_name] = self self.xrefs = xrefs self.large_record_size = large_record_size
dataPath = sys.argv[1] if dataPath.endswith(".pickle"): ## load from pickle crags = cragsFromPickle(dataPath) for crag in crags: print("{name}".format(**crag)) for route in crag['route']: print( " {name} ({grade}): {soft} soft, {fair} fair, {hard} hard / {total}\n Fairness: {fairness}\n" .format(**route)) elif dataPath.endswith(".json"): ## load from json crags = cragsFromJson(dataPath) # pickle it, now that we've got it pickleName = 'pickles/{}.pickle'.format( os.path.basename(dataPath).split('.')[0]) with open(pickleName, 'w') as f: print_err("Pickling to {}...".format(pickleName)) pickler = cPickle.Pickler(f) pickler.dump(crags) jsonName = 'json/{}.json'.format( os.path.basename(dataPath).split('.')[0]) with open(jsonName, 'w') as f: f.write(json.dumps(crags)) else: print_err("Unknown filetype for " + dataPath) sys.exit(1)
dictTagIndex[t] = [] for sample in tmp_reader: sid = sample[0] title = sample[1].split() body = sample[2].split() tags = [t for t in sample[3].split() if t in tagList] if (len(tags) == 0): continue dictMessages[sid] = { 'tags': tags, 'title': Counter(title), 'body': Counter(body) } dictTagCounts.update(tags) for t in tags: dictTagIndex[t].append(sid) fd.close() # Write dictionaries out in pickle file outstream = open(args.dictFile, 'wb') writer = pickle.Pickler(outstream, pickle.HIGHEST_PROTOCOL) writer.dump(dictMessages) writer.dump(dictTagCounts) writer.dump(dictTagIndex) outstream.close()
sys.path.insert(0,parentdir) import cPickle import copy import record import json from collections import defaultdict assert len(sys.argv) == 4, 'Usage: %s records.pickle photos.json photos.pickle' _, in_pickle, photos_json, out_pickle = sys.argv rs = record.AllRecords(in_pickle) expansions = json.load(file(photos_json)) f = file(out_pickle, "w") p = cPickle.Pickler(f, 2) skipped = 0 num_images, num_photos = 0, 0 for idx, r in enumerate(rs): digital_id = r.photo_id() image_file = '%s.jpg' % digital_id if image_file not in expansions: # XXX: why skip any images? skipped += 1 continue num_images += 1 if len(expansions[image_file]) == 0:
def dumps(self, arg, proto=0): f = StringIO() p = cPickle.Pickler(f, proto) p.dump(arg) f.seek(0) return f.read()
def _hash(s, o): f = StringIO.StringIO() p = pickle.Pickler(f, -1) p.persistent_id = s._filter p.dump(o); f.seek(0) return f.read()
######################################################### # File: build_location_dict.py # # Created: June 09, 2013 # # Modified: June 09, 2013 # # Author: Bogdan State # # Description: Constructs dictionary mapping # # user to location # ######################################################### import csv import cPickle as pickle FILE_LOCATION = "/media/bogdan/61ec6432-da13-415d-9afd-fd46e933f48b/twitter/" TXT_FILE_NAME = "location_iso2c.txt" PKL_FILE_NAME = "location_iso2c.pkl" location_dict = {} with open(FILE_LOCATION + TXT_FILE_NAME, 'rb') as csvfile: reader = csv.reader(csvfile, delimiter=',', quotechar='"') for row in reader: location_dict[row[0]] = row[1] output = pickle.Pickler(open(FILE_LOCATION + PKL_FILE_NAME, 'wb')) output.fast = True output.dump(location_dict)
def __init__(self, file, protocol=0): pickler = pickle.Pickler(file, protocol) pickler.persistent_id = self.persistent_id self.dump = pickler.dump self.clear_memo = pickler.clear_memo
def accept(self): self.log_info("checking connection protocol", 2) if not hasattr(self, 'unpickler'): stream = ScarabBufferedFile(self.stream) # allow exceptions to bubble up # FIXME should mark connection as invalid # FIXME could block on stream having less than 100 bytes head = stream.head(4) if head[0:4] == ("\x89" + "CBF"): import LDOBinary self.unpickler = LDOBinary.LDOBinaryUnmarshaler(stream) self.pickler = LDOBinary.LDOBinaryMarshaler(stream) self.protocol = 'ldobinary' elif head[0:2] == '<?': head = stream.head(100) if string.find(head, 'urn:schemas-xmlsoap-org:soap.v1') != -1: import SOAP self.unpickler = SOAP.SOAPUnmarshaler(stream) self.pickler = SOAP.SOAPMarshaler(stream) self.protocol = 'soap' else: import LDOXML self.unpickler = LDOXML.LDOXMLUnmarshaler(stream) self.pickler = LDOXML.LDOXMLMarshaler(stream) self.protocol = "ldoxml" elif head[0:1] != ' ' and head[0:1] != "\t" and head[0:1] != '<': import cPickle self.unpickler = cPickle.Unpickler(stream) self.pickler = cPickle.Pickler(stream) self.protocol = "pickle" else: self.log_info("accept: unrecognized serialization", 2) """FIXME LDO-XML or other XML format""" raise ScarabConnectionError self.log_info("accepted " + self.protocol + " connection") error = None try: request = self.unpickler.load() if self.protocol == "soap": # FIXME check for invalid data method = request['SOAP:Envelope']['SOAP:Body'].keys()[0] request = { 'object': 'root', 'method': method, 'args': [request['SOAP:Envelope']['SOAP:Body'][method]] } if self.debug >= 2: self.log_info("request: " + str(request), 3) except EOFError: self.log_info("EOF on connection", 2) self.is_valid = 0 # FIXME close pickler (which'll close stream?) # FIXME notify conn manager return except: exc_type, value, traceback = sys.exc_info() # since we load picklers dynamically, we need to check their # errors in a dynamic way if str(exc_type) == "cPickle.UnpicklingError": self.log_info("EOF on connection", 2) self.is_valid = 0 # FIXME close pickler (which'll close stream?) # FIXME notify conn manager return error = "error unmarshaling, closing connection: " \ + string.join(format_exception(exc_type, value, traceback)) self.is_valid = 0 # we'll still try to send a failure response, ya never know else: t = type(request) if t != types.DictType: error = "expected dictionary for request, got %s" % ` t.__name__ ` else: if (not (request.has_key('object') and request.has_key('method') and request.has_key('args'))): error = "missing object id, method name, or args in request" else: object = request['object'] method_name = request['method'] if not self.globals.has_key(object): error = "no such object registered: %s" % ` object ` else: try: method = getattr(self.globals[object], method_name) except: error = "no such method %s for object %s" \ % (method_name, `object`,) if error != None: self.log_info(error) result = {'error': error} else: call_str = object + "." + method_name self.log_info("calling `" + call_str + "'") try: result = apply(method, tuple(request['args'])) except: exc_type, value, traceback = sys.exc_info() exc_str = string.join( format_exception(exc_type, value, traceback)) self.log_info("exception raised in `" + call_str + "': " + exc_str) result = {'error': exc_str} else: self.log_info("`" + call_str + "' returned successfully", 2) result = {'result': [result]} try: if self.debug >= 2: self.log_info("response: " + str(result), 2) if self.protocol == "soap": if result.has_key('error'): self.pickler.encode_fault(100, result['error'], 1) else: self.pickler.encode_response(method_name, result['result'][0]) else: self.pickler.dump(result) try: self.pickler.flush() except AttributeError: self.stream.flush() except: exc_type, value, traceback = sys.exc_info() self.log_info( "error sending response" + string.join(format_exception(exc_type, value, traceback))) if self.is_valid: """FIXME ignore, log, or reraise?""" # else: ignore # FIXME if not self.is_valid: pickler.close def process_string(self, message): import StringIO self.stream = StringIO.StringIO(message) self.accept() delattr(self.stream) delattr(self.pickler) delattr(self.unpickler) def run_loop(self): # FIXME this is socket specific, see FIXMEs in Scarab.py while self.is_valid: self.log_info("awaiting connection", 2) socket, addr = self.socket.accept() self.caller = str(addr) stream = socket.makefile('r+') server = ScarabConnection(self) if self.debug >= 4: server.stream = ScarabDebugFile(stream, self) else: server.stream = stream while server.is_valid: server.accept() def log_info(self, message, level=1): if self.debug >= level: if hasattr(self, 'caller'): print self.caller + ": " + message else: print message
directory = "use it as argument" try: directory = sys.argv[1] except KeyError: pass d = {} #file_list = [f for f in glob.glob("%s/*.pisi" % directory) if not f.endswith(".delta.pisi")] #Arrangements for new repository structure file_list = [] for dirpath, subdirs, files in os.walk(directory): for x in files: if x.endswith(".pisi") and not x.endswith("delta.pisi"): file_list.append(os.path.join(dirpath, x)) for p in file_list: print "Processing %s.." % p for f in filter(lambda x: x.type == "executable", pisi.package.Package(p).get_files().list): fpath = os.path.join("/", f.path) if os.access(fpath, os.X_OK): d[fpath] = pisi.util.split_package_filename( os.path.basename(p))[0] o = open("../data/packages.db", "wb") cPickle.Pickler(o, protocol=2) cPickle.dump(d, o, protocol=2) o.close()
def dumps(obj, protocol=None): file = IOtype() pickler = pickle.Pickler(file, protocol=PROTOCOL) pickler.persistent_id = _function_pickling_handler pickler.dump(obj) return file.getvalue()
def dumps(self, arg, bin=0): p = cPickle.Pickler(bin) p.dump(arg) return p.getvalue()