def refresh(self, ind_val=None): # set attributes self.ind_list = self.Data.getList(self.ind_attr, False) self.dep_list = self.Data.getList(self.dep_attr, True) self.dep = {} self.ind = list(set(self.ind_list)) self.listToContours = [self.ind.index(i) for i in self.ind_list] self.DU = DistanceUncertainty(self) self.indQuantities = [self.ind_list.count(i) for i in self.ind] # Variables self.std = np.std(self.ind_list) self.weightFactor = 1.0 / (2 * ((float(self.weight_std_ratio) * self.std) ** 2)) # The points ordered by their dependent attribute orderedPoints = zip(self.dep_list, self.ind_list) orderedPoints.sort() self.sorted_dep_list, self.sorted_ind_list = zip(*orderedPoints) if not ind_val is None: new_indices = [i for i in range(len(orderedPoints)) if self.sorted_ind_list[i] == ind_val] return new_indices
class ObservedDistribution: def __init__(self, parser, ind_attr, contours , dep_attr, weight_std_ratio=None, retrain=False, prefix=None, save=True): #If a None path is provided, make one based on the ind_attr if prefix is None: prefix = "ods/"+sanitise(ind_attr)+"/" if not os.path.exists(prefix): os.makedirs(prefix) # If no weight is provided then expect to find one in a filename if weight_std_ratio is None: pattern = re.compile(sanitise(ind_attr)+'_[0-9]_'+sanitise(dep_attr)) for f in os.listdir(prefix): fn,ext = os.path.splitext(f) if ext == ".od" and pattern.match(sanitise(fn)) is not None: weight_std_ratio = float(fn.rsplit("_")[-1]) print "Found",f,"and setting weight ratio to",weight_std_ratio break if weight_std_ratio is None: print "No suitable weight factor found in provided OD files, using 0.15." weight_std_ratio = 0.15 self.weight_std_ratio = weight_std_ratio self.contours = contours self.prefix = prefix self.filename = getFileName(ind_attr, contours, dep_attr, weight_std_ratio) self.path = os.path.join(prefix,self.filename) # if this od has already been computed read in the file it was saved to and copy the attributes from that version if os.path.isfile(self.path) and not retrain: od = readObject(self.path) attributes = inspect.getmembers(od) for a in attributes: if not type(a[1]) is types.MethodType: setattr(self, a[0], a[1]) print "read in",self.path # otherwise build the od as normal else: self.retrain(parser, ind_attr, dep_attr, save=save) def __repr__(self): return self.filename def refresh(self, ind_val=None): # set attributes self.ind_list = self.Data.getList(self.ind_attr, False) self.dep_list = self.Data.getList(self.dep_attr, True) self.dep = {} self.ind = list(set(self.ind_list)) self.listToContours = [self.ind.index(i) for i in self.ind_list] self.DU = DistanceUncertainty(self) self.indQuantities = [self.ind_list.count(i) for i in self.ind] # Variables self.std = np.std(self.ind_list) self.weightFactor = 1.0 / (2 * ((float(self.weight_std_ratio) * self.std) ** 2)) # The points ordered by their dependent attribute orderedPoints = zip(self.dep_list, self.ind_list) orderedPoints.sort() self.sorted_dep_list, self.sorted_ind_list = zip(*orderedPoints) if not ind_val is None: new_indices = [i for i in range(len(orderedPoints)) if self.sorted_ind_list[i] == ind_val] return new_indices def retrain(self, parser, ind_attr, dep_attr, save=True): self.bins = [0.5] # Expand the bins to the number of contours selected (zero contours = just predict the median) for i in xrange(self.contours): self.bins = np.concatenate([[self.bins[0]/2],self.bins,[1-self.bins[0]/2]]) # convert self.bins back into a list if not type(self.bins) == list: self.bins = self.bins.tolist() self.bins.sort() # set parser self.Data = parser # set attributes self.ind_attr = ind_attr self.dep_attr = dep_attr self.refresh() # Parallel computation: results, self.weights = zip(*Parallel(n_jobs=-1)(delayed(findBins)(value, self.sorted_ind_list, self.sorted_dep_list, weightFunction, self.weightFactor, self.bins, findResults) for value in self.ind)) self.finishTraining(results) if save and not os.path.isfile(self.path): self.saveObject(self.path) def finishTraining(self, results): self.weights = list(self.weights) # Get Training Dots # Make a list of the input values at each bin boundary for b in self.bins: self.dep[b] = [] for r in results: for pair in r: self.dep[pair[0]].append(pair[1]) # set self.ind and self.dep (these are what ED will train on) self.ind = np.array(self.ind).T for b in self.bins: self.dep[b] = np.array(self.dep[b]) #This section avoids a crash that occurs when all of a self.ind[b] have the same value. if np.allclose(self.dep[b],self.dep[b][0]): self.dep[b][0] *= 1.01 print "Ran into a Y-axis contour (",b,") that has no variance in any of the point weights." print "This probably means the weighting factors are inappropriately large." print "Avoiding a crash in the SVR by falsely editing the first datapoint by 1% and then proceeding." # function to find the weights for each point around a single point on the independent axis def weightFunction(self, value, weightFactor=None): wf = self.weightFactor if weightFactor is not None: wf = 1.0 / (2 * ((weightFactor * self.std) ** 2)) return np.exp(wf*(-(abs(np.array(self.ind_list)-value) ** 2))) # getter functions def distanceUncertainty(self, values): return self.DU.distanceUncertainty(values) def indAttrName(self, san=False): if san: return sanitise(self.ind_attr) return self.ind_attr def indAttr(self): return self.ind def indAttrList(self): return self.ind_list def depAttrName(self, san=False): if san: return sanitise(self.dep_attr) return self.dep_attr def observedContours(self): return self.dep def scaledDepAttr(self): return self.Data.getList(self.dep_attr, True) def unscaledDepAttr(self): return self.Data.getList(self.dep_attr, False) def limits(self): return [[min(self.ind),max(self.ind)],[min(self.unscaledDepAttr()),max(self.unscaledDepAttr())]] # scale and unscale points trained on the scaled dots def unscalePoints(self, vals): return vals*np.array(self.Data.getScale(self.dep_attr))+np.array(self.Data.getTranslate(self.dep_attr)) def scalePoints(self,vals): return vals/np.array(self.Data.getScale(self.dep_attr))-np.array(self.Data.getTranslate(self.dep_attr)) def plotArtefacts(self,stroke=None,fill='black',plot=None,alpha=1): if plot is None: plot = pl.figure().add_subplot(1,1,1) plot.scatter(self.ind_list, self.unscalePoints(self.dep_list), edgecolor=stroke,facecolor=fill,s=2,lw=0.25,alpha=alpha) return plot def plotArtefact(self,x=None,y=None,plot=None,alpha=1,ED=None): if plot is None: plot = pl.figure().add_subplot(1,1,1) if x is None: minx = min(self.ind_list) maxx = max(self.ind_list) x = minx+np.random.random()*np.ptp([minx, maxx]) if y is None: y_pred = ED.getExpectationsAt(x,False) error_scale = np.mean(y_pred[self.bins[-1]]-y_pred[self.bins[0]]) y = ED.getExpectationsAt(x,False,medianOnly=True)+((np.random.random()*error_scale)-(0.5*error_scale)) text_pos = pl.ylim()[0]+np.ptp(pl.ylim())*0.025 surprise,raw_surprise = ED.surpriseCalc(x,y,None,False) plot.axvline(x) plot.scatter(x,y,s=500,c='r',marker='*') text = "".join([" Hypothetical phone: S=",str(round(surprise,3)),' (raw: ',str(round(abs(raw_surprise),3)),')']) plot.annotate(text,[x,text_pos],color='b') return plot # plot the results def plotObservedContours(self, title="", plot=None, alpha=1): if plot is None: plot = pl.figure().add_subplot(1,1,1) # median dot size (for Kaz) med_S = 2 # regular dot size reg_S = .5 # data size data_S = 5 centralIndex = self.bins.index(0.5) for i,b in enumerate(self.bins): dist_from_med = float(abs(i-centralIndex))/(len(self.bins) *.5) color = (1-dist_from_med, dist_from_med, 0) if b == 0.5: S = med_S else: S = reg_S zipped = zip(self.ind, self.unscalePoints(self.dep[b])) zipped.sort() x, y = zip(*zipped) list(x) list(y) #if b == 0.5: # print x # print y # sys.exit() plot.plot(x, y, color=color, lw=S, alpha=alpha) if self.ind_attr is None: plot.set_xlabel('$Year$') else: plot.set_xlabel(self.ind_attr) plot.set_ylabel(self.dep_attr) if len(title) > 0: plot.set_title(title) return plot def plotWeights(self, value, plot=None, alpha=1): if plot is None: plot = pl.figure().add_subplot(1,1,1) weights = self.weightFunction(value) weights *= 10000.0/sum(weights) weights = np.sqrt(weights) plot.scatter(self.ind_list, self.unscalePoints(self.dep_list), color='r', s=weights, alpha=alpha) return plot #Show the current plot(s). def show(self): pl.show() #Save the current plot to a given filename. def saveFig(self,filename): #if os.path.isfile(self.prefix+'/'+filename): # version = 1 # while os.path.isfile(self.prefix+'/'+str(version)+'_'+filename): # version += 1 # filename = str(version)+'_'+filename pl.savefig(filename) print 'Saved',filename def saveObject(self, filename): with open(filename, 'wb') as output: pickle.dump(self, output, pickle.HIGHEST_PROTOCOL) print "Saved,",filename