def goldenSectionSearch(self, obj, a, b, n): l = a + 0.382 * (b - a) h = a + 0.618 * (b - a) region = b - a num = 1 while (region > 0.01 and num <= n): fl = self.lossFunction(obj, l) fh = self.lossFunction(obj, h) #print("iter{0} fl={1:.4f} fh{2:.4f}".format(num,fl,fh)) if (fl > fh): a = l l = h h = a + 0.618 * (b - a) else: b = h h = l l = a + 0.382 * (b - a) num += 1 region = abs(b - a) utils.updateProgress(self.__scene) #0.6sec moveVal = (a + b) / 2 return moveVal
def bruteForceSearch(self, obj, valList): errList = [] for val in valList: err = self.lossFunction(obj, val) errList.append(err) utils.updateProgress(self.__scene) minIdx = errList.index(min(errList)) moveVal = valList[minIdx] return moveVal
def download(title, chapters): dir = DOWNLOAD+title+os.sep utils.mkdir_p(dir) print "Downloading "+title+" (", len(chapters) ,")..." for chap in chapters: utils.mkdir_p(dir+str(chap.number)) images = PLUGIN.getImages(chap.link) for i,img in enumerate(images): utils.putUrlContent(img.link, dir+str(chap.number)+ os.sep+img.title+utils.getExtension(img.link)) utils.updateProgress(chap.title, int(i*1.0/(len(images)-1)*100)) print "" file = open(dir+str(chap.number)+os.sep+'.completed', 'w+')
def getTransactions(fileSrc, products): # main variables to be returned transactions = [] reverseItemLookup = {} for key in products: reverseItemLookup[key[0]] = set() dataFile = codecs.open(fileSrc, 'rb', 'utf-8') # specify utf-8 encoding print "Loading Transactions..." lines = dataFile.readlines() # read all lines if settings.PROGRESS_BAR == True: util.updateProgress(0) # create a progress bar # test every line and extract its relevant information for idx, line in enumerate(lines): # test each line if settings.PROGRESS_BAR == True: util.updateProgress(float(idx) / float(len(lines))) lineList = line.split(", ") # Remove first item in the list lineList.pop(0) lineSet = set() for idx2, item in enumerate(lineList): if float(item) != 0: # Add the index to our list to indicate that the product has been bought reverseItemLookup[products[idx2][0]].add(idx) lineSet.add(idx2) # append our array to our transactions transactions.append(lineSet); if settings.PROGRESS_BAR == True: util.updateProgress(1) print "\n" # return our list of products return [transactions, reverseItemLookup]
def getProducts(fileSrc): # main variables to be returned products = [] dataFile = codecs.open(fileSrc, 'rb', 'utf-8') # specify utf-8 encoding print "Retrieving products..." lines = dataFile.readlines() # read all lines if settings.PROGRESS_BAR == True: util.updateProgress(0) # create a progress bar # test every line and extract its relevant information for idx, line in enumerate(lines): # test each line if settings.PROGRESS_BAR == True: util.updateProgress(float(idx) / float(len(lines))) lineList = line.split(", ") lineList[1] = float(lineList[1]) products.append(lineList); if settings.PROGRESS_BAR == True: util.updateProgress(1) print "\n" # return our list of products return products
def __init__(self, trainingData, attributes): print "Training Bayesian Classifier with " + str(len(trainingData)) + " data entries.\n" # COUNT VARIABLES print "Counting all variables:" if settings.PROGRESS_BAR == True: util.updateProgress(0) # Sort the training data into two bins based on classifier, meanwhile recording the counts for each variable numOfEntries = float(len(trainingData)) categoricalCounts = {} # Holds counts of each category self.classifierBins = {} # Holds the data points for each classifier self.probability = {} self.numericBins = {} count = 0.0 for entry in trainingData: # for every data row... count += 1.0 if settings.PROGRESS_BAR == True: util.updateProgress(count / (numOfEntries)) for attr in entry: # for each attribute... if util.isNumber(entry[attr]) == False: # for categorical attributes if entry[attr] in categoricalCounts: # if we have already created a key for this categoricalCounts[entry[attr]] += 1.0 # increment the key else: # otherwise we create a new key and set it to 1 categoricalCounts[entry[attr]] = 1.0 if attr == settings.CLASSIFIER_NAME: # if we are on the classifier, in this case "class" if entry[attr] in self.classifierBins: # add the row to the classifier bins, self.classifierBins[entry[attr]].append(entry) else: self.classifierBins[entry[attr]] = [entry] else: # For Numeric Attributes key = attr + ' given ' + entry[settings.CLASSIFIER_NAME] # declare a key if key in self.numericBins: # if the key is already in our numeric bins bisect.insort(self.numericBins[key], entry[attr]) # insert the numeric attribute in a sorted location else: self.numericBins[key] = [entry[attr]] # if it doesn't exist, create a list for it # DEAL WITH CONTINUOUS VARIABLES initialKeys = self.numericBins.keys() for key in initialKeys: self.numericBins[key + " mean"] = np.mean(self.numericBins[key]) # store mean of each prob self.numericBins[key + " stdev"] = np.std(self.numericBins[key]) # store std deviation of each continuous var for attr in attributes: # if we have not stored values for certain attributes, we do so now, using smoothing techniques if attr[1] != 'real': for attrType in attr[1]: if attrType not in self.probability: self.probability[attrType] = .5 / numOfEntries for name in self.classifierBins: self.probability[attrType + " given " + name] = .5 / len(self.classifierBins[name]) # ASSIGN PROBABILITIES print "\n\nAssigning probabilities:" # Now we have two bins, each holding our different classifiers and counts of all our variables if settings.PROGRESS_BAR == True: util.updateProgress(0) for key in categoricalCounts.keys(): # Assign categorical counts self.probability[key] = self.getProbability(categoricalCounts[key], numOfEntries) attrs = categoricalCounts.keys() # get the attrs we will iterate through count = 0.0 # create a count used to log to the status bar for key in self.classifierBins.keys(): # for each classifier type... count += 1 if settings.PROGRESS_BAR == True: util.updateProgress(count / float(len(self.classifierBins.keys()))) # update progress bar for row in self.classifierBins[key]: # for each row in the classifierBins... for rowKey in row: # for each key in the row... if util.isNumber(row[rowKey]) == False: # if we're dealing with a categorical variable... newKey = row[rowKey] + " given " + key # create a key variable if newKey in categoricalCounts: # count number of items included in that section categoricalCounts[newKey] += 1.0 else: categoricalCounts[newKey] = 1.0 for attrValue in attrs: # for every attrValue... countKey = attrValue + " given " + key # create a key if countKey in categoricalCounts: # add to categoricalCounts our conditional probabilities self.probability[countKey] = self.getProbability(categoricalCounts[countKey], len(self.classifierBins[key])) # Assign conditional probabilities else: self.probability[countKey] = self.getProbability(0, len(self.classifierBins[key])) if settings.PROGRESS_BAR == True: util.updateProgress(1) print "\nModel creation complete\n"
def readArff(fileSrc): # main variables to be returned relation = "" # relation attributes = [] # attribute list rawData = [] # main data storage reverseLookup = {} # store by value for reverse lookup continuousVariables = {} categoricalVariables = {} dataFile = codecs.open(fileSrc, 'rb', 'utf-8') # specify utf-8 encoding print "Reading file..." lines = dataFile.readlines() # read all lines if settings.PROGRESS_BAR == True: util.updateProgress(0) # create a progress bar # test every line and extract its relevant information for idx, line in enumerate(lines): # test each line if settings.PROGRESS_BAR == True: util.updateProgress(float(idx) / float(len(lines))) if line[0] == '%': # ignore comments continue elif line[0] == '@': # if is metadata if '@relation' in line: # if relation arrayLine = line.split(" ") relation = arrayLine[1] elif "@attribute" in line: # if attribute arrayLine = line.split(" ") attributes.append([arrayLine[1]]) if "real" not in arrayLine[2]: # if attribute is not real (is categorical) attrs = re.search('\{(.*?)\}', line).group() # select text between brackets attrs = re.sub('[\{\}]', "", attrs) # remove brackets newAttrs = attrs.split(", ") options = [] for attr in newAttrs: options.append(attr) attributes[len(attributes) - 1].append(options) else: # if it is real attributes[len(attributes) - 1].append('real') elif line[0] == " ": continue else: line = line.replace(" ", "") line = line.replace("\n", "") line = line.split(",") newDataEntry = {} # create a new object to store our row data for idx, value in enumerate(line): # for every column of data attribute = attributes[idx] if util.isNumber(value): # convert string to float if it's a number value = float(value) # Add value to our reverse lookup under the key "attributeName attributeValue" rlKey = attribute[0] + " " + str(value) # create key for our reverseLookup data structure if rlKey in reverseLookup: reverseLookup[rlKey].append(len(rawData)) # append index of our current row (the length of data) for quick lookup later else: reverseLookup[rlKey] = [len(rawData)] # create a new arrayList to store our indices if one does not already exist # fill our newData Entry newDataEntry[attribute[0]] = value # store the value under its proper key # add variables to our bins if attribute[1] == 'real': # if the attribute is real, we place it in a continuous bin if attribute[0] in continuousVariables: continuousVariables[attribute[0]].add(value, line[len(line) - 1]) # add our value to our continuous bin else: continuousVariables[attribute[0]] = util.continuousBin(attribute[0]) # instantiate a continuous bin to hold our variable continuousVariables[attribute[0]].add(value, line[len(line) - 1]) else: # if the attribute is categorical, we place it in a categorical bin if attribute[0] in categoricalVariables: categoricalVariables[attribute[0]].add(value, line[len(line) - 1]) else: categoricalVariables[attribute[0]] = util.categoricalBin(attribute[1]) categoricalVariables[attribute[0]].add(value, line[len(line) - 1]) rawData.append(newDataEntry) # append data entry to all of our data # END OF FOR LOOP results = {} results['data'] = rawData results['attributes'] = attributes results['relation'] = relation results['lookup'] = reverseLookup results['continuousVariables'] = continuousVariables results['categoricalVariables'] = categoricalVariables if settings.PROGRESS_BAR == True: util.updateProgress(1) print "\nFile read complete \n" return results
def optimize (self, X, Y ): """ Optimizes the number of trees (estimators) and max features used (features) and returns the best values, acording to the OOB criteria The results are shown in a diagnostic plot To avoid including many trees to produce tiny improvements, increments of OOB error below 0.01 are considered irrelevant """ RANDOM_STATE = 1226 errors = {} features = ['sqrt','log2','none'] if self.quantitative: tclf = {'sqrt': RandomForestRegressor(warm_start=False, oob_score=True, max_features="sqrt",random_state=RANDOM_STATE), 'log2': RandomForestRegressor(warm_start=False, oob_score=True, max_features="log2",random_state=RANDOM_STATE), 'none': RandomForestRegressor(warm_start=False, oob_score=True, max_features=None ,random_state=RANDOM_STATE) } else: tclf = {'sqrt': RandomForestClassifier(warm_start=False, oob_score=True, max_features="sqrt",random_state=RANDOM_STATE, class_weight=self.class_weight), 'log2': RandomForestClassifier(warm_start=False, oob_score=True, max_features="log2",random_state=RANDOM_STATE, class_weight=self.class_weight), 'none': RandomForestClassifier(warm_start=False, oob_score=True, max_features=None ,random_state=RANDOM_STATE, class_weight=self.class_weight) } # Range of `n_estimators` values to explore. min_estimators = 15 max_estimators = 700 stp_estimators = 100 num_steps = int((max_estimators-min_estimators)/stp_estimators) print 'optimizing RF....' updateProgress (0.0) optValue = 1.0e10 j = 0 for fi in features: errors[fi] = [] count = 0 for i in range(min_estimators, max_estimators + 1,stp_estimators): clf = tclf[fi] clf.set_params(n_estimators=i) clf.fit(X,Y) oob_error = 1 - clf.oob_score_ errors[fi].append((i,oob_error)) if oob_error < optValue: if np.abs(oob_error - optValue) > 0.01: optValue = oob_error optEstimators = i optFeatures = fi updateProgress (float(count+(j*num_steps))/float(len(features)*num_steps)) count = count+1 j=j+1 for ie in errors: xs, ys = zip (*errors[ie]) plt.plot(xs, ys, label=ie) plt.xlim(min_estimators, max_estimators) plt.xlabel("n_estimators (Trees)") plt.ylabel("OOB error rate") plt.legend(loc="upper right") plt.show() plt.savefig(self.vpath+"/rf-OOB-parameter-tuning.png") plt.savefig("./rf-OOB-parameter-tuning.png") print 'optimum features:', optFeatures, 'optimum estimators:', optEstimators, 'best OOB:', optValue return (optEstimators, optFeatures)
def varSelectionFFD (self, X, Y , A, autoscale=False, gui=True): # TODO : set dummyStep and ratio as tunable parameters dummyStep = 4.0 ratio = 2.0 # TODO : check the number of X variables. FFD is not suitable for very large X matrices # build a X reduced matrix Xr nobj, nvarx = np.shape (X) nvarxOri = nvarx index = np.ones(nvarx,dtype=np.int) st = np.std (X, axis=0, ddof=1) for i in range (nvarx): if st[i] < 1e-10: index[i] = 0 # set to 0 to allow creation of reduced matrices nvarxb = np.sum(index) #print index Xb = np.empty((nobj, nvarxb), dtype=np.float64) k=0 for i in range (nvarx): if index[i]>0: Xb[:,k]=X[:,i] k+=1 nobj, nvarx = np.shape (Xb) ndummy = int (np.floor(nvarx/dummyStep)) # number of dummy variables nvarxm = nvarx + ndummy # length of expanded vector ncomb, design = generateDesignFFD (nvarxm, ratio) # ncomb is the number of reduced models to be generated # design is the matrix that designates is every x variable # is in/out of the design matrix # print nvarx, ndummy, nvarxm, ncomb # obtain first estimation of Y std error SSY0 = 0.0 for i in range (nobj): SSY0+=np.square(Y[i]-np.mean(Y)) SDEP0 = np.sqrt(SSY0/float(nobj)) SDEP0x10 = 10.0 * SDEP0 # initializes effects effect = np.zeros(nvarxm,dtype=np.float64) xdesign = np.zeros(nvarx ,dtype=np.int) # set common model stuff self.autoscale = autoscale self.Y = Y.copy() if gui: updateProgress (0.0) for i in range(ncomb): # extract x design line (not considering dummies) k=0 for j in range (nvarxm): if j%(dummyStep+1) : # non-dummy var xdesign[k]=design[i][j] k+=1 nvarxr = int(np.sum(xdesign>0)) # if this design line contains few x vars skip the model validation if nvarxr <= (A+1) : continue # build a X reduced matrix Xr Xr = np.empty((nobj, nvarxr), dtype=np.float64) k=0 for j in range (nvarx): if xdesign[j]>0: Xr[:,k]=Xb[:,j] k+=1 # set the reduced matrix as model matrix and validate self.X = Xr.copy() self.validateLOO (A) # accumulate the min SDEP to a effect vector for every variable (including dummies) minSDEP = 2.0e10 for a in self.SDEP: if a < minSDEP : minSDEP = a if minSDEP > SDEP0x10: minSDEP = SDEP0 effect += design[i]*minSDEP if gui: updateProgress (float(i)/float(ncomb)) # calculate effects effect /= (ncomb/2) # compute dummy effects dummyEffect = 0.00 dummyMean = 0.00 k = 0 for i in range(nvarxm): if not (i%(dummyStep+1)) : # dummy var dummyMean+=effect[i] dummyMean/=ndummy for i in range(nvarxm): if not (i%(dummyStep+1)) : # dummy var dummyEffect+=np.square(effect[i]-dummyMean) ##dummyEffect+=np.square(effect[i]) ## old version: assuming mean of zero (?) ##td+=1 else : effect[k]=effect[i] k+=1 if dummyEffect > 1e-6: dummySD = np.sqrt(dummyEffect/ndummy) else : dummySD = 0.001 # compare with critical T values (two tail, 95%) t = stats.t.ppf(0.9725,ndummy-1) effectCutoff = t * dummySD res = np.ones(nvarx,dtype=np.int) # fixed (default) for i in range(nvarx): if np.abs(effect[i]) < effectCutoff: # uncertain res[i] = 2 elif effect[i] > 0 : res[i] = 0 # excluded #print res # map the result in a vector representing the full, original X resExp = np.ones(nvarxOri,dtype=np.int) k = 0 for i in range (nvarxOri): if index[i]==0: resExp[i] = 0 # these were already excluded or are inactive variables else : resExp[i] = res[k] k += 1 return resExp, np.sum(res==0)
def validateLOO (self, A, gui=False): """ Validates A dimensions of an already built PLS model, using Leave-One-Out cross-validation Returns nothing. The results of the cv (SSY, SDEP and Q2) are stored internally """ if self.X == None or self.Y == None: return X = self.X Y = self.Y nobj,nvarx = np.shape (X) SSY0 = 0.0 for i in range (nobj): SSY0+=np.square(Y[i]-np.mean(Y)) SSY = np.zeros(A,dtype=np.float64) YP = np.zeros ((nobj,A+1),dtype=np.float64) if gui: updateProgress (0.0) for i in range (nobj): # build reduced X and Y matrices removing i object Xr = np.delete(X,i,axis=0) Yr = np.delete(Y,i) Xr,muxr = center(Xr) Xr,wgxr = scale (Xr, self.autoscale) Yr,muyr = center(Yr) xp = np.copy(X[i,:]) xp -= muxr xp *= wgxr # predicts y for the i object, using A LV yp = self.getLOO(Xr,Yr,xp,A) yp += muyr # updates SSY with the object i errors YP[i,0]=Y[i] for a in range(A): SSY[a]+= np.square(yp[a]-Y[i]) YP[i,a+1]=yp[a] if gui : updateProgress (float(i)/float(nobj)) if gui : print self.SSY = SSY self.SDEP = [np.sqrt(i/nobj) for i in SSY] self.Q2 = [1.00-(i/SSY0) for i in SSY] self.Av = A return (YP)
def readArff(fileSrc): # main variables to be returned relation = "" # relation attributes = [] # attribute list rawData = [] # main data storage reverseLookup = {} # store by value for reverse lookup continuousVariables = {} categoricalVariables = {} dataFile = codecs.open(fileSrc, 'rb', 'utf-8') # specify utf-8 encoding print "Reading file..." lines = dataFile.readlines() # read all lines if settings.PROGRESS_BAR == True: util.updateProgress(0) # create a progress bar # test every line and extract its relevant information for idx, line in enumerate(lines): # test each line if settings.PROGRESS_BAR == True: util.updateProgress(float(idx) / float(len(lines))) if line[0] == '%': # ignore comments continue elif line[0] == '@': # if is metadata if '@relation' in line: # if relation arrayLine = line.split(" ") relation = arrayLine[1] elif "@attribute" in line: # if attribute arrayLine = line.split(" ") attributes.append([arrayLine[1]]) if "real" not in arrayLine[ 2]: # if attribute is not real (is categorical) attrs = re.search( '\{(.*?)\}', line).group() # select text between brackets attrs = re.sub('[\{\}]', "", attrs) # remove brackets newAttrs = attrs.split(", ") options = [] for attr in newAttrs: options.append(attr) attributes[len(attributes) - 1].append(options) else: # if it is real attributes[len(attributes) - 1].append('real') elif line[0] == " ": continue else: line = line.replace(" ", "") line = line.replace("\n", "") line = line.split(",") newDataEntry = {} # create a new object to store our row data for idx, value in enumerate(line): # for every column of data attribute = attributes[idx] if util.isNumber( value): # convert string to float if it's a number value = float(value) # Add value to our reverse lookup under the key "attributeName attributeValue" rlKey = attribute[0] + " " + str( value) # create key for our reverseLookup data structure if rlKey in reverseLookup: reverseLookup[rlKey].append( len(rawData) ) # append index of our current row (the length of data) for quick lookup later else: reverseLookup[rlKey] = [ len(rawData) ] # create a new arrayList to store our indices if one does not already exist # fill our newData Entry newDataEntry[attribute[ 0]] = value # store the value under its proper key # add variables to our bins if attribute[ 1] == 'real': # if the attribute is real, we place it in a continuous bin if attribute[0] in continuousVariables: continuousVariables[attribute[0]].add( value, line[len(line) - 1]) # add our value to our continuous bin else: continuousVariables[attribute[0]] = util.continuousBin( attribute[0] ) # instantiate a continuous bin to hold our variable continuousVariables[attribute[0]].add( value, line[len(line) - 1]) else: # if the attribute is categorical, we place it in a categorical bin if attribute[0] in categoricalVariables: categoricalVariables[attribute[0]].add( value, line[len(line) - 1]) else: categoricalVariables[ attribute[0]] = util.categoricalBin(attribute[1]) categoricalVariables[attribute[0]].add( value, line[len(line) - 1]) rawData.append( newDataEntry) # append data entry to all of our data # END OF FOR LOOP results = {} results['data'] = rawData results['attributes'] = attributes results['relation'] = relation results['lookup'] = reverseLookup results['continuousVariables'] = continuousVariables results['categoricalVariables'] = categoricalVariables if settings.PROGRESS_BAR == True: util.updateProgress(1) print "\nFile read complete \n" return results