def convertContinuousToCategorical(self, attrName, splits): data = [] currentSplit = [splits[0], 0] # split number, index splitTypes = [] # CREATE A LIST OF SPLIT TYPES for idx, split in enumerate( splits): # for every split, save a splitKey if idx == 0: # If index is zero splitTypes.append("<=" + str(split)) # add just less than or equal else: # If last element splitTypes.append( str(splits[idx - 1]) + "< x <=" + str(split)) # splitTypes.append(">" + str(splits[len(splits) - 1])) # UPDATE ATTRIBUTES TO REFELCT CHANGES for attr in self.attributes: if attr[0] == attrName: attr[1] = splitTypes newBin = util.categoricalBin( splitTypes) # create a new bin to store our values splitKey = splitTypes[0] for idx, contVar in enumerate( self.continuousVariables[attrName].getValues( )): # for every continuous variable we have if contVar > currentSplit[0] and currentSplit[1] != len(splits) - 1: currentSplit = [ splits[currentSplit[1] + 1], currentSplit[1] + 1 ] splitKey = splitTypes[currentSplit[1]] elif contVar > currentSplit: splitKey = splitTypes[len(splitTypes) - 1] # UPDATE REVERSE LOOKUP rlKey = attrName + " " + str(float(contVar)) if rlKey not in self.lookup: continue newrlKey = attrName + " " + splitKey for userId in self.lookup[ rlKey]: # for every user that has that continuous variable value self.data[userId][attrName] = splitKey if newrlKey in self.lookup: self.lookup[newrlKey].append(userId) else: self.lookup[newrlKey] = [userId] newBin.add(splitKey, self.data[userId][settings.CLASSIFIER_NAME]) self.lookup.pop(rlKey) self.continuousVariables.pop(attrName) self.categoricalVariables[attrName] = newBin
def convertContinuousToCategorical(self, attrName, splits): data = [] currentSplit = [splits[0], 0] # split number, index splitTypes = [] # CREATE A LIST OF SPLIT TYPES for idx, split in enumerate(splits): # for every split, save a splitKey if idx == 0: # If index is zero splitTypes.append("<=" + str(split)) # add just less than or equal else: # If last element splitTypes.append(str(splits[idx - 1]) + "< x <=" + str(split)) # splitTypes.append(">" + str(splits[len(splits) - 1])) # UPDATE ATTRIBUTES TO REFELCT CHANGES for attr in self.attributes: if attr[0] == attrName: attr[1] = splitTypes newBin = util.categoricalBin(splitTypes) # create a new bin to store our values splitKey = splitTypes[0] for idx, contVar in enumerate(self.continuousVariables[attrName].getValues()): # for every continuous variable we have if contVar > currentSplit[0] and currentSplit[1] != len(splits) - 1: currentSplit = [splits[currentSplit[1] + 1], currentSplit[1] + 1] splitKey = splitTypes[currentSplit[1]] elif contVar > currentSplit: splitKey = splitTypes[len(splitTypes) - 1] # UPDATE REVERSE LOOKUP rlKey = attrName + " " + str(float(contVar)) if rlKey not in self.lookup: continue newrlKey = attrName + " " + splitKey for userId in self.lookup[rlKey]: # for every user that has that continuous variable value self.data[userId][attrName] = splitKey if newrlKey in self.lookup: self.lookup[newrlKey].append(userId) else: self.lookup[newrlKey] = [userId] newBin.add(splitKey, self.data[userId][settings.CLASSIFIER_NAME]) self.lookup.pop(rlKey) self.continuousVariables.pop(attrName) self.categoricalVariables[attrName] = newBin
def readArff(fileSrc): # main variables to be returned relation = "" # relation attributes = [] # attribute list rawData = [] # main data storage reverseLookup = {} # store by value for reverse lookup continuousVariables = {} categoricalVariables = {} dataFile = codecs.open(fileSrc, 'rb', 'utf-8') # specify utf-8 encoding print "Reading file..." lines = dataFile.readlines() # read all lines if settings.PROGRESS_BAR == True: util.updateProgress(0) # create a progress bar # test every line and extract its relevant information for idx, line in enumerate(lines): # test each line if settings.PROGRESS_BAR == True: util.updateProgress(float(idx) / float(len(lines))) if line[0] == '%': # ignore comments continue elif line[0] == '@': # if is metadata if '@relation' in line: # if relation arrayLine = line.split(" ") relation = arrayLine[1] elif "@attribute" in line: # if attribute arrayLine = line.split(" ") attributes.append([arrayLine[1]]) if "real" not in arrayLine[2]: # if attribute is not real (is categorical) attrs = re.search('\{(.*?)\}', line).group() # select text between brackets attrs = re.sub('[\{\}]', "", attrs) # remove brackets newAttrs = attrs.split(", ") options = [] for attr in newAttrs: options.append(attr) attributes[len(attributes) - 1].append(options) else: # if it is real attributes[len(attributes) - 1].append('real') elif line[0] == " ": continue else: line = line.replace(" ", "") line = line.replace("\n", "") line = line.split(",") newDataEntry = {} # create a new object to store our row data for idx, value in enumerate(line): # for every column of data attribute = attributes[idx] if util.isNumber(value): # convert string to float if it's a number value = float(value) # Add value to our reverse lookup under the key "attributeName attributeValue" rlKey = attribute[0] + " " + str(value) # create key for our reverseLookup data structure if rlKey in reverseLookup: reverseLookup[rlKey].append(len(rawData)) # append index of our current row (the length of data) for quick lookup later else: reverseLookup[rlKey] = [len(rawData)] # create a new arrayList to store our indices if one does not already exist # fill our newData Entry newDataEntry[attribute[0]] = value # store the value under its proper key # add variables to our bins if attribute[1] == 'real': # if the attribute is real, we place it in a continuous bin if attribute[0] in continuousVariables: continuousVariables[attribute[0]].add(value, line[len(line) - 1]) # add our value to our continuous bin else: continuousVariables[attribute[0]] = util.continuousBin(attribute[0]) # instantiate a continuous bin to hold our variable continuousVariables[attribute[0]].add(value, line[len(line) - 1]) else: # if the attribute is categorical, we place it in a categorical bin if attribute[0] in categoricalVariables: categoricalVariables[attribute[0]].add(value, line[len(line) - 1]) else: categoricalVariables[attribute[0]] = util.categoricalBin(attribute[1]) categoricalVariables[attribute[0]].add(value, line[len(line) - 1]) rawData.append(newDataEntry) # append data entry to all of our data # END OF FOR LOOP results = {} results['data'] = rawData results['attributes'] = attributes results['relation'] = relation results['lookup'] = reverseLookup results['continuousVariables'] = continuousVariables results['categoricalVariables'] = categoricalVariables if settings.PROGRESS_BAR == True: util.updateProgress(1) print "\nFile read complete \n" return results
def readArff(fileSrc): # main variables to be returned relation = "" # relation attributes = [] # attribute list rawData = [] # main data storage reverseLookup = {} # store by value for reverse lookup continuousVariables = {} categoricalVariables = {} dataFile = codecs.open(fileSrc, 'rb', 'utf-8') # specify utf-8 encoding print "Reading file..." lines = dataFile.readlines() # read all lines if settings.PROGRESS_BAR == True: util.updateProgress(0) # create a progress bar # test every line and extract its relevant information for idx, line in enumerate(lines): # test each line if settings.PROGRESS_BAR == True: util.updateProgress(float(idx) / float(len(lines))) if line[0] == '%': # ignore comments continue elif line[0] == '@': # if is metadata if '@relation' in line: # if relation arrayLine = line.split(" ") relation = arrayLine[1] elif "@attribute" in line: # if attribute arrayLine = line.split(" ") attributes.append([arrayLine[1]]) if "real" not in arrayLine[ 2]: # if attribute is not real (is categorical) attrs = re.search( '\{(.*?)\}', line).group() # select text between brackets attrs = re.sub('[\{\}]', "", attrs) # remove brackets newAttrs = attrs.split(", ") options = [] for attr in newAttrs: options.append(attr) attributes[len(attributes) - 1].append(options) else: # if it is real attributes[len(attributes) - 1].append('real') elif line[0] == " ": continue else: line = line.replace(" ", "") line = line.replace("\n", "") line = line.split(",") newDataEntry = {} # create a new object to store our row data for idx, value in enumerate(line): # for every column of data attribute = attributes[idx] if util.isNumber( value): # convert string to float if it's a number value = float(value) # Add value to our reverse lookup under the key "attributeName attributeValue" rlKey = attribute[0] + " " + str( value) # create key for our reverseLookup data structure if rlKey in reverseLookup: reverseLookup[rlKey].append( len(rawData) ) # append index of our current row (the length of data) for quick lookup later else: reverseLookup[rlKey] = [ len(rawData) ] # create a new arrayList to store our indices if one does not already exist # fill our newData Entry newDataEntry[attribute[ 0]] = value # store the value under its proper key # add variables to our bins if attribute[ 1] == 'real': # if the attribute is real, we place it in a continuous bin if attribute[0] in continuousVariables: continuousVariables[attribute[0]].add( value, line[len(line) - 1]) # add our value to our continuous bin else: continuousVariables[attribute[0]] = util.continuousBin( attribute[0] ) # instantiate a continuous bin to hold our variable continuousVariables[attribute[0]].add( value, line[len(line) - 1]) else: # if the attribute is categorical, we place it in a categorical bin if attribute[0] in categoricalVariables: categoricalVariables[attribute[0]].add( value, line[len(line) - 1]) else: categoricalVariables[ attribute[0]] = util.categoricalBin(attribute[1]) categoricalVariables[attribute[0]].add( value, line[len(line) - 1]) rawData.append( newDataEntry) # append data entry to all of our data # END OF FOR LOOP results = {} results['data'] = rawData results['attributes'] = attributes results['relation'] = relation results['lookup'] = reverseLookup results['continuousVariables'] = continuousVariables results['categoricalVariables'] = categoricalVariables if settings.PROGRESS_BAR == True: util.updateProgress(1) print "\nFile read complete \n" return results