Beispiel #1
0
 def convertContinuousToCategorical(self, attrName, splits):
     data = []
     currentSplit = [splits[0], 0]  # split number, index
     splitTypes = []
     # CREATE A LIST OF SPLIT TYPES
     for idx, split in enumerate(
             splits):  # for every split, save a splitKey
         if idx == 0:  # If index is zero
             splitTypes.append("<=" +
                               str(split))  # add just less than or equal
         else:  # If last element
             splitTypes.append(
                 str(splits[idx - 1]) + "< x <=" + str(split))  #
     splitTypes.append(">" + str(splits[len(splits) - 1]))
     # UPDATE ATTRIBUTES TO REFELCT CHANGES
     for attr in self.attributes:
         if attr[0] == attrName:
             attr[1] = splitTypes
     newBin = util.categoricalBin(
         splitTypes)  # create a new bin to store our values
     splitKey = splitTypes[0]
     for idx, contVar in enumerate(
             self.continuousVariables[attrName].getValues(
             )):  # for every continuous variable we have
         if contVar > currentSplit[0] and currentSplit[1] != len(splits) - 1:
             currentSplit = [
                 splits[currentSplit[1] + 1], currentSplit[1] + 1
             ]
             splitKey = splitTypes[currentSplit[1]]
         elif contVar > currentSplit:
             splitKey = splitTypes[len(splitTypes) - 1]
         # UPDATE REVERSE LOOKUP
         rlKey = attrName + " " + str(float(contVar))
         if rlKey not in self.lookup:
             continue
         newrlKey = attrName + " " + splitKey
         for userId in self.lookup[
                 rlKey]:  # for every user that has that continuous variable value
             self.data[userId][attrName] = splitKey
             if newrlKey in self.lookup:
                 self.lookup[newrlKey].append(userId)
             else:
                 self.lookup[newrlKey] = [userId]
             newBin.add(splitKey,
                        self.data[userId][settings.CLASSIFIER_NAME])
         self.lookup.pop(rlKey)
     self.continuousVariables.pop(attrName)
     self.categoricalVariables[attrName] = newBin
Beispiel #2
0
	def convertContinuousToCategorical(self, attrName, splits):
		data = []
		currentSplit = [splits[0], 0] 	# split number, index
		splitTypes = []
		# CREATE A LIST OF SPLIT TYPES
		for idx, split in enumerate(splits):	# for every split, save a splitKey
			if idx == 0: 						# If index is zero
				splitTypes.append("<=" + str(split))	# add just less than or equal
			else:			# If last element
				splitTypes.append(str(splits[idx - 1]) + "< x <=" + str(split))	# 
		splitTypes.append(">" + str(splits[len(splits) - 1]))
		# UPDATE ATTRIBUTES TO REFELCT CHANGES
		for attr in self.attributes:
			if attr[0] == attrName:
				attr[1] = splitTypes
		newBin = util.categoricalBin(splitTypes)	# create a new bin to store our values
		splitKey = splitTypes[0]
		for idx, contVar in enumerate(self.continuousVariables[attrName].getValues()):	# for every continuous variable we have
			if contVar > currentSplit[0] and currentSplit[1] != len(splits) - 1:
				currentSplit = [splits[currentSplit[1] + 1], currentSplit[1] + 1]
				splitKey = splitTypes[currentSplit[1]]
			elif contVar > currentSplit:
				splitKey = splitTypes[len(splitTypes) - 1]
			# UPDATE REVERSE LOOKUP
			rlKey = attrName + " " + str(float(contVar))
			if rlKey not in self.lookup:
				continue
			newrlKey = attrName + " " + splitKey
			for userId in self.lookup[rlKey]:		# for every user that has that continuous variable value
				self.data[userId][attrName] = splitKey
				if newrlKey in self.lookup:
					self.lookup[newrlKey].append(userId)
				else:
					self.lookup[newrlKey] = [userId]
				newBin.add(splitKey, self.data[userId][settings.CLASSIFIER_NAME])
			self.lookup.pop(rlKey)
		self.continuousVariables.pop(attrName)
		self.categoricalVariables[attrName] = newBin
def readArff(fileSrc):
	# main variables to be returned
	relation = ""									# relation		
	attributes = []									# attribute list
	rawData = []									# main data storage
	reverseLookup = {}								# store by value for reverse lookup
	continuousVariables = {}
	categoricalVariables = {}
	dataFile = codecs.open(fileSrc, 'rb', 'utf-8') 	# specify utf-8 encoding
	print "Reading file..."
	lines = dataFile.readlines() 					# read all lines
	if settings.PROGRESS_BAR == True:
		util.updateProgress(0)					# create a progress bar
	# test every line and extract its relevant information
	for idx, line in enumerate(lines):				# test each line
		if settings.PROGRESS_BAR == True:
			util.updateProgress(float(idx) / float(len(lines)))
		if line[0] == '%':							# ignore comments
			continue
		elif line[0] == '@':						# if is metadata
			if '@relation' in line:					# if relation
				arrayLine = line.split(" ")
				relation = arrayLine[1]
			elif "@attribute" in line:				# if attribute
				arrayLine = line.split(" ")
				attributes.append([arrayLine[1]])
				if "real" not in arrayLine[2]:		# if attribute is not real (is categorical)
					attrs = re.search('\{(.*?)\}', line).group()	# select text between brackets
					attrs = re.sub('[\{\}]', "", attrs)				# remove brackets
					newAttrs = attrs.split(", ")					
					options = []
					for attr in newAttrs:
						options.append(attr)
					attributes[len(attributes) - 1].append(options)
				else: 							# if it is real
					attributes[len(attributes) - 1].append('real')
		elif line[0] == " ":
				continue
		else:
			line = line.replace(" ", "")
			line = line.replace("\n", "")
			line = line.split(",")
			newDataEntry = {}							# create a new object to store our row data
			for idx, value in enumerate(line):			# for every column of data
				attribute = attributes[idx]
				if util.isNumber(value):						# convert string to float if it's a number
					value = float(value)
				# Add value to our reverse lookup under the key "attributeName attributeValue"
				rlKey = attribute[0] + " " + str(value) 		# create key for our reverseLookup data structure
				if rlKey in reverseLookup:
					reverseLookup[rlKey].append(len(rawData)) # append index of our current row (the length of data) for quick lookup later
				else:
					reverseLookup[rlKey] = [len(rawData)]	# create a new arrayList to store our indices if one does not already exist
				# fill our newData Entry
				newDataEntry[attribute[0]] = value 		# store the value under its proper key
				# add variables to our bins
				if attribute[1] == 'real':  				# if the attribute is real, we place it in a continuous bin
					if attribute[0] in continuousVariables:
						continuousVariables[attribute[0]].add(value, line[len(line) - 1])							# add our value to our continuous bin
					else:
						continuousVariables[attribute[0]] = util.continuousBin(attribute[0])	# instantiate a continuous bin to hold our variable
						continuousVariables[attribute[0]].add(value, line[len(line) - 1])
				else:									# if the attribute is categorical, we place it in a categorical bin
					if attribute[0] in categoricalVariables:
						categoricalVariables[attribute[0]].add(value, line[len(line) - 1])
					else:
						categoricalVariables[attribute[0]] = util.categoricalBin(attribute[1])
						categoricalVariables[attribute[0]].add(value, line[len(line) - 1])
			rawData.append(newDataEntry)					# append data entry to all of our data
	# END OF FOR LOOP
	results = {}
	results['data'] = rawData
	results['attributes'] = attributes
	results['relation'] = relation
	results['lookup'] = reverseLookup
	results['continuousVariables'] = continuousVariables
	results['categoricalVariables'] = categoricalVariables
	if settings.PROGRESS_BAR == True:
		util.updateProgress(1)
	print "\nFile read complete \n"
	return results
Beispiel #4
0
def readArff(fileSrc):
    # main variables to be returned
    relation = ""  # relation
    attributes = []  # attribute list
    rawData = []  # main data storage
    reverseLookup = {}  # store by value for reverse lookup
    continuousVariables = {}
    categoricalVariables = {}
    dataFile = codecs.open(fileSrc, 'rb', 'utf-8')  # specify utf-8 encoding
    print "Reading file..."
    lines = dataFile.readlines()  # read all lines
    if settings.PROGRESS_BAR == True:
        util.updateProgress(0)  # create a progress bar
    # test every line and extract its relevant information
    for idx, line in enumerate(lines):  # test each line
        if settings.PROGRESS_BAR == True:
            util.updateProgress(float(idx) / float(len(lines)))
        if line[0] == '%':  # ignore comments
            continue
        elif line[0] == '@':  # if is metadata
            if '@relation' in line:  # if relation
                arrayLine = line.split(" ")
                relation = arrayLine[1]
            elif "@attribute" in line:  # if attribute
                arrayLine = line.split(" ")
                attributes.append([arrayLine[1]])
                if "real" not in arrayLine[
                        2]:  # if attribute is not real (is categorical)
                    attrs = re.search(
                        '\{(.*?)\}',
                        line).group()  # select text between brackets
                    attrs = re.sub('[\{\}]', "", attrs)  # remove brackets
                    newAttrs = attrs.split(", ")
                    options = []
                    for attr in newAttrs:
                        options.append(attr)
                    attributes[len(attributes) - 1].append(options)
                else:  # if it is real
                    attributes[len(attributes) - 1].append('real')
        elif line[0] == " ":
            continue
        else:
            line = line.replace(" ", "")
            line = line.replace("\n", "")
            line = line.split(",")
            newDataEntry = {}  # create a new object to store our row data
            for idx, value in enumerate(line):  # for every column of data
                attribute = attributes[idx]
                if util.isNumber(
                        value):  # convert string to float if it's a number
                    value = float(value)
                # Add value to our reverse lookup under the key "attributeName attributeValue"
                rlKey = attribute[0] + " " + str(
                    value)  # create key for our reverseLookup data structure
                if rlKey in reverseLookup:
                    reverseLookup[rlKey].append(
                        len(rawData)
                    )  # append index of our current row (the length of data) for quick lookup later
                else:
                    reverseLookup[rlKey] = [
                        len(rawData)
                    ]  # create a new arrayList to store our indices if one does not already exist
                # fill our newData Entry
                newDataEntry[attribute[
                    0]] = value  # store the value under its proper key
                # add variables to our bins
                if attribute[
                        1] == 'real':  # if the attribute is real, we place it in a continuous bin
                    if attribute[0] in continuousVariables:
                        continuousVariables[attribute[0]].add(
                            value,
                            line[len(line) -
                                 1])  # add our value to our continuous bin
                    else:
                        continuousVariables[attribute[0]] = util.continuousBin(
                            attribute[0]
                        )  # instantiate a continuous bin to hold our variable
                        continuousVariables[attribute[0]].add(
                            value, line[len(line) - 1])
                else:  # if the attribute is categorical, we place it in a categorical bin
                    if attribute[0] in categoricalVariables:
                        categoricalVariables[attribute[0]].add(
                            value, line[len(line) - 1])
                    else:
                        categoricalVariables[
                            attribute[0]] = util.categoricalBin(attribute[1])
                        categoricalVariables[attribute[0]].add(
                            value, line[len(line) - 1])
            rawData.append(
                newDataEntry)  # append data entry to all of our data
    # END OF FOR LOOP
    results = {}
    results['data'] = rawData
    results['attributes'] = attributes
    results['relation'] = relation
    results['lookup'] = reverseLookup
    results['continuousVariables'] = continuousVariables
    results['categoricalVariables'] = categoricalVariables
    if settings.PROGRESS_BAR == True:
        util.updateProgress(1)
    print "\nFile read complete \n"
    return results