def loadFeature(self, name, file): print "* loading feature: " + str(file) sys.stdout.flush() fileObj = open(file, "rb") tmp = pickle.load(fileObj) fileObj.close() numNone = 0 numTotal = 0 for docpair in tmp.keys(): val = tmp[docpair] if val != None: if docpair in self.docPairsToRawEdges.keys(): self.docPairsToRawEdges[docpair].addFeature(name, tmp[docpair]) else: e = Edge(docpair) e.addFeature(name, tmp[docpair]) self.docPairsToRawEdges[docpair] = e #self.edges.add(e) else: numNone = numNone + 1 numTotal = numTotal + 1 print "total: " + str(numTotal) print "none: " + str(numNone) sys.stdout.flush()
def normalizeFeatures(self): # goes through each feature for feature in self.featureNames: print "----------\nfeature :" + str(feature) sys.stdout.flush() min = 99999 max = -99999 sum = 0 # the sum of these two will total the size of self.docPairsToRawEdges missingValue = 0 hadValue = 0 values = [] # goes through all docpairs for docpair in self.docPairsToRawEdges.keys(): e = self.docPairsToRawEdges[docpair] # checks if the current docpair has the feature (it could be missing) if feature in e.features.keys(): val = e.features[feature] # appends to the list of vals (bc we will randomly sample from this) values.append(val) sum = sum + val if val < min: min = val if val > max: max = val hadValue = hadValue + 1 else: missingValue = missingValue + 1 avg = float(sum) / float(hadValue) print "min: " + str(min) print "max: " + str(max) print "avg: " + str(avg) print str(missingValue) + " missing values, out of " + str(len(self.docPairsToRawEdges)) + " total (" + str(float(missingValue)/float(len(self.docPairsToRawEdges))) + ")" sys.stdout.flush() denom = max - min # goes through all docpairs again, to: # - normalizes all values; and # - fills in missing values by randomly sampling (fills in for both raw and normalized values) for docpair in self.docPairsToRawEdges.keys(): e = self.docPairsToRawEdges[docpair] # checks if the current docpair has the feature (it could be missing) if feature in e.features.keys(): normVal = float(e.features[feature] - min) / float(denom) # a normalized Edge already exists for the given docpair if docpair in self.docPairsToNormEdges.keys(): enorm = self.docPairsToNormEdges[docpair] enorm.addFeature(feature, normVal) else: enorm = Edge(docpair) enorm.addFeature(feature, normVal) self.docPairsToNormEdges[docpair] = enorm else: # this docpair edge doesn't have the feature, let's randomly pick a value randvalue = values[randint(0,len(values)-1)] normVal = float(randvalue - min) / float(denom) #print "rand value: " + str(randvalue) e.addFeature(feature, randvalue) # a normalized Edge already exists for the given docpair if docpair in self.docPairsToNormEdges.keys(): enorm = self.docPairsToNormEdges[docpair] enorm.addFeature(feature, normVal) else: enorm = Edge(docpair) enorm.addFeature(feature, normVal) self.docPairsToNormEdges[docpair] = enorm print "done: " print "size of rawedges: " + str(len(self.docPairsToRawEdges)) print "size of normedges: " + str(len(self.docPairsToNormEdges)) sys.stdout.flush()