def readData(args): """ Read data file and print out some statistics Return a training set, test set, labelId to text map, and docId to categories map. Return format: trainingData = [ ["fox eats carrots", [0], docId], ["fox eats peppers", [0], docId], ["carrots are healthy", [1], docId], ["peppers is healthy", [1], docId], ] """ # Read data dataDict = readCSV(args.dataPath, 1) labelRefs, dataDict = mapLabelRefs(dataDict) categoriesInOrderOfInterest = [8, 9, 10, 5, 6, 11, 13][0 : args.numLabels] # Select data based on categories of interest. Shift category indices down # so we go from 0 to numLabels-1 trainingData = [] counts = numpy.zeros(len(labelRefs)) for document in dataDict.itervalues(): docId = document[2] oldCategoryIndex = document[1][0] if oldCategoryIndex in categoriesInOrderOfInterest: newIndex = categoriesInOrderOfInterest.index(oldCategoryIndex) trainingData.append([document[0], [newIndex], docId]) counts[newIndex] += 1 # For each document, figure out which categories it belongs to # Include the shifted category index documentCategoryMap = {} for doc in dataDict.iteritems(): docId = doc[1][2] oldCategoryIndex = doc[1][1][0] if oldCategoryIndex in categoriesInOrderOfInterest: newIndex = categoriesInOrderOfInterest.index(oldCategoryIndex) v = documentCategoryMap.get(docId, []) v.append(newIndex) documentCategoryMap[docId] = v labelRefs = [labelRefs[i] for i in categoriesInOrderOfInterest] print "Total number of unique documents", len(documentCategoryMap) print "Category counts: ", counts print "Categories in training/test data:", labelRefs return trainingData, trainingData, labelRefs, documentCategoryMap
def setupData(self, preprocess=False): """ Get the data from CSV and preprocess if specified. The call to readCSV() assumes a specific CSV format, detailed in its docstring. @param preprocess (bool) Whether or not to preprocess the data when reading in samples. """ self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses) if self.experimentType == "incremental": # stop now if the data won't work for the specified experiment if (not isinstance(self.trainSizes, list) or not all([0 <= size <= len(self.dataDict) for size in self.trainSizes])): raise ValueError("Invalid size(s) for training set(s).") self.labelRefs, self.dataDict = mapLabelRefs(self.dataDict) self.samples = self.model.prepData(self.dataDict, preprocess) if self.verbosity > 1: for i, s in self.samples.iteritems(): print i, s
def setupData(self, preprocess=False): """ Get the data from CSV and preprocess if specified. The call to readCSV() assumes a specific CSV format, detailed in its docstring. @param preprocess (bool) Whether or not to preprocess the data when reading in samples. """ self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses) if self.experimentType == "incremental": # stop now if the data won't work for the specified experiment if (not isinstance(self.trainSizes, list) or not all( [0 <= size <= len(self.dataDict) for size in self.trainSizes])): raise ValueError("Invalid size(s) for training set(s).") self.labelRefs, self.dataDict = mapLabelRefs(self.dataDict) self.samples = self.model.prepData(self.dataDict, preprocess) if self.verbosity > 1: for i, s in self.samples.iteritems(): print i, s