def readData(args):
    """
  Read data file and print out some statistics
  Return a training set, test set, labelId to text map, and docId to categories
  map.

  Return format:
      trainingData = [
        ["fox eats carrots", [0], docId],
        ["fox eats peppers", [0], docId],
        ["carrots are healthy", [1], docId],
        ["peppers is healthy", [1], docId],
      ]
  """
    # Read data
    dataDict = readCSV(args.dataPath, 1)
    labelRefs, dataDict = mapLabelRefs(dataDict)
    categoriesInOrderOfInterest = [8, 9, 10, 5, 6, 11, 13][0 : args.numLabels]

    # Select data based on categories of interest. Shift category indices down
    # so we go from 0 to numLabels-1
    trainingData = []
    counts = numpy.zeros(len(labelRefs))
    for document in dataDict.itervalues():
        docId = document[2]
        oldCategoryIndex = document[1][0]
        if oldCategoryIndex in categoriesInOrderOfInterest:
            newIndex = categoriesInOrderOfInterest.index(oldCategoryIndex)
            trainingData.append([document[0], [newIndex], docId])
            counts[newIndex] += 1

    # For each document, figure out which categories it belongs to
    # Include the shifted category index
    documentCategoryMap = {}
    for doc in dataDict.iteritems():
        docId = doc[1][2]
        oldCategoryIndex = doc[1][1][0]
        if oldCategoryIndex in categoriesInOrderOfInterest:
            newIndex = categoriesInOrderOfInterest.index(oldCategoryIndex)
            v = documentCategoryMap.get(docId, [])
            v.append(newIndex)
            documentCategoryMap[docId] = v

    labelRefs = [labelRefs[i] for i in categoriesInOrderOfInterest]
    print "Total number of unique documents", len(documentCategoryMap)
    print "Category counts: ", counts
    print "Categories in training/test data:", labelRefs

    return trainingData, trainingData, labelRefs, documentCategoryMap
  def setupData(self, preprocess=False):
    """
    Get the data from CSV and preprocess if specified. The call to readCSV()
    assumes a specific CSV format, detailed in its docstring.

    @param preprocess   (bool)    Whether or not to preprocess the data when
                                  reading in samples.
    """
    self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses)

    if self.experimentType == "incremental":
      # stop now if the data won't work for the specified experiment
      if (not isinstance(self.trainSizes, list) or not
          all([0 <= size <= len(self.dataDict) for size in self.trainSizes])):
        raise ValueError("Invalid size(s) for training set(s).")

    self.labelRefs, self.dataDict = mapLabelRefs(self.dataDict)

    self.samples = self.model.prepData(self.dataDict, preprocess)

    if self.verbosity > 1:
      for i, s in self.samples.iteritems():
        print i, s
Beispiel #3
0
    def setupData(self, preprocess=False):
        """
    Get the data from CSV and preprocess if specified. The call to readCSV()
    assumes a specific CSV format, detailed in its docstring.

    @param preprocess   (bool)    Whether or not to preprocess the data when
                                  reading in samples.
    """
        self.dataDict = readCSV(self.dataPath, numLabels=self.numClasses)

        if self.experimentType == "incremental":
            # stop now if the data won't work for the specified experiment
            if (not isinstance(self.trainSizes, list) or not all(
                [0 <= size <= len(self.dataDict)
                 for size in self.trainSizes])):
                raise ValueError("Invalid size(s) for training set(s).")

        self.labelRefs, self.dataDict = mapLabelRefs(self.dataDict)

        self.samples = self.model.prepData(self.dataDict, preprocess)

        if self.verbosity > 1:
            for i, s in self.samples.iteritems():
                print i, s