def setupData(self, preprocess=False, sampleIdx=2, **kwargs):
    """
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at runner.py (setupData) and network_data_generator.py (split) for the
    parameters
    """
    if self.generateData:
      ndg = NetworkDataGenerator()
      ndg.split(self.dataPath, sampleIdx, self.numClasses, preprocess,
        **kwargs)

      filename, ext = os.path.splitext(self.dataPath)
      self.classificationFile = "{}-classifications.json".format(filename)
      for i in xrange(len(self.trainSize)):
        if not self.orderedSplit:
          ndg.randomizeData()
        dataFile = "{}-{}{}".format(filename, i, ext)
        ndg.saveData(dataFile, self.classificationFile)
        self.dataFiles.append(dataFile)

      if self.verbosity > 0:
        print "{} file(s) generated at {}".format(len(self.dataFiles),
          self.dataFiles)
        print "Classification json is at: {}".format(self.classificationFile)
    else:
      # Does an orderedSplit
      self.dataFiles = [self.dataPath] * len(self.trainSize)

    self.actualLabels = [self._getClassifications(size, i)
      for i, size in enumerate(self.trainSize)]

    self._mapLabelRefs()
    def testSplitPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

        expected = [[{
            "_token": "gohbkchoo",
            "_categories": "0 1",
            "_sequenceID": 0,
            "ID": "1",
            "_reset": 1
        }],
                    [{
                        "_token": "o",
                        "_categories": "2",
                        "_sequenceID": 1,
                        "ID": "2",
                        "_reset": 1
                    }, {
                        "_token": "ca",
                        "_categories": "2",
                        "_sequenceID": 1,
                        "ID": "2",
                        "_reset": 0
                    }]]

        ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True)
        self.assertRecordsEqual(ndg.records, expected)
    def testRandomize(self):
        ndg = NetworkDataGenerator()
        filename = (
            self.dirName +
            "/../../../data/sample_reviews_multi/sample_reviews_data_training.csv"
        )
        ndg.split(filename, 2, 3, False)

        random.seed(1)
        ndg.randomizeData()

        dataOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_split.csv")
        categoriesOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_categories.json"
        )
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)

        randomizedIDs = []
        dataTable = pandas.read_csv(dataOutputFile)
        for _, values in dataTable.iterrows():
            record = values.to_dict()
            idx = record["_sequenceID"]
            if idx.isdigit() and (not randomizedIDs
                                  or randomizedIDs[-1] != idx):
                randomizedIDs.append(idx)

        self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
    def testSplitNoPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
                    "/../../../data/network_data_generator/multi_sample.csv")

        ndg.split(filename, 2, 3, False)
        self.assertRecordsEqual(ndg.records, self.expected)
    def testSaveData(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
                    "/../../../data/network_data_generator/multi_sample.csv")
        ndg.split(filename, 2, 3, False)
        dataOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_split.csv")
        categoriesOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_categories.json"
        )
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)
        self.assertTrue(success)

        dataTable = pandas.read_csv(dataOutputFile).fillna("")

        types = {
            "_category0": "int",
            "_category1": "int",
            "_category2": "int",
            "token": "string",
            "_sequenceID": "int",
            "_reset": "int"
        }
        specials = {
            "_category0": "C",
            "_category1": "C",
            "_category2": "C",
            "token": "",
            "_sequenceID": "S",
            "_reset": "R"
        }

        expected_records = [
            record for data in self.expected for record in data
        ]
        expected_records.insert(0, specials)
        expected_records.insert(0, types)

        for idx, values in dataTable.iterrows():
            record = values.to_dict()
            if record["_category1"] == "":
                del record["_category1"]

            if record["_category2"] == "":
                del record["_category2"]

            self.assertDictEqual(record, expected_records[idx])

        with open(categoriesOutputFile) as f:
            categories = json.load(f)

        expected_categories = {
            "kitchen": 0,
            "environment": 1,
            "not helpful": 2
        }
        self.assertDictEqual(categories, expected_categories)
  def testSaveDataIncorrectType(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
    dataOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_split.csv")
    categoriesOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_categories.csv")
    ndg.split(filename, 3, False)

    with self.assertRaises(TypeError):
      ndg.saveData(dataOutputFile, categoriesOutputFile)
    def testSaveDataIncorrectType(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.csv")
        ndg.split(filename, 3, False)

        with self.assertRaises(TypeError):
            ndg.saveData(dataOutputFile, categoriesOutputFile)
    def testFileRecordStreamReadData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        ndg.saveData(dataOutputFile, categoriesOutputFile)

        # If no error is raised, then the data is in the correct format
        frs = FileRecordStream(dataOutputFile)
  def testFileRecordStreamReadData(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
    ndg.split(filename, 3, False)
    dataOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_split.csv")
    categoriesOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_categories.json")
    ndg.saveData(dataOutputFile, categoriesOutputFile)

    # If no error is raised, then the data is in the correct format
    frs = FileRecordStream(dataOutputFile)
    def testSaveData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)
        self.assertTrue(success)

        dataTable = pandas.read_csv(dataOutputFile).fillna("")

        types = {
            "_categories": "list",
            "_token": "string",
            "_sequenceID": "int",
            "_reset": "int",
            "ID": "string"
        }
        specials = {
            "_categories": "C",
            "_token": "",
            "_sequenceID": "S",
            "_reset": "R",
            "ID": ""
        }

        expected_records = [
            record for data in self.expected for record in data
        ]
        expected_records.insert(0, specials)
        expected_records.insert(0, types)

        for idx, values in dataTable.iterrows():
            record = values.to_dict()
            if idx > 1:
                # csv values are strings, so cast the ints
                record["_sequenceID"] = int(record["_sequenceID"])
                record["_reset"] = int(record["_reset"])
            self.assertDictEqual(record, expected_records[idx])

        with open(categoriesOutputFile) as f:
            categories = json.load(f)

        expected_categories = {
            "kitchen": 0,
            "environment": 1,
            "not helpful": 2
        }
        self.assertDictEqual(categories, expected_categories)
    def testSplitPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
                    "/../../../data/network_data_generator/multi_sample.csv")

        expected = [[{
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "get",
            "_reset": "1"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "rid",
            "_reset": "0"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "trouble",
            "_reset": "0"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "kitchen",
            "_reset": "0"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "odor",
            "_reset": "0"
        }],
                    [{
                        "_category0": "2",
                        "_sequenceID": "1",
                        "token": "don",
                        "_reset": "1"
                    }, {
                        "_category0": "2",
                        "_sequenceID": "1",
                        "token": "care",
                        "_reset": "0"
                    }]]

        ndg.split(filename, 2, 3, True, ignoreCommon=100, correctSpell=True)
        self.assertRecordsEqual(ndg.records, expected)
    def testSaveDataIncorrectType(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
                    "/../../../data/network_data_generator/multi_sample.csv")
        dataOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_split.csv")
        categoriesOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_categories.csv"
        )
        ndg.split(filename, 2, 3, False)

        with self.assertRaises(TypeError):
            ndg.saveData(dataOutputFile, categoriesOutputFile)
  def testSaveData(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
    ndg.split(filename, 3, False)
    dataOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_split.csv")
    categoriesOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_categories.json")
    success = ndg.saveData(dataOutputFile, categoriesOutputFile)
    self.assertTrue(success)

    dataTable = pandas.read_csv(dataOutputFile).fillna("")

    types = {"_categories": "list",
             "_token": "string",
             "_sequenceID": "int",
             "_reset": "int",
             "ID": "string"}
    specials = {"_categories": "C",
                "_token": "",
                "_sequenceID": "S",
                "_reset": "R",
                "ID": ""}
    
    expected_records = [record for data in self.expected for record in data]
    expected_records.insert(0, specials)
    expected_records.insert(0, types)

    for idx, values in dataTable.iterrows():
      record = values.to_dict()
      if idx > 1:
        # csv values are strings, so cast the ints
        record["_sequenceID"] = int(record["_sequenceID"])
        record["_reset"] = int(record["_reset"])
      self.assertDictEqual(record, expected_records[idx])

    with open(categoriesOutputFile) as f:
      categories = json.load(f)

    expected_categories = {"kitchen": 0, "environment": 1, "not helpful": 2}
    self.assertDictEqual(categories, expected_categories)
Beispiel #14
0
    def setupData(self, preprocess=False, sampleIdx=2, **kwargs):
        """
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at runner.py (setupData) and network_data_generator.py (split) for the
    parameters
    """
        if self.generateData:
            ndg = NetworkDataGenerator()
            ndg.split(self.dataPath, sampleIdx, self.numClasses, preprocess,
                      **kwargs)

            filename, ext = os.path.splitext(self.dataPath)
            self.classificationFile = "{}-classifications.json".format(
                filename)
            for i in xrange(len(self.trainSize)):
                if not self.orderedSplit:
                    ndg.randomizeData()
                dataFile = "{}-{}{}".format(filename, i, ext)
                ndg.saveData(dataFile, self.classificationFile)
                self.dataFiles.append(dataFile)

            if self.verbosity > 0:
                print "{} file(s) generated at {}".format(
                    len(self.dataFiles), self.dataFiles)
                print "Classification json is at: {}".format(
                    self.classificationFile)
        else:
            # Does an orderedSplit
            self.dataFiles = [self.dataPath] * len(self.trainSize)

        self.actualLabels = [
            self._getClassifications(size, i)
            for i, size in enumerate(self.trainSize)
        ]

        self._mapLabelRefs()
  def testRandomize(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
    ndg.split(filename, 3, False)

    random.seed(1)
    ndg.randomizeData()

    dataOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_split.csv")
    categoriesOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_categories.json")
    success = ndg.saveData(dataOutputFile, categoriesOutputFile)

    randomizedIDs = []
    dataTable = pandas.read_csv(dataOutputFile)
    for _, values in dataTable.iterrows():
      record = values.to_dict()
      idx = record["_sequenceID"]
      if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx):
        randomizedIDs.append(idx)

    self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
  def testSplitPreprocess(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

    expected = [[
      {"_token": "gohbkchoo",
      "_categories": "0 1",
      "_sequenceID": 0,
      "ID": "1",
      "_reset": 1}],
      [{"_token": "o",
      "_categories": "2",
      "_sequenceID": 1,
      "ID": "2",
      "_reset": 1},
      {"_token": "ca",
      "_categories": "2",
      "_sequenceID": 1,
      "ID": "2",
      "_reset": 0}]]

    ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True)
    self.assertRecordsEqual(ndg.records, expected)
Beispiel #17
0
  def setupNetData(self, preprocess=False, generateData=False, **kwargs):
    """
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at runner.py (setupData) and network_data_generator.py (split) for the
    parameters.
    """
    if generateData:
      # TODO: use model.prepData()?
      ndg = NetworkDataGenerator()
      ndg.split(self.dataPath, self.numClasses, preprocess, **kwargs)

      filename, ext = os.path.splitext(self.dataPath)
      self.classificationFile = "{}_categories.json".format(filename)

      for i in xrange(len(self.trainSizes)):
        if not self.orderedSplit:
          ndg.randomizeData()
        dataFile = "{}_network_{}{}".format(filename, i, ext)
        ndg.saveData(dataFile, self.classificationFile)
        self.dataFiles.append(dataFile)

      if self.verbosity > 0:
        print "{} file(s) generated at {}".format(len(self.dataFiles),
          self.dataFiles)
        print "Classification JSON is at: {}".format(self.classificationFile)
    else:
      # Use the input file for each trial; maintains the order of samples.
      self.dataFiles = [self.dataPath] * len(self.trainSizes)

    if self.numClasses > 0:
      # Setup labels data objects
      self.actualLabels = [self._getClassifications(size, i)
        for i, size in enumerate(self.trainSizes)]
      self._mapLabelRefs()
  def testSplitNoPreprocess(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

    ndg.split(filename, 3, False)
    self.assertRecordsEqual(ndg.records, self.expected)
    def testSplitNoPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

        ndg.split(filename, 3, False)
        self.assertRecordsEqual(ndg.records, self.expected)