def testSplitPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

        expected = [[{
            "_token": "gohbkchoo",
            "_categories": "0 1",
            "_sequenceID": 0,
            "ID": "1",
            "_reset": 1
        }],
                    [{
                        "_token": "o",
                        "_categories": "2",
                        "_sequenceID": 1,
                        "ID": "2",
                        "_reset": 1
                    }, {
                        "_token": "ca",
                        "_categories": "2",
                        "_sequenceID": 1,
                        "ID": "2",
                        "_reset": 0
                    }]]

        ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True)
        self.assertRecordsEqual(ndg.records, expected)
    def testRandomize(self):
        ndg = NetworkDataGenerator()
        filename = (
            self.dirName +
            "/../../../data/sample_reviews_multi/sample_reviews_data_training.csv"
        )
        ndg.split(filename, 2, 3, False)

        random.seed(1)
        ndg.randomizeData()

        dataOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_split.csv")
        categoriesOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_categories.json"
        )
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)

        randomizedIDs = []
        dataTable = pandas.read_csv(dataOutputFile)
        for _, values in dataTable.iterrows():
            record = values.to_dict()
            idx = record["_sequenceID"]
            if idx.isdigit() and (not randomizedIDs
                                  or randomizedIDs[-1] != idx):
                randomizedIDs.append(idx)

        self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
    def testSplitNoPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
                    "/../../../data/network_data_generator/multi_sample.csv")

        ndg.split(filename, 2, 3, False)
        self.assertRecordsEqual(ndg.records, self.expected)
    def testSaveData(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
                    "/../../../data/network_data_generator/multi_sample.csv")
        ndg.split(filename, 2, 3, False)
        dataOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_split.csv")
        categoriesOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_categories.json"
        )
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)
        self.assertTrue(success)

        dataTable = pandas.read_csv(dataOutputFile).fillna("")

        types = {
            "_category0": "int",
            "_category1": "int",
            "_category2": "int",
            "token": "string",
            "_sequenceID": "int",
            "_reset": "int"
        }
        specials = {
            "_category0": "C",
            "_category1": "C",
            "_category2": "C",
            "token": "",
            "_sequenceID": "S",
            "_reset": "R"
        }

        expected_records = [
            record for data in self.expected for record in data
        ]
        expected_records.insert(0, specials)
        expected_records.insert(0, types)

        for idx, values in dataTable.iterrows():
            record = values.to_dict()
            if record["_category1"] == "":
                del record["_category1"]

            if record["_category2"] == "":
                del record["_category2"]

            self.assertDictEqual(record, expected_records[idx])

        with open(categoriesOutputFile) as f:
            categories = json.load(f)

        expected_categories = {
            "kitchen": 0,
            "environment": 1,
            "not helpful": 2
        }
        self.assertDictEqual(categories, expected_categories)
    def testSaveDataIncorrectType(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.csv")
        ndg.split(filename, 3, False)

        with self.assertRaises(TypeError):
            ndg.saveData(dataOutputFile, categoriesOutputFile)
    def testFileRecordStreamReadData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        ndg.saveData(dataOutputFile, categoriesOutputFile)

        # If no error is raised, then the data is in the correct format
        frs = FileRecordStream(dataOutputFile)
    def testSaveData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)
        self.assertTrue(success)

        dataTable = pandas.read_csv(dataOutputFile).fillna("")

        types = {
            "_categories": "list",
            "_token": "string",
            "_sequenceID": "int",
            "_reset": "int",
            "ID": "string"
        }
        specials = {
            "_categories": "C",
            "_token": "",
            "_sequenceID": "S",
            "_reset": "R",
            "ID": ""
        }

        expected_records = [
            record for data in self.expected for record in data
        ]
        expected_records.insert(0, specials)
        expected_records.insert(0, types)

        for idx, values in dataTable.iterrows():
            record = values.to_dict()
            if idx > 1:
                # csv values are strings, so cast the ints
                record["_sequenceID"] = int(record["_sequenceID"])
                record["_reset"] = int(record["_reset"])
            self.assertDictEqual(record, expected_records[idx])

        with open(categoriesOutputFile) as f:
            categories = json.load(f)

        expected_categories = {
            "kitchen": 0,
            "environment": 1,
            "not helpful": 2
        }
        self.assertDictEqual(categories, expected_categories)
Beispiel #8
0
    def prepData(self, dataPath, ordered=False, **kwargs):
        """
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @return networkDataPath  (str)  Path to data formtted for network API.
    @return ndg              (NetworkDataGenerator)
    """
        ndg = NetworkDataGenerator()
        networkDataPath = ndg.setupData(dataPath, self.numLabels, ordered,
                                        **kwargs)

        return networkDataPath, ndg
    def testSplitPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
                    "/../../../data/network_data_generator/multi_sample.csv")

        expected = [[{
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "get",
            "_reset": "1"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "rid",
            "_reset": "0"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "trouble",
            "_reset": "0"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "kitchen",
            "_reset": "0"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "odor",
            "_reset": "0"
        }],
                    [{
                        "_category0": "2",
                        "_sequenceID": "1",
                        "token": "don",
                        "_reset": "1"
                    }, {
                        "_category0": "2",
                        "_sequenceID": "1",
                        "token": "care",
                        "_reset": "0"
                    }]]

        ndg.split(filename, 2, 3, True, ignoreCommon=100, correctSpell=True)
        self.assertRecordsEqual(ndg.records, expected)
    def testSaveDataIncorrectType(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
                    "/../../../data/network_data_generator/multi_sample.csv")
        dataOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_split.csv")
        categoriesOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_categories.csv"
        )
        ndg.split(filename, 2, 3, False)

        with self.assertRaises(TypeError):
            ndg.saveData(dataOutputFile, categoriesOutputFile)
Beispiel #11
0
    def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs):
        """
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @param ordered           (bool) Keep order of data, or randomize.
    @param stripCats         (bool) Remove the categories and replace them with
                                    the sequence_Id.
    @return networkDataPath  (str)  Path to data formtted for network API.
    @return ndg              (NetworkDataGenerator)
    """
        ndg = NetworkDataGenerator()
        networkDataPath = ndg.setupData(dataPath, self.numLabels, ordered,
                                        stripCats, **kwargs)

        return networkDataPath, ndg
Beispiel #12
0
    def setupData(self, preprocess=False, sampleIdx=2, **kwargs):
        """
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at runner.py (setupData) and network_data_generator.py (split) for the
    parameters
    """
        if self.generateData:
            ndg = NetworkDataGenerator()
            ndg.split(self.dataPath, sampleIdx, self.numClasses, preprocess,
                      **kwargs)

            filename, ext = os.path.splitext(self.dataPath)
            self.classificationFile = "{}-classifications.json".format(
                filename)
            for i in xrange(len(self.trainSize)):
                if not self.orderedSplit:
                    ndg.randomizeData()
                dataFile = "{}-{}{}".format(filename, i, ext)
                ndg.saveData(dataFile, self.classificationFile)
                self.dataFiles.append(dataFile)

            if self.verbosity > 0:
                print "{} file(s) generated at {}".format(
                    len(self.dataFiles), self.dataFiles)
                print "Classification json is at: {}".format(
                    self.classificationFile)
        else:
            # Does an orderedSplit
            self.dataFiles = [self.dataPath] * len(self.trainSize)

        self.actualLabels = [
            self._getClassifications(size, i)
            for i, size in enumerate(self.trainSize)
        ]

        self._mapLabelRefs()
Beispiel #13
0
  def setupNetData(self, preprocess=False, generateData=False, **kwargs):
    """
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at runner.py (setupData) and network_data_generator.py (split) for the
    parameters.
    """
    if generateData:
      # TODO: use model.prepData()?
      ndg = NetworkDataGenerator()
      ndg.split(self.dataPath, self.numClasses, preprocess, **kwargs)

      filename, ext = os.path.splitext(self.dataPath)
      self.classificationFile = "{}_categories.json".format(filename)

      for i in xrange(len(self.trainSizes)):
        if not self.orderedSplit:
          ndg.randomizeData()
        dataFile = "{}_network_{}{}".format(filename, i, ext)
        ndg.saveData(dataFile, self.classificationFile)
        self.dataFiles.append(dataFile)

      if self.verbosity > 0:
        print "{} file(s) generated at {}".format(len(self.dataFiles),
          self.dataFiles)
        print "Classification JSON is at: {}".format(self.classificationFile)
    else:
      # Use the input file for each trial; maintains the order of samples.
      self.dataFiles = [self.dataPath] * len(self.trainSizes)

    if self.numClasses > 0:
      # Setup labels data objects
      self.actualLabels = [self._getClassifications(size, i)
        for i, size in enumerate(self.trainSizes)]
      self._mapLabelRefs()
    def testSplitNoPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

        ndg.split(filename, 3, False)
        self.assertRecordsEqual(ndg.records, self.expected)