def testSplitPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

        expected = [[{
            "_token": "gohbkchoo",
            "_categories": "0 1",
            "_sequenceID": 0,
            "ID": "1",
            "_reset": 1
        }],
                    [{
                        "_token": "o",
                        "_categories": "2",
                        "_sequenceID": 1,
                        "ID": "2",
                        "_reset": 1
                    }, {
                        "_token": "ca",
                        "_categories": "2",
                        "_sequenceID": 1,
                        "ID": "2",
                        "_reset": 0
                    }]]

        ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True)
        self.assertRecordsEqual(ndg.records, expected)
    def testSplitNoPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
                    "/../../../data/network_data_generator/multi_sample.csv")

        ndg.split(filename, 2, 3, False)
        self.assertRecordsEqual(ndg.records, self.expected)
    def testSaveData(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
                    "/../../../data/network_data_generator/multi_sample.csv")
        ndg.split(filename, 2, 3, False)
        dataOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_split.csv")
        categoriesOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_categories.json"
        )
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)
        self.assertTrue(success)

        dataTable = pandas.read_csv(dataOutputFile).fillna("")

        types = {
            "_category0": "int",
            "_category1": "int",
            "_category2": "int",
            "token": "string",
            "_sequenceID": "int",
            "_reset": "int"
        }
        specials = {
            "_category0": "C",
            "_category1": "C",
            "_category2": "C",
            "token": "",
            "_sequenceID": "S",
            "_reset": "R"
        }

        expected_records = [
            record for data in self.expected for record in data
        ]
        expected_records.insert(0, specials)
        expected_records.insert(0, types)

        for idx, values in dataTable.iterrows():
            record = values.to_dict()
            if record["_category1"] == "":
                del record["_category1"]

            if record["_category2"] == "":
                del record["_category2"]

            self.assertDictEqual(record, expected_records[idx])

        with open(categoriesOutputFile) as f:
            categories = json.load(f)

        expected_categories = {
            "kitchen": 0,
            "environment": 1,
            "not helpful": 2
        }
        self.assertDictEqual(categories, expected_categories)
  def testSaveDataIncorrectType(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
    dataOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_split.csv")
    categoriesOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_categories.csv")
    ndg.split(filename, 3, False)

    with self.assertRaises(TypeError):
      ndg.saveData(dataOutputFile, categoriesOutputFile)
  def testFileRecordStreamReadData(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
    ndg.split(filename, 3, False)
    dataOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_split.csv")
    categoriesOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_categories.json")
    ndg.saveData(dataOutputFile, categoriesOutputFile)

    # If no error is raised, then the data is in the correct format
    frs = FileRecordStream(dataOutputFile)
Ejemplo n.º 6
0
  def prepData(self, dataPath, ordered=False, **kwargs):
    """
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @return networkDataPath  (str)  Path to data formtted for network API.
    @return ndg              (NetworkDataGenerator)
    """
    ndg = NetworkDataGenerator()
    networkDataPath = ndg.setupData(dataPath, self.numLabels, ordered, **kwargs)

    return networkDataPath, ndg
    def testSaveData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)
        self.assertTrue(success)

        dataTable = pandas.read_csv(dataOutputFile).fillna("")

        types = {
            "_categories": "list",
            "_token": "string",
            "_sequenceID": "int",
            "_reset": "int",
            "ID": "string"
        }
        specials = {
            "_categories": "C",
            "_token": "",
            "_sequenceID": "S",
            "_reset": "R",
            "ID": ""
        }

        expected_records = [
            record for data in self.expected for record in data
        ]
        expected_records.insert(0, specials)
        expected_records.insert(0, types)

        for idx, values in dataTable.iterrows():
            record = values.to_dict()
            if idx > 1:
                # csv values are strings, so cast the ints
                record["_sequenceID"] = int(record["_sequenceID"])
                record["_reset"] = int(record["_reset"])
            self.assertDictEqual(record, expected_records[idx])

        with open(categoriesOutputFile) as f:
            categories = json.load(f)

        expected_categories = {
            "kitchen": 0,
            "environment": 1,
            "not helpful": 2
        }
        self.assertDictEqual(categories, expected_categories)
Ejemplo n.º 8
0
    def prepData(self, dataPath, ordered=False, **kwargs):
        """
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @return networkDataPath  (str)  Path to data formtted for network API.
    @return ndg              (NetworkDataGenerator)
    """
        ndg = NetworkDataGenerator()
        networkDataPath = ndg.setupData(dataPath, self.numLabels, ordered,
                                        **kwargs)

        return networkDataPath, ndg
    def testSplitPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
                    "/../../../data/network_data_generator/multi_sample.csv")

        expected = [[{
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "get",
            "_reset": "1"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "rid",
            "_reset": "0"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "trouble",
            "_reset": "0"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "kitchen",
            "_reset": "0"
        }, {
            "_category0": "0",
            "_category1": "1",
            "_sequenceID": "0",
            "token": "odor",
            "_reset": "0"
        }],
                    [{
                        "_category0": "2",
                        "_sequenceID": "1",
                        "token": "don",
                        "_reset": "1"
                    }, {
                        "_category0": "2",
                        "_sequenceID": "1",
                        "token": "care",
                        "_reset": "0"
                    }]]

        ndg.split(filename, 2, 3, True, ignoreCommon=100, correctSpell=True)
        self.assertRecordsEqual(ndg.records, expected)
Ejemplo n.º 10
0
  def _testing(self, trial):
    """
    Test the network on the test set for a particular trial and store the
    results
    @param trial      (int)       trial count
    """
    if self.verbosity > 0:
      i = sum(self.partitions[trial][0])
      indices = []
      for numTokens in self.partitions[trial][1]:
        indices.append(i)
        i += numTokens
      print ("\tRunner selects to test on sequences starting at indices "
             "{}".format(indices))

    results = ([], [])
    for i, numTokens in enumerate(self.partitions[trial][1]):
      predictions = []
      for _ in xrange(numTokens):
        predicted = self.model.testModel()
        predictions.append(predicted)
      winningPredictions = self._selectWinners(predictions)

      # TODO: switch to standard (expected, actual) format
      results[0].append(winningPredictions)
      results[1].append(self.actualLabels[trial][i])

    # Prepare data for writeOutClassifications
    trainIdx = range(len(self.partitions[trial][0]))
    testIdx = range(len(self.partitions[trial][0]),
      len(self.partitions[trial][0]) + len(self.partitions[trial][1]))
    self.partitions[trial] = (trainIdx, testIdx)
    self.samples = NetworkDataGenerator.getSamples(self.dataFiles[trial])

    self.results.append(results)
Ejemplo n.º 11
0
    def testing(self, trial):
        """
    Test the network on the test set for a particular trial and store the
    results
    @param trial      (int)       trial count
    """
        if self.verbosity > 0:
            i = sum(self.partitions[trial][0])
            indices = []
            for numTokens in self.partitions[trial][1]:
                indices.append(i)
                i += numTokens
            print "\tRunner selects to test on sample(s) {}".format(indices)

        results = ([], [])
        for i, numTokens in enumerate(self.partitions[trial][1]):
            predictions = []
            for _ in xrange(numTokens):
                predicted = self.model.testModel()
                predictions.append(predicted)
            winningPredictions = self._selectWinners(predictions)
            results[0].append(winningPredictions)
            results[1].append(self.actualLabels[trial][i])

        # Prepare data for writeOutClassifications
        trainIdx = range(len(self.partitions[trial][0]))
        testIdx = range(
            len(self.partitions[trial][0]),
            len(self.partitions[trial][0]) + len(self.partitions[trial][1]))
        self.partitions[trial] = (trainIdx, testIdx)
        self.samples = NetworkDataGenerator.getSamples(self.dataFiles[trial])

        self.results.append(results)
Ejemplo n.º 12
0
    def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs):
        """
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @param ordered           (bool) Keep order of data, or randomize.
    @param stripCats         (bool) Remove the categories and replace them with
                                    the sequence_Id.
    @return networkDataPath  (str)  Path to data formtted for network API.
    @return ndg              (NetworkDataGenerator)
    """
        ndg = NetworkDataGenerator()
        networkDataPath = ndg.setupData(dataPath, self.numLabels, ordered,
                                        stripCats, **kwargs)

        return networkDataPath, ndg
Ejemplo n.º 13
0
 def partitionIndices(self, split, trial):
   """
   Returns the number of tokens for each sample in the training and test set
   when doing an ordered split
   """
   dataFile = self.dataFiles[trial]
   numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
   return (numTokens[:split], numTokens[split:])
Ejemplo n.º 14
0
  def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs):
    """
    Generate the data in network API format.

    @param dataPath          (str)  Path to input data file; format as expected
                                    by NetworkDataGenerator.
    @param ordered           (bool) Keep order of data, or randomize.
    @param stripCats         (bool) Remove the categories and replace them with
                                    the sequence_Id.
    @return networkDataPath  (str)  Path to data formtted for network API.
    @return ndg              (NetworkDataGenerator)
    """
    ndg = NetworkDataGenerator()
    networkDataPath = ndg.setupData(
      dataPath, self.numLabels, ordered, stripCats, **kwargs)

    return networkDataPath, ndg
Ejemplo n.º 15
0
 def partitionIndices(self, split, trial):
     """
 Returns the number of tokens for each sample in the training and test set
 when doing an ordered split
 """
     dataFile = self.dataFiles[trial]
     numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
     return (numTokens[:split], numTokens[split:])
    def testRandomize(self):
        ndg = NetworkDataGenerator()
        filename = (
            self.dirName +
            "/../../../data/sample_reviews_multi/sample_reviews_data_training.csv"
        )
        ndg.split(filename, 2, 3, False)

        random.seed(1)
        ndg.randomizeData()

        dataOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_split.csv")
        categoriesOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_categories.json"
        )
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)

        randomizedIDs = []
        dataTable = pandas.read_csv(dataOutputFile)
        for _, values in dataTable.iterrows():
            record = values.to_dict()
            idx = record["_sequenceID"]
            if idx.isdigit() and (not randomizedIDs
                                  or randomizedIDs[-1] != idx):
                randomizedIDs.append(idx)

        self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
Ejemplo n.º 17
0
  def setupData(self, preprocess=False, sampleIdx=2, **kwargs):
    """
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at runner.py (setupData) and network_data_generator.py (split) for the
    parameters
    """
    if self.generateData:
      ndg = NetworkDataGenerator()
      ndg.split(self.dataPath, sampleIdx, self.numClasses, preprocess,
        **kwargs)

      filename, ext = os.path.splitext(self.dataPath)
      self.classificationFile = "{}-classifications.json".format(filename)
      for i in xrange(len(self.trainSize)):
        if not self.orderedSplit:
          ndg.randomizeData()
        dataFile = "{}-{}{}".format(filename, i, ext)
        ndg.saveData(dataFile, self.classificationFile)
        self.dataFiles.append(dataFile)

      if self.verbosity > 0:
        print "{} file(s) generated at {}".format(len(self.dataFiles),
          self.dataFiles)
        print "Classification json is at: {}".format(self.classificationFile)
    else:
      # Does an orderedSplit
      self.dataFiles = [self.dataPath] * len(self.trainSize)

    self.actualLabels = [self._getClassifications(size, i)
      for i, size in enumerate(self.trainSize)]

    self._mapLabelRefs()
Ejemplo n.º 18
0
 def partitionIndices(self):
   """
   Sets self.partitions for the number of tokens for each sample in the
   training and test sets (when doing an ordered split).
   """
   for trial, split in enumerate(self.trainSizes):
     dataFile = self.dataFiles[trial]
     numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile)
     self.partitions.append((numTokens[:split], numTokens[split:]))
  def testSaveData(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
    ndg.split(filename, 3, False)
    dataOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_split.csv")
    categoriesOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_categories.json")
    success = ndg.saveData(dataOutputFile, categoriesOutputFile)
    self.assertTrue(success)

    dataTable = pandas.read_csv(dataOutputFile).fillna("")

    types = {"_categories": "list",
             "_token": "string",
             "_sequenceID": "int",
             "_reset": "int",
             "ID": "string"}
    specials = {"_categories": "C",
                "_token": "",
                "_sequenceID": "S",
                "_reset": "R",
                "ID": ""}
    
    expected_records = [record for data in self.expected for record in data]
    expected_records.insert(0, specials)
    expected_records.insert(0, types)

    for idx, values in dataTable.iterrows():
      record = values.to_dict()
      if idx > 1:
        # csv values are strings, so cast the ints
        record["_sequenceID"] = int(record["_sequenceID"])
        record["_reset"] = int(record["_reset"])
      self.assertDictEqual(record, expected_records[idx])

    with open(categoriesOutputFile) as f:
      categories = json.load(f)

    expected_categories = {"kitchen": 0, "environment": 1, "not helpful": 2}
    self.assertDictEqual(categories, expected_categories)
Ejemplo n.º 20
0
 def _getClassifications(self, split, trial):
   """
   Gets the classifications for testing samples for a particular trial
   @param split      (int)       Size of training set
   @param trial      (int)       trial count
   @return           (list)      List of list of ids of classifications for a
                                 sample
   """
   dataFile = self.dataFiles[trial]
   classifications = NetworkDataGenerator.getClassifications(dataFile)
   return [[int(c) for c in classes.strip().split(" ")]
            for classes in classifications][split:]
Ejemplo n.º 21
0
 def _getClassifications(self, split, trial):
     """
 Gets the classifications for testing samples for a particular trial
 @param split      (int)       Size of training set
 @param trial      (int)       trial count
 @return           (list)      List of list of ids of classifications for a
                               sample
 """
     dataFile = self.dataFiles[trial]
     classifications = NetworkDataGenerator.getClassifications(dataFile)
     return [[int(c) for c in classes.strip().split(" ")]
             for classes in classifications][split:]
  def testSplitPreprocess(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

    expected = [[
      {"_token": "gohbkchoo",
      "_categories": "0 1",
      "_sequenceID": 0,
      "ID": "1",
      "_reset": 1}],
      [{"_token": "o",
      "_categories": "2",
      "_sequenceID": 1,
      "ID": "2",
      "_reset": 1},
      {"_token": "ca",
      "_categories": "2",
      "_sequenceID": 1,
      "ID": "2",
      "_reset": 0}]]

    ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True)
    self.assertRecordsEqual(ndg.records, expected)
    def testSaveDataIncorrectType(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.csv")
        ndg.split(filename, 3, False)

        with self.assertRaises(TypeError):
            ndg.saveData(dataOutputFile, categoriesOutputFile)
    def testFileRecordStreamReadData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        ndg.saveData(dataOutputFile, categoriesOutputFile)

        # If no error is raised, then the data is in the correct format
        frs = FileRecordStream(dataOutputFile)
    def testSaveDataIncorrectType(self):
        ndg = NetworkDataGenerator()
        filename = (self.dirName +
                    "/../../../data/network_data_generator/multi_sample.csv")
        dataOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_split.csv")
        categoriesOutputFile = (
            self.dirName +
            "/../../../data/network_data_generator/multi_sample_categories.csv"
        )
        ndg.split(filename, 2, 3, False)

        with self.assertRaises(TypeError):
            ndg.saveData(dataOutputFile, categoriesOutputFile)
Ejemplo n.º 26
0
    def setupData(self, preprocess=False, sampleIdx=2, **kwargs):
        """
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at runner.py (setupData) and network_data_generator.py (split) for the
    parameters
    """
        if self.generateData:
            ndg = NetworkDataGenerator()
            ndg.split(self.dataPath, sampleIdx, self.numClasses, preprocess,
                      **kwargs)

            filename, ext = os.path.splitext(self.dataPath)
            self.classificationFile = "{}-classifications.json".format(
                filename)
            for i in xrange(len(self.trainSize)):
                if not self.orderedSplit:
                    ndg.randomizeData()
                dataFile = "{}-{}{}".format(filename, i, ext)
                ndg.saveData(dataFile, self.classificationFile)
                self.dataFiles.append(dataFile)

            if self.verbosity > 0:
                print "{} file(s) generated at {}".format(
                    len(self.dataFiles), self.dataFiles)
                print "Classification json is at: {}".format(
                    self.classificationFile)
        else:
            # Does an orderedSplit
            self.dataFiles = [self.dataPath] * len(self.trainSize)

        self.actualLabels = [
            self._getClassifications(size, i)
            for i, size in enumerate(self.trainSize)
        ]

        self._mapLabelRefs()
Ejemplo n.º 27
0
  def setupNetData(self, preprocess=False, generateData=False, **kwargs):
    """
    Generate the data in network API format if necessary. self.dataFiles is
    populated with the paths of network data files, one for each trial

    Look at runner.py (setupData) and network_data_generator.py (split) for the
    parameters.
    """
    if generateData:
      # TODO: use model.prepData()?
      ndg = NetworkDataGenerator()
      ndg.split(self.dataPath, self.numClasses, preprocess, **kwargs)

      filename, ext = os.path.splitext(self.dataPath)
      self.classificationFile = "{}_categories.json".format(filename)

      for i in xrange(len(self.trainSizes)):
        if not self.orderedSplit:
          ndg.randomizeData()
        dataFile = "{}_network_{}{}".format(filename, i, ext)
        ndg.saveData(dataFile, self.classificationFile)
        self.dataFiles.append(dataFile)

      if self.verbosity > 0:
        print "{} file(s) generated at {}".format(len(self.dataFiles),
          self.dataFiles)
        print "Classification JSON is at: {}".format(self.classificationFile)
    else:
      # Use the input file for each trial; maintains the order of samples.
      self.dataFiles = [self.dataPath] * len(self.trainSizes)

    if self.numClasses > 0:
      # Setup labels data objects
      self.actualLabels = [self._getClassifications(size, i)
        for i, size in enumerate(self.trainSizes)]
      self._mapLabelRefs()
  def testRandomize(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
    ndg.split(filename, 3, False)

    random.seed(1)
    ndg.randomizeData()

    dataOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_split.csv")
    categoriesOutputFile = os.path.join(
        self.dirName, "test_data/multi_sample_categories.json")
    success = ndg.saveData(dataOutputFile, categoriesOutputFile)

    randomizedIDs = []
    dataTable = pandas.read_csv(dataOutputFile)
    for _, values in dataTable.iterrows():
      record = values.to_dict()
      idx = record["_sequenceID"]
      if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx):
        randomizedIDs.append(idx)

    self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
  def testSplitNoPreprocess(self):
    ndg = NetworkDataGenerator()
    filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

    ndg.split(filename, 3, False)
    self.assertRecordsEqual(ndg.records, self.expected)
    def testSplitNoPreprocess(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")

        ndg.split(filename, 3, False)
        self.assertRecordsEqual(ndg.records, self.expected)