def generateNetworkDataFiles(self, splits, seed, preprocess, **kwargs):
    # TODO: use model.prepData()?
    ndg = NetworkDataGenerator()
    self.dataDict = ndg.split(
      filePath=self.dataPath, numLabels=self.numClasses, textPreprocess=preprocess, **kwargs)

    filename, ext = os.path.splitext(self.dataPath)
    self.classificationFile = "{}_categories.json".format(filename)

    # Generate one data file for each experiment iteration.
    if self.experimentType == "k-folds" and not self.orderedSplit:
      # only randomize the data order once for k-folds cross validation
      ndg.randomizeData(seed)
    for i in xrange(splits):
      if self.experimentType != "k-folds" and not self.orderedSplit:
        ndg.randomizeData(seed)
        seed += 1
      # ext='.csv'
      dataFile = "{}_network_{}{}".format(filename, i, ext)
      ndg.saveData(dataFile, self.classificationFile)
      self.dataFiles.append(dataFile)

    if self.verbosity > 0:
      print "{} file(s) generated at {}".format(len(self.dataFiles),
        self.dataFiles)
      print "Classification JSON is at: {}".format(self.classificationFile)
Exemple #2
0
    def generateNetworkDataFiles(self, splits, seed, preprocess, **kwargs):
        # TODO: use model.prepData()?
        ndg = NetworkDataGenerator()
        self.dataDict = ndg.split(filePath=self.dataPath,
                                  numLabels=self.numClasses,
                                  textPreprocess=preprocess,
                                  **kwargs)

        filename, ext = os.path.splitext(self.dataPath)
        self.classificationFile = "{}_categories.json".format(filename)

        # Generate one data file for each experiment iteration.
        if self.experimentType == "k-folds" and not self.orderedSplit:
            # only randomize the data order once for k-folds cross validation
            ndg.randomizeData(seed)
        for i in xrange(splits):
            if self.experimentType != "k-folds" and not self.orderedSplit:
                ndg.randomizeData(seed)
                seed += 1
            # ext='.csv'
            dataFile = "{}_network_{}{}".format(filename, i, ext)
            ndg.saveData(dataFile, self.classificationFile)
            self.dataFiles.append(dataFile)

        if self.verbosity > 0:
            print "{} file(s) generated at {}".format(len(self.dataFiles),
                                                      self.dataFiles)
            print "Classification JSON is at: {}".format(
                self.classificationFile)
    def testSaveDataIncorrectType(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.csv")
        ndg.split(filename, 3, False)

        with self.assertRaises(TypeError):
            ndg.saveData(dataOutputFile, categoriesOutputFile)
    def testFileRecordStreamReadData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.json")
        ndg.saveData(dataOutputFile, categoriesOutputFile)

        # If no error is raised, then the data is in the correct format
        frs = FileRecordStream(dataOutputFile)
Exemple #5
0
    def testSaveDataIncorrectType(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.csv")
        ndg.split(filename, 3, False)

        with self.assertRaises(TypeError):
            ndg.saveData(dataOutputFile, categoriesOutputFile)
Exemple #6
0
    def testFileRecordStreamReadData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        ndg.saveData(dataOutputFile, categoriesOutputFile)

        # If no error is raised, then the data is in the correct format
        frs = FileRecordStream(dataOutputFile)
    def testSaveData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)
        self.assertTrue(success)

        dataTable = pandas.read_csv(dataOutputFile).fillna("")

        types = {"_category": "list", "_token": "string", "_sequenceId": "int", "_reset": "int", "ID": "string"}
        specials = {"_category": "C", "_token": "", "_sequenceId": "S", "_reset": "R", "ID": ""}

        expected_records = [record for data in self.expected for record in data]
        expected_records.insert(0, specials)
        expected_records.insert(0, types)

        for idx, values in dataTable.iterrows():
            record = values.to_dict()
            if idx > 1:
                # csv values are strings, so cast the ints
                record["_sequenceId"] = int(record["_sequenceId"])
                record["_reset"] = int(record["_reset"])
            self.assertDictEqual(record, expected_records[idx])

        with open(categoriesOutputFile) as f:
            categories = json.load(f)

        expected_categories = {"kitchen": 0, "environment": 1, "not helpful": 2}
        self.assertDictEqual(categories, expected_categories)
Exemple #8
0
    def testSaveData(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)
        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)
        self.assertTrue(success)

        dataTable = pandas.read_csv(dataOutputFile).fillna("")

        types = {
            "_category": "list",
            "_token": "string",
            "_sequenceId": "int",
            "_reset": "int",
            "ID": "string"
        }
        specials = {
            "_category": "C",
            "_token": "",
            "_sequenceId": "S",
            "_reset": "R",
            "ID": ""
        }

        expected_records = [
            record for data in self.expected for record in data
        ]
        expected_records.insert(0, specials)
        expected_records.insert(0, types)

        for idx, values in dataTable.iterrows():
            record = values.to_dict()
            if idx > 1:
                # csv values are strings, so cast the ints
                record["_sequenceId"] = int(record["_sequenceId"])
                record["_reset"] = int(record["_reset"])
            self.assertDictEqual(record, expected_records[idx])

        with open(categoriesOutputFile) as f:
            categories = json.load(f)

        expected_categories = {
            "kitchen": 0,
            "environment": 1,
            "not helpful": 2
        }
        self.assertDictEqual(categories, expected_categories)
    def testRandomize(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)

        random.seed(1)
        ndg.randomizeData()

        dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)

        randomizedIDs = []
        dataTable = pandas.read_csv(dataOutputFile)
        for _, values in dataTable.iterrows():
            record = values.to_dict()
            idx = record["_sequenceId"]
            if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx):
                randomizedIDs.append(idx)

        self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
Exemple #10
0
    def testRandomize(self):
        ndg = NetworkDataGenerator()
        filename = os.path.join(self.dirName, "test_data/multi_sample.csv")
        ndg.split(filename, 3, False)

        random.seed(1)
        ndg.randomizeData()

        dataOutputFile = os.path.join(self.dirName,
                                      "test_data/multi_sample_split.csv")
        categoriesOutputFile = os.path.join(
            self.dirName, "test_data/multi_sample_categories.json")
        success = ndg.saveData(dataOutputFile, categoriesOutputFile)

        randomizedIDs = []
        dataTable = pandas.read_csv(dataOutputFile)
        for _, values in dataTable.iterrows():
            record = values.to_dict()
            idx = record["_sequenceId"]
            if idx.isdigit() and (not randomizedIDs
                                  or randomizedIDs[-1] != idx):
                randomizedIDs.append(idx)

        self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
Exemple #11
0
  def setupNetData(
    self, generateData=True, seed=42, preprocess=False, **kwargs):
    """
    Resulting network data files created:
      - One for each bucket
      - One for each training rep, where samples are not repeated in a given
      file. Each samples is given its own category (_category = _sequenceId).

    The classification json is saved when generating the final training file.
    """
    if generateData:
      ndg = NetworkDataGenerator()
      self.dataDict = ndg.split(
        filePath=self.dataPath, numLabels=1, textPreprocess=preprocess,
        **kwargs)

      filename, ext = os.path.splitext(self.dataPath)
      self.classificationFile = "{}_categories.json".format(filename)

      # Generate test data files: one network data file for each bucket.
      bucketFilePaths = bucketCSVs(self.dataPath)
      for bucketFile in bucketFilePaths:
        ndg.reset()
        ndg.split(
          filePath=bucketFile, numLabels=1, textPreprocess=preprocess, **kwargs)
        bucketFileName, ext = os.path.splitext(bucketFile)
        if not self.orderedSplit:
          # the sequences will be written to the file in random order
          ndg.randomizeData(seed)
        dataFile = "{}_network{}".format(bucketFileName, ext)
        ndg.saveData(dataFile, self.classificationFile)  # the classification file here gets (correctly) overwritten later
        self.bucketFiles.append(dataFile)

      # Generate training data file(s).
      self.trainingDicts = []
      uniqueDataDict = OrderedDict()
      included = []
      seqID = 0
      for dataEntry in self.dataDict.values():
        uniqueID = dataEntry[2]
        if uniqueID not in included:
          # skip over the samples that are repeated in multiple buckets
          uniqueDataDict[seqID] = dataEntry
          included.append(uniqueID)
          seqID += 1
      self.trainingDicts.append(uniqueDataDict)

      ndg.reset()
      ndg.split(
        dataDict=uniqueDataDict, numLabels=1, textPreprocess=preprocess,
        **kwargs)
      for rep in xrange(self.trainingReps):
        # use a different file for each training rep
        if not self.orderedSplit:
          ndg.randomizeData(seed)
        ndg.stripCategories()  # replace the categories w/ seqId
        dataFile = "{}_network_training_{}{}".format(filename, rep, ext)
        ndg.saveData(dataFile, self.classificationFile)
        self.dataFiles.append(dataFile)

      # TODO: maybe add a method (and arg) for removing all these data files

    else:
      # TODO (only if needed)
      raise NotImplementedError("Must generate data.")

    # labels references match the classification json
    self.mapLabelRefs()