def testSplitPreprocess(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") expected = [[{ "_token": "gohbkchoo", "_categories": "0 1", "_sequenceID": 0, "ID": "1", "_reset": 1 }], [{ "_token": "o", "_categories": "2", "_sequenceID": 1, "ID": "2", "_reset": 1 }, { "_token": "ca", "_categories": "2", "_sequenceID": 1, "ID": "2", "_reset": 0 }]] ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True) self.assertRecordsEqual(ndg.records, expected)
def testSplitNoPreprocess(self): ndg = NetworkDataGenerator() filename = (self.dirName + "/../../../data/network_data_generator/multi_sample.csv") ndg.split(filename, 2, 3, False) self.assertRecordsEqual(ndg.records, self.expected)
def testSaveData(self): ndg = NetworkDataGenerator() filename = (self.dirName + "/../../../data/network_data_generator/multi_sample.csv") ndg.split(filename, 2, 3, False) dataOutputFile = ( self.dirName + "/../../../data/network_data_generator/multi_sample_split.csv") categoriesOutputFile = ( self.dirName + "/../../../data/network_data_generator/multi_sample_categories.json" ) success = ndg.saveData(dataOutputFile, categoriesOutputFile) self.assertTrue(success) dataTable = pandas.read_csv(dataOutputFile).fillna("") types = { "_category0": "int", "_category1": "int", "_category2": "int", "token": "string", "_sequenceID": "int", "_reset": "int" } specials = { "_category0": "C", "_category1": "C", "_category2": "C", "token": "", "_sequenceID": "S", "_reset": "R" } expected_records = [ record for data in self.expected for record in data ] expected_records.insert(0, specials) expected_records.insert(0, types) for idx, values in dataTable.iterrows(): record = values.to_dict() if record["_category1"] == "": del record["_category1"] if record["_category2"] == "": del record["_category2"] self.assertDictEqual(record, expected_records[idx]) with open(categoriesOutputFile) as f: categories = json.load(f) expected_categories = { "kitchen": 0, "environment": 1, "not helpful": 2 } self.assertDictEqual(categories, expected_categories)
def testSaveDataIncorrectType(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") dataOutputFile = os.path.join( self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.csv") ndg.split(filename, 3, False) with self.assertRaises(TypeError): ndg.saveData(dataOutputFile, categoriesOutputFile)
def testFileRecordStreamReadData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join( self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") ndg.saveData(dataOutputFile, categoriesOutputFile) # If no error is raised, then the data is in the correct format frs = FileRecordStream(dataOutputFile)
def prepData(self, dataPath, ordered=False, **kwargs): """ Generate the data in network API format. @param dataPath (str) Path to input data file; format as expected by NetworkDataGenerator. @return networkDataPath (str) Path to data formtted for network API. @return ndg (NetworkDataGenerator) """ ndg = NetworkDataGenerator() networkDataPath = ndg.setupData(dataPath, self.numLabels, ordered, **kwargs) return networkDataPath, ndg
def testSaveData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) self.assertTrue(success) dataTable = pandas.read_csv(dataOutputFile).fillna("") types = { "_categories": "list", "_token": "string", "_sequenceID": "int", "_reset": "int", "ID": "string" } specials = { "_categories": "C", "_token": "", "_sequenceID": "S", "_reset": "R", "ID": "" } expected_records = [ record for data in self.expected for record in data ] expected_records.insert(0, specials) expected_records.insert(0, types) for idx, values in dataTable.iterrows(): record = values.to_dict() if idx > 1: # csv values are strings, so cast the ints record["_sequenceID"] = int(record["_sequenceID"]) record["_reset"] = int(record["_reset"]) self.assertDictEqual(record, expected_records[idx]) with open(categoriesOutputFile) as f: categories = json.load(f) expected_categories = { "kitchen": 0, "environment": 1, "not helpful": 2 } self.assertDictEqual(categories, expected_categories)
def testSplitPreprocess(self): ndg = NetworkDataGenerator() filename = (self.dirName + "/../../../data/network_data_generator/multi_sample.csv") expected = [[{ "_category0": "0", "_category1": "1", "_sequenceID": "0", "token": "get", "_reset": "1" }, { "_category0": "0", "_category1": "1", "_sequenceID": "0", "token": "rid", "_reset": "0" }, { "_category0": "0", "_category1": "1", "_sequenceID": "0", "token": "trouble", "_reset": "0" }, { "_category0": "0", "_category1": "1", "_sequenceID": "0", "token": "kitchen", "_reset": "0" }, { "_category0": "0", "_category1": "1", "_sequenceID": "0", "token": "odor", "_reset": "0" }], [{ "_category0": "2", "_sequenceID": "1", "token": "don", "_reset": "1" }, { "_category0": "2", "_sequenceID": "1", "token": "care", "_reset": "0" }]] ndg.split(filename, 2, 3, True, ignoreCommon=100, correctSpell=True) self.assertRecordsEqual(ndg.records, expected)
def _testing(self, trial): """ Test the network on the test set for a particular trial and store the results @param trial (int) trial count """ if self.verbosity > 0: i = sum(self.partitions[trial][0]) indices = [] for numTokens in self.partitions[trial][1]: indices.append(i) i += numTokens print ("\tRunner selects to test on sequences starting at indices " "{}".format(indices)) results = ([], []) for i, numTokens in enumerate(self.partitions[trial][1]): predictions = [] for _ in xrange(numTokens): predicted = self.model.testModel() predictions.append(predicted) winningPredictions = self._selectWinners(predictions) # TODO: switch to standard (expected, actual) format results[0].append(winningPredictions) results[1].append(self.actualLabels[trial][i]) # Prepare data for writeOutClassifications trainIdx = range(len(self.partitions[trial][0])) testIdx = range(len(self.partitions[trial][0]), len(self.partitions[trial][0]) + len(self.partitions[trial][1])) self.partitions[trial] = (trainIdx, testIdx) self.samples = NetworkDataGenerator.getSamples(self.dataFiles[trial]) self.results.append(results)
def testing(self, trial): """ Test the network on the test set for a particular trial and store the results @param trial (int) trial count """ if self.verbosity > 0: i = sum(self.partitions[trial][0]) indices = [] for numTokens in self.partitions[trial][1]: indices.append(i) i += numTokens print "\tRunner selects to test on sample(s) {}".format(indices) results = ([], []) for i, numTokens in enumerate(self.partitions[trial][1]): predictions = [] for _ in xrange(numTokens): predicted = self.model.testModel() predictions.append(predicted) winningPredictions = self._selectWinners(predictions) results[0].append(winningPredictions) results[1].append(self.actualLabels[trial][i]) # Prepare data for writeOutClassifications trainIdx = range(len(self.partitions[trial][0])) testIdx = range( len(self.partitions[trial][0]), len(self.partitions[trial][0]) + len(self.partitions[trial][1])) self.partitions[trial] = (trainIdx, testIdx) self.samples = NetworkDataGenerator.getSamples(self.dataFiles[trial]) self.results.append(results)
def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs): """ Generate the data in network API format. @param dataPath (str) Path to input data file; format as expected by NetworkDataGenerator. @param ordered (bool) Keep order of data, or randomize. @param stripCats (bool) Remove the categories and replace them with the sequence_Id. @return networkDataPath (str) Path to data formtted for network API. @return ndg (NetworkDataGenerator) """ ndg = NetworkDataGenerator() networkDataPath = ndg.setupData(dataPath, self.numLabels, ordered, stripCats, **kwargs) return networkDataPath, ndg
def partitionIndices(self, split, trial): """ Returns the number of tokens for each sample in the training and test set when doing an ordered split """ dataFile = self.dataFiles[trial] numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile) return (numTokens[:split], numTokens[split:])
def prepData(self, dataPath, ordered=False, stripCats=False, **kwargs): """ Generate the data in network API format. @param dataPath (str) Path to input data file; format as expected by NetworkDataGenerator. @param ordered (bool) Keep order of data, or randomize. @param stripCats (bool) Remove the categories and replace them with the sequence_Id. @return networkDataPath (str) Path to data formtted for network API. @return ndg (NetworkDataGenerator) """ ndg = NetworkDataGenerator() networkDataPath = ndg.setupData( dataPath, self.numLabels, ordered, stripCats, **kwargs) return networkDataPath, ndg
def testRandomize(self): ndg = NetworkDataGenerator() filename = ( self.dirName + "/../../../data/sample_reviews_multi/sample_reviews_data_training.csv" ) ndg.split(filename, 2, 3, False) random.seed(1) ndg.randomizeData() dataOutputFile = ( self.dirName + "/../../../data/network_data_generator/multi_sample_split.csv") categoriesOutputFile = ( self.dirName + "/../../../data/network_data_generator/multi_sample_categories.json" ) success = ndg.saveData(dataOutputFile, categoriesOutputFile) randomizedIDs = [] dataTable = pandas.read_csv(dataOutputFile) for _, values in dataTable.iterrows(): record = values.to_dict() idx = record["_sequenceID"] if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx): randomizedIDs.append(idx) self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
def setupData(self, preprocess=False, sampleIdx=2, **kwargs): """ Generate the data in network API format if necessary. self.dataFiles is populated with the paths of network data files, one for each trial Look at runner.py (setupData) and network_data_generator.py (split) for the parameters """ if self.generateData: ndg = NetworkDataGenerator() ndg.split(self.dataPath, sampleIdx, self.numClasses, preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}-classifications.json".format(filename) for i in xrange(len(self.trainSize)): if not self.orderedSplit: ndg.randomizeData() dataFile = "{}-{}{}".format(filename, i, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) if self.verbosity > 0: print "{} file(s) generated at {}".format(len(self.dataFiles), self.dataFiles) print "Classification json is at: {}".format(self.classificationFile) else: # Does an orderedSplit self.dataFiles = [self.dataPath] * len(self.trainSize) self.actualLabels = [self._getClassifications(size, i) for i, size in enumerate(self.trainSize)] self._mapLabelRefs()
def partitionIndices(self): """ Sets self.partitions for the number of tokens for each sample in the training and test sets (when doing an ordered split). """ for trial, split in enumerate(self.trainSizes): dataFile = self.dataFiles[trial] numTokens = NetworkDataGenerator.getNumberOfTokens(dataFile) self.partitions.append((numTokens[:split], numTokens[split:]))
def testSaveData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join( self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) self.assertTrue(success) dataTable = pandas.read_csv(dataOutputFile).fillna("") types = {"_categories": "list", "_token": "string", "_sequenceID": "int", "_reset": "int", "ID": "string"} specials = {"_categories": "C", "_token": "", "_sequenceID": "S", "_reset": "R", "ID": ""} expected_records = [record for data in self.expected for record in data] expected_records.insert(0, specials) expected_records.insert(0, types) for idx, values in dataTable.iterrows(): record = values.to_dict() if idx > 1: # csv values are strings, so cast the ints record["_sequenceID"] = int(record["_sequenceID"]) record["_reset"] = int(record["_reset"]) self.assertDictEqual(record, expected_records[idx]) with open(categoriesOutputFile) as f: categories = json.load(f) expected_categories = {"kitchen": 0, "environment": 1, "not helpful": 2} self.assertDictEqual(categories, expected_categories)
def _getClassifications(self, split, trial): """ Gets the classifications for testing samples for a particular trial @param split (int) Size of training set @param trial (int) trial count @return (list) List of list of ids of classifications for a sample """ dataFile = self.dataFiles[trial] classifications = NetworkDataGenerator.getClassifications(dataFile) return [[int(c) for c in classes.strip().split(" ")] for classes in classifications][split:]
def testSplitPreprocess(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") expected = [[ {"_token": "gohbkchoo", "_categories": "0 1", "_sequenceID": 0, "ID": "1", "_reset": 1}], [{"_token": "o", "_categories": "2", "_sequenceID": 1, "ID": "2", "_reset": 1}, {"_token": "ca", "_categories": "2", "_sequenceID": 1, "ID": "2", "_reset": 0}]] ndg.split(filename, 3, True, ignoreCommon=100, correctSpell=True) self.assertRecordsEqual(ndg.records, expected)
def testSaveDataIncorrectType(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.csv") ndg.split(filename, 3, False) with self.assertRaises(TypeError): ndg.saveData(dataOutputFile, categoriesOutputFile)
def testFileRecordStreamReadData(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) dataOutputFile = os.path.join(self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") ndg.saveData(dataOutputFile, categoriesOutputFile) # If no error is raised, then the data is in the correct format frs = FileRecordStream(dataOutputFile)
def testSaveDataIncorrectType(self): ndg = NetworkDataGenerator() filename = (self.dirName + "/../../../data/network_data_generator/multi_sample.csv") dataOutputFile = ( self.dirName + "/../../../data/network_data_generator/multi_sample_split.csv") categoriesOutputFile = ( self.dirName + "/../../../data/network_data_generator/multi_sample_categories.csv" ) ndg.split(filename, 2, 3, False) with self.assertRaises(TypeError): ndg.saveData(dataOutputFile, categoriesOutputFile)
def setupData(self, preprocess=False, sampleIdx=2, **kwargs): """ Generate the data in network API format if necessary. self.dataFiles is populated with the paths of network data files, one for each trial Look at runner.py (setupData) and network_data_generator.py (split) for the parameters """ if self.generateData: ndg = NetworkDataGenerator() ndg.split(self.dataPath, sampleIdx, self.numClasses, preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}-classifications.json".format( filename) for i in xrange(len(self.trainSize)): if not self.orderedSplit: ndg.randomizeData() dataFile = "{}-{}{}".format(filename, i, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) if self.verbosity > 0: print "{} file(s) generated at {}".format( len(self.dataFiles), self.dataFiles) print "Classification json is at: {}".format( self.classificationFile) else: # Does an orderedSplit self.dataFiles = [self.dataPath] * len(self.trainSize) self.actualLabels = [ self._getClassifications(size, i) for i, size in enumerate(self.trainSize) ] self._mapLabelRefs()
def setupNetData(self, preprocess=False, generateData=False, **kwargs): """ Generate the data in network API format if necessary. self.dataFiles is populated with the paths of network data files, one for each trial Look at runner.py (setupData) and network_data_generator.py (split) for the parameters. """ if generateData: # TODO: use model.prepData()? ndg = NetworkDataGenerator() ndg.split(self.dataPath, self.numClasses, preprocess, **kwargs) filename, ext = os.path.splitext(self.dataPath) self.classificationFile = "{}_categories.json".format(filename) for i in xrange(len(self.trainSizes)): if not self.orderedSplit: ndg.randomizeData() dataFile = "{}_network_{}{}".format(filename, i, ext) ndg.saveData(dataFile, self.classificationFile) self.dataFiles.append(dataFile) if self.verbosity > 0: print "{} file(s) generated at {}".format(len(self.dataFiles), self.dataFiles) print "Classification JSON is at: {}".format(self.classificationFile) else: # Use the input file for each trial; maintains the order of samples. self.dataFiles = [self.dataPath] * len(self.trainSizes) if self.numClasses > 0: # Setup labels data objects self.actualLabels = [self._getClassifications(size, i) for i, size in enumerate(self.trainSizes)] self._mapLabelRefs()
def testRandomize(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) random.seed(1) ndg.randomizeData() dataOutputFile = os.path.join( self.dirName, "test_data/multi_sample_split.csv") categoriesOutputFile = os.path.join( self.dirName, "test_data/multi_sample_categories.json") success = ndg.saveData(dataOutputFile, categoriesOutputFile) randomizedIDs = [] dataTable = pandas.read_csv(dataOutputFile) for _, values in dataTable.iterrows(): record = values.to_dict() idx = record["_sequenceID"] if idx.isdigit() and (not randomizedIDs or randomizedIDs[-1] != idx): randomizedIDs.append(idx) self.assertNotEqual(randomizedIDs, range(len(randomizedIDs)))
def testSplitNoPreprocess(self): ndg = NetworkDataGenerator() filename = os.path.join(self.dirName, "test_data/multi_sample.csv") ndg.split(filename, 3, False) self.assertRecordsEqual(ndg.records, self.expected)