def test_word_crossover_word_pairs(self): groupedPredictor = GroupedPredictions() testData = {'word1':[{'sent':'the the the the big cat', 'tokens':['the','the','the','the','big','cat']}, {'sent':'the big big big big dog', 'tokens':['the','big','big','big','big','dog']}, {'sent':'big big big big apples are green', 'tokens':['big','big','big','big','apples','are','green']}, {'sent':'people like the the the the cars', 'tokens':['people','like','the','the','the','the','cars']}, {'sent':'people like big big big big boats', 'tokens':['people','like','big','big','big','big','boats']}, {'sent':'the the the the apples are red', 'tokens':['the','the','the','the','apples','are','red']}, {'sent':'the big mouse', 'tokens':['the','big','mouse']}, {'sent':'people like cat', 'tokens':['people','like','cat']}, {'sent':'tomatoes are red', 'tokens':['tomatoes','are','red']}]} correctGroups = [set(['the the the the big cat', 'the the the the apples are red', 'people like the the the the cars']), set(['people like cat', 'the big mouse','tomatoes are red']), set(['people like big big big big boats', 'the big big big big dog','big big big big apples are green'])] results = groupedPredictor.wordCrossoverSelection(testData, 3, True) for group in results['word1']: self.assertTrue(set(group) in correctGroups)
def test_grouped_caluculate_accuracy_4_by_4(self): groupedPredictor = GroupedPredictions() testData = {'word1':[ {'sent':'a1','tokens':['a1']},{'sent':'a2','tokens':['a2']}, {'sent':'a3','tokens':['a3']},{'sent':'a4','tokens':['a4']}, {'sent':'b1','tokens':['b1']},{'sent':'b2','tokens':['b2']}, {'sent':'b3','tokens':['b3']},{'sent':'b4','tokens':['b4']}, {'sent':'c1','tokens':['c1']},{'sent':'c2','tokens':['c2']}, {'sent':'c3','tokens':['c3']},{'sent':'c4','tokens':['c4']}, {'sent':'d1','tokens':['d1']},{'sent':'d2','tokens':['d2']}, {'sent':'d3','tokens':['d3']},{'sent':'d4','tokens':['d4']}]} # all groups correct results = {'word1':[['d2','d1','d4','d3'],['a4','a2','a3','a1'], ['b1','b4','b2','b3'],['c2','c1','c4','c3']]} accuracy = groupedPredictor.calculateAccuracy(results, testData) self.assertEqual(accuracy, 1) # 2 groups correct results = {'word1':[['d2','d1','d4','d3'],['a4','b2','a3','a1'],\ ['b1','b4','a2','b3'],['c2','c1','c4','c3']]} accuracy = groupedPredictor.calculateAccuracy(results, testData) self.assertEqual(accuracy, 0) # 0 groups correct results = {'word1':[['d2','d1','d4','a3'],['a4','a2','b3','a1'],\ ['b1','b4','b2','c3'],['c2','c1','c4','d3']]} accuracy = groupedPredictor.calculateAccuracy(results, testData) self.assertEqual(accuracy, 0)
def test_creation_of_all_possible_groups_of_3_by_3_and_4_by_4(self): groupedPredictor = GroupedPredictions() group3 = [('a'),('b'),('c'),('d'),('e'),('f'),('g'),('h'),('i')] allGroups = groupedPredictor.createAllGroupsOfSize3(group3) self.assertEqual(len(allGroups), 280) # Takes a long time to run """
def test_calculate_group_score(self): groupedPredictor = GroupedPredictions() group = ['a', 'b', 'c'] letterToIDMap = {'a':0, 'b':1, 'c':2} simValues = self.get_sim_values() groupScoreManual = simValues[0][0] + simValues[0][1] + simValues[0][2] +\ simValues[1][0] + simValues[1][1] + simValues[1][2] +\ simValues[2][0] + simValues[2][1] + simValues[2][2] groupScore = groupedPredictor.calculateGroupScore(group, simValues, letterToIDMap) self.assertEqual(groupScore, groupScoreManual)
def runGroupedTest(data, method, model, accuracyMeasure): """ Runs a grouped evaluation problem prediction on the given data and returns the accuracy using the selected accuracy measure. Args: data: The data to perform the prediction on. method: The selection of the prediction method to be used, valid arguments are 'random', 'wordCrossover' or 'word2vec' model: A trained word2vec model if method is 'word2vec' else None. accuracyMeasure: The measure by which the accuracy will be measured either 'total' or 'pairs'. Returns: The accuracy as a float of using the selected prediction method on the given data using the selected accuracy measure. """ dataTest = GroupedPredictions() groupTestData = ds.createGroupedTestData(data) # sl.saveGroupedData('oxfordGroupedTest', groupTestData) if method == 'random': selections = dataTest.randomSelection(groupTestData, 3) elif method == 'wordCrossover': selections = dataTest.wordCrossoverSelection(groupTestData, 3) elif method == 'word2vec': selections = dataTest.word2VecSimilaritySelection( groupTestData, 3, model) if accuracyMeasure == 'total': return dataTest.calculateAccuracy(selections, groupTestData) elif accuracyMeasure == 'pairs': return dataTest.calculateAccuracyPairs(selections, groupTestData)
def test_grouped_calculate_accuracy_3_by_3(self): groupedPredictor = GroupedPredictions() testData = {'word1':['a1','a2','a3','b1','b2','b3','c1','c2','c3']} testData = self.formatTestData(testData) # all groups correct results = {'word1':[['b3','b1','b2'],['c2','c1','c3'],['a1','a3','a2']]} accuracy = groupedPredictor.calculateAccuracy(results, testData) self.assertEqual(accuracy, 1) # 1 group correct results = {'word1':[['b3','b1','b2'],['c2','c1','a3'],['a1','c3','a2']]} accuracy = groupedPredictor.calculateAccuracy(results, testData) self.assertEqual(accuracy, 0) # 0 groups correct results = {'word1':[['b3','a3','c2'],['c3','a1','a2'],['c1','b3','b2']]} accuracy = groupedPredictor.calculateAccuracy(results, testData) self.assertEqual(accuracy, 0)
def test_group_by_similarity_brute_force(self): groupedPredictor = GroupedPredictions() examples = [{'sent':'a', 'tokens':['a']},{'sent':'b', 'tokens':['b']}, {'sent':'c', 'tokens':['c']},{'sent':'d', 'tokens':['d']}, {'sent':'e', 'tokens':['e']},{'sent':'f', 'tokens':['f']}, {'sent':'g', 'tokens':['g']},{'sent':'h', 'tokens':['h']}, {'sent':'i', 'tokens':['i']}] simValues = self.get_sim_values() results = groupedPredictor.groupBySimilarityBF(examples, simValues, 3, False) correctGroupings = [set(['a','b','d']),set(['c','e','h']),set(['f','g','i'])] for group in results: groupSet = set(group) self.assertTrue(groupSet in correctGroupings) simValues = self.get_sim_values_inverse() results = groupedPredictor.groupBySimilarityBF(examples, simValues, 3, True) for group in results: groupSet = set(group) self.assertTrue(groupSet in correctGroupings)
def test_grouped_random_selection_4_by_4(self): groupedPredictor = GroupedPredictions() groupSize = 4 letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p'] examples = [{'sent':letter, 'token':word_tokenize(letter)} for letter in letters] testData = {'word1':examples} predictions = groupedPredictor.randomSelection(testData, groupSize) for key in predictions: self.assertTrue(key in testData) results = predictions[key] self.assertEqual(len(results), 4) used = [] for group in results: self.assertEqual(len(group), 4) for item in group: self.assertTrue(item not in used) used.append(item) self.assertEqual(len(examples), len(used)) self.assertEqual(set(letters), set(used))
def test_grouped_random_selection_3_by_3(self): groupedPredictor = GroupedPredictions() groupSize = 3 testData = {'word1':['a','b','c','d','e','f','g','h','i'], 'word2':['j','k','l','m','n','o','p','r','s']} for key in testData: examples = testData[key] examples = [{'sent':example, 'tokens':word_tokenize(example)} for example in examples] testData[key] = examples predictions = groupedPredictor.randomSelection(testData, groupSize) for key in predictions: self.assertTrue(key in testData) results = predictions[key] self.assertEqual(len(results), 3) used = [] for group in results: self.assertEqual(len(group), 3) for item in group: self.assertTrue(item not in used) used.append(item)
def test_grouped_calculate_accuracy_pairs_4_by_4(self): groupedPredictor = GroupedPredictions() testData = {'word1':['a1','a2','a3','a4','b1','b2','b3','b4', 'c1','c2','c3','c4','d1','d2','d3','d4']} testData = self.formatTestData(testData) # all pairs correct results = {'word1':[['d2','d1','d4','d3'],['a4','a2','a3','a1'],\ ['b1','b4','b2','b3'],['c2','c1','c4','c3']]} accuracy = groupedPredictor.calculateAccuracyPairs(results, testData) self.assertEqual(accuracy, 24/float(24)) # 12 pairs correct results = {'word1':[['d2','d1','d4','d3'],['a4','c2','a3','a1'],\ ['c3','b4','c4','b3'],['b2','b1','c1','a2']]} accuracy = groupedPredictor.calculateAccuracyPairs(results, testData) self.assertEqual(accuracy, 12/float(24)) # 6 pairs correct results = {'word1':[['b3','c1','b2','b4'],['c3','a2','c2','d1'],\ ['b1','a3','a1','d3'],['d4','a4','c4','d2']]} accuracy = groupedPredictor.calculateAccuracyPairs(results, testData) self.assertEqual(accuracy, 6/float(24)) # 0 pairs correct results = {'word1':[['a1','b1','c1','d1'],['a2','b2','c2','d2'],\ ['a3','b3','c3','d3'],['a4','b4','c4','d4']]} accuracy = groupedPredictor.calculateAccuracyPairs(results, testData) self.assertEqual(accuracy, 0/float(24))
def runGroupedTest(data, method, model, accuracyMeasure): """ Runs a grouped evaluation problem prediction on the given data and returns the accuracy using the selected accuracy measure. Args: data: The data to perform the prediction on. method: The selection of the prediction method to be used, valid arguments are 'random', 'wordCrossover' or 'word2vec' model: A trained word2vec model if method is 'word2vec' else None. accuracyMeasure: The measure by which the accuracy will be measured either 'total' or 'pairs'. Returns: The accuracy as a float of using the selected prediction method on the given data using the selected accuracy measure. """ dataTest = GroupedPredictions() groupTestData = ds.createGroupedTestData(data) #sl.saveGroupedData('oxfordGroupedTest', groupTestData) if method == 'random': selections = dataTest.randomSelection(groupTestData, 3) elif method == 'wordCrossover': selections = dataTest.wordCrossoverSelection(groupTestData, 3) elif method == 'word2vec': selections = dataTest.word2VecSimilaritySelection(groupTestData, 3, model) if accuracyMeasure == 'total': return dataTest.calculateAccuracy(selections, groupTestData) elif accuracyMeasure == 'pairs': return dataTest.calculateAccuracyPairs(selections, groupTestData)
def test_grouped_word2vec_selection_3_by_3(self): model = Word2Vec.load(self.brownFilePath) groupedPredictor = GroupedPredictions() examples1 = ['cat','dog','horse','apple','orange','lemon',\ 'England','France','Spain'] examples2 = ['boy','girl','man','bus','car','boat','pencil','pen','rubber'] allExamples = examples1 + examples2 examples1 = [{'sent':word, 'tokens':word_tokenize(word)} for word in examples1] examples2 = [{'sent':word, 'tokens':word_tokenize(word)} for word in examples2] groupSize = 3 testData = {'word1': examples1, 'word2': examples2} predictions = groupedPredictor.word2VecSimilaritySelection(testData, groupSize, model) used = [] for key in predictions: self.assertTrue(key in testData) results = predictions[key] self.assertEqual(len(results), 3) for group in results: self.assertEqual(len(group), 3) for item in group: self.assertTrue(item not in used) used.append(item) self.assertEqual(len(allExamples), len(used)) self.assertEqual(set(allExamples), set(used))
def test_grouped_calculate_accuracy_pairs_3_by_3(self): groupedPredictor = GroupedPredictions() testData = {'word1':['a1','a2','a3','b1','b2','b3','c1','c2','c3']} testData = self.formatTestData(testData) # all pairs correct results = {'word1':[['b3','b1','b2'],['c2','c1','c3'],['a1','a3','a2']]} accuracy = groupedPredictor.calculateAccuracyPairs(results, testData) self.assertEqual(accuracy, 9/float(9)) # 5 pairs correct results = {'word1':[['b3','c1','c2'],['a3','a2','a1'],['b1','c3','b2']]} accuracy = groupedPredictor.calculateAccuracyPairs(results, testData) self.assertEqual(accuracy, 5/float(9)) # 3 pairs correct results = {'word1':[['b3','c1','b2'],['c3','a2','c2'],['b1','a3','a1']]} accuracy = groupedPredictor.calculateAccuracyPairs(results, testData) self.assertEqual(accuracy, 3/float(9)) # 0 pairs correct results = {'word1':[['a2','c1','b3'],['b2','a3','c2'],['b1','c3','a1']]} accuracy = groupedPredictor.calculateAccuracyPairs(results, testData) self.assertEqual(accuracy, 0/float(9))