def test_sense_selection_is_random_not_dominant(): oxfordData = sl.loadDataFromFile('../dictionaryData/oxfordExtra') oxfordNoun = ds.selectPoS(oxfordData, 'Noun') oxfordNoun4SenseMin = ds.removeWordsWithTooFewSenses(oxfordNoun, 4, 4) totalBaseProb = 0 totalSelected = 0 for i in range(100): dominantSense = {} for key in oxfordNoun4SenseMin: values = oxfordNoun4SenseMin[key] values = sorted(values, key=lambda x:len(x['examples']), reverse=True) dominantSense[key] = {'def':values[0]['def'], 'numSense':len(values)} oxfordSelected = ds.selectExamplesAndSenses(oxfordNoun4SenseMin, 4, 2) selectedCount = 0 baseProbability = 0 count = 0 for key in oxfordSelected: if oxfordSelected[key][0]['def'] == dominantSense[key]['def']: selectedCount += 1 baseProbability += 1/float(dominantSense[key]['numSense']) count += 1 totalSelected += selectedCount/float(count) totalBaseProb += baseProbability/count print('Number of times dominant sense selected: {}'.format(totalSelected/100)) print('Probability of dominant sense being selected: {}'.format(totalBaseProb/100))
def test_save_and_load_dataset_to_text_file(self): files = ['oxfordExtra', 'collinsExtra', 'semcorExtra'] for file in files: dataset = sl.loadDataFromFile('../dictionaryData/' + file) sl.saveFullDatasetToFileAsText('tempSaveFile.txt', dataset) loadedData = sl.loadDataFromTextFile('tempSaveFile.txt') self.assertEqual(len(dataset), len(loadedData)) for key in dataset: self.assertTrue(key in loadedData) originalWordData = dataset[key] loadedWordData = loadedData[key] self.assertEqual(len(originalWordData), len(loadedWordData)) for originalSense, loadedSense in zip(originalWordData, loadedWordData): self.assertEqual(originalSense['def'], loadedSense['def']) self.assertEqual(originalSense['pos'], loadedSense['pos']) self.assertEqual(originalSense['inWordNet'], loadedSense['inWordNet']) self.assertEqual(originalSense['inSemcor'], loadedSense['inSemcor']) self.assertEqual(originalSense['semcorWordFreq'], loadedSense['semcorWordFreq']) self.assertEqual(originalSense['senseCount'], loadedSense['senseCount']) self.assertEqual(originalSense['inCoca5000WordFreq'], loadedSense['inCoca5000WordFreq']) self.assertEqual(originalSense['coca5000WordFreq'], loadedSense['coca5000WordFreq']) self.assertEqual(len(originalSense['examples']), len(loadedSense['examples'])) for originalExample, loadedExample in zip( originalSense['examples'], loadedSense['examples']): self.assertEqual(originalExample, loadedExample)
def test_sense_selection_is_random_not_dominant(): oxfordData = sl.loadDataFromFile('../dictionaryData/oxfordExtra') oxfordNoun = ds.selectPoS(oxfordData, 'Noun') oxfordNoun4SenseMin = ds.removeWordsWithTooFewSenses(oxfordNoun, 4, 4) totalBaseProb = 0 totalSelected = 0 for i in range(100): dominantSense = {} for key in oxfordNoun4SenseMin: values = oxfordNoun4SenseMin[key] values = sorted(values, key=lambda x:len(x['examples']), reverse=True) dominantSense[key] = {'def':values[0]['def'], 'numSense':len(values)} oxfordSelected = ds.selectExamplesAndSenses(oxfordNoun4SenseMin, 4, 2) selectedCount = 0 baseProbability = 0 count = 0 for key in oxfordSelected: if oxfordSelected[key][0]['def'] == dominantSense[key]['def']: selectedCount += 1 baseProbability += 1/float(dominantSense[key]['numSense']) count += 1 totalSelected += selectedCount/float(count) totalBaseProb += baseProbability/count print(('Number of times dominant sense selected: {}'.format(totalSelected/100))) print(('Probability of dominant sense being selected: {}'.format(totalBaseProb/100)))
def test_load_data_from_file(self): dictToSave = self.getLargerDict() sl.saveDataToFile('testSaveFile', dictToSave) loadedDict = sl.loadDataFromFile('testSaveFile') self.assertEqual(len(dictToSave), len(loadedDict)) for key in loadedDict: loadedSenses = loadedDict[key] originalSenses = dictToSave[key] for i in range(len(loadedSenses)): self.assertEqual(loadedSenses[i]['pos'], originalSenses[i]['pos']) self.assertEqual(loadedSenses[i]['def'], originalSenses[i]['def']) self.assertEqual(loadedSenses[i]['examples'], originalSenses[i]['examples'])
def test_save_and_load_dataset_to_text_file_no_metadata(self): files = ['oxfordExtra', 'collinsExtra', 'semcorExtra'] for file in files: dataset = sl.loadDataFromFile('../dictionaryData/'+file) sl.saveFullDatasetToFileAsText('tempSaveFile.txt', dataset, False) loadedData = sl.loadDataFromTextFile('tempSaveFile.txt', False) self.assertEqual(len(dataset), len(loadedData)) for key in dataset: self.assertTrue(key in loadedData) originalWordData = dataset[key] loadedWordData = loadedData[key] self.assertEqual(len(originalWordData), len(loadedWordData)) for originalSense, loadedSense in zip(originalWordData, loadedWordData): self.assertEqual(len(loadedSense.keys()), 3) self.assertEqual(originalSense['def'], loadedSense['def']) self.assertEqual(originalSense['pos'], loadedSense['pos']) self.assertEqual(len(originalSense['examples']), len(loadedSense['examples'])) for originalExample, loadedExample in zip(originalSense['examples'], loadedSense['examples']): self.assertEqual(originalExample, loadedExample)
def test_save_and_load_dataset_to_text_file_no_metadata(self): files = ['oxfordExtra', 'collinsExtra', 'semcorExtra'] for file in files: dataset = sl.loadDataFromFile('../dictionaryData/' + file) sl.saveFullDatasetToFileAsText('tempSaveFile.txt', dataset, False) loadedData = sl.loadDataFromTextFile('tempSaveFile.txt', False) self.assertEqual(len(dataset), len(loadedData)) for key in dataset: self.assertTrue(key in loadedData) originalWordData = dataset[key] loadedWordData = loadedData[key] self.assertEqual(len(originalWordData), len(loadedWordData)) for originalSense, loadedSense in zip(originalWordData, loadedWordData): self.assertEqual(len(list(loadedSense.keys())), 3) self.assertEqual(originalSense['def'], loadedSense['def']) self.assertEqual(originalSense['pos'], loadedSense['pos']) self.assertEqual(len(originalSense['examples']), len(loadedSense['examples'])) for originalExample, loadedExample in zip( originalSense['examples'], loadedSense['examples']): self.assertEqual(originalExample, loadedExample)
def main(argv): """ Runs evaluation of a prediction technique on a selected evaluation problem from a selected dataset. Runs the evaluation multiple times and prints stats to output. Takes as an argument the file path to a configeration file that is used to set the parameters of the evaluation. """ startTime = time.time() parser = SafeConfigParser() parser.read(argv[0]) validConfig = validateConfigFile(parser) if not validConfig: exit() seed(parser.getint('evaluation_params', 'seedNo')) #print('Remove stop words: {} Remove punctuation: {} Lemmatize: {}'.format(rmStopwords, rmPunct, lemmatize)) dictionaryDataPath = parser.get('evaluation_params', 'dictionary') try: evaluationData = sl.loadDataFromFile('dictionaryData/' + dictionaryDataPath) except IOError as err: print(dictionaryDataParth + ' can not be found in the dictionaryData directory.') exit() evaluationData = ds.selectPoS(evaluationData, parser.get('evaluation_params', 'pos')) evaluationData = ds.removeWordsWithTooFewSenses(evaluationData, parser.getint('evaluation_params', 'numOfSenses'), parser.getint('evaluation_params', 'numOfExamp')) evaluationData= ds.examplesToLowerCase(evaluationData) evaluationData = ds.tokenizeAndLemmatizeExamples(evaluationData, parser.getboolean('evaluation_params', 'lemmatize')) evaluationData = ds.removeStopwordsAndPunct(evaluationData, parser.getboolean('evaluation_params', 'rmStopwords'), parser.getboolean('evaluation_params', 'rmPunct')) print(len(evaluationData)) """
def main(argv): """ Runs evaluation of a prediction technique on a selected evaluation problem from a selected dataset. Runs the evaluation multiple times and prints stats to output. Takes as an argument the file path to a configeration file that is used to set the parameters of the evaluation. """ startTime = time.time() parser = SafeConfigParser() parser.read(argv[0]) validConfig = validateConfigFile(parser) if not validConfig: print('[ERROR] - Config not valid!') exit() seed(parser.getint('evaluation_params', 'seedNo')) # print('Remove stop words: {} Remove punctuation: {} Lemmatize: {}'.format(rmStopwords, rmPunct, lemmatize)) dictionaryDataPath = parser.get('evaluation_params', 'dictionary') try: evaluationData = sl.loadDataFromFile('dictionaryData/' + dictionaryDataPath) except IOError as err: print((dictionaryDataPath + ' can not be found in the dictionaryData directory.')) exit() evaluationData = ds.selectPoS(evaluationData, parser.get('evaluation_params', 'pos')) evaluationData = ds.removeWordsWithTooFewSenses( evaluationData, parser.getint('evaluation_params', 'numOfSenses'), parser.getint('evaluation_params', 'numOfExamp')) evaluationData = ds.examplesToLowerCase(evaluationData) evaluationData = ds.tokenizeAndLemmatizeExamples( evaluationData, parser.getboolean('evaluation_params', 'lemmatize')) evaluationData = ds.removeStopwordsAndPunct( evaluationData, parser.getboolean('evaluation_params', 'rmStopwords'), parser.getboolean('evaluation_params', 'rmPunct')) num_examples_lt_5 = 0 num_examples_lt_10 = 0 num_examples_lt_20 = 0 num_examples_all = 0 base_path = '/Users/thomas/DevSandbox/EpicDataShelf/tag-lab/sense_alloc' pos = parser.get('evaluation_params', 'pos') with open(os.path.join(base_path, dictionaryDataPath.split('.')[0], '{}_lt_5.txt'.format(pos)), 'w', encoding='utf-8') as lt_5_file, \ open(os.path.join(base_path, dictionaryDataPath.split('.')[0], '{}_lt_10.txt'.format(pos)), 'w', encoding='utf-8') as lt_10_file, \ open(os.path.join(base_path, dictionaryDataPath.split('.')[0], '{}_lt_20.txt'.format(pos)), 'w', encoding='utf-8') as lt_20_file, \ open(os.path.join(base_path, dictionaryDataPath.split('.')[0], '{}_all.txt'.format(pos)), 'w', encoding='utf-8') as all_file: for key, vals in list(evaluationData.items()): print(['### ITEM: {} ###'.format(key)]) # print('\t{}\n'.format(json.dumps(vals, indent=4))) for val in vals: ex_lt_5 = [] ex_lt_10 = [] ex_lt_20 = [] ex_all = [] for ex in val['examples']: if (ex['sent'].count(' ') < 5): num_examples_lt_5 += 1 ex_lt_5.append((ex['sent'], val['def'])) if (ex['sent'].count(' ') < 10): num_examples_lt_10 += 1 ex_lt_10.append((ex['sent'], val['def'])) if (ex['sent'].count(' ') < 20): # print('{}: {}'.format(ex['sent'].count(' '), ex['sent'])) num_examples_lt_20 += 1 ex_lt_20.append((ex['sent'], val['def'])) num_examples_all += 1 ex_all.append((ex['sent'], val['def'])) for l, f in [(ex_lt_5, lt_5_file), (ex_lt_10, lt_10_file), (ex_lt_20, lt_20_file), (ex_all, all_file)]: if (len(l) > 2): for ex, syn in l: f.write('{}\t{}\t{}\n'.format(key, ex, syn)) print('---------------------------------------------------') print(('<=5: {}; <=10: {}; <=20: {}'.format(num_examples_lt_5, num_examples_lt_10, num_examples_lt_20))) """