def test_sense_selection_is_random_not_dominant():
	oxfordData = sl.loadDataFromFile('../dictionaryData/oxfordExtra')
	oxfordNoun = ds.selectPoS(oxfordData, 'Noun')
	oxfordNoun4SenseMin = ds.removeWordsWithTooFewSenses(oxfordNoun, 4, 4)
	totalBaseProb = 0
	totalSelected = 0
	for i in range(100):	
		dominantSense = {}
		for key in oxfordNoun4SenseMin:
			values = oxfordNoun4SenseMin[key]
			values = sorted(values, key=lambda x:len(x['examples']), reverse=True)
			dominantSense[key] = {'def':values[0]['def'], 'numSense':len(values)}
		oxfordSelected = ds.selectExamplesAndSenses(oxfordNoun4SenseMin, 4, 2)
		selectedCount = 0
		baseProbability = 0
		count = 0
		for key in oxfordSelected:
			if oxfordSelected[key][0]['def'] == dominantSense[key]['def']:
				selectedCount += 1
			baseProbability += 1/float(dominantSense[key]['numSense'])
			count += 1
		totalSelected += selectedCount/float(count)
		totalBaseProb += baseProbability/count
	print('Number of times dominant sense selected: {}'.format(totalSelected/100))
	print('Probability of dominant sense being selected: {}'.format(totalBaseProb/100))	
 def test_save_and_load_dataset_to_text_file(self):
     files = ['oxfordExtra', 'collinsExtra', 'semcorExtra']
     for file in files:
         dataset = sl.loadDataFromFile('../dictionaryData/' + file)
         sl.saveFullDatasetToFileAsText('tempSaveFile.txt', dataset)
         loadedData = sl.loadDataFromTextFile('tempSaveFile.txt')
         self.assertEqual(len(dataset), len(loadedData))
         for key in dataset:
             self.assertTrue(key in loadedData)
             originalWordData = dataset[key]
             loadedWordData = loadedData[key]
             self.assertEqual(len(originalWordData), len(loadedWordData))
             for originalSense, loadedSense in zip(originalWordData,
                                                   loadedWordData):
                 self.assertEqual(originalSense['def'], loadedSense['def'])
                 self.assertEqual(originalSense['pos'], loadedSense['pos'])
                 self.assertEqual(originalSense['inWordNet'],
                                  loadedSense['inWordNet'])
                 self.assertEqual(originalSense['inSemcor'],
                                  loadedSense['inSemcor'])
                 self.assertEqual(originalSense['semcorWordFreq'],
                                  loadedSense['semcorWordFreq'])
                 self.assertEqual(originalSense['senseCount'],
                                  loadedSense['senseCount'])
                 self.assertEqual(originalSense['inCoca5000WordFreq'],
                                  loadedSense['inCoca5000WordFreq'])
                 self.assertEqual(originalSense['coca5000WordFreq'],
                                  loadedSense['coca5000WordFreq'])
                 self.assertEqual(len(originalSense['examples']),
                                  len(loadedSense['examples']))
                 for originalExample, loadedExample in zip(
                         originalSense['examples'],
                         loadedSense['examples']):
                     self.assertEqual(originalExample, loadedExample)
Example #3
0
def test_sense_selection_is_random_not_dominant():
	oxfordData = sl.loadDataFromFile('../dictionaryData/oxfordExtra')
	oxfordNoun = ds.selectPoS(oxfordData, 'Noun')
	oxfordNoun4SenseMin = ds.removeWordsWithTooFewSenses(oxfordNoun, 4, 4)
	totalBaseProb = 0
	totalSelected = 0
	for i in range(100):	
		dominantSense = {}
		for key in oxfordNoun4SenseMin:
			values = oxfordNoun4SenseMin[key]
			values = sorted(values, key=lambda x:len(x['examples']), reverse=True)
			dominantSense[key] = {'def':values[0]['def'], 'numSense':len(values)}
		oxfordSelected = ds.selectExamplesAndSenses(oxfordNoun4SenseMin, 4, 2)
		selectedCount = 0
		baseProbability = 0
		count = 0
		for key in oxfordSelected:
			if oxfordSelected[key][0]['def'] == dominantSense[key]['def']:
				selectedCount += 1
			baseProbability += 1/float(dominantSense[key]['numSense'])
			count += 1
		totalSelected += selectedCount/float(count)
		totalBaseProb += baseProbability/count
	print(('Number of times dominant sense selected: {}'.format(totalSelected/100)))
	print(('Probability of dominant sense being selected: {}'.format(totalBaseProb/100)))	
	def test_load_data_from_file(self):
		dictToSave = self.getLargerDict()
		sl.saveDataToFile('testSaveFile', dictToSave)
		loadedDict = sl.loadDataFromFile('testSaveFile')
		self.assertEqual(len(dictToSave), len(loadedDict))
		for key in loadedDict:
			loadedSenses = loadedDict[key]
			originalSenses = dictToSave[key]
			for i in range(len(loadedSenses)):
				self.assertEqual(loadedSenses[i]['pos'], originalSenses[i]['pos'])
				self.assertEqual(loadedSenses[i]['def'], originalSenses[i]['def'])
				self.assertEqual(loadedSenses[i]['examples'], originalSenses[i]['examples'])
 def test_load_data_from_file(self):
     dictToSave = self.getLargerDict()
     sl.saveDataToFile('testSaveFile', dictToSave)
     loadedDict = sl.loadDataFromFile('testSaveFile')
     self.assertEqual(len(dictToSave), len(loadedDict))
     for key in loadedDict:
         loadedSenses = loadedDict[key]
         originalSenses = dictToSave[key]
         for i in range(len(loadedSenses)):
             self.assertEqual(loadedSenses[i]['pos'],
                              originalSenses[i]['pos'])
             self.assertEqual(loadedSenses[i]['def'],
                              originalSenses[i]['def'])
             self.assertEqual(loadedSenses[i]['examples'],
                              originalSenses[i]['examples'])
	def test_save_and_load_dataset_to_text_file_no_metadata(self):
		files = ['oxfordExtra', 'collinsExtra', 'semcorExtra']
		for file in files:
			dataset = sl.loadDataFromFile('../dictionaryData/'+file)
			sl.saveFullDatasetToFileAsText('tempSaveFile.txt', dataset, False)
			loadedData = sl.loadDataFromTextFile('tempSaveFile.txt', False)
			self.assertEqual(len(dataset), len(loadedData))
			for key in dataset:
				self.assertTrue(key in loadedData)
				originalWordData = dataset[key]
				loadedWordData = loadedData[key]
				self.assertEqual(len(originalWordData), len(loadedWordData))
				for originalSense, loadedSense in zip(originalWordData, loadedWordData):
					self.assertEqual(len(loadedSense.keys()), 3)
					self.assertEqual(originalSense['def'], loadedSense['def'])
					self.assertEqual(originalSense['pos'], loadedSense['pos'])
					self.assertEqual(len(originalSense['examples']), len(loadedSense['examples']))
					for originalExample, loadedExample in zip(originalSense['examples'], loadedSense['examples']):
						self.assertEqual(originalExample, loadedExample)
 def test_save_and_load_dataset_to_text_file_no_metadata(self):
     files = ['oxfordExtra', 'collinsExtra', 'semcorExtra']
     for file in files:
         dataset = sl.loadDataFromFile('../dictionaryData/' + file)
         sl.saveFullDatasetToFileAsText('tempSaveFile.txt', dataset, False)
         loadedData = sl.loadDataFromTextFile('tempSaveFile.txt', False)
         self.assertEqual(len(dataset), len(loadedData))
         for key in dataset:
             self.assertTrue(key in loadedData)
             originalWordData = dataset[key]
             loadedWordData = loadedData[key]
             self.assertEqual(len(originalWordData), len(loadedWordData))
             for originalSense, loadedSense in zip(originalWordData,
                                                   loadedWordData):
                 self.assertEqual(len(list(loadedSense.keys())), 3)
                 self.assertEqual(originalSense['def'], loadedSense['def'])
                 self.assertEqual(originalSense['pos'], loadedSense['pos'])
                 self.assertEqual(len(originalSense['examples']),
                                  len(loadedSense['examples']))
                 for originalExample, loadedExample in zip(
                         originalSense['examples'],
                         loadedSense['examples']):
                     self.assertEqual(originalExample, loadedExample)
def main(argv):
	"""
	Runs evaluation of a prediction technique on a selected evaluation problem
	from a selected dataset. Runs the evaluation multiple times and prints stats
	to output. Takes as an argument the file path to a configeration file that
	is used to set the parameters of the evaluation.
	"""
	startTime  = time.time()
	parser = SafeConfigParser()
	parser.read(argv[0])
	
	validConfig = validateConfigFile(parser)
	if not validConfig:
		exit()

	seed(parser.getint('evaluation_params', 'seedNo'))
	#print('Remove stop words: {} Remove punctuation: {} Lemmatize: {}'.format(rmStopwords, rmPunct, lemmatize))	
	dictionaryDataPath = parser.get('evaluation_params', 'dictionary')
	try:
		evaluationData = sl.loadDataFromFile('dictionaryData/' + dictionaryDataPath)
	except IOError as err:
		print(dictionaryDataParth + ' can not be found in the dictionaryData directory.')
		exit()	
	
	evaluationData = ds.selectPoS(evaluationData, parser.get('evaluation_params', 'pos'))
	evaluationData = ds.removeWordsWithTooFewSenses(evaluationData, 
		parser.getint('evaluation_params', 'numOfSenses'), 
		parser.getint('evaluation_params', 'numOfExamp'))
	evaluationData= ds.examplesToLowerCase(evaluationData)
	evaluationData = ds.tokenizeAndLemmatizeExamples(evaluationData,
		parser.getboolean('evaluation_params', 'lemmatize'))
	evaluationData = ds.removeStopwordsAndPunct(evaluationData, 
		parser.getboolean('evaluation_params', 'rmStopwords'), 
		parser.getboolean('evaluation_params', 'rmPunct'))

	print(len(evaluationData))
	"""
def main(argv):
    """
	Runs evaluation of a prediction technique on a selected evaluation problem
	from a selected dataset. Runs the evaluation multiple times and prints stats
	to output. Takes as an argument the file path to a configeration file that
	is used to set the parameters of the evaluation.
	"""
    startTime = time.time()
    parser = SafeConfigParser()
    parser.read(argv[0])

    validConfig = validateConfigFile(parser)
    if not validConfig:
        print('[ERROR] - Config not valid!')
        exit()

    seed(parser.getint('evaluation_params', 'seedNo'))
    # print('Remove stop words: {} Remove punctuation: {} Lemmatize: {}'.format(rmStopwords, rmPunct, lemmatize))
    dictionaryDataPath = parser.get('evaluation_params', 'dictionary')
    try:
        evaluationData = sl.loadDataFromFile('dictionaryData/' +
                                             dictionaryDataPath)
    except IOError as err:
        print((dictionaryDataPath +
               ' can not be found in the dictionaryData directory.'))
        exit()

    evaluationData = ds.selectPoS(evaluationData,
                                  parser.get('evaluation_params', 'pos'))
    evaluationData = ds.removeWordsWithTooFewSenses(
        evaluationData, parser.getint('evaluation_params', 'numOfSenses'),
        parser.getint('evaluation_params', 'numOfExamp'))
    evaluationData = ds.examplesToLowerCase(evaluationData)
    evaluationData = ds.tokenizeAndLemmatizeExamples(
        evaluationData, parser.getboolean('evaluation_params', 'lemmatize'))
    evaluationData = ds.removeStopwordsAndPunct(
        evaluationData, parser.getboolean('evaluation_params', 'rmStopwords'),
        parser.getboolean('evaluation_params', 'rmPunct'))

    num_examples_lt_5 = 0
    num_examples_lt_10 = 0
    num_examples_lt_20 = 0
    num_examples_all = 0
    base_path = '/Users/thomas/DevSandbox/EpicDataShelf/tag-lab/sense_alloc'
    pos = parser.get('evaluation_params', 'pos')
    with open(os.path.join(base_path, dictionaryDataPath.split('.')[0], '{}_lt_5.txt'.format(pos)), 'w', encoding='utf-8') as lt_5_file, \
      open(os.path.join(base_path, dictionaryDataPath.split('.')[0], '{}_lt_10.txt'.format(pos)), 'w', encoding='utf-8') as lt_10_file, \
      open(os.path.join(base_path, dictionaryDataPath.split('.')[0], '{}_lt_20.txt'.format(pos)), 'w', encoding='utf-8') as lt_20_file, \
      open(os.path.join(base_path, dictionaryDataPath.split('.')[0], '{}_all.txt'.format(pos)), 'w', encoding='utf-8') as all_file:
        for key, vals in list(evaluationData.items()):
            print(['### ITEM: {} ###'.format(key)])
            # print('\t{}\n'.format(json.dumps(vals, indent=4)))
            for val in vals:
                ex_lt_5 = []
                ex_lt_10 = []
                ex_lt_20 = []
                ex_all = []
                for ex in val['examples']:
                    if (ex['sent'].count(' ') < 5):
                        num_examples_lt_5 += 1
                        ex_lt_5.append((ex['sent'], val['def']))
                    if (ex['sent'].count(' ') < 10):
                        num_examples_lt_10 += 1
                        ex_lt_10.append((ex['sent'], val['def']))
                    if (ex['sent'].count(' ') < 20):
                        # print('{}: {}'.format(ex['sent'].count(' '), ex['sent']))
                        num_examples_lt_20 += 1
                        ex_lt_20.append((ex['sent'], val['def']))
                    num_examples_all += 1
                    ex_all.append((ex['sent'], val['def']))

                for l, f in [(ex_lt_5, lt_5_file), (ex_lt_10, lt_10_file),
                             (ex_lt_20, lt_20_file), (ex_all, all_file)]:
                    if (len(l) > 2):
                        for ex, syn in l:
                            f.write('{}\t{}\t{}\n'.format(key, ex, syn))

            print('---------------------------------------------------')
    print(('<=5: {}; <=10: {}; <=20: {}'.format(num_examples_lt_5,
                                                num_examples_lt_10,
                                                num_examples_lt_20)))
    """