예제 #1
0
def main():
	print("Script to compare annotations")
	tag_isf = read_data(TAG_ISF)
	tag_human = read_data(TAG_HUMAN)
	tag_match = []
	tag_missing = []
	tag_missing_by_lemma = []
	for tagh in tag_human:
		found = None
		for tagi in tag_isf:
			if tagi[:-2] == tagh[:-2] and same_synsetid(tagi[-2], tagh[-2]):
				tag_match.append(tagh)
				found = tagi
				break
		if found is None:
			tag_missing.append(tagh)
	c = Counter()
	for missing in tag_missing:
		c.count(missing[-1])
	c.summarise()
	for k in c.count_map:
		tag_missing_by_lemma.append((k, c[k]))
	tag_missing_by_lemma.sort(key=itemgetter(1))
	print("Match: %s" % (len(tag_match),))
	writelines([ '\t'.join(x) for x in tag_match ], TAG_MATCH)
	writelines([ '\t'.join(x) for x in tag_missing ], TAG_MISSING)
	writelines([ '\t'.join([str(c) for c in x]) for x in reversed(tag_missing_by_lemma) ], TAG_MISSING_BY_LEMMA)
	print("Done!")
	pass
예제 #2
0
def main():
	print("Script to convert Viet SentiWordnet to Open Multilingual Wordnet format")

	c = Counter()

	# Fix VSW format
	with open(VSW_DATA, 'r') as vsw_input:
		with open(VSW_FIXED, 'w') as vsw_fixed:
			for line in vsw_input.readlines():
				if line.startswith('#'):
					vsw_fixed.write(line)
					if line.startswith('# Web: https://sourceforge.net/projects/vietsentiwordne/'):
						vsw_fixed.write('#\n# Some bugs fixed by Le Tuan Anh <*****@*****.**>\n')
						vsw_fixed.write('# Latest version is available at: https://github.com/letuananh/omwtk\n#\n')
				else:
					c.count('processed')
					sense = Sense(*line.split('\t'))
					if sense.Gloss.find(';') < 0:
						c.count("error")
						#print(sense.Gloss.strip())
						fixedline = line
						if line.find(', "') > 0:
							fixedline = line.replace(', "', '; "', 1)
						elif line.find('"') < 0:
							#print(line)
							c.count("No example")
						elif line.find(',"') > 0:
							fixedline = line.replace(',"', '; "', 1)
						elif line.find('như: "') > 0:
							fixedline = line.replace('như: "', '; "', 1)
						vsw_fixed.write(fixedline)
					else:
						c.count("ok")
						vsw_fixed.write(line)
					
					
	c.summarise()
	#exit()
	
	# Read file
	with open(VSW_FIXED, 'r') as vsw_input:
		lines = [ x for x in vsw_input.readlines() if not x.startswith('#') ]
	senses = [ Sense(*line.split('\t')) for line in lines ]
	
	# Write file
	with open(OMW_DATA, 'w') as omw_output:
		omw_output.write('# Prepared by Le Tuan Anh <*****@*****.**>\n')
		omw_output.write('# Based on Viet SentiWordnet 1.0\n')
		omw_output.write('# Latest version is available at: https://github.com/letuananh/omwtk\n')
		
		all_examples = []
		all_definitions = []
		for sense in senses:
			# 001937986-a    vie:lemma    giỏ
			# 001937986-a    vie:def    có trình độ cao, đáng được khâm phục, khen ngợi
			# 001937986-a    vie:exe    giáo viên dạy giỏi
			synset_id = '%s-%s' % (sense.SenseID , sense.POS)
			lemma = sense.SynsetTerms.split('#')[0]
			# Some lemmas are wrong
			#if len(sense.SynsetTerms.split('#')) > 2:
			#	print(sense.SynsetTerms)
			definition = ''
			example = ''
			if sense.Gloss.find(';') > 0:
				definition = sense.Gloss[:sense.Gloss.find(';')].strip()
				example = sense.Gloss[sense.Gloss.find(';')+1:].strip()
			omw_output.write('%s\tvie:lemma\t%s\n' % (synset_id,lemma))
			if definition:
				omw_output.write('%s\tvie:def\t%s\n' % (synset_id,definition))
				all_definitions.append(definition)
			if example:
				examples = [ x.strip() for x in example.split('"') if len(x.strip()) > 1 ]
				all_examples += examples
				#print(examples)
				for i, val in enumerate(examples):
					omw_output.write('%s\tvie:exe\t%s\t%s\n' % (synset_id, i, val))
		all_examples.sort(key=len)
		for example in all_examples:
			print(example)
		writelines(all_examples, 'data/examples.txt')
		writelines(sorted(all_definitions,key=len), 'data/defs.txt')
	pass