Exemple #1
0
	def __buildDics__(self, filename):
		misc = Miscelaneous()
		txtfile = misc.openFile(filename, 'r')

		record_phrase = False
		for line in txtfile:
			line = re.sub('\n', '', line)
			if '(ROOT' in line:
				record_phrase = True
			elif line == '':
				record_phrase = False
			elif record_phrase:
				elements = line.split('(')
				for values in elements:
					if ')' in values:
						term = values.split(')')[0]
						print term


		"""

				self.dic_t[id_t] = {'word':word, 'lemma':lemma, 'pos':pos, 'morph':morph, 'sem':sem, 'extra':extra, 'headof':''}

				self.dic_nt[id_nt] = {'cat':cat, 'edge':array_edges}
			
					self.dic_t[idref]['headof'] = id_nt
					self.dic_nt[id_nt]['head'] = idref

				self.dic_nt[id_nt]['edge'] = array_edges
		"""
		txtfile.close()
	def writeDicAN(self, filename):
		misc = Miscelaneous()
		output_an = misc.openFile(filename+'.txt', 'w')
		if self.mountANRelations:
			self.__extractRelations__('AN')
			self.mountANRelations = False
		for id_an in self.dic_an:
			output_an.write(id_an+'#'+str(self.dic_an[id_an])+'\n')
		output_an.close()
Exemple #3
0
    def __init__(self, fileinput):
        self.list_seeds = []
        misc = Miscelaneous()
        file_seeds = misc.openFile(fileinput, 'r')

        for line in file_seeds:
            if line != '':
                line = line.replace('\n', '')
                self.list_seeds.append(line.lower())
        file_seeds.close()
Exemple #4
0
    def __init__(self, fileinput):
        self.list_seeds = []
        misc = Miscelaneous()
        file_seeds = misc.openFile(fileinput, "r")

        for line in file_seeds:
            if line != "":
                line = line.replace("\n", "")
                self.list_seeds.append(line.lower())
        file_seeds.close()
Exemple #5
0
	def __init__(self, temp_folder, file_input, seedfile, mi_precision):
		self.window_size = file_input[1:-23]
		self.temp_folder = temp_folder
		self.misc = Miscelaneous()
		seeds_file = Seeds(seedfile)
		self.list_seeds = seeds_file.getSeeds()
		self.first_line = ''
		self.dic_tuplas = defaultdict(dict)
		self.dic_terms = OrderedDict()
		self.__buildMI__(file_input, mi_precision)
Exemple #6
0
    def __init__(self, temp_folder):
        self.temp_folder = temp_folder
        self.misc = Miscelaneous()
        self.dic_an = {}
        self.dic_sv = {}
        self.dic_vo = {}
        self.matrix_relations = ['AN', 'SV', 'VO']

        for type_relation in self.matrix_relations:
            self.__loadTerms__(type_relation)
        self.__writeDic__()
Exemple #7
0
	def __init__(self, seedfile, temp_folder, sim_measure):
		self.misc = Miscelaneous()
		seeds_file = Seeds(seedfile)
		self.temp_folder = temp_folder
		self.dic_nouns = {}
		self.dic_seeds = defaultdict(dict)
		#self.dic_seeds_freqObj = {}
		#self.dic_seeds_Obj = {}
		self.list_seeds = seeds_file.getSeeds()
		self.dic_measure = defaultdict(dict)
		self.dic_Obj2 = defaultdict(dict)
		self.dic_freqObj = {}
		self.dic_Obj = {}

		self.__buildHashs__(sim_measure)
	def __init__(self, input_folder, temp_folder, min_word_size, window_size):
		try:
			self.root, self.dirs, self.files = os.walk(input_folder).next()[:3]
		except IOError:
			print bcolors.FAIL+'ERROR: It was not possible to open the '+input_folder+' folder'+bcolors.ENDC
			sys.exit(2)

		self.min_word_size = int(min_word_size)
		self.window_size = int(window_size)
		self.temp_folder = temp_folder
		self.qty_documents = len(self.files)
		self.misc = Miscelaneous()

		if os.path.exists(self.temp_folder+'Statistical_corpus.txt'):
			os.system('rm '+self.temp_folder+'Statistical_corpus.txt')
		self.temp_file = self.misc.openFile(self.temp_folder+'Statistical_corpus.txt', 'a')
Exemple #9
0
    def __init__(self, input_file, temp_folder, svd_dimension):
        self.misc = Miscelaneous()
        self.temp_folder = temp_folder
        self.svd_dimension = svd_dimension
        self.dic_column = OrderedDict()
        self.dic_column_index = {}
        self.dic_row = OrderedDict()
        self.dic_row_index = {}
        self.array_row = []
        self.array_col = []
        self.array_data = []
        self.dic_matrix = {}

        string_files_matrix = ''

        self.buildMatrixFromFile(input_file)
        self.applySvd()
        self.writeSvd()
Exemple #10
0
	def __buildDics__(self, filename):
		misc = Miscelaneous()
		xmlfile = misc.openFile(filename, 'r')

		for line in xmlfile:
			if '<t ' in line:
				id_t = (line.split('id=\"')[1]).split('\"')[0]
				word = (line.split('word=\"')[1]).split('\"')[0].lower()
				lemma = ((line.split('lemma=\"')[1]).split('\"')[0]).lower()
				morph = (line.split('morph=\"')[1]).split('\"')[0]
				sem = (line.split('sem=\"')[1]).split('\"')[0]
				extra = (line.split('extra=\"')[1]).split('\"')[0]
				
				if re.search('%|&amp;', lemma):
					pos = '--' 
				else:
					pos = (line.split('pos=\"')[1]).split('\"')[0]

				self.dic_t[id_t] = {'word':word, 'lemma':lemma, 'pos':pos, 'morph':morph, 'sem':sem, 'extra':extra, 'headof':''}
				
			elif '<nt ' in line:
				id_nt = (line.split('id=\"')[1]).split('\"')[0]
				id_nt_number = id_nt.split('_')[1]
				cat = (line.split('cat=\"')[1]).split('\"')[0]
				array_edges = []
				self.dic_nt[id_nt] = {'cat':cat, 'edge':array_edges}
			
			elif '<edge ' in line:
				
				idref = (line.split('idref=\"')[1]).split('\"')[0]
				idref_number = idref.split('_')[1]
				label = (line.split('label=\"')[1]).split('\"')[0]
				if int(idref_number) < 500 or int(idref_number) > int(id_nt_number):
					array_edges.append([idref, label])
				
				if label == 'H':
					self.dic_t[idref]['headof'] = id_nt
					self.dic_nt[id_nt]['head'] = idref

			elif '</nt>' in line:
				self.dic_nt[id_nt]['edge'] = array_edges
		
		xmlfile.close()
Exemple #11
0
	def __init__(self, ctx_freq_file, seedfile):
		self.misc = Miscelaneous()
		seeds_file = Seeds(seedfile)
		self.list_seeds = seeds_file.getSeeds()
		self.dic_baseline = defaultdict(dict)
		self.dic_diceBin = defaultdict(dict)
		self.dic_diceMin = defaultdict(dict)
		self.dic_jaccard = defaultdict(dict)
		self.dic_cosineBin = defaultdict(dict)
		self.dic_cosine = defaultdict(dict)
		self.dic_city = defaultdict(dict)
		self.dic_euclidean = defaultdict(dict)
		self.dic_js = defaultdict(dict)
		self.dic_lin = defaultdict(dict)
		self.dic_jaccardMax = defaultdict(dict)
		self.dic_ctx = defaultdict(dict)
		self.dic_sum_freq_noun = {}
		self.dic_qty_noun = {}
		self.__buildHashs__(ctx_freq_file, seedfile)
Exemple #12
0
	def __init__(self, temp_folder):
		self.temp_folder = temp_folder
		self.misc = Miscelaneous()
		self.dic_an = {}
		self.dic_sv = {}
		self.dic_vo = {}
		self.matrix_relations = ['AN', 'SV', 'VO']

		for type_relation in self.matrix_relations:
			self.__loadTerms__(type_relation)
		self.__writeDic__()
	def __init__(self, input_folder, temp_folder, stoplist_file, min_word_size, record_intermediate):	
		try:
			self.root, self.dirs, self.files = os.walk(input_folder).next()[:3]
		except IOError:
			print bcolors.FAIL+'ERROR: It was not possible to open the '+input_folder+' folder'+bcolors.ENDC
			sys.exit(2)

		self.min_word_size = int(min_word_size)
		self.temp_folder = temp_folder
		self.qty_documents = len(self.files)
		self.misc = Miscelaneous()
		self.stoplist = self.misc.getStoplist(stoplist_file)
		
		self.matrix_relations = ['AN', 'SV', 'VO']
		self.dic_an = {}
		self.dic_sv = {}
		self.dic_vo = {}

		command = 'rm -Rf '+self.temp_folder+'; mkdir '+self.temp_folder+' '
		if record_intermediate:
			command += self.temp_folder+'AN/'+' '+self.temp_folder+'AN/2Order/'+self.temp_folder+'AN/3Order/ '
			command += self.temp_folder+'SV/'+' '+self.temp_folder+'SV/2Order/'+self.temp_folder+'SV/3Order/ '
			command += self.temp_folder+'VO/'+' '+self.temp_folder+'VO/2Order/'+self.temp_folder+'VO/3Order/ '
		os.system(command)

		i = 0
		for corpus_file in self.files:
			self.dic_t = {}
			self.dic_nt = {}
			self.dic_an_doc = {}
			self.dic_sv_doc = {}
			self.dic_vo_doc = {}
			#print corpus_file
			i += 1
			if re.match('.*xml$', corpus_file):
				corpus_filename = corpus_file.split('.')[0]
				xml = ParseStanfordXml(self.root+''+corpus_file)
				self.dic_t = xml.getDicTerms()
				self.dic_nt = xml.getDicNonTerminals()
				self.__extractRelations__()
				if record_intermediate:
					self.__writeDicRelations__(corpus_filename)

			self.misc.progress_bar(i, self.qty_documents, 100)

		self.__writeDic__()
Exemple #14
0
    def __init__(self, input_file, temp_folder, svd_dimension):
        self.misc = Miscelaneous()
        self.temp_folder = temp_folder
        self.svd_dimension = svd_dimension
        self.dic_column = OrderedDict()
        self.dic_column_index = {}
        self.dic_row = OrderedDict()
        self.dic_row_index = {}
        self.array_row = []
        self.array_col = []
        self.array_data = []
        self.dic_matrix = {}

        string_files_matrix = ""

        self.buildMatrixFromFile(input_file)
        self.applySvd()
        self.writeSvd()
Exemple #15
0
	def __init__(self, temp_folder, svd_dimension, record_intermediate):
		self.misc = Miscelaneous()
		self.temp_folder = temp_folder
		self.svd_dimension = svd_dimension
		self.dic_noun = OrderedDict()
		self.dic_noun_index = {}
		self.dic_modifier = OrderedDict()
		self.dic_modifier_index = {}
		self.row = []
		self.col = []
		self.data = []
		self.dic_matrix = {}
		#self.line_data = ''
		list_relations = ['AN', 'SV', 'VO']

		string_files_matrix = ''
		for relation in list_relations:
			self.type_relation = relation
			#self.buildMatrixFromFile()
			#self.applySvd()
			#if record_intermediate:
			#	logfile.writeLogfile('- Recording SVD matrix to '+relation+' in a file...')
			#	self.writeSvd()
			#self.buildRelationsSvd()

			string_files_matrix += self.temp_folder+''+relation+'/Matrix_row.txt '+self.temp_folder+''+relation+'/Matrix_column.txt '
			file_matrix = self.misc.openFile(self.temp_folder+''+relation+'/Matrix_row.txt', 'r')
			for line in file_matrix:
				self.__loadDicMatrix__(line, relation)
			file_matrix.close()

		file_doc_matrix = self.misc.openFile(self.temp_folder+'/Matrix_nouns.txt', 'w')
		number_document = 0
		for noun in self.dic_matrix:
			file_doc_matrix.write(str(number_document)+' : '+noun+'\n')
			command = 'cat'+self.dic_matrix[noun]+' > '+self.temp_folder+'Matrix/'+str(number_document)+'.txt'
			os.system(command)
			if not record_intermediate:
				command = 'rm -Rf'+self.dic_matrix[noun]+' '+string_files_matrix
				os.system(command)
			number_document += 1
		file_doc_matrix.close()
Exemple #16
0
class Contexts:

	def __init__(self, temp_folder):
		self.temp_folder = temp_folder
		self.misc = Miscelaneous()
		self.dic_an = {}
		self.dic_sv = {}
		self.dic_vo = {}
		self.matrix_relations = ['AN', 'SV', 'VO']

		for type_relation in self.matrix_relations:
			self.__loadTerms__(type_relation)
		self.__writeDic__()

	def __del__(self):
		pass

	def __loadTerms__(self, type_relation):
		try:
			root, dirs, files = os.walk(self.temp_folder+''+type_relation+'/2Order/').next()[:3]
		except IOError:
			print bcolors.FAIL+'ERROR: It was not possible to open the '+self.temp_folder+' folder'+bcolors.ENDC
			sys.exit(2)

		qty_documents = len(files)

		i = 0
		for corpus_file in files:
			i += 1
			if re.match('.*txt$', corpus_file):
				relation_file = self.misc.openFile(root+''+corpus_file, 'r')
				for line in relation_file:
					line = re.sub('\n', '', line)
					relation, noun, frequency = line.split('#')
					if type_relation == 'AN':
						self.__addElementDicAN__(relation+'#'+noun, frequency)
					elif type_relation == 'SV':
						self.__addElementDicSV__(relation+'#'+noun, frequency)
					elif type_relation == 'VO':
						self.__addElementDicVO__(relation+'#'+noun, frequency)
			self.misc.progress_bar(i, qty_documents, 100)

	def __addElementDicAN__(self, relation, frequency):
		if self.dic_an.has_key(relation):
			self.dic_an[relation] += int(frequency)
		else:
			self.dic_an[relation] = int(frequency)

	def __addElementDicSV__(self, relation, frequency):
		if self.dic_sv.has_key(relation):
			self.dic_sv[relation] += int(frequency)
		else:
			self.dic_sv[relation] = int(frequency)

	def __addElementDicVO__(self, relation, frequency):
		if self.dic_vo.has_key(relation):
			self.dic_vo[relation] += int(frequency)
		else:
			self.dic_vo[relation] = int(frequency)

	def __writeDic__(self):
		for type_relation in self.matrix_relations:
			file_relation = self.misc.openFile(self.temp_folder+''+type_relation+'/Relations.txt', 'w')
			dic_relation = self.getDic(type_relation)
			for id_relation in dic_relation:
				file_relation.write(id_relation+'#'+str(dic_relation[id_relation])+'\n')
			file_relation.close()

	""" Get and Print methods """

	def getDic(self, type_relation):
		if type_relation == 'AN': return self.dic_an
		elif type_relation == 'SV': return self.dic_sv
		elif type_relation == 'VO': return self.dic_vo

	def printDic (self, type_relation):
		dic_relation = getDic(type_relation)
		for id_relation in self.dic_relation:
			print id_relation+' = '+str(self.dic_relation[id_relation])
class StanfordSyntacticContexts:

	def __init__(self, input_folder, temp_folder, stoplist_file, min_word_size, record_intermediate):	
		try:
			self.root, self.dirs, self.files = os.walk(input_folder).next()[:3]
		except IOError:
			print bcolors.FAIL+'ERROR: It was not possible to open the '+input_folder+' folder'+bcolors.ENDC
			sys.exit(2)

		self.min_word_size = int(min_word_size)
		self.temp_folder = temp_folder
		self.qty_documents = len(self.files)
		self.misc = Miscelaneous()
		self.stoplist = self.misc.getStoplist(stoplist_file)
		
		self.matrix_relations = ['AN', 'SV', 'VO']
		self.dic_an = {}
		self.dic_sv = {}
		self.dic_vo = {}

		command = 'rm -Rf '+self.temp_folder+'; mkdir '+self.temp_folder+' '
		if record_intermediate:
			command += self.temp_folder+'AN/'+' '+self.temp_folder+'AN/2Order/'+self.temp_folder+'AN/3Order/ '
			command += self.temp_folder+'SV/'+' '+self.temp_folder+'SV/2Order/'+self.temp_folder+'SV/3Order/ '
			command += self.temp_folder+'VO/'+' '+self.temp_folder+'VO/2Order/'+self.temp_folder+'VO/3Order/ '
		os.system(command)

		i = 0
		for corpus_file in self.files:
			self.dic_t = {}
			self.dic_nt = {}
			self.dic_an_doc = {}
			self.dic_sv_doc = {}
			self.dic_vo_doc = {}
			#print corpus_file
			i += 1
			if re.match('.*xml$', corpus_file):
				corpus_filename = corpus_file.split('.')[0]
				xml = ParseStanfordXml(self.root+''+corpus_file)
				self.dic_t = xml.getDicTerms()
				self.dic_nt = xml.getDicNonTerminals()
				self.__extractRelations__()
				if record_intermediate:
					self.__writeDicRelations__(corpus_filename)

			self.misc.progress_bar(i, self.qty_documents, 100)

		self.__writeDic__()

	def __del__(self):
		pass

	def __extractRelations__(self):
		i = 0
		#print self.dic_t
		for id_nt in self.dic_nt:
			if self.dic_nt[id_nt]['cat'] == 'nn':
				noun = self.dic_t[self.dic_nt[id_nt]['gov']]['lemma'].lower()
				#noun = re.sub('-', '_', noun)
				context = self.dic_t[self.dic_nt[id_nt]['dep']]['lemma'].lower()
				context = re.sub('-', '_', context)
				if len(noun) >= self.min_word_size and len(context) >= self.min_word_size:
					self.__addElementDicAN__(context+'#'+noun)
					self.__addElementDicAN__(noun+'#'+context)
					self.__addElementDicDocAN__(context+'#'+noun)
					self.__addElementDicDocAN__(noun+'#'+context)

			elif self.dic_nt[id_nt]['cat'] == 'amod':
				noun = self.dic_t[self.dic_nt[id_nt]['gov']]['lemma'].lower()
				#noun = re.sub('-', '_', noun)
				context = self.dic_t[self.dic_nt[id_nt]['dep']]['lemma'].lower()
				context = re.sub('-', '_', context)
				if len(noun) >= self.min_word_size and len(context) >= self.min_word_size and context not in self.stoplist:
					self.__addElementDicAN__(context+'#'+noun)
					self.__addElementDicDocAN__(context+'#'+noun)
				
			if re.match('prep_', self.dic_nt[id_nt]['cat']) \
				and re.match('^NN', self.dic_t[self.dic_nt[id_nt]['dep']]['pos'])  \
				and re.match('^NN', self.dic_t[self.dic_nt[id_nt]['gov']]['pos']):
				noun = self.dic_t[self.dic_nt[id_nt]['gov']]['lemma'].lower()
				#noun = re.sub('-', '_', noun)
				context = self.dic_t[self.dic_nt[id_nt]['dep']]['lemma'].lower()
				context = re.sub('-', '_', context)
				prep = self.dic_nt[id_nt]['cat'].split('_')[1]
				if len(noun) >= self.min_word_size and len(context) >= self.min_word_size:
					self.__addElementDicAN__(prep+'_'+context+'#'+noun)
					self.__addElementDicAN__(prep+'_'+noun+'#'+context)
					self.__addElementDicDocAN__(prep+'_'+context+'#'+noun)
					self.__addElementDicDocAN__(prep+'_'+noun+'#'+context)

			elif re.match('^(nsubjpass|nsubj|xsubj|agent)$', self.dic_nt[id_nt]['cat']): #gov = verb
				if re.match('V', self.dic_t[self.dic_nt[id_nt]['gov']]['pos']):
					verb = self.dic_t[self.dic_nt[id_nt]['gov']]['lemma'].lower()
					#verb = re.sub('-', '_', verb)
					contexts = self.dic_t[self.dic_nt[id_nt]['dep']]['nps']
					for context in contexts:
						if len(verb) >= self.min_word_size and len(context) >= self.min_word_size:
							self.__addElementDicSV__('sub_'+verb+'#'+context)
							self.__addElementDicDocSV__('sub_'+verb+'#'+context)
			
			elif re.match('^(dobj|iobj)$', self.dic_nt[id_nt]['cat']):
				if re.match('V', self.dic_t[self.dic_nt[id_nt]['gov']]['pos']):
					verb = self.dic_t[self.dic_nt[id_nt]['gov']]['lemma'].lower()
					#verb = re.sub('-', '_', verb)
					contexts = self.dic_t[self.dic_nt[id_nt]['dep']]['nps']
					for context in contexts:
						if len(verb) >= self.min_word_size and len(context) >= self.min_word_size:
							self.__addElementDicVO__('obj_'+verb+'#'+context)
							self.__addElementDicDocVO__('obj_'+verb+'#'+context)

	def __addElementDicAN__(self, relation):
		if self.dic_an.has_key(relation):
			self.dic_an[relation] += 1
		else:
			self.dic_an[relation] = 1

	def __addElementDicDocAN__(self, relation):
		if self.dic_an_doc.has_key(relation):
			self.dic_an_doc[relation] += 1
		else:
			self.dic_an_doc[relation] = 1

	def __addElementDicSV__(self, relation):
		if self.dic_sv.has_key(relation):
			self.dic_sv[relation] += 1
		else:
			self.dic_sv[relation] = 1

	def __addElementDicDocSV__(self, relation):
		if self.dic_sv_doc.has_key(relation):
			self.dic_sv_doc[relation] += 1
		else:
			self.dic_sv_doc[relation] = 1

	def __addElementDicVO__(self, relation):
		if self.dic_vo.has_key(relation):
			self.dic_vo[relation] += 1
		else:
			self.dic_vo[relation] = 1

	def __addElementDicDocVO__(self, relation):
		if self.dic_vo_doc.has_key(relation):
			self.dic_vo_doc[relation] += 1
		else:
			self.dic_vo_doc[relation] = 1

	def __writeDicRelations__(self, corpus_filename):
		file_relation_an = self.misc.openFile(self.temp_folder+'AN/2Order/AN_'+corpus_filename+'.txt', 'w')
		for id_relation in self.dic_an_doc:
			file_relation_an.write(id_relation+'#'+str(self.dic_an_doc[id_relation])+'\n')
		file_relation_an.close()

		file_relation_sv = self.misc.openFile(self.temp_folder+'SV/2Order/SV_'+corpus_filename+'.txt', 'w')
		for id_relation in self.dic_sv_doc:
			file_relation_sv.write(id_relation+'#'+str(self.dic_sv_doc[id_relation])+'\n')
		file_relation_sv.close()

		file_relation_vo = self.misc.openFile(self.temp_folder+'VO/2Order/VO_'+corpus_filename+'.txt', 'w')
		for id_relation in self.dic_vo_doc:
			file_relation_vo.write(id_relation+'#'+str(self.dic_vo_doc[id_relation])+'\n')
		file_relation_vo.close()

	def __writeDic__(self):
		for type_relation in self.matrix_relations:
			file_relation = self.misc.openFile(self.temp_folder+''+type_relation+'/2Order/Relations.txt', 'w')
			dic_relation = self.getDic(type_relation)
			for id_relation in dic_relation:
				file_relation.write(id_relation+'#'+str(dic_relation[id_relation])+'\n')
			file_relation.close()

	""" Get and Print methods """

	def getDic(self, type_relation):
		if type_relation == 'AN': return self.dic_an
		elif type_relation == 'SV': return self.dic_sv
		elif type_relation == 'VO': return self.dic_vo

	def printDic (self, type_relation):
		dic_relation = getDic(type_relation)
		for id_relation in self.dic_relation:
			print id_relation+' = '+str(self.dic_relation[id_relation])
class StatisticalCorpus:
	def __init__(self, input_folder, temp_folder, min_word_size, window_size):
		try:
			self.root, self.dirs, self.files = os.walk(input_folder).next()[:3]
		except IOError:
			print bcolors.FAIL+'ERROR: It was not possible to open the '+input_folder+' folder'+bcolors.ENDC
			sys.exit(2)

		self.min_word_size = int(min_word_size)
		self.window_size = int(window_size)
		self.temp_folder = temp_folder
		self.qty_documents = len(self.files)
		self.misc = Miscelaneous()

		if os.path.exists(self.temp_folder+'Statistical_corpus.txt'):
			os.system('rm '+self.temp_folder+'Statistical_corpus.txt')
		self.temp_file = self.misc.openFile(self.temp_folder+'Statistical_corpus.txt', 'a')

	def __del__(self):
		pass

	def buildCorpus_pt(self):
		i = 0
		for corpus_file in self.files:
			i += 1
			if re.match('.*xml$', corpus_file):
				corpus_filename = corpus_file.split('.')[0]
				xmlfile = ParsePalavrasXml(self.root+''+corpus_file)
				dic_terms = xmlfile.getDicTerms()
				dic_nouns = xmlfile.getNouns()
				#dic_verbs = xmlfile.getVerbs()

				id_sentence = 1
				id_word = 1
				id_t = 's'+str(id_sentence)+'_'+str(id_word)

				string_corpus = ''
				while dic_terms.has_key(id_t):
					while dic_terms.has_key(id_t):
						lemma = re.sub('(--|/|,|;|\(|\)|\$|\+|\')', '', dic_terms[id_t]['lemma'])
						lemma = re.sub('-', '_', lemma)
						lemma = re.sub('_$', '', lemma)

						if not re.match('^(pu|num|conj|art|prp|spec)', dic_terms[id_t]['pos']) and (len(lemma) >= self.min_word_size):
							if dic_nouns.has_key(id_t):
								string_corpus += lemma+'__N '
							#elif dic_verbs.has_key(id_t):
							#	string_corpus += lemma+'__V '
							else:
								string_corpus += lemma+'__O '
						id_word += 1
						id_t = 's'+str(id_sentence)+'_'+str(id_word)
					id_word = 1
					id_sentence += 1
					id_t = 's'+str(id_sentence)+'_'+str(id_word)
					#print string_corpus
				#print string_corpus
				self.temp_file.write(string_corpus)
				self.misc.progress_bar(i, self.qty_documents, 100)

		self.temp_file.close()

	def buildCorpus_en(self):
		i = 0
		for corpus_file in self.files:
			i += 1
			if re.match('.*xml$', corpus_file):
				corpus_filename = corpus_file.split('.')[0]
				xmlfile = ParseStanfordXml(self.root+''+corpus_file)
				dic_terms = xmlfile.getDicTerms()
				self.__getRelationsInAWindow__(dic_terms, self.window_size)
				self.misc.progress_bar(i, self.qty_documents, 100)
		self.temp_file.close()
				
	""" GET RELATIONS IN A WINDOW """
	def __getRelationsInAWindow__(self, dic_terms, window_size):
		i = 0
		id_sentence = 1
		id_word = 1
		id_t = 's'+str(id_sentence)+'_'+str(id_word)
		string_corpus = ''
		while dic_terms.has_key(id_t):
			while dic_terms.has_key(id_t):
				lemma = re.sub('(--|/|,|;|\(|\)|\$|\+|\'|[.])', '', dic_terms[id_t]['lemma']).lower()
				lemma = re.sub('-', '_', lemma)
				lemma = re.sub('_$', '', lemma)
				
				if len(lemma) >= self.min_word_size:
					if re.match('^NN', dic_terms[id_t]['pos']):
						string_corpus += lemma+'__N '
					elif re.match('^(AMOD|JJ|VB|MD|RB|RP)', dic_terms[id_t]['pos']):
						string_corpus += lemma+'__O '
				id_word += 1
				id_t = 's'+str(id_sentence)+'_'+str(id_word)
			id_word = 1
			id_sentence += 1
			id_t = 's'+str(id_sentence)+'_'+str(id_word)
		self.temp_file.write(string_corpus)

	def buildSTRelations(self, file_input, seeds_file):
		seeds = Seeds(seeds_file)
		list_seeds = seeds.getSeeds()
		dic_tuplas = {}
		file_bigrams = self.misc.openFile(self.temp_folder+''+file_input, 'r')
		first_line = ''

		for line in file_bigrams:
			if first_line != '':
				part = line.split('<>')
				term_type1 = part[0]
				term_type2 = part[1]
				term1, type1 = term_type1.split('__')
				term2, type2 = term_type2.split('__')

				freq_tupla = part[2].split(' ')[0]
				freq_term1 = part[2].split(' ')[1]
				freq_term2 = part[2].split(' ')[2]
				
				if type1 == 'N' and term1 != term2:
					if dic_tuplas.has_key(term2+'#'+term1+'#'):
						dic_tuplas[term2+'#'+term1+'#'] += int(freq_tupla)
					else:
						dic_tuplas[term2+'#'+term1+'#'] = int(freq_tupla)
				if type2 == 'N' and term1 != term2:
					if dic_tuplas.has_key(term1+'#'+term2+'#'):
						dic_tuplas[term1+'#'+term2+'#'] += int(freq_tupla)
					else:
						dic_tuplas[term1+'#'+term2+'#'] = int(freq_tupla)
			else:
				first_line = line
		file_bigrams.close()

		file_relations = self.misc.openFile(self.temp_folder+'W'+str(self.window_size)+'_Relations.txt', 'w')
		for tupla in dic_tuplas:
			file_relations.write(tupla+''+str(dic_tuplas[tupla])+'\n')
		file_relations.close()
Exemple #19
0
	def __buildDics__(self, filename):
		misc = Miscelaneous()
		xmlfile = misc.openFile(filename, 'r')
		self.stoplist = misc.getStoplist('../misc/stoplist.txt')

		record_dependencies = False
		record_collapsed = False
		for line in xmlfile:
			line = re.sub('\n', '', line)
			if '<sentence ' in line:
				id_s = (line.split('id=\"')[1]).split('\"')[0]	
				array_rec_dep = []
			elif '<token ' in line:
				id_t = 's'+id_s+'_'+(line.split('id=\"')[1]).split('\"')[0]
			elif '<word>' in line:
				word = (line.split('<word>')[1]).split('</word>')[0]
			elif '<lemma>' in line:
				lemma = (line.split('<lemma>')[1]).split('</lemma>')[0]
			elif '<POS>' in line:
				pos = (line.split('<POS>')[1]).split('</POS>')[0]
			elif '<NER>' in line:
				ner = (line.split('<NER>')[1]).split('</NER>')[0]
			elif '</token>' in line:
				self.dic_t[id_t] = {'word':word, 'lemma':lemma, 'pos':pos, 'ner':ner, 'nps':[]}
				if re.match('NN|NNP', pos):
					array_nps = [lemma.lower()]
					self.dic_t[id_t]['nps'] = array_nps

			elif '<basic-dependencies>' in line:
				record_dependencies = True
				index_nt = 500
			elif '</basic-dependencies>' in line:
				record_dependencies = False
			elif '<collapsed-ccprocessed-dependencies>' in line:
				record_collapsed = True
			elif '</collapsed-ccprocessed-dependencies>' in line:
				record_collapsed = False

			if record_dependencies or record_collapsed:
				if '<dep type=' in line:
					cat = (line.split('type=\"')[1]).split('\"')[0]
				elif '<governor ' in line:
					idx_gov = (line.split('idx=\"')[1]).split('\"')[0]
					id_t_gov = 's'+id_s+'_'+idx_gov
				elif '<dependent ' in line:
					idx_dep = (line.split('idx=\"')[1]).split('\"')[0]
					id_t_dep = 's'+id_s+'_'+idx_dep
				elif (record_dependencies or record_collapsed) and '</dep>' in line:
					if cat+'#'+id_t_gov+'#'+id_t_dep not in array_rec_dep:
						array_rec_dep.append(cat+'#'+id_t_gov+'#'+id_t_dep)	
						self.dic_nt['s'+id_s+'_'+str(index_nt)] = {'cat':cat, 'gov':id_t_gov, 'dep':id_t_dep}
						index_nt += 1
		xmlfile.close()

		for id_nt in self.dic_nt:
			if re.match("(nn|amod)", self.dic_nt[id_nt]['cat']):
				array_nps = self.dic_t[self.dic_nt[id_nt]['gov']]['nps']
				string = ''
				id_gov = self.dic_nt[id_nt]['gov'].split('_')[1]
				id_dep = self.dic_nt[id_nt]['dep'].split('_')[1]
				id_s = self.dic_nt[id_nt]['dep'].split('_')[0]
				for i in range(int(id_dep), int(id_gov)):
					id_next = id_s+'_'+str(i)
					if re.match("(NN|JJ)", self.dic_t[id_next]['pos']) and self.dic_t[id_next]['lemma'] not in self.stoplist:
						string += self.dic_t[id_next]['lemma']+'_'
				string += self.dic_t[id_s+'_'+id_gov]['lemma']
				if len(string.split('_')) > 1 and string.lower() not in array_nps:
					array_nps.append(string.lower())
				self.dic_t[self.dic_nt[id_nt]['gov']]['nps'] = array_nps

			elif re.match("prep_of", self.dic_nt[id_nt]['cat']):
				id_gov = self.dic_nt[id_nt]['gov']
				id_dep = self.dic_nt[id_nt]['dep']
				
				if re.match("NN", self.dic_t[id_dep]['pos']) and re.match("NN", self.dic_t[id_gov]['pos']):
					array_nps = self.dic_t[id_dep]['nps']
					string = self.dic_t[id_gov]['lemma']+'_of_'+self.dic_t[id_dep]['lemma']
					array_nps.append(string.lower())
					self.dic_t[self.dic_nt[id_nt]['dep']]['nps'] = array_nps

					array_nps = self.dic_t[id_gov]['nps']
					string = self.dic_t[id_gov]['lemma']+'_of_'+self.dic_t[id_dep]['lemma']
					array_nps.append(string.lower())
					self.dic_t[self.dic_nt[id_nt]['gov']]['nps'] = array_nps				
Exemple #20
0
class Similarities:
	def __init__(self, seedfile, temp_folder, sim_measure):
		self.misc = Miscelaneous()
		seeds_file = Seeds(seedfile)
		self.temp_folder = temp_folder
		self.dic_nouns = {}
		self.dic_seeds = defaultdict(dict)
		#self.dic_seeds_freqObj = {}
		#self.dic_seeds_Obj = {}
		self.list_seeds = seeds_file.getSeeds()
		self.dic_measure = defaultdict(dict)
		self.dic_Obj2 = defaultdict(dict)
		self.dic_freqObj = {}
		self.dic_Obj = {}

		self.__buildHashs__(sim_measure)

	def __del__(self):
		pass

	def __buildHashs__(self, sim_measure):
		file_nouns = self.misc.openFile(self.temp_folder+'Matrix_nouns.txt', 'r')
		for line in file_nouns:
			line = re.sub('\n', '', line)
			doc, noun = line.split(' : ')
			self.dic_nouns[doc] = noun
			if noun in self.list_seeds:
				file_doc_seed = self.misc.openFile(self.temp_folder+'Matrix/'+doc+'.txt', 'r')
				self.dic_freqObj[doc] = 0
				self.dic_Obj[doc] = 0
				for line in file_doc_seed:
					line = re.sub('\n', '', line)
					modifier, noun, freq = line.split('#')
					self.dic_seeds[doc][modifier] = float(freq)
					#self.dic_seeds_freqObj[doc] += float(freq)
					#self.dic_seeds_Obj[doc] += 1 
				file_doc_seed.close()
		
		for doc_noun in self.dic_nouns:
			file_doc_nouns = self.misc.openFile(self.temp_folder+'Matrix/'+doc_noun+'.txt', 'r')
			for line in file_doc_nouns:
				line = re.sub('\n', '', line)
				modifier, noun, freq = line.split('#')
				self.dic_Obj2[doc_noun][modifier] = float(freq)
				#self.dic_freqObj[doc] += float(freq)
				#self.dic_Obj[doc] += 1
			file_doc_nouns.close()
		# Colocar o limitador do array para n valores, não consumindo muita memória OU imprimir a lista em um arquivo 
			for doc_seed in self.dic_seeds:
				if doc_noun != doc_seed:
					if sim_measure == 'jaccardMax':
						self.dic_measure[doc_seed][doc_noun] = self.getJaccardMaxMeasure(doc_seed, doc_noun)
					elif sim_measure == 'cosine':
						self.dic_measure[doc_seed][doc_noun] = self.getCosineMeasure(doc_seed, doc_noun)
			del self.dic_Obj2[doc_noun]
		# Deletar a hash criada Obj2


	def getTopNOrderedDic(self, n):
		dic_measure_ordered = self.__sortTopNFromAllDic__(self.dic_measure, n)
		return dic_measure_ordered

	def getJaccardMaxMeasure(self, doc_seed, doc_noun):
		minimum = 0
		maximum = 0
		for attr in self.dic_seeds[doc_seed]:
			if self.dic_Obj2[doc_noun].has_key(attr):
				assoc1 = self.dic_seeds[doc_seed][attr]
				assoc2 = self.dic_Obj2[doc_noun][attr]
				minimum += min(assoc1, assoc2)
				maximum += max(assoc1, assoc2)
			elif self.dic_seeds[doc_seed].has_key(attr):
				maximum += self.dic_seeds[doc_seed][attr]

		for attr2 in self.dic_Obj2[doc_noun]:
			if not self.dic_seeds[doc_seed].has_key(attr2):
				maximum += self.dic_Obj2[doc_noun][attr2]

		if maximum > 0:
			return minimum/maximum
		else:
			return -1

	def getCosineMeasure(self, doc_seed, doc_noun):
		intersection = 0
		o1 = 0
		o2 = 0
		for attr in self.dic_seeds[doc_seed]:
			if self.dic_Obj2[doc_noun].has_key(attr):
				assoc1 = self.dic_seeds[doc_seed][attr]
				assoc2 = self.dic_Obj2[doc_noun][attr]
				intersection += assoc1 * assoc2
				o1 += assoc1**2
				o2 += assoc2**2
			elif self.dic_seeds[doc_seed].has_key(attr):
				o1 += self.dic_seeds[doc_seed][attr]**2

		for attr2 in self.dic_Obj2[doc_noun]:
			if not self.dic_seeds[doc_seed].has_key(attr2):
				o2 += self.dic_Obj2[doc_noun][attr2]**2

		if o1 > 0 and o2 > 0:
			return intersection/math.sqrt(float(o1 * o2))
		else:
			return -1

	def __sortTopNFromAllDic__(self, dic, n):
		dic_terms = OrderedDict()
		self.dic_seeds = sorted(self.dic_seedsp)
		for doc in self.dic_seeds:
			if self.__existKeyInDic__(doc, dic):
				seed = self.dic_nouns[doc]
				dic_terms[seed] = {'terms': []}
				dic_related = {}
				for related_term in dic[doc]:
					dic_related[related_term] = dic[doc][related_term]
				if n == 0: n = None
				dic_ordered = sorted(dic_related.items(), key=itemgetter(1), reverse=True)[0:n]
				for list_ordered in dic_ordered:
					dic_terms[seed]['terms'].append({self.dic_nouns[list_ordered[0]]:str(list_ordered[1])})
		return dic_terms

	def __existKeyInDic__(self, key, dic):
		if dic.has_key(key):
			return dic
		else:
			print bcolors.WARNING+'WARNING: System cannot found the term "'+key+'" in corpus'+bcolors.ENDC
			print ''
			return False

	def __printDic__(self, dic_terms):
		for seed in dic_terms:
			print 'Seed: '+seed
			for index_related_term in dic_terms[seed]['terms']:
					similarity = index_related_term[index_related_term.keys()[0]]
					term = index_related_term.keys()[0]
					print 'Related term: '+term+'\nSimilarity  : '+similarity
			print ''
Exemple #21
0
class Contexts:
    def __init__(self, temp_folder):
        self.temp_folder = temp_folder
        self.misc = Miscelaneous()
        self.dic_an = {}
        self.dic_sv = {}
        self.dic_vo = {}
        self.matrix_relations = ['AN', 'SV', 'VO']

        for type_relation in self.matrix_relations:
            self.__loadTerms__(type_relation)
        self.__writeDic__()

    def __del__(self):
        pass

    def __loadTerms__(self, type_relation):
        try:
            root, dirs, files = os.walk(self.temp_folder + '' + type_relation +
                                        '/2Order/').next()[:3]
        except IOError:
            print bcolors.FAIL + 'ERROR: It was not possible to open the ' + self.temp_folder + ' folder' + bcolors.ENDC
            sys.exit(2)

        qty_documents = len(files)

        i = 0
        for corpus_file in files:
            i += 1
            if re.match('.*txt$', corpus_file):
                relation_file = self.misc.openFile(root + '' + corpus_file,
                                                   'r')
                for line in relation_file:
                    line = re.sub('\n', '', line)
                    relation, noun, frequency = line.split('#')
                    if type_relation == 'AN':
                        self.__addElementDicAN__(relation + '#' + noun,
                                                 frequency)
                    elif type_relation == 'SV':
                        self.__addElementDicSV__(relation + '#' + noun,
                                                 frequency)
                    elif type_relation == 'VO':
                        self.__addElementDicVO__(relation + '#' + noun,
                                                 frequency)
            self.misc.progress_bar(i, qty_documents, 100)

    def __addElementDicAN__(self, relation, frequency):
        if self.dic_an.has_key(relation):
            self.dic_an[relation] += int(frequency)
        else:
            self.dic_an[relation] = int(frequency)

    def __addElementDicSV__(self, relation, frequency):
        if self.dic_sv.has_key(relation):
            self.dic_sv[relation] += int(frequency)
        else:
            self.dic_sv[relation] = int(frequency)

    def __addElementDicVO__(self, relation, frequency):
        if self.dic_vo.has_key(relation):
            self.dic_vo[relation] += int(frequency)
        else:
            self.dic_vo[relation] = int(frequency)

    def __writeDic__(self):
        for type_relation in self.matrix_relations:
            file_relation = self.misc.openFile(
                self.temp_folder + '' + type_relation + '/Relations.txt', 'w')
            dic_relation = self.getDic(type_relation)
            for id_relation in dic_relation:
                file_relation.write(id_relation + '#' +
                                    str(dic_relation[id_relation]) + '\n')
            file_relation.close()

    """ Get and Print methods """

    def getDic(self, type_relation):
        if type_relation == 'AN': return self.dic_an
        elif type_relation == 'SV': return self.dic_sv
        elif type_relation == 'VO': return self.dic_vo

    def printDic(self, type_relation):
        dic_relation = getDic(type_relation)
        for id_relation in self.dic_relation:
            print id_relation + ' = ' + str(self.dic_relation[id_relation])
Exemple #22
0
    def __buildDics__(self, filename):
        misc = Miscelaneous()
        xmlfile = misc.openFile(filename, "r")
        self.stoplist = misc.getStoplist("../misc/stoplist.txt")

        record_dependencies = False
        record_collapsed = False
        for line in xmlfile:
            line = re.sub("\n", "", line)
            if "<sentence " in line:
                id_s = (line.split('id="')[1]).split('"')[0]
                array_rec_dep = []
            elif "<token " in line:
                id_t = "s" + id_s + "_" + (line.split('id="')[1]).split('"')[0]
            elif "<word>" in line:
                word = (line.split("<word>")[1]).split("</word>")[0]
            elif "<lemma>" in line:
                lemma = (line.split("<lemma>")[1]).split("</lemma>")[0]
            elif "<POS>" in line:
                pos = (line.split("<POS>")[1]).split("</POS>")[0]
            elif "<NER>" in line:
                ner = (line.split("<NER>")[1]).split("</NER>")[0]
            elif "</token>" in line:
                self.dic_t[id_t] = {"word": word, "lemma": lemma, "pos": pos, "ner": ner, "nps": []}
                if re.match("NN|NNP", pos):
                    array_nps = [lemma.lower()]
                    self.dic_t[id_t]["nps"] = array_nps

            elif "<basic-dependencies>" in line:
                record_dependencies = True
                index_nt = 500
            elif "</basic-dependencies>" in line:
                record_dependencies = False
            elif "<collapsed-ccprocessed-dependencies>" in line:
                record_collapsed = True
            elif "</collapsed-ccprocessed-dependencies>" in line:
                record_collapsed = False

            if record_dependencies or record_collapsed:
                if "<dep type=" in line:
                    cat = (line.split('type="')[1]).split('"')[0]
                elif "<governor " in line:
                    idx_gov = (line.split('idx="')[1]).split('"')[0]
                    id_t_gov = "s" + id_s + "_" + idx_gov
                elif "<dependent " in line:
                    idx_dep = (line.split('idx="')[1]).split('"')[0]
                    id_t_dep = "s" + id_s + "_" + idx_dep
                elif (record_dependencies or record_collapsed) and "</dep>" in line:
                    if cat + "#" + id_t_gov + "#" + id_t_dep not in array_rec_dep:
                        array_rec_dep.append(cat + "#" + id_t_gov + "#" + id_t_dep)
                        self.dic_nt["s" + id_s + "_" + str(index_nt)] = {"cat": cat, "gov": id_t_gov, "dep": id_t_dep}
                        index_nt += 1
        xmlfile.close()

        for id_nt in self.dic_nt:
            if re.match("(nn|amod)", self.dic_nt[id_nt]["cat"]):
                array_nps = self.dic_t[self.dic_nt[id_nt]["gov"]]["nps"]
                string = ""
                id_gov = self.dic_nt[id_nt]["gov"].split("_")[1]
                id_dep = self.dic_nt[id_nt]["dep"].split("_")[1]
                id_s = self.dic_nt[id_nt]["dep"].split("_")[0]
                for i in range(int(id_dep), int(id_gov)):
                    id_next = id_s + "_" + str(i)
                    if (
                        re.match("(NN|JJ)", self.dic_t[id_next]["pos"])
                        and self.dic_t[id_next]["lemma"] not in self.stoplist
                    ):
                        string += self.dic_t[id_next]["lemma"] + "_"
                string += self.dic_t[id_s + "_" + id_gov]["lemma"]
                if len(string.split("_")) > 1 and string.lower() not in array_nps:
                    array_nps.append(string.lower())
                self.dic_t[self.dic_nt[id_nt]["gov"]]["nps"] = array_nps

            elif re.match("prep_of", self.dic_nt[id_nt]["cat"]):
                id_gov = self.dic_nt[id_nt]["gov"]
                id_dep = self.dic_nt[id_nt]["dep"]

                if re.match("NN", self.dic_t[id_dep]["pos"]) and re.match("NN", self.dic_t[id_gov]["pos"]):
                    array_nps = self.dic_t[id_dep]["nps"]
                    string = self.dic_t[id_gov]["lemma"] + "_of_" + self.dic_t[id_dep]["lemma"]
                    array_nps.append(string.lower())
                    self.dic_t[self.dic_nt[id_nt]["dep"]]["nps"] = array_nps

                    array_nps = self.dic_t[id_gov]["nps"]
                    string = self.dic_t[id_gov]["lemma"] + "_of_" + self.dic_t[id_dep]["lemma"]
                    array_nps.append(string.lower())
                    self.dic_t[self.dic_nt[id_nt]["gov"]]["nps"] = array_nps
Exemple #23
0
 def __init__(self, output_file, max_qty_terms):
     self.output_file = output_file
     self.max_qty_terms = max_qty_terms
     misc = Miscelaneous()
     self.thesaurus_file = misc.openFile(output_file, 'w')
Exemple #24
0
class MutualInformation:
	def __init__(self, temp_folder, file_input, seedfile, mi_precision):
		self.window_size = file_input[1:-23]
		self.temp_folder = temp_folder
		self.misc = Miscelaneous()
		seeds_file = Seeds(seedfile)
		self.list_seeds = seeds_file.getSeeds()
		self.first_line = ''
		self.dic_tuplas = defaultdict(dict)
		self.dic_terms = OrderedDict()
		self.__buildMI__(file_input, mi_precision)

	def __del__(self):
		pass

	def __buildMI__(self, file_input, mi_precision):
		filename_input = file_input[:-4]
		file_bigrams = self.misc.openFile(self.temp_folder+''+file_input, 'r')
		for line in file_bigrams:
			if self.first_line != '':
				part = line.split('<>')
				term_type1 = part[0]
				term_type2 = part[1]
				term1, type1 = term_type1.split('__')
				term2, type2 = term_type2.split('__')

				freq_tupla = part[2].split(' ')[0]
				freq_term1 = part[2].split(' ')[1]
				freq_term2 = part[2].split(' ')[2]

				if type1 == 'N' and type2 == 'N' and term1 != term2:
					if term1 in self.list_seeds:				
						self.dic_tuplas[term1+'<>'+term2+'<>']['freq_tupla'] = int(freq_tupla)
						self.dic_tuplas[term1+'<>'+term2+'<>']['freq_term1'] = int(freq_term1)
						self.dic_tuplas[term1+'<>'+term2+'<>']['freq_term2'] = int(freq_term2)
					elif term2 in self.list_seeds:
						if self.dic_tuplas.has_key(term2):
							self.dic_tuplas[term2+'<>'+term1+'<>']['freq_tupla'] += int(freq_tupla)
						else:
							self.dic_tuplas[term2+'<>'+term1+'<>']['freq_tupla'] = int(freq_tupla)
							self.dic_tuplas[term2+'<>'+term1+'<>']['freq_term1'] = int(freq_term2)
							self.dic_tuplas[term2+'<>'+term1+'<>']['freq_term2'] = int(freq_term1)

			else:
				self.first_line = line
		file_bigrams.close()

		file_relations = self.misc.openFile(self.temp_folder+''+filename_input+'_to_MI.txt', 'w')
		file_relations.write(self.first_line)
		for tupla in self.dic_tuplas:
			file_relations.write(tupla+''+str(self.dic_tuplas[tupla]['freq_tupla'])+' '+str(self.dic_tuplas[tupla]['freq_term1'])+' '+str(self.dic_tuplas[tupla]['freq_term2'])+'\n')
		file_relations.close()

		command = "statistic.pl tmi.pm -precision "+mi_precision+' '+self.temp_folder+'IM'+self.window_size+'_SecondOrder.txt '+self.temp_folder+''+filename_input+'_to_MI.txt'
		os.system(command)

	def getDicMI(self):
		file_mi = self.misc.openFile(self.temp_folder+'IM'+self.window_size+'_SecondOrder.txt', 'r')

		first_line = False
		list_used_seeds = []
		for line in file_mi:
			if first_line:
				seed, none, term, none, rank, true_mi, freq_1, freq_2, freq_3 = re.split(r'[ |<>]', line)
				if seed in self.list_seeds and seed not in list_used_seeds:
					list_used_seeds.append(seed)
					self.dic_terms[seed] = {'terms': []}
				if seed in self.list_seeds: 
					self.dic_terms[seed]['terms'].append({term:true_mi})	
			else:
				first_line = True
		return self.dic_terms

	def getDicBigrams(self):
		return self.dic_tuplas

	def printDicBigrams(self):
		print self.first_line,
		for tupla in self.dic_tuplas:
			print tupla,self.dic_tuplas[tupla]['freq_tupla'],self.dic_tuplas[tupla]['freq_term1'],self.dic_tuplas[tupla]['freq_term2']
Exemple #25
0
class Matrix:
	def __init__(self, temp_folder, svd_dimension, record_intermediate):
		self.misc = Miscelaneous()
		self.temp_folder = temp_folder
		self.svd_dimension = svd_dimension
		self.dic_noun = OrderedDict()
		self.dic_noun_index = {}
		self.dic_modifier = OrderedDict()
		self.dic_modifier_index = {}
		self.row = []
		self.col = []
		self.data = []
		self.dic_matrix = {}
		#self.line_data = ''
		list_relations = ['AN', 'SV', 'VO']

		string_files_matrix = ''
		for relation in list_relations:
			self.type_relation = relation
			#self.buildMatrixFromFile()
			#self.applySvd()
			#if record_intermediate:
			#	logfile.writeLogfile('- Recording SVD matrix to '+relation+' in a file...')
			#	self.writeSvd()
			#self.buildRelationsSvd()

			string_files_matrix += self.temp_folder+''+relation+'/Matrix_row.txt '+self.temp_folder+''+relation+'/Matrix_column.txt '
			file_matrix = self.misc.openFile(self.temp_folder+''+relation+'/Matrix_row.txt', 'r')
			for line in file_matrix:
				self.__loadDicMatrix__(line, relation)
			file_matrix.close()

		file_doc_matrix = self.misc.openFile(self.temp_folder+'/Matrix_nouns.txt', 'w')
		number_document = 0
		for noun in self.dic_matrix:
			file_doc_matrix.write(str(number_document)+' : '+noun+'\n')
			command = 'cat'+self.dic_matrix[noun]+' > '+self.temp_folder+'Matrix/'+str(number_document)+'.txt'
			os.system(command)
			if not record_intermediate:
				command = 'rm -Rf'+self.dic_matrix[noun]+' '+string_files_matrix
				os.system(command)
			number_document += 1
		file_doc_matrix.close()

	def __del__(self):
		pass

	def __loadDicMatrix__(self, line, relation):
		line = re.sub('\n', '', line)
		row, noun = line.split(' : ')
		if self.dic_matrix.has_key(noun):
			self.dic_matrix[noun] = self.dic_matrix[noun]+' '+self.temp_folder+''+relation+'/3Order/'+row+'.txt'
		else:
			self.dic_matrix[noun] = ' '+self.temp_folder+''+relation+'/3Order/'+row+'.txt'

	def buildMatrixFromFile(self):
		index_modifier = 0
		index_noun = 0
		line_row = ''
		line_column = ''

		file_relations = self.misc.openFile(self.temp_folder+''+self.type_relation+'/Relations.txt', 'r')
		file_row = self.misc.openFile(self.temp_folder+''+self.type_relation+'/Matrix_row.txt', 'w')
		file_column = self.misc.openFile(self.temp_folder+''+self.type_relation+'/Matrix_column.txt', 'w')
		#file_data = self.misc.openFile(self.temp_folder+''+self.type_relation+'/Matrix_data.txt', 'w')

		for line in file_relations:
			line = re.sub('\n', '', line)
			modifier, noun, frequency = line.split('#')
		
			if self.dic_modifier.has_key(modifier):
				index_m = self.dic_modifier[modifier]
			else:
				self.dic_modifier[modifier] = index_modifier
				self.dic_modifier_index[index_modifier] = modifier
				index_m = index_modifier
				index_modifier = index_modifier + 1

			if self.dic_noun.has_key(noun):
				index_n = self.dic_noun[noun]
			else:
				self.dic_noun[noun] = index_noun
				self.dic_noun_index[index_noun] = noun
				index_n = index_noun
				index_noun = index_noun + 1

			self.row.append(int(index_n))
			self.col.append(int(index_m))
			self.data.append(math.log(float(frequency)+1, e))

			line_row += str(index_n)+' '
			line_column += str(index_m)+' '
			#self.line_data += str(frequency)+' '
		
		file_relations.close()

		for modifier in self.dic_modifier:
			file_column.write(str(self.dic_modifier[modifier])+' : '+modifier+'\n')
		for noun in self.dic_noun:
			file_row.write(str(self.dic_noun[noun])+' : '+noun+'\n')

		#file_data.write('<row>\n')
		#file_data.write(line_row[0:-1]+'\n')
		#file_data.write('<column>\n')
		#file_data.write(line_column[0:-1]+'\n')
		#file_data.write('<data>\n')
		#file_data.write(self.line_data[0:-1]+'\n')

		file_row.close()
		file_column.close()
		#file_data.close()

	def applySvd(self):
		len_row = max(self.row)+1
		len_col = max(self.col)+1
		print 'Applying SVD with ROW: '+str(len_row)+' and COL: '+str(len_col)
		sparse_matrix = scipy.sparse.csc_matrix( (self.data,(self.row,self.col)), shape=(len_row,len_col) )
		print 'sparsed matrix'
		Ut, Sigma, Vt = sparsesvd(sparse_matrix, self.svd_dimension)
		print 'Ut Sigma Vt done!'
		sparse_matrix = array(0)
		print 'Mounting Matrix SVD'
		self.svd_matrix = numpy.dot(Ut.T, numpy.dot(numpy.diag(Sigma), Vt))
		print 'Done!'
		print Ut
		print '\n'
		print Sigma
		print '\n'
		print Vt
		Ut = None
		Sigma = None
		Vt = None
		#Ut = array(0)
		#Sigma = array(0)
		#Vt = array(0)

	def buildRelationsSvd(self):
		index_noun = 0
		for row_data in self.svd_matrix:
			index_modifier = 0
			file_relations_svd = self.misc.openFile(self.temp_folder+''+self.type_relation+'/3Order/'+str(index_noun)+'.txt', 'w')
			for value in row_data:
				file_relations_svd.write(self.dic_modifier_index[index_modifier]+'#'+self.dic_noun_index[index_noun]+'#'+str(value)+'\n')
				index_modifier += 1
			index_noun += 1
			file_relations_svd.close()
		self.svd_matrix = array(0)

	def writeSvd(self):
		file_matrix_svd = self.misc.openFile(self.temp_folder+''+self.type_relation+'/MatrixDataSvd.txt', 'w')
		for row_data in self.svd_matrix:
			for value in row_data:
				file_matrix_svd.write(str(value)+' ')
			file_matrix_svd.write('\n');
		file_matrix_svd.close()
Exemple #26
0
	def __init__(self, output_file, max_qty_terms):
		self.output_file = output_file
		self.max_qty_terms = max_qty_terms
		misc = Miscelaneous()
		self.thesaurus_file = misc.openFile(output_file, 'w')
Exemple #27
0
class Parameters:

	def __init__(self, type_atc, argv):
		self.input_folder = '../Data/Corpus/'
		self.output_folder = '../Data/Output/'
		self.temp_folder = '../Data/Temp/'
		self.seeds_file = '../misc/seeds.txt'
		self.stoplist_file = '../misc/stoplist.txt'
		self.misc = Miscelaneous()
		file_parameters = self.misc.openFile('../misc/parameters.cfg', 'r')

		for line in file_parameters:
			if re.match('contexts', line):
				contexts = line.split('=')[1].replace('\n','')
				if contexts == 'On': self.contexts = True
				else: self.contexts = False
			if re.match('language', line):
				self.language = line.split('=')[1].replace('\n','')
			if re.match('max_qty_terms', line):
				self.max_qty_terms = line.split('=')[1].replace('\n','')
			if re.match('mi_precision', line):
				self.mi_precision = line.split('=')[1].replace('\n','')
			if re.match('min_word_size', line):
				self.min_word_size = line.split('=')[1].replace('\n','')
			if re.match('sim_measure', line):
				self.sim_measure = line.split('=')[1].replace('\n','')
			if re.match('svd_dimension', line):
				self.svd_dimension = line.split('=')[1].replace('\n','')
			if re.match('window_size', line):
				self.window_size = line.split('=')[1].replace('\n','')
			if re.match('record_log', line):
				record_log = line.split('=')[1].replace('\n','')
				if record_log == 'On': self.record_log = True
				else: self.record_log = False
			if re.match('record_intermediate', line):
				record_intermediate = line.split('=')[1].replace('\n','')
				if record_intermediate == 'On': self.record_intermediate = True
				else: self.record_intermediate = False
		file_parameters.close()

		try:
			opts, args = getopt.getopt(argv,\
				"h:c:i:o:m:M:p:w:d:t:l:L:r:R:s:S:", \
				["help", "contexts=", "input=", "output=", "min_size=", "max_terms=", "mi_precision=", "window_size=", "svd_dimension=", "temp=", "language=", "record_log=", "record_intermediate=", "seeds=", "sim_measure=", "stoplist="])
		except getopt.GetoptError:
			self.usage(type_atc)
			sys.exit(2)
		for opt, arg in opts:
			if opt in ("-h", "--help"):
				self.help()
				sys.exit(0)
			elif opt in ("-c", "--contexts"):
				if arg == 'On': self.contexts = True
				elif arg == 'Off': self.contexts = False
			elif opt in ("-i", "--input"):
				if os.path.isdir(arg): self.input_folder = arg 
				else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a folder, setting '+self.input_folder+' as input folder'+bcolors.ENDC
			elif opt in ("-o", "--output"):
				if os.path.isdir(arg):  self.output_folder = arg
				else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a folder, setting '+self.output_folder+' as output folder'+bcolors.ENDC
			elif opt in ("-t", "--temp"): 
				if os.path.isdir(arg): self.temp_folder = arg 
				else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a folder, setting '+self.temp_folder+' as temporary folder'+bcolors.ENDC 
			elif opt in ("-m", "--min_size"):
				self.min_word_size = arg
			elif opt in ("-M", "--max_terms"):
				self.max_qty_terms = arg
			elif opt in ("-l", "--language"):
				if arg == 'en' or arg == 'pt': self.language = arg
				else: print bcolors.WARNING+'WARNING: "'+str(arg)+'" is not a supported language, setting to "'+self.language+'" as language'+bcolors.ENDC 
			elif opt in ("-r", "--record_log"):
				if arg == 'On': self.record_log = True
				elif arg == 'Off': self.record_log = False
			elif opt in ("-R", "--record_intermediate"):
				if arg == 'On': self.record_intermediate = True
				elif arg == 'Off': self.record_intermediate = False
				else: print bcolors.WARNING+'WARNING: "'+str(arg)+'" is not a supported option to log recording, setting to "'+record_log+'" as default option'+bcolors.ENDC 
			elif opt in ("-s", "--seeds"):
				if os.path.isfile(arg): self.seeds_file = arg 
				else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a file, setting '+self.seeds+' as seeds file'+bcolors.ENDC
			elif opt in ("-S", "--sim_measure"):
				if arg == 'mutual_information' \
					or arg == 'baseline' \
					or arg == 'dicebin' \
					or arg == 'dicemin' \
					or arg == 'jaccard' \
					or arg == 'cosinebin' \
					or arg == 'cosine' \
					or arg == 'city' \
					or arg == 'euclidean' \
					or arg == 'js' \
					or arg == 'lin' \
					or arg == 'jaccardmax': 
					self.sim_measure = arg
				else: 
					print bcolors.WARNING+'WARNING: "'+str(arg)+'" is not a supported similarity measure, setting to "'+self.sim_measure+'" as default similarity measure. 						\nSimilarity measures supported by the system:\n - mutual_information [used only in First Order construction]\n - baseline\n - dicebin\n - dicemin\n - jaccard\n - cosinebin\n - cosine\n - city\n - euclidean\n - js\n - lin\n - jaccardmax'+bcolors.ENDC
			elif opt in ("-L", "--stoplist"):
				if os.path.isfile(arg): self.stoplist_file = arg 
				else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a file, setting '+self.seeds+' as stoplist file'+bcolors.ENDC

			if type_atc == 'FirstOrder':
				if opt in ("-p", "--mi_precision"):
					self.mi_precision = arg
				elif opt in ("-w", "--window_size"):
					self.window_size = arg

			elif type_atc == 'HigherOrder':
				if opt in ("-d", "--svd_dimension"):
					self.svd_dimension = arg

	def __del__(self):
		pass

	def getContexts(self):
		return self.contexts

	def getInputFolder(self):
		return self.input_folder

	def getLanguage(self):
		return self.language

	def getMinWordSize(self):
		return self.min_word_size

	def getMaxQtyTerms(self):
		return self.max_qty_terms

	def getMIPrecision(self):
		return self.mi_precision

	def getOutputFolder(self):
		return self.output_folder

	def getRecordLog(self):
		return self.record_log

	def getRecordIntermediate(self):
		return self.record_intermediate

	def getSeedsFile(self):
		return self.seeds_file

	def getSimilarityMeasure(self):
		return self.sim_measure

	def getStoplistFile(self):
		return self.stoplist_file

	def getSvdDimension(self):
		return self.svd_dimension

	def getWindowSize(self):
		return self.window_size

	def getTempFolder(self):
		return self.temp_folder

	def usage(self, type_atc):
		if type_atc == 'FirstOrder':
			usage = """
   Usage: python main_FirstOrder.py [OPTION] [FOLDER]... [OPTION] [PARAMETER]...\n
   -c  --contexts=            Input folder containing the syntactic context files
   -i  --input=               Input folder containing the corpus
   -l  --language=            Language of the corpus data
   -L  --stoplist=            File containing a list of stopwords
   -m  --min_size=            Minimum size of a word to be computed
   -M  --max_terms=           Max number of similar terms recorded in the XML file
   -o  --output=              Output folder to receive the data
   -p  --mi_precision=        Precision of the Mutual Information result
   -r  --record_log=          Enable/Disable log file recording
   -R  --record_intermediate= Enable/Disable intermediate files recording
   -s  --seeds=               File containing seeds to the thesaurus
	-S  --sim_measure=         Metric to compute the similarity between seed and related terms
   -w  --window_size=         Size of the window to compute the correlation analysis
   -t  --temp=                Temp folder to receive temporary data
   -h  --help                 Display this help and exit
   """
		elif type_atc == 'HigherOrder':
			usage = """
   Usage: python main_HigherOrder.py [OPTION] [FOLDER]... [OPTION] [PARAMETER]...\n
   -c  --contexts=            Input folder containing the syntactic context files
   -d  --svd_dimension=       Number of dimensions to reduce the SVD
   -i  --input=               Input folder containing the corpus
   -l  --language=            Language of the corpus data
   -L  --stoplist=            File containing a list of stopwords
   -m  --min_size=            Minimum size of a word to be computed
   -M  --max_terms=           Max number of similar terms recorded in the XML file
   -o  --output=              Output folder to receive the corpus
   -r  --record_log=          Enable/Disable log file recording
   -R  --record_intermediate= Enable/Disable intermediate files recording
   -s  --seeds=               File containing seeds to the thesaurus
   -S  --sim_measure=         Metric to compute the similarity between seed and related terms
   -t  --temp=                Temp folder to receive temporary data
   -h  --help                 Display this help and exit
   """
		else:
			usage = """
   Usage: python main_SecondOrder.py [OPTION] [FOLDER]... [OPTION] [PARAMETER]...\n
   -c  --contexts=            Input folder containing the syntactic context files
   -i  --input=               Input folder containing the corpus
   -l  --language=            Language of the corpus data
   -L  --stoplist=            File containing a list of stopwords
   -m  --min_size=            Minimum size of a word to be computed
   -M  --max_terms=           Max number of similar terms recorded in the XML file
   -o  --output=              Output folder to receive the corpus
   -r  --record_log=          Enable/Disable log file recording
   -R  --record_intermediate= Enable/Disable intermediate files recording
   -s  --seeds=               File containing seeds to the thesaurus
   -S  --sim_measure=         Metric to compute the similarity between seed and related terms
   -t  --temp=                Temp folder to receive temporary data
   -h  --help                 Display this help and exit
   """
		print usage

	def help(self):
		help = """
   HELP FILE:
   -----------------------------------------------------------------------------------------------\n
   [COMMAND] $python ['main' program].py [OPTION] [FOLDER]... [OPTION] [PARAMETER]...\n
   [OPTION] [FOLDER] ... [OPTION] [PARAMETER]
   -c  --contexts=            Input folder containing the sybtactic context files
                              Default option: 'Off' [The system loads the corpus folder]
                              Supported options: 'On' and 'Off'
   -d  --svd_dimension=       Number of dimensions to reduce the SVD [Used only in main_HigherOrder.py]\n
   -i  --input=               Input folder containing the corpus
                              Default folder: '../Data/Corpus/'\n
   -l  --language=            Language of the corpus data
                              Default language: 'en'
                              Supported languages: 'en' [English] and 'pt' [Portuguese]\n
   -L  --stoplist=            File containing a list of stopwords
                              Default file: '../misc/stoplist.txt'\n
   -m  --min_size=            Minimum size of a word to be computed
                              Default size: '3' letters\n
   -M  --max_terms=           Max number of similar terms recorded in the XML file
                              Default max: '10' related terms\n
   -o  --output=              Output folder to receive the data
                              Default output: '../Data/Output/'\n
   -p  --mi_precision=        Precision of the Mutual Information result [Used only in main_FirstOrder.py with --sim_measure=mi_information]
                              Default precision: 10\n
   -r  --record_log=          Enable/Disable log file recording
                              Default option: 'Off' [Log file is recorded in ../misc/application.log]
                              Supported options: 'On' and 'Off'
   -R  --record_intermediate= Enable/Disable intermediate files recording
                              Default option: 'Off'
                              Supported options: 'On' and 'Off' [Intermediate files are recorded in '../Temp/AN/', '../Temp/SV/', and '../Temp/VO/']
   -s  --seeds=               File containing seeds to the thesaurus
                              Default file: '../misc/seeds.txt'\n
   -S  --sim_measure=         Metric to compute the similarity between seed and related terms
                              Default measure: 'jaccardmax'
                              Supported measures: 'mutual_information', 'baseline', 'dicebin'
                                                  'dicemin', 'jaccard', 'cosinebin', 'cosine'
                                                  'city', 'euclidean', 'js', 'lin', 'jaccardmax'\n
   -w  --window_size=         Size of the window to compute the correlation analysis [Used only in main_FirstOrder.py]
                              Default size: '20'\n
   -t  --temp=                Temp folder to receive temporary data
                              Default folder: '../Data/Temp/'\n
   -h  --help                 Display this help and exit\n
   """
		print help
Exemple #28
0
class Matrix:
    def __init__(self, input_file, temp_folder, svd_dimension):
        self.misc = Miscelaneous()
        self.temp_folder = temp_folder
        self.svd_dimension = svd_dimension
        self.dic_column = OrderedDict()
        self.dic_column_index = {}
        self.dic_row = OrderedDict()
        self.dic_row_index = {}
        self.array_row = []
        self.array_col = []
        self.array_data = []
        self.dic_matrix = {}

        string_files_matrix = ""

        self.buildMatrixFromFile(input_file)
        self.applySvd()
        self.writeSvd()

    def __del__(self):
        pass

    def buildMatrixFromFile(self, input_file):
        index_row = 0
        index_column = 0
        line_row = ""
        line_column = ""
        line_data = ""

        file_input = self.misc.openFile(input_file, "r")
        file_row = self.misc.openFile(self.temp_folder + "Matrix_row.txt", "w")
        file_column = self.misc.openFile(self.temp_folder + "Matrix_column.txt", "w")
        file_data = self.misc.openFile(self.temp_folder + "Matrix_data.txt", "w")

        for line in file_input:
            line = re.sub("\n", "", line)
            row, column, frequency = line.split("#")

            if self.dic_row.has_key(row):
                index_m = self.dic_row[row]
            else:
                self.dic_row[row] = index_row
                self.dic_row_index[index_row] = row
                index_m = index_row
                index_row = index_row + 1

            if self.dic_column.has_key(column):
                index_n = self.dic_column[column]
            else:
                self.dic_column[column] = index_column
                self.dic_column_index[index_column] = column
                index_n = index_column
                index_column = index_column + 1

            self.array_row.append(int(index_n))
            self.array_col.append(int(index_m))
            log_frequency = math.log(float(frequency) + 1, e)
            self.array_data.append(float(frequency))

            line_row += str(index_n) + " "
            line_column += str(index_m) + " "
            line_data += str(frequency) + " "

        file_input.close()

        for row in self.dic_row:
            file_row.write(str(self.dic_row[row]) + " : " + row + "\n")
        for column in self.dic_column:
            file_column.write(str(self.dic_column[column]) + " : " + column + "\n")

        file_data.write("<row>\n")
        file_data.write(line_row[0:-1] + "\n")
        file_data.write("<column>\n")
        file_data.write(line_column[0:-1] + "\n")
        file_data.write("<data>\n")
        file_data.write(line_data[0:-1] + "\n")

        file_row.close()
        file_column.close()
        file_data.close()

    def applySvd(self):
        len_row = max(self.array_row) + 1
        len_col = max(self.array_col) + 1
        print "Applying SVD with ROW: " + str(len_row) + " and COL: " + str(len_col)
        sparse_matrix = scipy.sparse.csc_matrix(
            (self.array_data, (self.array_row, self.array_col)), shape=(len_row, len_col)
        )
        print "sparsed matrix"
        Ut, Sigma, Vt = sparsesvd(sparse_matrix, self.svd_dimension)
        print "U Sigma Vt done!"
        sparse_matrix = array(0)
        print "Mounting Matrix SVD"
        self.svd_matrix = numpy.dot(Ut.T, numpy.dot(numpy.diag(Sigma), Vt))
        print "Done!"
        print Ut.T
        print "\n"
        print Sigma
        print "\n"
        print Vt
        print "\n"
        print self.svd_matrix.T
        print "\n"
        Ut = None
        Sigma = None
        Vt = None
        # Ut = array(0)
        # Sigma = array(0)
        # Vt = array(0)

    def writeSvd(self):
        file_matrix_svd = self.misc.openFile(self.temp_folder + "/MatrixDataSvd.txt", "w")
        row_number = 0
        for row_data in self.svd_matrix.T:
            column_number = 0
            for value in row_data:
                file_matrix_svd.write(
                    self.dic_row_index[row_number]
                    + "#"
                    + self.dic_column_index[column_number]
                    + "#"
                    + str(value)
                    + "\n"
                )
                column_number += 1
            row_number += 1
        file_matrix_svd.close()
Exemple #29
0
	def __init__(self, type_atc, argv):
		self.input_folder = '../Data/Corpus/'
		self.output_folder = '../Data/Output/'
		self.temp_folder = '../Data/Temp/'
		self.seeds_file = '../misc/seeds.txt'
		self.stoplist_file = '../misc/stoplist.txt'
		self.misc = Miscelaneous()
		file_parameters = self.misc.openFile('../misc/parameters.cfg', 'r')

		for line in file_parameters:
			if re.match('contexts', line):
				contexts = line.split('=')[1].replace('\n','')
				if contexts == 'On': self.contexts = True
				else: self.contexts = False
			if re.match('language', line):
				self.language = line.split('=')[1].replace('\n','')
			if re.match('max_qty_terms', line):
				self.max_qty_terms = line.split('=')[1].replace('\n','')
			if re.match('mi_precision', line):
				self.mi_precision = line.split('=')[1].replace('\n','')
			if re.match('min_word_size', line):
				self.min_word_size = line.split('=')[1].replace('\n','')
			if re.match('sim_measure', line):
				self.sim_measure = line.split('=')[1].replace('\n','')
			if re.match('svd_dimension', line):
				self.svd_dimension = line.split('=')[1].replace('\n','')
			if re.match('window_size', line):
				self.window_size = line.split('=')[1].replace('\n','')
			if re.match('record_log', line):
				record_log = line.split('=')[1].replace('\n','')
				if record_log == 'On': self.record_log = True
				else: self.record_log = False
			if re.match('record_intermediate', line):
				record_intermediate = line.split('=')[1].replace('\n','')
				if record_intermediate == 'On': self.record_intermediate = True
				else: self.record_intermediate = False
		file_parameters.close()

		try:
			opts, args = getopt.getopt(argv,\
				"h:c:i:o:m:M:p:w:d:t:l:L:r:R:s:S:", \
				["help", "contexts=", "input=", "output=", "min_size=", "max_terms=", "mi_precision=", "window_size=", "svd_dimension=", "temp=", "language=", "record_log=", "record_intermediate=", "seeds=", "sim_measure=", "stoplist="])
		except getopt.GetoptError:
			self.usage(type_atc)
			sys.exit(2)
		for opt, arg in opts:
			if opt in ("-h", "--help"):
				self.help()
				sys.exit(0)
			elif opt in ("-c", "--contexts"):
				if arg == 'On': self.contexts = True
				elif arg == 'Off': self.contexts = False
			elif opt in ("-i", "--input"):
				if os.path.isdir(arg): self.input_folder = arg 
				else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a folder, setting '+self.input_folder+' as input folder'+bcolors.ENDC
			elif opt in ("-o", "--output"):
				if os.path.isdir(arg):  self.output_folder = arg
				else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a folder, setting '+self.output_folder+' as output folder'+bcolors.ENDC
			elif opt in ("-t", "--temp"): 
				if os.path.isdir(arg): self.temp_folder = arg 
				else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a folder, setting '+self.temp_folder+' as temporary folder'+bcolors.ENDC 
			elif opt in ("-m", "--min_size"):
				self.min_word_size = arg
			elif opt in ("-M", "--max_terms"):
				self.max_qty_terms = arg
			elif opt in ("-l", "--language"):
				if arg == 'en' or arg == 'pt': self.language = arg
				else: print bcolors.WARNING+'WARNING: "'+str(arg)+'" is not a supported language, setting to "'+self.language+'" as language'+bcolors.ENDC 
			elif opt in ("-r", "--record_log"):
				if arg == 'On': self.record_log = True
				elif arg == 'Off': self.record_log = False
			elif opt in ("-R", "--record_intermediate"):
				if arg == 'On': self.record_intermediate = True
				elif arg == 'Off': self.record_intermediate = False
				else: print bcolors.WARNING+'WARNING: "'+str(arg)+'" is not a supported option to log recording, setting to "'+record_log+'" as default option'+bcolors.ENDC 
			elif opt in ("-s", "--seeds"):
				if os.path.isfile(arg): self.seeds_file = arg 
				else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a file, setting '+self.seeds+' as seeds file'+bcolors.ENDC
			elif opt in ("-S", "--sim_measure"):
				if arg == 'mutual_information' \
					or arg == 'baseline' \
					or arg == 'dicebin' \
					or arg == 'dicemin' \
					or arg == 'jaccard' \
					or arg == 'cosinebin' \
					or arg == 'cosine' \
					or arg == 'city' \
					or arg == 'euclidean' \
					or arg == 'js' \
					or arg == 'lin' \
					or arg == 'jaccardmax': 
					self.sim_measure = arg
				else: 
					print bcolors.WARNING+'WARNING: "'+str(arg)+'" is not a supported similarity measure, setting to "'+self.sim_measure+'" as default similarity measure. 						\nSimilarity measures supported by the system:\n - mutual_information [used only in First Order construction]\n - baseline\n - dicebin\n - dicemin\n - jaccard\n - cosinebin\n - cosine\n - city\n - euclidean\n - js\n - lin\n - jaccardmax'+bcolors.ENDC
			elif opt in ("-L", "--stoplist"):
				if os.path.isfile(arg): self.stoplist_file = arg 
				else: print bcolors.WARNING+'WARNING: '+str(arg)+' is not a file, setting '+self.seeds+' as stoplist file'+bcolors.ENDC

			if type_atc == 'FirstOrder':
				if opt in ("-p", "--mi_precision"):
					self.mi_precision = arg
				elif opt in ("-w", "--window_size"):
					self.window_size = arg

			elif type_atc == 'HigherOrder':
				if opt in ("-d", "--svd_dimension"):
					self.svd_dimension = arg
Exemple #30
0
class Matrix:
    def __init__(self, input_file, temp_folder, svd_dimension):
        self.misc = Miscelaneous()
        self.temp_folder = temp_folder
        self.svd_dimension = svd_dimension
        self.dic_column = OrderedDict()
        self.dic_column_index = {}
        self.dic_row = OrderedDict()
        self.dic_row_index = {}
        self.array_row = []
        self.array_col = []
        self.array_data = []
        self.dic_matrix = {}

        string_files_matrix = ''

        self.buildMatrixFromFile(input_file)
        self.applySvd()
        self.writeSvd()

    def __del__(self):
        pass

    def buildMatrixFromFile(self, input_file):
        index_row = 0
        index_column = 0
        line_row = ''
        line_column = ''
        line_data = ''

        file_input = self.misc.openFile(input_file, 'r')
        file_row = self.misc.openFile(self.temp_folder + 'Matrix_row.txt', 'w')
        file_column = self.misc.openFile(
            self.temp_folder + 'Matrix_column.txt', 'w')
        file_data = self.misc.openFile(self.temp_folder + 'Matrix_data.txt',
                                       'w')

        for line in file_input:
            line = re.sub('\n', '', line)
            row, column, frequency = line.split('#')

            if self.dic_row.has_key(row):
                index_m = self.dic_row[row]
            else:
                self.dic_row[row] = index_row
                self.dic_row_index[index_row] = row
                index_m = index_row
                index_row = index_row + 1

            if self.dic_column.has_key(column):
                index_n = self.dic_column[column]
            else:
                self.dic_column[column] = index_column
                self.dic_column_index[index_column] = column
                index_n = index_column
                index_column = index_column + 1

            self.array_row.append(int(index_n))
            self.array_col.append(int(index_m))
            log_frequency = math.log(float(frequency) + 1, e)
            self.array_data.append(float(frequency))

            line_row += str(index_n) + ' '
            line_column += str(index_m) + ' '
            line_data += str(frequency) + ' '

        file_input.close()

        for row in self.dic_row:
            file_row.write(str(self.dic_row[row]) + ' : ' + row + '\n')
        for column in self.dic_column:
            file_column.write(
                str(self.dic_column[column]) + ' : ' + column + '\n')

        file_data.write('<row>\n')
        file_data.write(line_row[0:-1] + '\n')
        file_data.write('<column>\n')
        file_data.write(line_column[0:-1] + '\n')
        file_data.write('<data>\n')
        file_data.write(line_data[0:-1] + '\n')

        file_row.close()
        file_column.close()
        file_data.close()

    def applySvd(self):
        len_row = max(self.array_row) + 1
        len_col = max(self.array_col) + 1
        print 'Applying SVD with ROW: ' + str(len_row) + ' and COL: ' + str(
            len_col)
        sparse_matrix = scipy.sparse.csc_matrix(
            (self.array_data, (self.array_row, self.array_col)),
            shape=(len_row, len_col))
        print 'sparsed matrix'
        Ut, Sigma, Vt = sparsesvd(sparse_matrix, self.svd_dimension)
        print 'U Sigma Vt done!'
        sparse_matrix = array(0)
        print 'Mounting Matrix SVD'
        self.svd_matrix = numpy.dot(Ut.T, numpy.dot(numpy.diag(Sigma), Vt))
        print 'Done!'
        print Ut.T
        print '\n'
        print Sigma
        print '\n'
        print Vt
        print '\n'
        print self.svd_matrix.T
        print '\n'
        Ut = None
        Sigma = None
        Vt = None
        #Ut = array(0)
        #Sigma = array(0)
        #Vt = array(0)

    def writeSvd(self):
        file_matrix_svd = self.misc.openFile(
            self.temp_folder + '/MatrixDataSvd.txt', 'w')
        row_number = 0
        for row_data in self.svd_matrix.T:
            column_number = 0
            for value in row_data:
                file_matrix_svd.write(self.dic_row_index[row_number] + '#' +
                                      self.dic_column_index[column_number] +
                                      '#' + str(value) + '\n')
                column_number += 1
            row_number += 1
        file_matrix_svd.close()
Exemple #31
0
class Measures:
	def __init__(self, ctx_freq_file, seedfile):
		self.misc = Miscelaneous()
		seeds_file = Seeds(seedfile)
		self.list_seeds = seeds_file.getSeeds()
		self.dic_baseline = defaultdict(dict)
		self.dic_diceBin = defaultdict(dict)
		self.dic_diceMin = defaultdict(dict)
		self.dic_jaccard = defaultdict(dict)
		self.dic_cosineBin = defaultdict(dict)
		self.dic_cosine = defaultdict(dict)
		self.dic_city = defaultdict(dict)
		self.dic_euclidean = defaultdict(dict)
		self.dic_js = defaultdict(dict)
		self.dic_lin = defaultdict(dict)
		self.dic_jaccardMax = defaultdict(dict)
		self.dic_ctx = defaultdict(dict)
		self.dic_sum_freq_noun = {}
		self.dic_qty_noun = {}
		self.__buildHashs__(ctx_freq_file, seedfile)

	def __del__(self):
		pass

	def __buildHashs__(self, ctx_freq_file, seedfile):
		list_nouns = []
		ctxfreqfile = self.misc.openFile(ctx_freq_file, 'r')
		
		for line in ctxfreqfile:
			modifier, noun, freq = line.split('#')
			list_nouns.append(noun)
			freq = freq.replace('\n', '')
			self.dic_ctx[noun][modifier] = float(freq)
			if self.dic_sum_freq_noun.has_key(noun):
				self.dic_sum_freq_noun[noun] += float(freq)
			else:
				self.dic_sum_freq_noun[noun] = float(freq)
			if self.dic_qty_noun.has_key(noun):
				self.dic_qty_noun[noun] += 1
			else:
				self.dic_qty_noun[noun] = 1

		for seed in self.list_seeds:
			print 'Seed: '+seed
			i = 0
			qty_related = len(list_nouns)
			for related in list_nouns:
				if seed != related:
					i += 1
					baseline = 0
					diceBin = 0
					diceMin = 0
					jaccard = 0
					cosineBin = 0
					cosine = 0
					city = 0
					euclidean = 0
					js = 0
					lin = 0
					jaccardMax = 0

					sun_min = 0
					sun_max = 0
					sum_intersection = 0
					intersection = 0
					square_freq_seed = 0
					square_freq_related = 0
					d_seed = 0
					d_related = 0

					for modifier in self.dic_ctx[seed]:
						freq_seed_modifier = 0
						freq_related_modifier = 0
						if self.dic_ctx[related].has_key(modifier):
							baseline += 1
							freq_seed_modifier = self.dic_ctx[seed][modifier]
							freq_related_modifier = self.dic_ctx[related][modifier]
							sun_min += min(freq_seed_modifier, freq_related_modifier)
							sun_max += max(freq_seed_modifier, freq_related_modifier)
							city += abs(freq_seed_modifier - freq_related_modifier)
							euclidean += (freq_seed_modifier - freq_related_modifier)**2

							relative_freq_seed = float(freq_seed_modifier) / self.dic_sum_freq_noun[seed]
							if self.dic_sum_freq_noun[related] == 0:
								print bcolors.FAIL+'ERROR: Frequency of '+related+' is zero.'+bcolors.ENDC
							else:
								relative_freq_related = float(freq_related_modifier) / self.dic_sum_freq_noun[related]
								relative_freq_seed_related = float(relative_freq_seed + relative_freq_related) / 2

							if relative_freq_seed > 0.0 and relative_freq_seed_related > 0.0:
								d_seed += relative_freq_seed * math.log(float(relative_freq_seed / relative_freq_seed_related))
							if relative_freq_related > 0.0 and relative_freq_seed_related > 0.0:
								d_related += relative_freq_related * math.log(float(relative_freq_related / relative_freq_seed_related))
							intersection += freq_seed_modifier * freq_related_modifier
							sum_intersection += freq_seed_modifier + freq_related_modifier
							square_freq_seed += freq_seed_modifier**2
							square_freq_related += freq_related_modifier**2
				
						elif self.dic_ctx[seed].has_key(modifier):
							freq_seed_modifier = self.dic_ctx[seed][modifier]
							sun_max += freq_seed_modifier
							city += freq_seed_modifier
							euclidean += freq_seed_modifier**2
							square_freq_seed += freq_seed_modifier**2
				
					for modifier in self.dic_ctx[related]:
						if not self.dic_ctx[seed].has_key(modifier):
							freq_related_modifier = self.dic_ctx[related][modifier]
							sun_max += freq_related_modifier
							city += freq_related_modifier
							euclidean += freq_related_modifier**2
							square_freq_related +=  freq_related_modifier**2

					if sun_max > 0:
						jaccardMax = float(sun_min) / sun_max

					if self.dic_qty_noun.has_key(seed) and self.dic_qty_noun.has_key(related):
						diceBin = float(2*baseline) / (self.dic_qty_noun[seed] + self.dic_qty_noun[related])
						cosineBin = baseline / math.sqrt(float(self.dic_qty_noun[seed] * self.dic_qty_noun[related]))
						jaccard = float(baseline) / (self.dic_qty_noun[seed] + self.dic_qty_noun[related] - baseline)
			
					if self.dic_sum_freq_noun.has_key(seed) and self.dic_sum_freq_noun.has_key(related):
						diceMin = float((2*sun_min)) / (self.dic_sum_freq_noun[seed] + self.dic_sum_freq_noun[related])
						lin = float(sum_intersection) / (self.dic_sum_freq_noun[seed] + self.dic_sum_freq_noun[related])

					if square_freq_seed > 0 and square_freq_related > 0:
						cosine = intersection / (math.sqrt(float(square_freq_seed * square_freq_related)))
					euclidean = math.sqrt(float(euclidean))
					js = float(d_seed + d_related) / 2

					if  baseline >= 1:
						self.dic_baseline[seed][related] = baseline
						self.dic_diceBin[seed][related] = diceBin
						self.dic_diceMin[seed][related] = diceMin
						self.dic_jaccard[seed][related] = jaccard
						self.dic_cosineBin[seed][related] = cosineBin
						self.dic_cosine[seed][related] = cosine
						self.dic_city[seed][related] = city
						self.dic_euclidean[seed][related] = euclidean
						self.dic_js[seed][related] = js
						self.dic_lin[seed][related] = lin
						self.dic_jaccardMax[seed][related] = jaccardMax

				self.misc.progress_bar(i, qty_related, 100)
			print ''
				
	""" Methods to get the entire dictionaries """
	def getDic(self, sim_measure):
		dic_measure = self.__verifyMeasure__(sim_measure)
		return self.__sortTopNFromAllDic__(dic_measure, 0)

	""" Methods to get the DICs to a specific seed """
	def getDicToSeed(self, sim_measure, seed):
		dic_measure = self.__verifyMeasure__(sim_measure)
		return self.__sortTopNFromDic__(dic_measure, seed, 0)

	""" Methods to get the TOP N to a specific seed """
	def getTopNToSeed(self, sim_measure, seed, n):
		dic_measure = self.__verifyMeasure__(sim_measure)
		return self.__sortTopNFromDic__(dic_measure, seed, n)

	""" Methods to get the TOP N to ALL seeds """
	def getTopNToAllSeeds(self, sim_measure, n):
		dic_measure = self.__verifyMeasure__(sim_measure)
		return self.__sortTopNFromAllDic__(dic_measure, n)

	""" Methods to print the TOP N to a specific seed """
	def printTopNToSeed(self, sim_measure, seed, n):
		dic_terms =  self.getTopNToSeed(sim_measure, seed, n)
		self.__printDic__(dic_terms)

	""" Methods to print the TOP N to ALL seeds """
	def printTopNToAllSeeds(self, sim_measure, n):
		dic_terms = self.getTopNToAllSeeds(sim_measure, n)
		self.__printDic__(dic_terms)

	""" Internal methods """
	def __verifyMeasure__(self, sim_measure):
		if sim_measure == 'baseline': dic_measure = self.dic_baseline
		elif sim_measure == 'dicebin': dic_measure = self.dic_diceBin
		elif sim_measure == 'dicemin': dic_measure = self.dic_diceMin
		elif sim_measure == 'jaccard': dic_measure = self.dic_jaccard
		elif sim_measure == 'cosinebin': dic_measure = self.dic_cosineBin
		elif sim_measure == 'cosine': dic_measure = self.dic_cosine
		elif sim_measure == 'city': dic_measure = self.dic_city
		elif sim_measure == 'euclidean': dic_measure = self.dic_euclidean
		elif sim_measure == 'js': dic_measure = self.dic_js
		elif sim_measure == 'lin': dic_measure = self.dic_lin
		elif sim_measure == 'jaccardmax': dic_measure = self.dic_jaccardMax
		return dic_measure

	def __sortTopNFromDic__(self, dic, seed, n):
		dic_terms = OrderedDict()
		if self.__existKeyInDic__(seed, dic):
			dic_related = {}
			dic_terms[seed] = {'terms': []}
			for related_term in dic[seed]:
				dic_related[related_term] = dic[seed][related_term]
			if n == 0: n = None
			dic_ordered = sorted(dic_related.items(), key=itemgetter(1), reverse=True)[0:n]
			for list_ordered in dic_ordered:
				dic_terms[seed]['terms'].append({list_ordered[0]:str(list_ordered[1])})
		return dic_terms

	def __sortTopNFromAllDic__(self, dic, n):
		dic_terms = OrderedDict()
	
		for seed in self.list_seeds:
			if self.__existKeyInDic__(seed, dic):
				dic_terms[seed] = {'terms': []}
				dic_related = {}
				for related_term in dic[seed]:
					dic_related[related_term] = dic[seed][related_term]
				if n == 0: n = None
				dic_ordered = sorted(dic_related.items(), key=itemgetter(1), reverse=True)[0:n]
				for list_ordered in dic_ordered:
					dic_terms[seed]['terms'].append({list_ordered[0]:str(list_ordered[1])})
		return dic_terms

	def __existKeyInDic__(self, key, dic):
		if dic.has_key(key):
			return dic
		else:
			print bcolors.WARNING+'WARNING: System cannot found the term "'+key+'" in corpus'+bcolors.ENDC
			print ''
			return False

	def __printDic__(self, dic_terms):
		for seed in dic_terms:
			print 'Seed: '+seed
			for index_related_term in dic_terms[seed]['terms']:
					similarity = index_related_term[index_related_term.keys()[0]]
					term = index_related_term.keys()[0]
					print 'Related term: '+term+'\nSimilarity  : '+similarity
			print ''