Example #1
0
File: Clean.py Project: Unaah/bilbo
	def _buildWords(self, dicWords):
		"""
		Make 'Word' objects with words in dicWords
		dicWord : dictionary of words returned from _extract_tags
		"""		
		words = []
		for word in dicWords:
			instanceWord = Word(word["nom"], word["balise"], word["caracteristique"])
			if "noLabel" in word["balise"]:
				instanceWord.ignoreWord = 1
			if "relateditem" in word["balise"]:
				instanceWord.item = 1
			
			words.append(instanceWord)
		return words
Example #2
0
	def _sepFrontSpePunc(self, word):
		"""
		Separate special punctuation marks at the front of the word
		Special punctuation marks are non-English marks, which cannot be processed by regular expression
		Check out 'self.special'
		"""
		
		frontWords = []
		input_str = word.nom
		tagNames = word.listNomTag()
		featNames = word.listNomFeature()
		new_str = input_str
		
		for key in self.special.keys() :
			if new_str.find(key) == 0 :
				new_str = new_str[len(key):]
				tmpWord = Word(key, tagNames, featNames)
				tmpWord.addTag("c")
				frontWords.append(tmpWord)
				
		return frontWords, new_str
Example #3
0
	def _sepFrontPunc(self, word):
		"""
		Separate punctuation marks at the front of the word
		"""
		frontWords = []
		input_str = word.nom
		tagNames = word.listNomTag()
		featNames = word.listNomFeature()
		tmp_str = input_str
		i=0
		allPunc = '.,():;{}[]!?#$%\*+<=>@^_|~"' #exclude /
		if self.options.u : allPunc = allPunc[:-1]
		while (i < len(input_str)) :
			c = input_str[i]
			if c in allPunc :
				tmpWord = Word(c, tagNames, featNames)
				tmpWord.addTag("c")
				frontWords.append(tmpWord) #create word for a punctuation mark
				tmp_str = input_str[i+1:]
				i += 1
			else : i = len(input_str) #exit
				
		return frontWords, tmp_str