def _buildWords(self, dicWords): """ Make 'Word' objects with words in dicWords dicWord : dictionary of words returned from _extract_tags """ words = [] for word in dicWords: instanceWord = Word(word["nom"], word["balise"], word["caracteristique"]) if "noLabel" in word["balise"]: instanceWord.ignoreWord = 1 if "relateditem" in word["balise"]: instanceWord.item = 1 words.append(instanceWord) return words
def _sepFrontSpePunc(self, word): """ Separate special punctuation marks at the front of the word Special punctuation marks are non-English marks, which cannot be processed by regular expression Check out 'self.special' """ frontWords = [] input_str = word.nom tagNames = word.listNomTag() featNames = word.listNomFeature() new_str = input_str for key in self.special.keys() : if new_str.find(key) == 0 : new_str = new_str[len(key):] tmpWord = Word(key, tagNames, featNames) tmpWord.addTag("c") frontWords.append(tmpWord) return frontWords, new_str
def _sepFrontPunc(self, word): """ Separate punctuation marks at the front of the word """ frontWords = [] input_str = word.nom tagNames = word.listNomTag() featNames = word.listNomFeature() tmp_str = input_str i=0 allPunc = '.,():;{}[]!?#$%\*+<=>@^_|~"' #exclude / if self.options.u : allPunc = allPunc[:-1] while (i < len(input_str)) : c = input_str[i] if c in allPunc : tmpWord = Word(c, tagNames, featNames) tmpWord.addTag("c") frontWords.append(tmpWord) #create word for a punctuation mark tmp_str = input_str[i+1:] i += 1 else : i = len(input_str) #exit return frontWords, tmp_str