Esempio n. 1
0
  def clean_text(self,minWordLen):

    dictionary = Dictionary.Dictionary2()
    for i in range(len(self.inputContent)):
      line = self.inputContent[i].rstrip('\n').split('\t')

      text = " ".join(line[self.columnStart:]).strip()
      text = Filters.filter_url( text )
      text = Filters.filter_accents(text.decode('utf8', 'ignore'))
      text = Filters.filter_punct( text )
      text = Filters.filter_charRepetition( text ).split()

      words = [ word for word in text if word.find('@') == -1 and not word.isdigit() \
          and len(word) > minWordLen]

      newLine = "\t".join( line[:self.columnStart] ) + "\t"
      for word in words:
        if word[0].isupper():
          newWord = word
        else:
          newWord = dictionary.getWord(word, False, False)
        
        if( word not in self.histogram ):
          self.histogram[word] = 0

        self.histogram[word] += 1
        newLine += newWord + " "
     
      self.inputContent[i] = newLine.strip()