# if len(word) > 0 and (not dic.check(word)): # sugestoes = dic.suggest(word) # if len(sugestoes) > 0: # output = sugestoes[0] # return output ## Inicio do Treinamento catho_treinamento = LazyCorpusLoader( 'catho_treinamento', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(negativo|positivo|neutro)/.*') print "Preparando documentos para treinamento..." sys.stdout.flush() documents_treinamento = [(list(catho_treinamento.words(fileid)), category) for category in catho_treinamento.categories() for fileid in catho_treinamento.fileids(category)] print "fim da preparacao dos documentos de treinamento." sys.stdout.flush() ## Pre-processamento corpus_words = [w.lower() for w in catho_treinamento.words() if w not in string.punctuation] #if w not in string.punctuation and #w not in stopwords] #random.shuffle(documents) all_words = nltk.FreqDist(corpus_words)
cat_pattern=r'(\w+)/*', encoding='utf8', ) # returns all non-stop words from corpus def get_top_words(): all_words = nltk.FreqDist( w.lower() for w in decisions.words() if not w.lower() in stopwords.words('english') ) word_features = list(all_words)[:500] nltk.FreqDist.pprint(all_words, 500) return word_features print(decisions.categories()) documents = [(list(decisions.words(fileid)), category) for category in decisions.categories() for fileid in decisions.fileids(category)] #random.shuffle(documents) print (documents) pos_features = [(features.tweet_to_words(d), c) for (d, c) in documents if c == 'pos'] neg_features = [(features.tweet_to_words(d), c) for (d, c) in documents if c == 'neg'] random.shuffle(pos_features) random.shuffle(neg_features) chosen_features_200 = pos_features[:100] + neg_features[:100] random.shuffle(chosen_features_200)
import nltk from nltk.corpus import LazyCorpusLoader, CategorizedPlaintextCorpusReader from DataCleaning.POSTagger import tagger from nltk.corpus.reader import sentiwordnet movie_reviews123 = LazyCorpusLoader('movie_reviews123', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos|neutral)/.*', encoding='ascii') print(movie_reviews123.categories()) print(nltk.tag._pos_tag())
document_words = set(document) features = {} for word in word_features: if word not in stopwords and word not in string.punctuation: features['contains(%s)' % word] = (word in document_words) return features ## Inicio do Treinamento catho = LazyCorpusLoader( 'catho_treinamentoV2', CategorizedPlaintextCorpusReader, r'(?!\.).*\.txt', cat_pattern=r'(negativo|positivo|neutro)/.*') print "Preparando documentos para treinamento..." sys.stdout.flush() documents = [(list(catho.words(fileid)), category) for category in catho.categories() for fileid in catho.fileids(category)] print "fim da preparacao dos documentos de treinamento." sys.stdout.flush() ## Pre-processamento corpus_words = [w.lower() for w in catho.words() if w not in string.punctuation] random.shuffle(documents) all_words = nltk.FreqDist(corpus_words) word_features = all_words.keys()[:850]