#     if len(word) > 0 and (not dic.check(word)):
#         sugestoes = dic.suggest(word)
#         if len(sugestoes) > 0:
#             output = sugestoes[0]
#     return output


## Inicio do Treinamento
catho_treinamento = LazyCorpusLoader(
    'catho_treinamento', CategorizedPlaintextCorpusReader,
    r'(?!\.).*\.txt', cat_pattern=r'(negativo|positivo|neutro)/.*')

print "Preparando documentos para treinamento..."
sys.stdout.flush()
documents_treinamento = [(list(catho_treinamento.words(fileid)), category)
                         for category in catho_treinamento.categories()
                         for fileid in catho_treinamento.fileids(category)]
print "fim da preparacao dos documentos de treinamento."
sys.stdout.flush()
## Pre-processamento

corpus_words = [w.lower()
                for w in catho_treinamento.words()
                if w not in string.punctuation]
                #if w not in string.punctuation and
                #w not in stopwords]


#random.shuffle(documents)

all_words = nltk.FreqDist(corpus_words)
    cat_pattern=r'(\w+)/*',
    encoding='utf8',
)

# returns all non-stop words from corpus
def get_top_words():
    all_words = nltk.FreqDist(
        w.lower() for w in decisions.words()
        if not w.lower() in stopwords.words('english')
    )
    word_features = list(all_words)[:500]
    nltk.FreqDist.pprint(all_words, 500)
    return word_features


print(decisions.categories())
documents = [(list(decisions.words(fileid)), category)
             for category in decisions.categories()
             for fileid in decisions.fileids(category)]

#random.shuffle(documents)
print (documents)

pos_features = [(features.tweet_to_words(d), c) for (d, c) in documents if c == 'pos']
neg_features = [(features.tweet_to_words(d), c) for (d, c) in documents if c == 'neg']

random.shuffle(pos_features)
random.shuffle(neg_features)

chosen_features_200 = pos_features[:100] + neg_features[:100]
random.shuffle(chosen_features_200)
Exemple #3
0
import nltk
from nltk.corpus import LazyCorpusLoader, CategorizedPlaintextCorpusReader
from DataCleaning.POSTagger import tagger
from nltk.corpus.reader import sentiwordnet
movie_reviews123 = LazyCorpusLoader('movie_reviews123',
                                    CategorizedPlaintextCorpusReader,
                                    r'(?!\.).*\.txt',
                                    cat_pattern=r'(neg|pos|neutral)/.*',
                                    encoding='ascii')

print(movie_reviews123.categories())

print(nltk.tag._pos_tag())
Exemple #4
0
    document_words = set(document)
    features = {}
    for word in word_features:
        if word not in stopwords and word not in string.punctuation:
            features['contains(%s)' % word] = (word in document_words)
    return features

## Inicio do Treinamento
catho = LazyCorpusLoader(
    'catho_treinamentoV2', CategorizedPlaintextCorpusReader,
    r'(?!\.).*\.txt', cat_pattern=r'(negativo|positivo|neutro)/.*')

print "Preparando documentos para treinamento..."
sys.stdout.flush()
documents = [(list(catho.words(fileid)), category)
             for category in catho.categories()
             for fileid in catho.fileids(category)]
print "fim da preparacao dos documentos de treinamento."
sys.stdout.flush()

## Pre-processamento
corpus_words = [w.lower()
                for w in catho.words()
                if w not in string.punctuation]

random.shuffle(documents)

all_words = nltk.FreqDist(corpus_words)

word_features = all_words.keys()[:850]