コード例 #1
0
    def init_documents(f_re, cat_re):
        logging.debug("Reading corpus")
        reports = CategorizedPlaintextCorpusReader(corpus_dir,
                                                   f_re,
                                                   cat_pattern=cat_re,
                                                   encoding='utf8')
        logging.debug("Found {} fileids".format(len(reports.fileids())))
        logging.debug("Found categories: {}".format(reports.categories()))
        logging.debug("Building docs")

        documents = [
            (tokenize(reports.words(i)), reports.categories(i)[0])
              for i in reports.fileids()]
        return documents
コード例 #2
0
def display_features(num_features=1000,
                     show_features=200,
                     filepath='classifiers/nltk_nb.pkl',
                     verbose=True):
    '''
    Displays informative features from NHLCorpus
    '''
    stop_words = set(stopwords.words('english'))
    nhl = CategorizedPlaintextCorpusReader(root='data/NHLcorpus/',
                                           fileids=r'.*\.txt',
                                           cat_pattern='(\w+)/*')
    documents = []
    for category in nhl.categories():
        for fileid in nhl.fileids(category):
            documents.append(([
                re.sub(r'\W+', '', w.lower()) for w in nhl.words(fileid)
                if w.lower() not in stop_words
            ], category))
    all_words = nltk.FreqDist(
        re.sub(r'\W+', '', w.lower()) for w in nhl.words()
        if w.lower() not in stop_words)
    word_features = [w[0] for w in all_words.most_common(num_features)]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains({})'.format(word)] = word in document_words
        return features

    featuresets = [(document_features(d), c) for (d, c) in documents]
    nb_clf = nltk.NaiveBayesClassifier.train(featuresets)
    if verbose:
        nb_clf.show_most_informative_features(show_features)
        print('Accuracy on training data: {}'.format(
            nltk.classify.accuracy(nb_clf, featuresets)))

    save_classifier = open(filepath, 'wb')
    pickle.dump(nb_clf, save_classifier)
    save_classifier.close()
コード例 #3
0
ファイル: core.py プロジェクト: Christoph/newsComp-server
    def __create_corpus(self, language, chars):
        """Create a categorized nltk.corpus from data/* where the subfolders are the different categories.

        :chars: List of chars which will be additionally to stopwords  removed before the statistical analysis
        :language: The newspaper language as string
        :returns: nltk.corpus, list(all normalized words)

        """

        # Create corpus from data directory
        news_corpus = CategorizedPlaintextCorpusReader('data/', r'.*\.txt', cat_pattern=r'(\w+)/*')

        # Get all german stopwords and addition chars for removal
        g_stop = stopwords.words(language)
        g_stop.extend(chars)

        # Stemmer
        snow = nlp.stem.SnowballStemmer(language, ignore_stopwords=True)

        # Dict of all words/category
        cat = news_corpus.categories()
        total_words = {}

        for news in cat:
            #Get the words
            words = news_corpus.words(categories=news)

            # Remove stopwords and tokenize
            words = [w.lower() for w in words if w not in g_stop]

            # Stem all tokens
            words = [snow.stem(w) for w in words]

            total_words.update({news: words})

        return news_corpus, total_words
コード例 #4
0
#http://www.cs.cornell.edu/people/pabo/movie%2Dreview%2Ddata/

from nltk.corpus import CategorizedPlaintextCorpusReader
from random import randint

reader = CategorizedPlaintextCorpusReader(
    r'mix20_rand700_tokens_cleaned/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*')
print(reader.categories())
print(reader.fileids())

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(negFiles) - 1)]

print(fileN)
print(fileP)

for w in reader.words(fileP):
    print(w + ' ', end='')
    if w is '.':
        print()

for w in reader.words(fileN):
    print(w + ' ', end='')
    if w is '.':
        print()
コード例 #5
0
from nltk.corpus import CategorizedPlaintextCorpusReader

corpus_root = "./files/"
cat_root = "../categories/"

# Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'...
corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')

# get all categories
cats = corpus.categories()
print(cats)

# access corpus
raw = corpus.raw()

# access words, normal and for a category
words = corpus.words()
words_pop = corpus.words(categories="POP")
words_rock = corpus.words(categories="ROCK")

# access sents, normal and for a category
sents = corpus.sents()
sents_pop = corpus.sents(categories="POP")
sents_rock = corpus.sents(categories="ROCK")

# make lists
word_list = list(words)
sents_list = list(sents)

pop_word_list = list(words_pop)
pop_sents_list = list(sents_pop)
コード例 #6
0
class CorpusUtil(object):
    """Documentar
    """
    def __init__(self, raiz_corpus):
        """Cria um objeto do tipo 'CategorizedPlaintextCorpusReader',
        utilizando o diretório raiz do corpus, onde os documentos
        estão localizados, dispostos em seus respectivos subdiretórios,
        de acordo com sua categoria, sejam eles/elas quais for
        
        -->     raiz_corpus/{pos,neg,neu,...}.
        """
        reload(sys)
        sys.setdefaultencoding("utf-8")
        
        self._raiz_corpus = raiz_corpus
        self._corpus = CategorizedPlaintextCorpusReader(raiz_corpus, r'.+\.txt', cat_pattern=r'(\w+)/*',
                                                        encoding='utf-8')
        self._documentos = None
        self._palavras_frequentes = None
        self._todas_palavras = None
        self._featuresets = None
        self._train_set = None
        self._test_set = None

    def get_documentos(self):
        """Construimos uma lista de documentos, rotulados com as
        categorias apropriadas. Cada documento é representado por
        uma tupla na estrutura abaixo:
        
        (conteudo_do_documento, categoria)
        
        Retorna essa lista com todos os documentos do corpus.
        """
        """
        documentos = [(self.corpus.words(fileid), categoria)
                       for categoria in self.corpus.categories()
                       for fileid in self.corpus.fileids(categoria)]
        """
        print "-- Recuperando documentos do corpus."

        if self._documentos is None:            
            self._documentos = [Documento(" ".join(self._corpus.words(fileid)), categoria, self, fileid)
                                for categoria in self._corpus.categories()
                                for fileid in self._corpus.fileids(categoria)]

        # Embaralha documentos
        for i in range(0, 10):
            shuffle(self._documentos)

        return self._documentos

    def get_palavras_frequentes(self):
        """Documentar.
        """
        if self._palavras_frequentes is None:

            print "-- Verificando as palavras mais frequentes do corpus."

            # Teste - retorna apenas as 2000 palavras mais frequentes do corpus
            todas_palavras = [word.lower() for word in self._corpus.words()]
            freq_dist_palavras = FreqDist(todas_palavras)
            frequencia_palavras = freq_dist_palavras.most_common(2000)  # 2000 palavras mais frequentes
            
            self._palavras_frequentes = [palavra for palavra, frequencia in frequencia_palavras]
            
            # all_words = FreqDist(word.lower() for word in self.corpus.words())
            # self.word_features = list(all_words)[:2000]
        return self._palavras_frequentes

    def get_todas_palavras(self):
        if self._todas_palavras is None:
            print "-- Recuperando todas as palavras do corpus."
            self._todas_palavras = [word.lower() for word in self._corpus.words()]
            self._todas_palavras = set(self._todas_palavras)

        return self._todas_palavras

    def get_featuresets(self):
        """Configura os featuresets que são construídos na
        seguinte estrutura:
            (features_do_documento, categoria)
        
        Retorna uma lista de featuresets
        """
        if self._featuresets is None:
            
            if self._documentos is None:
                self.get_documentos()

            print "-- Recuperando featuresets."

            self._featuresets = apply_features(Documento.get_features, self._documentos)
        
        return self._featuresets

    def get_train_set(self):
        """Documentar
        """
        if self._featuresets is None:
            self.get_featuresets()

        print "-- Recuperando train_set."

        # Para não ocupar toda a memória RAM,
        # não armazena todos os documentos de uma vez nesta.
        # self._train_set = apply_features(Documento.get_features, self._documentos[100:])
        self._train_set = apply_features(Documento.get_features, self._documentos)

        return self._train_set

    def get_test_set(self):
        if self._featuresets is None:
            self.get_featuresets()

        print "-- Recuperando test_set."

        # self._test_set = apply_features(Documento.get_features, self._documentos[:100])

        return self._test_set

    def gravar_palavras_frequentes(self):
        diretorio_destino = "/home/lucas/Documents/mineracao_opiniao/palavras_frequentes_corpus"
        molde_nome_arquivo = "palavras_frequentes_%s.pickle"

        tempo_agora = str(datetime.now())
        # Substitui ':' e espaço em branco por '.'
        tempo_agora = re.sub(ur':|\s', '.', tempo_agora)
        nome_arquivo = molde_nome_arquivo % tempo_agora

        if self._palavras_frequentes is None:
            self.get_palavras_frequentes()

        f = open(diretorio_destino + "/" + nome_arquivo, 'wb')
        pickle.dump(self._palavras_frequentes, f)
        f.close()

        return True

    @staticmethod
    def abrir_arquivo_palavras_frequentes(arquivo_path):
        f = open(arquivo_path, 'rb')
        palavras_frequentes = pickle.load(f)
        f.close()

        return palavras_frequentes
コード例 #7
0
ファイル: aula1.py プロジェクト: rodrigorenie/datascience
from nltk.corpus import brown

# Abrir os documentos dentro do caminho específico
# Argumentos
# 1. Caminho absoluto para os documentos
# 2. tipo / extensão dos documentos (*.txt)
# 3. indicativo das pastas que formarão as categorias
# todos os argumentos são expressões regulares

leitor = CategorizedPlaintextCorpusReader(
    '../Dados/mix20_rand700_tokens_cleaned/tokens/',
    '.*.txt',
    cat_pattern=r'(\w+)/*')

# Verificar o que foi carregado
print(leitor.categories())
print(leitor.fileids())

# Separar o corpus de acordo com as categorias
posFiles = leitor.fileids(categories='pos')
negFiles = leitor.fileids(categories='neg')
print('Arquivos pos:', posFiles)
print('Arquivos neg:', negFiles)

# Carregar os primeiros arquivos das categorias
arqP = posFiles[0]
arqN = negFiles[1]

print("ArqP: ", arqP)
print("ArqN: ", arqN)
コード例 #8
0
ファイル: classify.py プロジェクト: breuckelen/verse
pos_file.close()
neg_file.close()


# Words for all emotions
lexicon = {}
for emotion in base_emotions:
    f = open('./opinion-lexicon-English/%s-words.txt' % emotion, 'rU')
    words = [word.strip() for word in f.readlines()]
    lexicon[emotion] = words
    f.close()

# Make a classifier based on the feature sets of the poems
poem_corpus = CategorizedPlaintextCorpusReader('./data', 'poems.*',
        cat_file='cats.txt')

poem_set = [(fileid, category) for fileid in poem_corpus.fileids() \
        for category in poem_corpus.categories(fileid)]
random.shuffle(poem_set)

feature_set = [(poem_features(poem_corpus.words(fileids=[fileid])),
        category) for (fileid, category) in poem_set]

train_set, test_set = feature_set[2000:], feature_set[:2000]

# Initialize the classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# For improving the algorithm
classifier.show_most_informative_features(20)
コード例 #9
0
# Build corpus for specific problem set
problem = 'problemA'
problem_root = nltk.data.find('corpora/AAAC/%s' % (problem))
problem_files = PlaintextCorpusReader(problem_root, '.*\.txt')


# Categorize corpus by author
auth_map = {}
for filename in problem_files.fileids():
	a_n =  filename[:3]
	auth_map[filename] =  [a_n]

# By the entire corpus
problem_cat = CategorizedPlaintextCorpusReader(problem_root, '.*\.txt', cat_map=auth_map)
documents = [(list(problem_cat.words(fileid)), category) 
				for category in problem_cat.categories() 
				for fileid in problem_cat.fileids(category)]
random.shuffle(documents)


# Word Frequency featureset
# Word freq accross corpus
all_words = nltk.FreqDist(words.lower() for words in problem_cat.words())
key_words = all_words.keys()[:2000]


# Compares whether a word from the keywords is in a document
def doc_features(doc):
	doc_words = set(doc)
	features = {}
	for word in key_words:
コード例 #10
0
def errors_em(poem_set):
    errors = []
    for (fileid, category) in poem_set:
        poem = corpus_of_poems.words(fileids=[fileid])
        emotion_correct = features_of_poem(poem)
        guess = classifier.classify(features_of_poem(poem))

        if guess != category:
            errors.append((category, guess, poem, emotion_correct['emotions']))

    return errors


poem_set = []
for fileid in corpus_of_poems.fileids():
    for category in corpus_of_poems.categories(fileid):
        poem_set.append((fileid, category))

print(poem_set)

random.shuffle(poem_set)

feature_set = []
for (fileid, category) in poem_set:
    feature_cal = (features_of_poem(fileid), category)
    feature_set.append(feature_cal)

train_set = feature_set[25:]

test_set = feature_set[:25]
コード例 #11
0
# Provide path to the custom corpora

mydir = '/Users/vasilis/Desktop/Lennon/lyrics_custom_corpus'

# Read data from our custom corpora

mr = CategorizedPlaintextCorpusReader(mydir,
                                      r'(?!\.).*\.txt',
                                      cat_pattern=r'(neg|pos)/.*')

# Clean lyrics from the English stop words.
stop = stopwords.words('english')

documents = [(list(mr.words(fileid)), category)
             for category in mr.categories()
             for fileid in mr.fileids(category)]

classifiers_dir = '/Users/vasilis/vxm773/Lennon/pickled_classifiers'

if os.path.exists(classifiers_dir):
    shutil.rmtree(classifiers_dir)
os.makedirs(classifiers_dir)

save_documents = open("pickled_classifiers/documents.pickle", "wb")
pickle.dump(documents, save_documents)
save_documents.close()

# Shuffle lyrics in order to avoid training only towards pos/neg lyrics.

random.shuffle(documents)
コード例 #12
0
j = 0
for i in range(10):
    dataset = str(i + 1)
    #mydir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Classified News/Training'
    train_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Train+val'
    test_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Testing'
    #test_dir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/NA is negative old/Classified News/Criteria '+dataset+''

    preprocessed = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/data_2.p'

    train_Corpus = CategorizedPlaintextCorpusReader(train_dir,
                                                    r'(?!\.).*\.txt',
                                                    cat_pattern=r'(\w+)/*')

    train_documents = [(list(train_Corpus.words(fileid)), category)
                       for category in train_Corpus.categories()
                       for fileid in train_Corpus.fileids(category)]

    only_docs = [' '.join(doc[:1000]) for (doc, category) in train_documents]
    only_docs = [
        ' '.join(normalize_text(document, lemmatize=True, remove_stop=None))
        for document in only_docs
    ]

    #######################################################################################
    train_labels = [category for (doc, category) in train_documents]
    train_binary_labels = [1 if i == 'pos' else 0 for i in train_labels]

    #train_data, test_data, train_labels, test_labels = train_test_split(only_docs, binary_labels,test_size=.15)
    train_data = only_docs
    train_labels = train_binary_labels
コード例 #13
0
from pprint import pprint
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader, stopwords
import logging

CORPUS_ROOT = "/Users/derek/Data/RADCAT/corpus"

if __name__ == "__main__":
    # For reports with category in the f/n abc_def+3.txt
    reports = CategorizedPlaintextCorpusReader(CORPUS_ROOT,
                                               '.*',
                                               cat_pattern=r'.*\+(.+)\.txt')

    logging.basicConfig(level=logging.DEBUG)
    logging.debug(reports.categories())

    toks = [
        w.lower() for w in reports.words()
        if w.isalpha() and w not in stopwords.words('english')
    ]

    all = nltk.Text(toks)
    print all.concordance('hemodynamically')

    # Create your bi-grams and n-grams
    # bgs = nltk.bigrams(toks)
    tgs = nltk.ngrams(toks, 3)

    fdist = nltk.FreqDist(tgs)
    pprint(fdist.most_common(20))