コード例 #1
1
ファイル: poems.py プロジェクト: breuckelen/verse
def create_vocabularies():
    poem_corpus = CategorizedPlaintextCorpusReader('./data', 'poems.*',
        cat_file='cats.txt')

    for emotion in base_emotions:
        words = poem_corpus.words(categories=[emotion])
        words = [w.lower() for w in words if w.isalpha() and w not in stopwords.words('english')]
        fdist = nltk.FreqDist(words)
        vocabulary = fdist.keys()[:200]

        vocab_file = open('./opinion-lexicon-English/%s-words.txt' % emotion, 'w')
        vocab_file.write('\n'.join(vocabulary))
        vocab_file.close()
コード例 #2
0
ファイル: nltk_classifier.py プロジェクト: RomRuben/TFM
def nltk():
    #### FOR TRAINING DATA ####
    stop = stopwords.words('spanish')

    # Reads the training data.
    traindir = '/Users/ruben/Desktop/Formularios_clasificados/training'
    mr = CategorizedPlaintextCorpusReader(traindir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8')

    # Converts training data into tuples of [(words,label), ...]
    documents = [([w for w in mr.words(i) if w.lower() not in stop and w not in string.punctuation], i.split('/')[0]) for i
                 in mr.fileids()]
    # Extract training features.
    word_features = FreqDist(chain(*[i for i, j in documents]))
    word_features = word_features.keys()[:100]
    # Assuming that you're using full data set
    # since your test set is different.
    train_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents]

    #### TRAINS THE TAGGER ####
    # Train the tagger
    classifier = NaiveBayesClassifier.train(train_set)

    #### FOR TESTING DATA ####
    # Now do the same reading and processing for the testing data.
    testdir = '/Users/ruben/Desktop/Formularios_clasificados/testing'
    mr_test = CategorizedPlaintextCorpusReader(testdir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8')
    # Converts testing data into tuples of [(words,label), ...]
    test_documents = [
        ([w for w in mr_test.words(i) if w.lower() not in stop and w not in string.punctuation], i.split('/')[0]) for i in
        mr_test.fileids()]
    # Reads test data into features:
    test_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in test_documents]

    correct = 0
    wrong = 0
    #### Evaluate the classifier ####
    for doc, gold_label in test_set:
        tagged_label = classifier.classify(doc)
        if tagged_label == gold_label:
            correct += 1
        else:
            wrong += 1

    print correct, wrong, (float(correct) / wrong + correct)
コード例 #3
0
def construct_model(copusPath, modelPath):
    mr = CategorizedPlaintextCorpusReader(copusPath, r'(?!\.).*\.txt',
                                           cat_pattern=r'*/.*', encoding='iso-8859-1')
    stop = stopwords.words('french')
    documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation],
                   i.split('/')[0]) for i in mr.fileids()]
    word_features = FreqDist(chain(*[i for i, j in documents]))
    word_features = list(word_features.keys())
    numtrain = int(len(documents) * 100 / 100)
    train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]]
    """test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag  in documents[numtrain:]]"""
    classifier = nbc.train(train_set)
    mrtest = CategorizedPlaintextCorpusReader(os.path.abspath("corpus_test"), r'(?!\.).*\.txt', cat_pattern=r'*/.*', encoding='iso-8859-1')
    documentsTest = [([w for w in mrtest.words(i) if w.lower() not in stop and w.lower() 
                   not in string.punctuation],
                   i.split('/')[0]) for i in mrtest.fileids()]
    word_features_test = FreqDist(chain(*[i for i, j in documentsTest]))
    word_features_test = list(word_features_test.keys())
    numtrain_test = int(len(documentsTest) * 100 / 100)
    test_set = [({i:(i in tokens) for i in word_features_test}, tag) for tokens, tag  in documentsTest[:numtrain_test]]
    save_classifier(classifier, modelPath)
コード例 #4
0
def display_features(num_features=1000,
                     show_features=200,
                     filepath='classifiers/nltk_nb.pkl',
                     verbose=True):
    '''
    Displays informative features from NHLCorpus
    '''
    stop_words = set(stopwords.words('english'))
    nhl = CategorizedPlaintextCorpusReader(root='data/NHLcorpus/',
                                           fileids=r'.*\.txt',
                                           cat_pattern='(\w+)/*')
    documents = []
    for category in nhl.categories():
        for fileid in nhl.fileids(category):
            documents.append(([
                re.sub(r'\W+', '', w.lower()) for w in nhl.words(fileid)
                if w.lower() not in stop_words
            ], category))
    all_words = nltk.FreqDist(
        re.sub(r'\W+', '', w.lower()) for w in nhl.words()
        if w.lower() not in stop_words)
    word_features = [w[0] for w in all_words.most_common(num_features)]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains({})'.format(word)] = word in document_words
        return features

    featuresets = [(document_features(d), c) for (d, c) in documents]
    nb_clf = nltk.NaiveBayesClassifier.train(featuresets)
    if verbose:
        nb_clf.show_most_informative_features(show_features)
        print('Accuracy on training data: {}'.format(
            nltk.classify.accuracy(nb_clf, featuresets)))

    save_classifier = open(filepath, 'wb')
    pickle.dump(nb_clf, save_classifier)
    save_classifier.close()
コード例 #5
0
    def init_documents(f_re, cat_re):
        logging.debug("Reading corpus")
        reports = CategorizedPlaintextCorpusReader(corpus_dir,
                                                   f_re,
                                                   cat_pattern=cat_re,
                                                   encoding='utf8')
        logging.debug("Found {} fileids".format(len(reports.fileids())))
        logging.debug("Found categories: {}".format(reports.categories()))
        logging.debug("Building docs")

        documents = [
            (tokenize(reports.words(i)), reports.categories(i)[0])
              for i in reports.fileids()]
        return documents
コード例 #6
0
ファイル: paypal_testi.py プロジェクト: fabiopuddu77/Paypal
    def __init__(self, dir, doc):
        self.doc = doc
        self.dir = dir
        self.eng_stopw = stopwords.words('english')


        text_corpus = CategorizedPlaintextCorpusReader(
            './%s/' % self.dir,
            r'.*\.csv',  # leggo solamente i file che terminato con .csv
            cat_pattern=r'(\w+)/*',  # prendi tutto quello che c'è dopo la directory
            encoding='latin-1'
        )

        self.text = nltk.Text(text_corpus.words(self.doc))
コード例 #7
0
ファイル: core.py プロジェクト: Christoph/newsComp-server
    def __create_corpus(self, language, chars):
        """Create a categorized nltk.corpus from data/* where the subfolders are the different categories.

        :chars: List of chars which will be additionally to stopwords  removed before the statistical analysis
        :language: The newspaper language as string
        :returns: nltk.corpus, list(all normalized words)

        """

        # Create corpus from data directory
        news_corpus = CategorizedPlaintextCorpusReader('data/', r'.*\.txt', cat_pattern=r'(\w+)/*')

        # Get all german stopwords and addition chars for removal
        g_stop = stopwords.words(language)
        g_stop.extend(chars)

        # Stemmer
        snow = nlp.stem.SnowballStemmer(language, ignore_stopwords=True)

        # Dict of all words/category
        cat = news_corpus.categories()
        total_words = {}

        for news in cat:
            #Get the words
            words = news_corpus.words(categories=news)

            # Remove stopwords and tokenize
            words = [w.lower() for w in words if w not in g_stop]

            # Stem all tokens
            words = [snow.stem(w) for w in words]

            total_words.update({news: words})

        return news_corpus, total_words
コード例 #8
0
def classify_emails():
    stop_words = set(stopwords.words("english"))

    lemmatizer = WordNetLemmatizer()

    mydir = '/home/ubuntu/nltk_data/corpora/gmail'

    all_words = []
    filtered_words = []
    removedPuncuations_words = []
    lematized_words = []
    test_filter = []

    mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(hotel|flight|other)/.*', encoding='latin-1')
    stop = stopwords.words('english')
    documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

    word_features = FreqDist(chain(*[i for i,j in documents]))
    word_features = word_features.keys()[:100]

    def word_feats(document):
        words = set(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)

        return dict(features)

    negids = mr.fileids('hotel')
    posids = mr.fileids('flight')
    neutralids = mr.fileids('other')

    negfeats = [(word_feats(mr.words(fileids=[f])), 'hotel') for f in negids]
    posfeats = [(word_feats(mr.words(fileids=[f])), 'flight') for f in posids]
    neutralfeats = [(word_feats(mr.words(fileids=[f])), 'other') for f in neutralids]

    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
    neutralcutoff = len(neutralfeats)*3/4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] + neutralfeats[:neutralcutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neutralfeats[neutralcutoff:]

    classifier = nltk.NaiveBayesClassifier.train(trainfeats)
    print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testfeats))*100)

    print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats)*100)


    file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read()
    tokens = nltk.word_tokenize(file_content)

    test_sent_features = {word.lower(): (word in tokens) for word in mr.words()}

    file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read()
    tokens = nltk.word_tokenize(file_content)
    tri_tokens = trigrams(tokens)

    cities = []
    matchedIndex = []
    tokenized = []
    addresses = []
    district = ['Akarawita','Angamuwa','Avissawella','Batawala','Battaramulla','Batugampola','Bope','Boralesgamuwa','Borella','Dedigamuwa','Dehiwala','Deltara','Habarakada','Handapangoda','Hanwella','Hewainna','Hiripitya','Hokandara','Homagama','Horagala','Kaduwela','Kahawala','Kalatuwawa','Madapatha','Maharagama','Malabe','Meegoda','Padukka','Pannipitiya','Piliyandala','Pitipana','Homagama','Polgasowita','Puwakpitiya','Ranala','Siddamulla','Slave Island','Sri Jayawardenapura','Talawatugoda','Tummodara','Waga','Watareka','Dickwella']

    for i in tokens:
        tokenized.append(i)

    pattern = re.compile("\d+")
    for i in tokenized:
        if pattern.match(i):
            matchedIndex.append(tokenized.index(i))
            print ("match"+i)
            print (tokenized.index(i))

        else:
            print ("not match")

    for t in tokenized:
        for i in district:
            if t.lower()==i.lower():
                cities.append(tokenized.index(t))

    distance= 200
    start = 0
    end = 0

    for t in cities:
        for i in matchedIndex:
            dis = t-i;
            if (dis<=distance and dis>0):
                distance=dis
                start=t
                end=i
            else:
                print ("higher")

    address = ""

    for token in range(end,start+1):
        address+=tokenized[(token)]
        print (address)
        addresses.append(address)

    for address in addresses:
        try:
            search = geocoder.get(address)
        except ValueError:
            continue
        first_result = search[0]

    output =  [first_result.geometry.location.lat,first_result.geometry.location.lng]


    stri = ','.join(map(str, output))
    return stri
コード例 #9
0
'''
import string
from itertools import chain

from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk

# working dir: UN/
mydir = 'corpus/meeting_records_final_categorized'

mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(intervention|soft_action)/.*', encoding='utf-8')
stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i,j in documents]))
word_features = word_features.keys()[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]]
test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag  in documents[numtrain:]]

classifier = nbc.train(train_set)
print nltk.classify.accuracy(classifier, test_set) # .87 - ?!?!?!
classifier.show_most_informative_features(20)

# for word_features.keys()[:100]
'''
Most Informative Features
コード例 #10
0
from pprint import pprint
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader, stopwords
import logging

CORPUS_ROOT = "/Users/derek/Data/RADCAT/corpus"

if __name__ == "__main__":
    # For reports with category in the f/n abc_def+3.txt
    reports = CategorizedPlaintextCorpusReader(CORPUS_ROOT,
                                               '.*',
                                               cat_pattern=r'.*\+(.+)\.txt')

    logging.basicConfig(level=logging.DEBUG)
    logging.debug(reports.categories())

    toks = [
        w.lower() for w in reports.words()
        if w.isalpha() and w not in stopwords.words('english')
    ]

    all = nltk.Text(toks)
    print all.concordance('hemodynamically')

    # Create your bi-grams and n-grams
    # bgs = nltk.bigrams(toks)
    tgs = nltk.ngrams(toks, 3)

    fdist = nltk.FreqDist(tgs)
    pprint(fdist.most_common(20))
コード例 #11
0
ファイル: classify.py プロジェクト: breuckelen/verse
pos_file.close()
neg_file.close()


# Words for all emotions
lexicon = {}
for emotion in base_emotions:
    f = open('./opinion-lexicon-English/%s-words.txt' % emotion, 'rU')
    words = [word.strip() for word in f.readlines()]
    lexicon[emotion] = words
    f.close()

# Make a classifier based on the feature sets of the poems
poem_corpus = CategorizedPlaintextCorpusReader('./data', 'poems.*',
        cat_file='cats.txt')

poem_set = [(fileid, category) for fileid in poem_corpus.fileids() \
        for category in poem_corpus.categories(fileid)]
random.shuffle(poem_set)

feature_set = [(poem_features(poem_corpus.words(fileids=[fileid])),
        category) for (fileid, category) in poem_set]

train_set, test_set = feature_set[2000:], feature_set[:2000]

# Initialize the classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# For improving the algorithm
classifier.show_most_informative_features(20)
コード例 #12
0
# GET RAW TEXT COMMENT given fileid
# corpus.raw([fileid])  #  my_corpus.raw(my_corpus.fileids()[2])) # prints raw text of file index 2 of whole corpus#

# GET list of TOKENIZED SENTS for a COMMENT via index or fileid:
# sents = corpus.sents(corpus.fileids()[index])
# sents = corpus.sents([fileid])
"""
GET TOKENIZED PARAGRAPHS
para = corpus.paras([fileid])
comment
"""
"""
GET TOKENIZED COMMENT
para = corpus.paras([fileid])
comment
"""

# ITERATE OVER FILEIDS
for fileid in corpus.fileids()[22:23]:
    print(fileid)
    print(type(fileid))
    print(len(corpus.raw(fileid)))
    print(corpus.raw(fileid))

    #sents = get_raw_sentences(fileid)
    sents = get_raw_paragraph(fileid)
    # print("SENT:  " + "\nSENT:  ".join(sents))
    words = corpus.words(fileid)
    print(words)
コード例 #13
0
ファイル: Oving4TK.py プロジェクト: vhellem/Plab
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk

mydir = 'Documents/Plab/Project4/subset/test/neg'

mr = CategorizedPlaintextCorpusReader(mydir,
                                      r'(?!\.).*\.txt',
                                      cat_pattern=r'(neg|pos)/.*',
                                      encoding='ascii')
stop = stopwords.words('english')
documents = [([
    w for w in mr.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i, j in documents]))
word_features = word_features.keys()[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i: (i in tokens)
               for i in word_features}, tag)
             for tokens, tag in documents[:numtrain]]
test_set = [({i: (i in tokens)
              for i in word_features}, tag)
            for tokens, tag in documents[numtrain:]]

classifier = nbc.train(train_set)
コード例 #14
0
# Rodrigo Renie de Braga Pinto
# TEXT ANALYSIS(Apostila)Parte 1.docx
# Exercitando 1
# Execute o que se pede.
# Imprima as palavras dos documentos neg/cv002_tok-3321.txt e
# pos/cv003_tok-8338.txt

from nltk.corpus import CategorizedPlaintextCorpusReader

corpus_reader = CategorizedPlaintextCorpusReader(
    'dados/mix20_rand700_tokens_cleaned/tokens/',
    '.*.txt',
    cat_pattern=r'(\w+)/*')

words = {'neg/cv002_tok-3321.txt': [], 'pos/cv003_tok-8338.txt': []}

for file in words:
    words[file] = corpus_reader.words(fileids=file)
    print('Palavras no arquivo {}: {}'.format(file, words[file]))
コード例 #15
0
corpus_root = "./files/"
cat_root = "../categories/"

# Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'...
corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')

# get all categories
cats = corpus.categories()
print(cats)

# access corpus
raw = corpus.raw()

# access words, normal and for a category
words = corpus.words()
words_pop = corpus.words(categories="POP")
words_rock = corpus.words(categories="ROCK")

# access sents, normal and for a category
sents = corpus.sents()
sents_pop = corpus.sents(categories="POP")
sents_rock = corpus.sents(categories="ROCK")

# make lists
word_list = list(words)
sents_list = list(sents)

pop_word_list = list(words_pop)
pop_sents_list = list(sents_pop)
コード例 #16
0
def loadCorpus(category = None) :

    corpus_root = "../corpus/lyric_corpus/files/"
    cat_root = "../categories/"

    if not os.name == 'posix':
        corpus_root = "..\\corpus\\lyric_corpus\\files\\"
    # load the corpus

    # corpus = PlaintextCorpusReader(corpus_root, '.*\.txt')
    corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')
    # print files in corpus
    # for file in corpus.fileids():
    # print(file)
    # access corpus

    raw = corpus.raw()
    words = corpus.words()
    # print (category)
    if(category == None):
        sents = corpus.sents()
    else:
        sents = corpus.sents(categories = category)
    # sents_pop = corpus.sents(categories="POP")
    # sents_rock = corpus.sents(categories="ROCK")

    shuffledSents = shuffleSent(sents)


    numberSents = len(shuffledSents)
    trainSize = math.floor(numberSents*0.8)
    testSize = len(shuffledSents) - trainSize
    # testSize = math.floor(numberSents*0.1)
    # devSize = len(shuffledSents)-trainSize - testSize

    trainCorpus = []
    testCorpus = []
    # devCorpus = []
    wholeCorpus = []
    testSents = []

    for i in range(numberSents):
        if(i < trainSize):
            for word in shuffledSents[i]:
                trainCorpus.append(word)
                wholeCorpus.append(word)
        # elif(i < (trainSize + testSize)):
        #     for word in shuffledSents[i]:
        #         testCorpus.append(word)
        #         wholeCorpus.append(word)
        else:
            testSents.append(shuffledSents[i])
            for word in shuffledSents[i]:
                testCorpus.append(word)
                wholeCorpus.append(word)



    # testCorpus = []
    # trainCorpus = list(words)
    # for i in range(testSize):
    #     seed = random.randrange(0,numberSents - i)
    #     testCorpus.append(trainCorpus.pop(seed))

    return wholeCorpus, trainCorpus, testCorpus, testSents
コード例 #17
0
from nltk.corpus import CategorizedPlaintextCorpusReader
from nltk import bigrams
from nltk import trigrams
from nltk.collocations import *
import nltk

corpus_root = "../corpus/lyric_corpus/files/"
cat_root = "../categories/"

# Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'...
corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')

# word lists
word_list_pop = list(corpus.words(categories="POP"))
word_list_rock = list(corpus.words(categories="ROCK"))

# bigram lists
bigram_list_pop = list(bigrams(word_list_pop))
bigram_list_rock = list(bigrams(word_list_rock))

# trigram lists
trigram_list_pop = list(trigrams(word_list_pop))
trigram_list_rock = list(trigrams(word_list_rock))

# measures
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()

# finders default window size is 2
bi_finder_pop = BigramCollocationFinder.from_words(word_list_pop)
bi_finder_rock = BigramCollocationFinder.from_words(word_list_rock)
コード例 #18
0

# Provide path to the custom corpora

mydir = '/Users/vasilis/Desktop/Lennon/lyrics_custom_corpus'

# Read data from our custom corpora

mr = CategorizedPlaintextCorpusReader(mydir,
                                      r'(?!\.).*\.txt',
                                      cat_pattern=r'(neg|pos)/.*')

# Clean lyrics from the English stop words.
stop = stopwords.words('english')

documents = [(list(mr.words(fileid)), category)
             for category in mr.categories()
             for fileid in mr.fileids(category)]

classifiers_dir = '/Users/vasilis/vxm773/Lennon/pickled_classifiers'

if os.path.exists(classifiers_dir):
    shutil.rmtree(classifiers_dir)
os.makedirs(classifiers_dir)

save_documents = open("pickled_classifiers/documents.pickle", "wb")
pickle.dump(documents, save_documents)
save_documents.close()

# Shuffle lyrics in order to avoid training only towards pos/neg lyrics.
コード例 #19
0
j = 0
for i in range(10):
    dataset = str(i + 1)
    #mydir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Classified News/Training'
    train_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Train+val'
    test_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Testing'
    #test_dir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/NA is negative old/Classified News/Criteria '+dataset+''

    preprocessed = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/data_2.p'

    train_Corpus = CategorizedPlaintextCorpusReader(train_dir,
                                                    r'(?!\.).*\.txt',
                                                    cat_pattern=r'(\w+)/*')

    train_documents = [(list(train_Corpus.words(fileid)), category)
                       for category in train_Corpus.categories()
                       for fileid in train_Corpus.fileids(category)]

    only_docs = [' '.join(doc[:1000]) for (doc, category) in train_documents]
    only_docs = [
        ' '.join(normalize_text(document, lemmatize=True, remove_stop=None))
        for document in only_docs
    ]

    #######################################################################################
    train_labels = [category for (doc, category) in train_documents]
    train_binary_labels = [1 if i == 'pos' else 0 for i in train_labels]

    #train_data, test_data, train_labels, test_labels = train_test_split(only_docs, binary_labels,test_size=.15)
    train_data = only_docs
コード例 #20
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 11 08:43:52 2021

@author: paulogamero
"""
# ATIVIDADE: EXERCITANDO 1 - PARTE 01
# AUTOR: Paulo Gamero

from nltk.corpus import CategorizedPlaintextCorpusReader

d = CategorizedPlaintextCorpusReader(
    r'C:\Users\Usuario\Dropbox\Pos\Pós DataScience\4 - Análise de textos com R e Python\Dados\mix20_rand700_tokens_cleaned\tokens',
r'.*.txt', cat_pattern = r'(\w+)/*', encoding = 'iso8859-1')

for p in d.words('pos/cv003_tok-8338.txt'):
    print(p + ' ', end = '')

for n in d.words('neg/cv002_tok-3321.txt'):
    print(n + ' ', end = '')
コード例 #21
0
ファイル: aula1.py プロジェクト: rodrigorenie/datascience
# Separar o corpus de acordo com as categorias
posFiles = leitor.fileids(categories='pos')
negFiles = leitor.fileids(categories='neg')
print('Arquivos pos:', posFiles)
print('Arquivos neg:', negFiles)

# Carregar os primeiros arquivos das categorias
arqP = posFiles[0]
arqN = negFiles[1]

print("ArqP: ", arqP)
print("ArqN: ", arqN)

# Imprimir as sentenças dos arquivos
print('Palavras nos arquivos selecionados')
for p in leitor.words(arqP):
    print(p + ' ', end='')

print('---')

for p in leitor.words(arqN):
    print(p + ' ', end='')

#
#
#
#

print(brown.categories())

# Selecionar três categorias livremente
コード例 #22
0
class CorpusUtil(object):
    """Documentar
    """
    def __init__(self, raiz_corpus):
        """Cria um objeto do tipo 'CategorizedPlaintextCorpusReader',
        utilizando o diretório raiz do corpus, onde os documentos
        estão localizados, dispostos em seus respectivos subdiretórios,
        de acordo com sua categoria, sejam eles/elas quais for
        
        -->     raiz_corpus/{pos,neg,neu,...}.
        """
        reload(sys)
        sys.setdefaultencoding("utf-8")
        
        self._raiz_corpus = raiz_corpus
        self._corpus = CategorizedPlaintextCorpusReader(raiz_corpus, r'.+\.txt', cat_pattern=r'(\w+)/*',
                                                        encoding='utf-8')
        self._documentos = None
        self._palavras_frequentes = None
        self._todas_palavras = None
        self._featuresets = None
        self._train_set = None
        self._test_set = None

    def get_documentos(self):
        """Construimos uma lista de documentos, rotulados com as
        categorias apropriadas. Cada documento é representado por
        uma tupla na estrutura abaixo:
        
        (conteudo_do_documento, categoria)
        
        Retorna essa lista com todos os documentos do corpus.
        """
        """
        documentos = [(self.corpus.words(fileid), categoria)
                       for categoria in self.corpus.categories()
                       for fileid in self.corpus.fileids(categoria)]
        """
        print "-- Recuperando documentos do corpus."

        if self._documentos is None:            
            self._documentos = [Documento(" ".join(self._corpus.words(fileid)), categoria, self, fileid)
                                for categoria in self._corpus.categories()
                                for fileid in self._corpus.fileids(categoria)]

        # Embaralha documentos
        for i in range(0, 10):
            shuffle(self._documentos)

        return self._documentos

    def get_palavras_frequentes(self):
        """Documentar.
        """
        if self._palavras_frequentes is None:

            print "-- Verificando as palavras mais frequentes do corpus."

            # Teste - retorna apenas as 2000 palavras mais frequentes do corpus
            todas_palavras = [word.lower() for word in self._corpus.words()]
            freq_dist_palavras = FreqDist(todas_palavras)
            frequencia_palavras = freq_dist_palavras.most_common(2000)  # 2000 palavras mais frequentes
            
            self._palavras_frequentes = [palavra for palavra, frequencia in frequencia_palavras]
            
            # all_words = FreqDist(word.lower() for word in self.corpus.words())
            # self.word_features = list(all_words)[:2000]
        return self._palavras_frequentes

    def get_todas_palavras(self):
        if self._todas_palavras is None:
            print "-- Recuperando todas as palavras do corpus."
            self._todas_palavras = [word.lower() for word in self._corpus.words()]
            self._todas_palavras = set(self._todas_palavras)

        return self._todas_palavras

    def get_featuresets(self):
        """Configura os featuresets que são construídos na
        seguinte estrutura:
            (features_do_documento, categoria)
        
        Retorna uma lista de featuresets
        """
        if self._featuresets is None:
            
            if self._documentos is None:
                self.get_documentos()

            print "-- Recuperando featuresets."

            self._featuresets = apply_features(Documento.get_features, self._documentos)
        
        return self._featuresets

    def get_train_set(self):
        """Documentar
        """
        if self._featuresets is None:
            self.get_featuresets()

        print "-- Recuperando train_set."

        # Para não ocupar toda a memória RAM,
        # não armazena todos os documentos de uma vez nesta.
        # self._train_set = apply_features(Documento.get_features, self._documentos[100:])
        self._train_set = apply_features(Documento.get_features, self._documentos)

        return self._train_set

    def get_test_set(self):
        if self._featuresets is None:
            self.get_featuresets()

        print "-- Recuperando test_set."

        # self._test_set = apply_features(Documento.get_features, self._documentos[:100])

        return self._test_set

    def gravar_palavras_frequentes(self):
        diretorio_destino = "/home/lucas/Documents/mineracao_opiniao/palavras_frequentes_corpus"
        molde_nome_arquivo = "palavras_frequentes_%s.pickle"

        tempo_agora = str(datetime.now())
        # Substitui ':' e espaço em branco por '.'
        tempo_agora = re.sub(ur':|\s', '.', tempo_agora)
        nome_arquivo = molde_nome_arquivo % tempo_agora

        if self._palavras_frequentes is None:
            self.get_palavras_frequentes()

        f = open(diretorio_destino + "/" + nome_arquivo, 'wb')
        pickle.dump(self._palavras_frequentes, f)
        f.close()

        return True

    @staticmethod
    def abrir_arquivo_palavras_frequentes(arquivo_path):
        f = open(arquivo_path, 'rb')
        palavras_frequentes = pickle.load(f)
        f.close()

        return palavras_frequentes
コード例 #23
0
# Build corpus for specific problem set
problem = 'problemA'
problem_root = nltk.data.find('corpora/AAAC/%s' % (problem))
problem_files = PlaintextCorpusReader(problem_root, '.*\.txt')


# Categorize corpus by author
auth_map = {}
for filename in problem_files.fileids():
	a_n =  filename[:3]
	auth_map[filename] =  [a_n]

# By the entire corpus
problem_cat = CategorizedPlaintextCorpusReader(problem_root, '.*\.txt', cat_map=auth_map)
documents = [(list(problem_cat.words(fileid)), category) 
				for category in problem_cat.categories() 
				for fileid in problem_cat.fileids(category)]
random.shuffle(documents)


# Word Frequency featureset
# Word freq accross corpus
all_words = nltk.FreqDist(words.lower() for words in problem_cat.words())
key_words = all_words.keys()[:2000]


# Compares whether a word from the keywords is in a document
def doc_features(doc):
	doc_words = set(doc)
	features = {}
コード例 #24
0
ファイル: Oving4TK.py プロジェクト: vhellem/Plab
import string
from itertools import chain

from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk

mydir = 'Documents/Plab/Project4/subset/test/neg'

mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii')
stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i,j in documents]))
word_features = word_features.keys()[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]]
test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag  in documents[numtrain:]]

classifier = nbc.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)
コード例 #25
0
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader
from pylab import *

import plotly.plotly as py
import plotly.graph_objs as go

corpus_root = "../corpus/lyric_corpus/files/"
cat_root = "../categories/"

corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')
words = corpus.words()
#frequency distribution

popWords = corpus.words(categories="POP")
rockWords = corpus.words(categories="ROCK")

#print("-----All words-----")
fd = nltk.FreqDist(words)
ALL_FrequentWords = fd.most_common(104)
ALL_FrequentWords_50_100 = []
for i in range(54,104):
	ALL_FrequentWords_50_100.append(ALL_FrequentWords[i])
#print(ALL_FrequentWords)


#print("-----All POP words-----")
fd_POP = nltk.FreqDist(popWords)
POP_FrequentWords = fd_POP.most_common(60)
#print(fd1.most_common(60))
    print('Number of words: ' + str(len(crp.words())))


#corpus_Stats(data_m)
#print('\n'+'First file: '+ data_fileids[0])
#print('Last file: '+ data_fileids[-1])

#%%
num_para_py = defaultdict(int)
num_word_py = defaultdict(int)

for y in range(1983, 2013):
    files = data_m.fileids(str(y))
    files_size = len(files)
    num_para_py[y] += sum([len(data_m.paras(f)) for f in files]) / files_size
    num_word_py[y] += sum([len(data_m.words(f)) for f in files]) / files_size

para_words = pd.DataFrame(
    [num_para_py, num_word_py],
    index=['Average number of paragraphs', 'Average number of words']).T

#word around groupbed bar charts
trace0 = go.Bar(x=para_words.index,
                y=para_words['Average number of paragraphs'],
                name='Average number of paragraphs ')
trace1 = go.Bar(x=para_words.index, y=[0], showlegend=False, hoverinfo='none')
trace2 = go.Bar(x=para_words.index,
                y=[0],
                yaxis='y2',
                showlegend=False,
                hoverinfo='none')
コード例 #27
0
# NLTK brow selection
word_list_brown = brown.words()
sents_list_brown = brown.sents()
vocabulary_brown = set(word_list_brown)
brown_len_words = len(word_list_brown)
brown_len_sents = len(sents_list_brown)
brown_len_vocab = len(vocabulary_brown)
brown_richness = lexical_diversity(word_list_brown)

# Lyric corpus
cats = corpus.categories()
print(len(cats))
print(cats)

num_files = len(corpus.fileids())
word_list = list(corpus.words())
sents_list = list(corpus.sents())
vocabulary = set(word_list)
total_len_words = len(word_list)
total_len_sents = len(sents_list)
total_len_vocab = len(vocabulary)
total_richness = lexical_diversity(word_list)

# POP
word_list_pop = list(corpus.words(categories="POP"))
sents_list_pop = list(corpus.sents(categories="POP"))
vocabulary_pop = set(word_list_pop)
pop_len_words = len(word_list_pop)
pop_len_sents = len(sents_list_pop)
pop_len_vocab = len(vocabulary_pop)
pop_richness = lexical_diversity(word_list_pop)
コード例 #28
0
#http://www.cs.cornell.edu/people/pabo/movie%2Dreview%2Ddata/

from nltk.corpus import CategorizedPlaintextCorpusReader
from random import randint

reader = CategorizedPlaintextCorpusReader(
    r'mix20_rand700_tokens_cleaned/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*')
print(reader.categories())
print(reader.fileids())

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(negFiles) - 1)]

print(fileN)
print(fileP)

for w in reader.words(fileP):
    print(w + ' ', end='')
    if w is '.':
        print()

for w in reader.words(fileN):
    print(w + ' ', end='')
    if w is '.':
        print()
コード例 #29
0
    r'(?!\.).*\.txt',
    cat_pattern=
    r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*')
mr_test = CategorizedPlaintextCorpusReader(
    mydir_test,
    r'(?!\.).*\.txt',
    cat_pattern=
    r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*')

stop = stopwords.words('english')

with open('.\\stopwords.txt') as f:
    stop = f.read().splitlines()

documents_train = [([
    w for w in mr_train.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in mr_train.fileids()
                   if os.path.getsize(os.path.join(mydir_train, i)) > 0]
documents_test = [([
    w for w in mr_test.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in mr_test.fileids()
                  if os.path.getsize(os.path.join(mydir_test, i)) > 0]

word_features_train = FreqDist(chain(*[i for i, j in documents_train]))
word_features_train = list(word_features_train.keys())[:1000]

word_features_test = FreqDist(chain(*[i for i, j in documents_test]))
word_features_test = list(word_features_test.keys())[:1000]