def create_vocabularies(): poem_corpus = CategorizedPlaintextCorpusReader('./data', 'poems.*', cat_file='cats.txt') for emotion in base_emotions: words = poem_corpus.words(categories=[emotion]) words = [w.lower() for w in words if w.isalpha() and w not in stopwords.words('english')] fdist = nltk.FreqDist(words) vocabulary = fdist.keys()[:200] vocab_file = open('./opinion-lexicon-English/%s-words.txt' % emotion, 'w') vocab_file.write('\n'.join(vocabulary)) vocab_file.close()
def nltk(): #### FOR TRAINING DATA #### stop = stopwords.words('spanish') # Reads the training data. traindir = '/Users/ruben/Desktop/Formularios_clasificados/training' mr = CategorizedPlaintextCorpusReader(traindir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8') # Converts training data into tuples of [(words,label), ...] documents = [([w for w in mr.words(i) if w.lower() not in stop and w not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] # Extract training features. word_features = FreqDist(chain(*[i for i, j in documents])) word_features = word_features.keys()[:100] # Assuming that you're using full data set # since your test set is different. train_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents] #### TRAINS THE TAGGER #### # Train the tagger classifier = NaiveBayesClassifier.train(train_set) #### FOR TESTING DATA #### # Now do the same reading and processing for the testing data. testdir = '/Users/ruben/Desktop/Formularios_clasificados/testing' mr_test = CategorizedPlaintextCorpusReader(testdir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8') # Converts testing data into tuples of [(words,label), ...] test_documents = [ ([w for w in mr_test.words(i) if w.lower() not in stop and w not in string.punctuation], i.split('/')[0]) for i in mr_test.fileids()] # Reads test data into features: test_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in test_documents] correct = 0 wrong = 0 #### Evaluate the classifier #### for doc, gold_label in test_set: tagged_label = classifier.classify(doc) if tagged_label == gold_label: correct += 1 else: wrong += 1 print correct, wrong, (float(correct) / wrong + correct)
def construct_model(copusPath, modelPath): mr = CategorizedPlaintextCorpusReader(copusPath, r'(?!\.).*\.txt', cat_pattern=r'*/.*', encoding='iso-8859-1') stop = stopwords.words('french') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i, j in documents])) word_features = list(word_features.keys()) numtrain = int(len(documents) * 100 / 100) train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]] """test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag in documents[numtrain:]]""" classifier = nbc.train(train_set) mrtest = CategorizedPlaintextCorpusReader(os.path.abspath("corpus_test"), r'(?!\.).*\.txt', cat_pattern=r'*/.*', encoding='iso-8859-1') documentsTest = [([w for w in mrtest.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mrtest.fileids()] word_features_test = FreqDist(chain(*[i for i, j in documentsTest])) word_features_test = list(word_features_test.keys()) numtrain_test = int(len(documentsTest) * 100 / 100) test_set = [({i:(i in tokens) for i in word_features_test}, tag) for tokens, tag in documentsTest[:numtrain_test]] save_classifier(classifier, modelPath)
def display_features(num_features=1000, show_features=200, filepath='classifiers/nltk_nb.pkl', verbose=True): ''' Displays informative features from NHLCorpus ''' stop_words = set(stopwords.words('english')) nhl = CategorizedPlaintextCorpusReader(root='data/NHLcorpus/', fileids=r'.*\.txt', cat_pattern='(\w+)/*') documents = [] for category in nhl.categories(): for fileid in nhl.fileids(category): documents.append(([ re.sub(r'\W+', '', w.lower()) for w in nhl.words(fileid) if w.lower() not in stop_words ], category)) all_words = nltk.FreqDist( re.sub(r'\W+', '', w.lower()) for w in nhl.words() if w.lower() not in stop_words) word_features = [w[0] for w in all_words.most_common(num_features)] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains({})'.format(word)] = word in document_words return features featuresets = [(document_features(d), c) for (d, c) in documents] nb_clf = nltk.NaiveBayesClassifier.train(featuresets) if verbose: nb_clf.show_most_informative_features(show_features) print('Accuracy on training data: {}'.format( nltk.classify.accuracy(nb_clf, featuresets))) save_classifier = open(filepath, 'wb') pickle.dump(nb_clf, save_classifier) save_classifier.close()
def init_documents(f_re, cat_re): logging.debug("Reading corpus") reports = CategorizedPlaintextCorpusReader(corpus_dir, f_re, cat_pattern=cat_re, encoding='utf8') logging.debug("Found {} fileids".format(len(reports.fileids()))) logging.debug("Found categories: {}".format(reports.categories())) logging.debug("Building docs") documents = [ (tokenize(reports.words(i)), reports.categories(i)[0]) for i in reports.fileids()] return documents
def __init__(self, dir, doc): self.doc = doc self.dir = dir self.eng_stopw = stopwords.words('english') text_corpus = CategorizedPlaintextCorpusReader( './%s/' % self.dir, r'.*\.csv', # leggo solamente i file che terminato con .csv cat_pattern=r'(\w+)/*', # prendi tutto quello che c'è dopo la directory encoding='latin-1' ) self.text = nltk.Text(text_corpus.words(self.doc))
def __create_corpus(self, language, chars): """Create a categorized nltk.corpus from data/* where the subfolders are the different categories. :chars: List of chars which will be additionally to stopwords removed before the statistical analysis :language: The newspaper language as string :returns: nltk.corpus, list(all normalized words) """ # Create corpus from data directory news_corpus = CategorizedPlaintextCorpusReader('data/', r'.*\.txt', cat_pattern=r'(\w+)/*') # Get all german stopwords and addition chars for removal g_stop = stopwords.words(language) g_stop.extend(chars) # Stemmer snow = nlp.stem.SnowballStemmer(language, ignore_stopwords=True) # Dict of all words/category cat = news_corpus.categories() total_words = {} for news in cat: #Get the words words = news_corpus.words(categories=news) # Remove stopwords and tokenize words = [w.lower() for w in words if w not in g_stop] # Stem all tokens words = [snow.stem(w) for w in words] total_words.update({news: words}) return news_corpus, total_words
def classify_emails(): stop_words = set(stopwords.words("english")) lemmatizer = WordNetLemmatizer() mydir = '/home/ubuntu/nltk_data/corpora/gmail' all_words = [] filtered_words = [] removedPuncuations_words = [] lematized_words = [] test_filter = [] mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(hotel|flight|other)/.*', encoding='latin-1') stop = stopwords.words('english') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i,j in documents])) word_features = word_features.keys()[:100] def word_feats(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) return dict(features) negids = mr.fileids('hotel') posids = mr.fileids('flight') neutralids = mr.fileids('other') negfeats = [(word_feats(mr.words(fileids=[f])), 'hotel') for f in negids] posfeats = [(word_feats(mr.words(fileids=[f])), 'flight') for f in posids] neutralfeats = [(word_feats(mr.words(fileids=[f])), 'other') for f in neutralids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 neutralcutoff = len(neutralfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] + neutralfeats[:neutralcutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neutralfeats[neutralcutoff:] classifier = nltk.NaiveBayesClassifier.train(trainfeats) print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testfeats))*100) print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats)*100) file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read() tokens = nltk.word_tokenize(file_content) test_sent_features = {word.lower(): (word in tokens) for word in mr.words()} file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read() tokens = nltk.word_tokenize(file_content) tri_tokens = trigrams(tokens) cities = [] matchedIndex = [] tokenized = [] addresses = [] district = ['Akarawita','Angamuwa','Avissawella','Batawala','Battaramulla','Batugampola','Bope','Boralesgamuwa','Borella','Dedigamuwa','Dehiwala','Deltara','Habarakada','Handapangoda','Hanwella','Hewainna','Hiripitya','Hokandara','Homagama','Horagala','Kaduwela','Kahawala','Kalatuwawa','Madapatha','Maharagama','Malabe','Meegoda','Padukka','Pannipitiya','Piliyandala','Pitipana','Homagama','Polgasowita','Puwakpitiya','Ranala','Siddamulla','Slave Island','Sri Jayawardenapura','Talawatugoda','Tummodara','Waga','Watareka','Dickwella'] for i in tokens: tokenized.append(i) pattern = re.compile("\d+") for i in tokenized: if pattern.match(i): matchedIndex.append(tokenized.index(i)) print ("match"+i) print (tokenized.index(i)) else: print ("not match") for t in tokenized: for i in district: if t.lower()==i.lower(): cities.append(tokenized.index(t)) distance= 200 start = 0 end = 0 for t in cities: for i in matchedIndex: dis = t-i; if (dis<=distance and dis>0): distance=dis start=t end=i else: print ("higher") address = "" for token in range(end,start+1): address+=tokenized[(token)] print (address) addresses.append(address) for address in addresses: try: search = geocoder.get(address) except ValueError: continue first_result = search[0] output = [first_result.geometry.location.lat,first_result.geometry.location.lng] stri = ','.join(map(str, output)) return stri
''' import string from itertools import chain from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk # working dir: UN/ mydir = 'corpus/meeting_records_final_categorized' mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(intervention|soft_action)/.*', encoding='utf-8') stop = stopwords.words('english') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i,j in documents])) word_features = word_features.keys()[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]] test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[numtrain:]] classifier = nbc.train(train_set) print nltk.classify.accuracy(classifier, test_set) # .87 - ?!?!?! classifier.show_most_informative_features(20) # for word_features.keys()[:100] ''' Most Informative Features
from pprint import pprint import nltk from nltk.corpus import CategorizedPlaintextCorpusReader, stopwords import logging CORPUS_ROOT = "/Users/derek/Data/RADCAT/corpus" if __name__ == "__main__": # For reports with category in the f/n abc_def+3.txt reports = CategorizedPlaintextCorpusReader(CORPUS_ROOT, '.*', cat_pattern=r'.*\+(.+)\.txt') logging.basicConfig(level=logging.DEBUG) logging.debug(reports.categories()) toks = [ w.lower() for w in reports.words() if w.isalpha() and w not in stopwords.words('english') ] all = nltk.Text(toks) print all.concordance('hemodynamically') # Create your bi-grams and n-grams # bgs = nltk.bigrams(toks) tgs = nltk.ngrams(toks, 3) fdist = nltk.FreqDist(tgs) pprint(fdist.most_common(20))
pos_file.close() neg_file.close() # Words for all emotions lexicon = {} for emotion in base_emotions: f = open('./opinion-lexicon-English/%s-words.txt' % emotion, 'rU') words = [word.strip() for word in f.readlines()] lexicon[emotion] = words f.close() # Make a classifier based on the feature sets of the poems poem_corpus = CategorizedPlaintextCorpusReader('./data', 'poems.*', cat_file='cats.txt') poem_set = [(fileid, category) for fileid in poem_corpus.fileids() \ for category in poem_corpus.categories(fileid)] random.shuffle(poem_set) feature_set = [(poem_features(poem_corpus.words(fileids=[fileid])), category) for (fileid, category) in poem_set] train_set, test_set = feature_set[2000:], feature_set[:2000] # Initialize the classifier classifier = nltk.NaiveBayesClassifier.train(train_set) # For improving the algorithm classifier.show_most_informative_features(20)
# GET RAW TEXT COMMENT given fileid # corpus.raw([fileid]) # my_corpus.raw(my_corpus.fileids()[2])) # prints raw text of file index 2 of whole corpus# # GET list of TOKENIZED SENTS for a COMMENT via index or fileid: # sents = corpus.sents(corpus.fileids()[index]) # sents = corpus.sents([fileid]) """ GET TOKENIZED PARAGRAPHS para = corpus.paras([fileid]) comment """ """ GET TOKENIZED COMMENT para = corpus.paras([fileid]) comment """ # ITERATE OVER FILEIDS for fileid in corpus.fileids()[22:23]: print(fileid) print(type(fileid)) print(len(corpus.raw(fileid))) print(corpus.raw(fileid)) #sents = get_raw_sentences(fileid) sents = get_raw_paragraph(fileid) # print("SENT: " + "\nSENT: ".join(sents)) words = corpus.words(fileid) print(words)
from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk mydir = 'Documents/Plab/Project4/subset/test/neg' mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii') stop = stopwords.words('english') documents = [([ w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i, j in documents])) word_features = word_features.keys()[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]] test_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents[numtrain:]] classifier = nbc.train(train_set)
# Rodrigo Renie de Braga Pinto # TEXT ANALYSIS(Apostila)Parte 1.docx # Exercitando 1 # Execute o que se pede. # Imprima as palavras dos documentos neg/cv002_tok-3321.txt e # pos/cv003_tok-8338.txt from nltk.corpus import CategorizedPlaintextCorpusReader corpus_reader = CategorizedPlaintextCorpusReader( 'dados/mix20_rand700_tokens_cleaned/tokens/', '.*.txt', cat_pattern=r'(\w+)/*') words = {'neg/cv002_tok-3321.txt': [], 'pos/cv003_tok-8338.txt': []} for file in words: words[file] = corpus_reader.words(fileids=file) print('Palavras no arquivo {}: {}'.format(file, words[file]))
corpus_root = "./files/" cat_root = "../categories/" # Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'... corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') # get all categories cats = corpus.categories() print(cats) # access corpus raw = corpus.raw() # access words, normal and for a category words = corpus.words() words_pop = corpus.words(categories="POP") words_rock = corpus.words(categories="ROCK") # access sents, normal and for a category sents = corpus.sents() sents_pop = corpus.sents(categories="POP") sents_rock = corpus.sents(categories="ROCK") # make lists word_list = list(words) sents_list = list(sents) pop_word_list = list(words_pop) pop_sents_list = list(sents_pop)
def loadCorpus(category = None) : corpus_root = "../corpus/lyric_corpus/files/" cat_root = "../categories/" if not os.name == 'posix': corpus_root = "..\\corpus\\lyric_corpus\\files\\" # load the corpus # corpus = PlaintextCorpusReader(corpus_root, '.*\.txt') corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') # print files in corpus # for file in corpus.fileids(): # print(file) # access corpus raw = corpus.raw() words = corpus.words() # print (category) if(category == None): sents = corpus.sents() else: sents = corpus.sents(categories = category) # sents_pop = corpus.sents(categories="POP") # sents_rock = corpus.sents(categories="ROCK") shuffledSents = shuffleSent(sents) numberSents = len(shuffledSents) trainSize = math.floor(numberSents*0.8) testSize = len(shuffledSents) - trainSize # testSize = math.floor(numberSents*0.1) # devSize = len(shuffledSents)-trainSize - testSize trainCorpus = [] testCorpus = [] # devCorpus = [] wholeCorpus = [] testSents = [] for i in range(numberSents): if(i < trainSize): for word in shuffledSents[i]: trainCorpus.append(word) wholeCorpus.append(word) # elif(i < (trainSize + testSize)): # for word in shuffledSents[i]: # testCorpus.append(word) # wholeCorpus.append(word) else: testSents.append(shuffledSents[i]) for word in shuffledSents[i]: testCorpus.append(word) wholeCorpus.append(word) # testCorpus = [] # trainCorpus = list(words) # for i in range(testSize): # seed = random.randrange(0,numberSents - i) # testCorpus.append(trainCorpus.pop(seed)) return wholeCorpus, trainCorpus, testCorpus, testSents
from nltk.corpus import CategorizedPlaintextCorpusReader from nltk import bigrams from nltk import trigrams from nltk.collocations import * import nltk corpus_root = "../corpus/lyric_corpus/files/" cat_root = "../categories/" # Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'... corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') # word lists word_list_pop = list(corpus.words(categories="POP")) word_list_rock = list(corpus.words(categories="ROCK")) # bigram lists bigram_list_pop = list(bigrams(word_list_pop)) bigram_list_rock = list(bigrams(word_list_rock)) # trigram lists trigram_list_pop = list(trigrams(word_list_pop)) trigram_list_rock = list(trigrams(word_list_rock)) # measures bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() # finders default window size is 2 bi_finder_pop = BigramCollocationFinder.from_words(word_list_pop) bi_finder_rock = BigramCollocationFinder.from_words(word_list_rock)
# Provide path to the custom corpora mydir = '/Users/vasilis/Desktop/Lennon/lyrics_custom_corpus' # Read data from our custom corpora mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*') # Clean lyrics from the English stop words. stop = stopwords.words('english') documents = [(list(mr.words(fileid)), category) for category in mr.categories() for fileid in mr.fileids(category)] classifiers_dir = '/Users/vasilis/vxm773/Lennon/pickled_classifiers' if os.path.exists(classifiers_dir): shutil.rmtree(classifiers_dir) os.makedirs(classifiers_dir) save_documents = open("pickled_classifiers/documents.pickle", "wb") pickle.dump(documents, save_documents) save_documents.close() # Shuffle lyrics in order to avoid training only towards pos/neg lyrics.
j = 0 for i in range(10): dataset = str(i + 1) #mydir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Classified News/Training' train_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Train+val' test_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Testing' #test_dir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/NA is negative old/Classified News/Criteria '+dataset+'' preprocessed = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/data_2.p' train_Corpus = CategorizedPlaintextCorpusReader(train_dir, r'(?!\.).*\.txt', cat_pattern=r'(\w+)/*') train_documents = [(list(train_Corpus.words(fileid)), category) for category in train_Corpus.categories() for fileid in train_Corpus.fileids(category)] only_docs = [' '.join(doc[:1000]) for (doc, category) in train_documents] only_docs = [ ' '.join(normalize_text(document, lemmatize=True, remove_stop=None)) for document in only_docs ] ####################################################################################### train_labels = [category for (doc, category) in train_documents] train_binary_labels = [1 if i == 'pos' else 0 for i in train_labels] #train_data, test_data, train_labels, test_labels = train_test_split(only_docs, binary_labels,test_size=.15) train_data = only_docs
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Mar 11 08:43:52 2021 @author: paulogamero """ # ATIVIDADE: EXERCITANDO 1 - PARTE 01 # AUTOR: Paulo Gamero from nltk.corpus import CategorizedPlaintextCorpusReader d = CategorizedPlaintextCorpusReader( r'C:\Users\Usuario\Dropbox\Pos\Pós DataScience\4 - Análise de textos com R e Python\Dados\mix20_rand700_tokens_cleaned\tokens', r'.*.txt', cat_pattern = r'(\w+)/*', encoding = 'iso8859-1') for p in d.words('pos/cv003_tok-8338.txt'): print(p + ' ', end = '') for n in d.words('neg/cv002_tok-3321.txt'): print(n + ' ', end = '')
# Separar o corpus de acordo com as categorias posFiles = leitor.fileids(categories='pos') negFiles = leitor.fileids(categories='neg') print('Arquivos pos:', posFiles) print('Arquivos neg:', negFiles) # Carregar os primeiros arquivos das categorias arqP = posFiles[0] arqN = negFiles[1] print("ArqP: ", arqP) print("ArqN: ", arqN) # Imprimir as sentenças dos arquivos print('Palavras nos arquivos selecionados') for p in leitor.words(arqP): print(p + ' ', end='') print('---') for p in leitor.words(arqN): print(p + ' ', end='') # # # # print(brown.categories()) # Selecionar três categorias livremente
class CorpusUtil(object): """Documentar """ def __init__(self, raiz_corpus): """Cria um objeto do tipo 'CategorizedPlaintextCorpusReader', utilizando o diretório raiz do corpus, onde os documentos estão localizados, dispostos em seus respectivos subdiretórios, de acordo com sua categoria, sejam eles/elas quais for --> raiz_corpus/{pos,neg,neu,...}. """ reload(sys) sys.setdefaultencoding("utf-8") self._raiz_corpus = raiz_corpus self._corpus = CategorizedPlaintextCorpusReader(raiz_corpus, r'.+\.txt', cat_pattern=r'(\w+)/*', encoding='utf-8') self._documentos = None self._palavras_frequentes = None self._todas_palavras = None self._featuresets = None self._train_set = None self._test_set = None def get_documentos(self): """Construimos uma lista de documentos, rotulados com as categorias apropriadas. Cada documento é representado por uma tupla na estrutura abaixo: (conteudo_do_documento, categoria) Retorna essa lista com todos os documentos do corpus. """ """ documentos = [(self.corpus.words(fileid), categoria) for categoria in self.corpus.categories() for fileid in self.corpus.fileids(categoria)] """ print "-- Recuperando documentos do corpus." if self._documentos is None: self._documentos = [Documento(" ".join(self._corpus.words(fileid)), categoria, self, fileid) for categoria in self._corpus.categories() for fileid in self._corpus.fileids(categoria)] # Embaralha documentos for i in range(0, 10): shuffle(self._documentos) return self._documentos def get_palavras_frequentes(self): """Documentar. """ if self._palavras_frequentes is None: print "-- Verificando as palavras mais frequentes do corpus." # Teste - retorna apenas as 2000 palavras mais frequentes do corpus todas_palavras = [word.lower() for word in self._corpus.words()] freq_dist_palavras = FreqDist(todas_palavras) frequencia_palavras = freq_dist_palavras.most_common(2000) # 2000 palavras mais frequentes self._palavras_frequentes = [palavra for palavra, frequencia in frequencia_palavras] # all_words = FreqDist(word.lower() for word in self.corpus.words()) # self.word_features = list(all_words)[:2000] return self._palavras_frequentes def get_todas_palavras(self): if self._todas_palavras is None: print "-- Recuperando todas as palavras do corpus." self._todas_palavras = [word.lower() for word in self._corpus.words()] self._todas_palavras = set(self._todas_palavras) return self._todas_palavras def get_featuresets(self): """Configura os featuresets que são construídos na seguinte estrutura: (features_do_documento, categoria) Retorna uma lista de featuresets """ if self._featuresets is None: if self._documentos is None: self.get_documentos() print "-- Recuperando featuresets." self._featuresets = apply_features(Documento.get_features, self._documentos) return self._featuresets def get_train_set(self): """Documentar """ if self._featuresets is None: self.get_featuresets() print "-- Recuperando train_set." # Para não ocupar toda a memória RAM, # não armazena todos os documentos de uma vez nesta. # self._train_set = apply_features(Documento.get_features, self._documentos[100:]) self._train_set = apply_features(Documento.get_features, self._documentos) return self._train_set def get_test_set(self): if self._featuresets is None: self.get_featuresets() print "-- Recuperando test_set." # self._test_set = apply_features(Documento.get_features, self._documentos[:100]) return self._test_set def gravar_palavras_frequentes(self): diretorio_destino = "/home/lucas/Documents/mineracao_opiniao/palavras_frequentes_corpus" molde_nome_arquivo = "palavras_frequentes_%s.pickle" tempo_agora = str(datetime.now()) # Substitui ':' e espaço em branco por '.' tempo_agora = re.sub(ur':|\s', '.', tempo_agora) nome_arquivo = molde_nome_arquivo % tempo_agora if self._palavras_frequentes is None: self.get_palavras_frequentes() f = open(diretorio_destino + "/" + nome_arquivo, 'wb') pickle.dump(self._palavras_frequentes, f) f.close() return True @staticmethod def abrir_arquivo_palavras_frequentes(arquivo_path): f = open(arquivo_path, 'rb') palavras_frequentes = pickle.load(f) f.close() return palavras_frequentes
# Build corpus for specific problem set problem = 'problemA' problem_root = nltk.data.find('corpora/AAAC/%s' % (problem)) problem_files = PlaintextCorpusReader(problem_root, '.*\.txt') # Categorize corpus by author auth_map = {} for filename in problem_files.fileids(): a_n = filename[:3] auth_map[filename] = [a_n] # By the entire corpus problem_cat = CategorizedPlaintextCorpusReader(problem_root, '.*\.txt', cat_map=auth_map) documents = [(list(problem_cat.words(fileid)), category) for category in problem_cat.categories() for fileid in problem_cat.fileids(category)] random.shuffle(documents) # Word Frequency featureset # Word freq accross corpus all_words = nltk.FreqDist(words.lower() for words in problem_cat.words()) key_words = all_words.keys()[:2000] # Compares whether a word from the keywords is in a document def doc_features(doc): doc_words = set(doc) features = {}
import string from itertools import chain from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk mydir = 'Documents/Plab/Project4/subset/test/neg' mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii') stop = stopwords.words('english') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i,j in documents])) word_features = word_features.keys()[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]] test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[numtrain:]] classifier = nbc.train(train_set) print(nltk.classify.accuracy(classifier, test_set)) classifier.show_most_informative_features(5)
import nltk from nltk.corpus import CategorizedPlaintextCorpusReader from pylab import * import plotly.plotly as py import plotly.graph_objs as go corpus_root = "../corpus/lyric_corpus/files/" cat_root = "../categories/" corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') words = corpus.words() #frequency distribution popWords = corpus.words(categories="POP") rockWords = corpus.words(categories="ROCK") #print("-----All words-----") fd = nltk.FreqDist(words) ALL_FrequentWords = fd.most_common(104) ALL_FrequentWords_50_100 = [] for i in range(54,104): ALL_FrequentWords_50_100.append(ALL_FrequentWords[i]) #print(ALL_FrequentWords) #print("-----All POP words-----") fd_POP = nltk.FreqDist(popWords) POP_FrequentWords = fd_POP.most_common(60) #print(fd1.most_common(60))
print('Number of words: ' + str(len(crp.words()))) #corpus_Stats(data_m) #print('\n'+'First file: '+ data_fileids[0]) #print('Last file: '+ data_fileids[-1]) #%% num_para_py = defaultdict(int) num_word_py = defaultdict(int) for y in range(1983, 2013): files = data_m.fileids(str(y)) files_size = len(files) num_para_py[y] += sum([len(data_m.paras(f)) for f in files]) / files_size num_word_py[y] += sum([len(data_m.words(f)) for f in files]) / files_size para_words = pd.DataFrame( [num_para_py, num_word_py], index=['Average number of paragraphs', 'Average number of words']).T #word around groupbed bar charts trace0 = go.Bar(x=para_words.index, y=para_words['Average number of paragraphs'], name='Average number of paragraphs ') trace1 = go.Bar(x=para_words.index, y=[0], showlegend=False, hoverinfo='none') trace2 = go.Bar(x=para_words.index, y=[0], yaxis='y2', showlegend=False, hoverinfo='none')
# NLTK brow selection word_list_brown = brown.words() sents_list_brown = brown.sents() vocabulary_brown = set(word_list_brown) brown_len_words = len(word_list_brown) brown_len_sents = len(sents_list_brown) brown_len_vocab = len(vocabulary_brown) brown_richness = lexical_diversity(word_list_brown) # Lyric corpus cats = corpus.categories() print(len(cats)) print(cats) num_files = len(corpus.fileids()) word_list = list(corpus.words()) sents_list = list(corpus.sents()) vocabulary = set(word_list) total_len_words = len(word_list) total_len_sents = len(sents_list) total_len_vocab = len(vocabulary) total_richness = lexical_diversity(word_list) # POP word_list_pop = list(corpus.words(categories="POP")) sents_list_pop = list(corpus.sents(categories="POP")) vocabulary_pop = set(word_list_pop) pop_len_words = len(word_list_pop) pop_len_sents = len(sents_list_pop) pop_len_vocab = len(vocabulary_pop) pop_richness = lexical_diversity(word_list_pop)
#http://www.cs.cornell.edu/people/pabo/movie%2Dreview%2Ddata/ from nltk.corpus import CategorizedPlaintextCorpusReader from random import randint reader = CategorizedPlaintextCorpusReader( r'mix20_rand700_tokens_cleaned/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(negFiles) - 1)] print(fileN) print(fileP) for w in reader.words(fileP): print(w + ' ', end='') if w is '.': print() for w in reader.words(fileN): print(w + ' ', end='') if w is '.': print()
r'(?!\.).*\.txt', cat_pattern= r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*') mr_test = CategorizedPlaintextCorpusReader( mydir_test, r'(?!\.).*\.txt', cat_pattern= r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*') stop = stopwords.words('english') with open('.\\stopwords.txt') as f: stop = f.read().splitlines() documents_train = [([ w for w in mr_train.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in mr_train.fileids() if os.path.getsize(os.path.join(mydir_train, i)) > 0] documents_test = [([ w for w in mr_test.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in mr_test.fileids() if os.path.getsize(os.path.join(mydir_test, i)) > 0] word_features_train = FreqDist(chain(*[i for i, j in documents_train])) word_features_train = list(word_features_train.keys())[:1000] word_features_test = FreqDist(chain(*[i for i, j in documents_test])) word_features_test = list(word_features_test.keys())[:1000]