def init_documents(f_re, cat_re): logging.debug("Reading corpus") reports = CategorizedPlaintextCorpusReader(corpus_dir, f_re, cat_pattern=cat_re, encoding='utf8') logging.debug("Found {} fileids".format(len(reports.fileids()))) logging.debug("Found categories: {}".format(reports.categories())) logging.debug("Building docs") documents = [ (tokenize(reports.words(i)), reports.categories(i)[0]) for i in reports.fileids()] return documents
def display_features(num_features=1000, show_features=200, filepath='classifiers/nltk_nb.pkl', verbose=True): ''' Displays informative features from NHLCorpus ''' stop_words = set(stopwords.words('english')) nhl = CategorizedPlaintextCorpusReader(root='data/NHLcorpus/', fileids=r'.*\.txt', cat_pattern='(\w+)/*') documents = [] for category in nhl.categories(): for fileid in nhl.fileids(category): documents.append(([ re.sub(r'\W+', '', w.lower()) for w in nhl.words(fileid) if w.lower() not in stop_words ], category)) all_words = nltk.FreqDist( re.sub(r'\W+', '', w.lower()) for w in nhl.words() if w.lower() not in stop_words) word_features = [w[0] for w in all_words.most_common(num_features)] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains({})'.format(word)] = word in document_words return features featuresets = [(document_features(d), c) for (d, c) in documents] nb_clf = nltk.NaiveBayesClassifier.train(featuresets) if verbose: nb_clf.show_most_informative_features(show_features) print('Accuracy on training data: {}'.format( nltk.classify.accuracy(nb_clf, featuresets))) save_classifier = open(filepath, 'wb') pickle.dump(nb_clf, save_classifier) save_classifier.close()
def __create_corpus(self, language, chars): """Create a categorized nltk.corpus from data/* where the subfolders are the different categories. :chars: List of chars which will be additionally to stopwords removed before the statistical analysis :language: The newspaper language as string :returns: nltk.corpus, list(all normalized words) """ # Create corpus from data directory news_corpus = CategorizedPlaintextCorpusReader('data/', r'.*\.txt', cat_pattern=r'(\w+)/*') # Get all german stopwords and addition chars for removal g_stop = stopwords.words(language) g_stop.extend(chars) # Stemmer snow = nlp.stem.SnowballStemmer(language, ignore_stopwords=True) # Dict of all words/category cat = news_corpus.categories() total_words = {} for news in cat: #Get the words words = news_corpus.words(categories=news) # Remove stopwords and tokenize words = [w.lower() for w in words if w not in g_stop] # Stem all tokens words = [snow.stem(w) for w in words] total_words.update({news: words}) return news_corpus, total_words
#http://www.cs.cornell.edu/people/pabo/movie%2Dreview%2Ddata/ from nltk.corpus import CategorizedPlaintextCorpusReader from random import randint reader = CategorizedPlaintextCorpusReader( r'mix20_rand700_tokens_cleaned/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(negFiles) - 1)] print(fileN) print(fileP) for w in reader.words(fileP): print(w + ' ', end='') if w is '.': print() for w in reader.words(fileN): print(w + ' ', end='') if w is '.': print()
from nltk.corpus import CategorizedPlaintextCorpusReader corpus_root = "./files/" cat_root = "../categories/" # Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'... corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') # get all categories cats = corpus.categories() print(cats) # access corpus raw = corpus.raw() # access words, normal and for a category words = corpus.words() words_pop = corpus.words(categories="POP") words_rock = corpus.words(categories="ROCK") # access sents, normal and for a category sents = corpus.sents() sents_pop = corpus.sents(categories="POP") sents_rock = corpus.sents(categories="ROCK") # make lists word_list = list(words) sents_list = list(sents) pop_word_list = list(words_pop) pop_sents_list = list(sents_pop)
class CorpusUtil(object): """Documentar """ def __init__(self, raiz_corpus): """Cria um objeto do tipo 'CategorizedPlaintextCorpusReader', utilizando o diretório raiz do corpus, onde os documentos estão localizados, dispostos em seus respectivos subdiretórios, de acordo com sua categoria, sejam eles/elas quais for --> raiz_corpus/{pos,neg,neu,...}. """ reload(sys) sys.setdefaultencoding("utf-8") self._raiz_corpus = raiz_corpus self._corpus = CategorizedPlaintextCorpusReader(raiz_corpus, r'.+\.txt', cat_pattern=r'(\w+)/*', encoding='utf-8') self._documentos = None self._palavras_frequentes = None self._todas_palavras = None self._featuresets = None self._train_set = None self._test_set = None def get_documentos(self): """Construimos uma lista de documentos, rotulados com as categorias apropriadas. Cada documento é representado por uma tupla na estrutura abaixo: (conteudo_do_documento, categoria) Retorna essa lista com todos os documentos do corpus. """ """ documentos = [(self.corpus.words(fileid), categoria) for categoria in self.corpus.categories() for fileid in self.corpus.fileids(categoria)] """ print "-- Recuperando documentos do corpus." if self._documentos is None: self._documentos = [Documento(" ".join(self._corpus.words(fileid)), categoria, self, fileid) for categoria in self._corpus.categories() for fileid in self._corpus.fileids(categoria)] # Embaralha documentos for i in range(0, 10): shuffle(self._documentos) return self._documentos def get_palavras_frequentes(self): """Documentar. """ if self._palavras_frequentes is None: print "-- Verificando as palavras mais frequentes do corpus." # Teste - retorna apenas as 2000 palavras mais frequentes do corpus todas_palavras = [word.lower() for word in self._corpus.words()] freq_dist_palavras = FreqDist(todas_palavras) frequencia_palavras = freq_dist_palavras.most_common(2000) # 2000 palavras mais frequentes self._palavras_frequentes = [palavra for palavra, frequencia in frequencia_palavras] # all_words = FreqDist(word.lower() for word in self.corpus.words()) # self.word_features = list(all_words)[:2000] return self._palavras_frequentes def get_todas_palavras(self): if self._todas_palavras is None: print "-- Recuperando todas as palavras do corpus." self._todas_palavras = [word.lower() for word in self._corpus.words()] self._todas_palavras = set(self._todas_palavras) return self._todas_palavras def get_featuresets(self): """Configura os featuresets que são construídos na seguinte estrutura: (features_do_documento, categoria) Retorna uma lista de featuresets """ if self._featuresets is None: if self._documentos is None: self.get_documentos() print "-- Recuperando featuresets." self._featuresets = apply_features(Documento.get_features, self._documentos) return self._featuresets def get_train_set(self): """Documentar """ if self._featuresets is None: self.get_featuresets() print "-- Recuperando train_set." # Para não ocupar toda a memória RAM, # não armazena todos os documentos de uma vez nesta. # self._train_set = apply_features(Documento.get_features, self._documentos[100:]) self._train_set = apply_features(Documento.get_features, self._documentos) return self._train_set def get_test_set(self): if self._featuresets is None: self.get_featuresets() print "-- Recuperando test_set." # self._test_set = apply_features(Documento.get_features, self._documentos[:100]) return self._test_set def gravar_palavras_frequentes(self): diretorio_destino = "/home/lucas/Documents/mineracao_opiniao/palavras_frequentes_corpus" molde_nome_arquivo = "palavras_frequentes_%s.pickle" tempo_agora = str(datetime.now()) # Substitui ':' e espaço em branco por '.' tempo_agora = re.sub(ur':|\s', '.', tempo_agora) nome_arquivo = molde_nome_arquivo % tempo_agora if self._palavras_frequentes is None: self.get_palavras_frequentes() f = open(diretorio_destino + "/" + nome_arquivo, 'wb') pickle.dump(self._palavras_frequentes, f) f.close() return True @staticmethod def abrir_arquivo_palavras_frequentes(arquivo_path): f = open(arquivo_path, 'rb') palavras_frequentes = pickle.load(f) f.close() return palavras_frequentes
from nltk.corpus import brown # Abrir os documentos dentro do caminho específico # Argumentos # 1. Caminho absoluto para os documentos # 2. tipo / extensão dos documentos (*.txt) # 3. indicativo das pastas que formarão as categorias # todos os argumentos são expressões regulares leitor = CategorizedPlaintextCorpusReader( '../Dados/mix20_rand700_tokens_cleaned/tokens/', '.*.txt', cat_pattern=r'(\w+)/*') # Verificar o que foi carregado print(leitor.categories()) print(leitor.fileids()) # Separar o corpus de acordo com as categorias posFiles = leitor.fileids(categories='pos') negFiles = leitor.fileids(categories='neg') print('Arquivos pos:', posFiles) print('Arquivos neg:', negFiles) # Carregar os primeiros arquivos das categorias arqP = posFiles[0] arqN = negFiles[1] print("ArqP: ", arqP) print("ArqN: ", arqN)
pos_file.close() neg_file.close() # Words for all emotions lexicon = {} for emotion in base_emotions: f = open('./opinion-lexicon-English/%s-words.txt' % emotion, 'rU') words = [word.strip() for word in f.readlines()] lexicon[emotion] = words f.close() # Make a classifier based on the feature sets of the poems poem_corpus = CategorizedPlaintextCorpusReader('./data', 'poems.*', cat_file='cats.txt') poem_set = [(fileid, category) for fileid in poem_corpus.fileids() \ for category in poem_corpus.categories(fileid)] random.shuffle(poem_set) feature_set = [(poem_features(poem_corpus.words(fileids=[fileid])), category) for (fileid, category) in poem_set] train_set, test_set = feature_set[2000:], feature_set[:2000] # Initialize the classifier classifier = nltk.NaiveBayesClassifier.train(train_set) # For improving the algorithm classifier.show_most_informative_features(20)
# Build corpus for specific problem set problem = 'problemA' problem_root = nltk.data.find('corpora/AAAC/%s' % (problem)) problem_files = PlaintextCorpusReader(problem_root, '.*\.txt') # Categorize corpus by author auth_map = {} for filename in problem_files.fileids(): a_n = filename[:3] auth_map[filename] = [a_n] # By the entire corpus problem_cat = CategorizedPlaintextCorpusReader(problem_root, '.*\.txt', cat_map=auth_map) documents = [(list(problem_cat.words(fileid)), category) for category in problem_cat.categories() for fileid in problem_cat.fileids(category)] random.shuffle(documents) # Word Frequency featureset # Word freq accross corpus all_words = nltk.FreqDist(words.lower() for words in problem_cat.words()) key_words = all_words.keys()[:2000] # Compares whether a word from the keywords is in a document def doc_features(doc): doc_words = set(doc) features = {} for word in key_words:
def errors_em(poem_set): errors = [] for (fileid, category) in poem_set: poem = corpus_of_poems.words(fileids=[fileid]) emotion_correct = features_of_poem(poem) guess = classifier.classify(features_of_poem(poem)) if guess != category: errors.append((category, guess, poem, emotion_correct['emotions'])) return errors poem_set = [] for fileid in corpus_of_poems.fileids(): for category in corpus_of_poems.categories(fileid): poem_set.append((fileid, category)) print(poem_set) random.shuffle(poem_set) feature_set = [] for (fileid, category) in poem_set: feature_cal = (features_of_poem(fileid), category) feature_set.append(feature_cal) train_set = feature_set[25:] test_set = feature_set[:25]
# Provide path to the custom corpora mydir = '/Users/vasilis/Desktop/Lennon/lyrics_custom_corpus' # Read data from our custom corpora mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*') # Clean lyrics from the English stop words. stop = stopwords.words('english') documents = [(list(mr.words(fileid)), category) for category in mr.categories() for fileid in mr.fileids(category)] classifiers_dir = '/Users/vasilis/vxm773/Lennon/pickled_classifiers' if os.path.exists(classifiers_dir): shutil.rmtree(classifiers_dir) os.makedirs(classifiers_dir) save_documents = open("pickled_classifiers/documents.pickle", "wb") pickle.dump(documents, save_documents) save_documents.close() # Shuffle lyrics in order to avoid training only towards pos/neg lyrics. random.shuffle(documents)
j = 0 for i in range(10): dataset = str(i + 1) #mydir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Classified News/Training' train_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Train+val' test_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Testing' #test_dir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/NA is negative old/Classified News/Criteria '+dataset+'' preprocessed = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/data_2.p' train_Corpus = CategorizedPlaintextCorpusReader(train_dir, r'(?!\.).*\.txt', cat_pattern=r'(\w+)/*') train_documents = [(list(train_Corpus.words(fileid)), category) for category in train_Corpus.categories() for fileid in train_Corpus.fileids(category)] only_docs = [' '.join(doc[:1000]) for (doc, category) in train_documents] only_docs = [ ' '.join(normalize_text(document, lemmatize=True, remove_stop=None)) for document in only_docs ] ####################################################################################### train_labels = [category for (doc, category) in train_documents] train_binary_labels = [1 if i == 'pos' else 0 for i in train_labels] #train_data, test_data, train_labels, test_labels = train_test_split(only_docs, binary_labels,test_size=.15) train_data = only_docs train_labels = train_binary_labels
from pprint import pprint import nltk from nltk.corpus import CategorizedPlaintextCorpusReader, stopwords import logging CORPUS_ROOT = "/Users/derek/Data/RADCAT/corpus" if __name__ == "__main__": # For reports with category in the f/n abc_def+3.txt reports = CategorizedPlaintextCorpusReader(CORPUS_ROOT, '.*', cat_pattern=r'.*\+(.+)\.txt') logging.basicConfig(level=logging.DEBUG) logging.debug(reports.categories()) toks = [ w.lower() for w in reports.words() if w.isalpha() and w not in stopwords.words('english') ] all = nltk.Text(toks) print all.concordance('hemodynamically') # Create your bi-grams and n-grams # bgs = nltk.bigrams(toks) tgs = nltk.ngrams(toks, 3) fdist = nltk.FreqDist(tgs) pprint(fdist.most_common(20))