def init_documents(f_re, cat_re): logging.debug("Reading corpus") reports = CategorizedPlaintextCorpusReader(corpus_dir, f_re, cat_pattern=cat_re, encoding='utf8') logging.debug("Found {} fileids".format(len(reports.fileids()))) logging.debug("Found categories: {}".format(reports.categories())) logging.debug("Building docs") documents = [ (tokenize(reports.words(i)), reports.categories(i)[0]) for i in reports.fileids()] return documents
def nltk(): #### FOR TRAINING DATA #### stop = stopwords.words('spanish') # Reads the training data. traindir = '/Users/ruben/Desktop/Formularios_clasificados/training' mr = CategorizedPlaintextCorpusReader(traindir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8') # Converts training data into tuples of [(words,label), ...] documents = [([w for w in mr.words(i) if w.lower() not in stop and w not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] # Extract training features. word_features = FreqDist(chain(*[i for i, j in documents])) word_features = word_features.keys()[:100] # Assuming that you're using full data set # since your test set is different. train_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents] #### TRAINS THE TAGGER #### # Train the tagger classifier = NaiveBayesClassifier.train(train_set) #### FOR TESTING DATA #### # Now do the same reading and processing for the testing data. testdir = '/Users/ruben/Desktop/Formularios_clasificados/testing' mr_test = CategorizedPlaintextCorpusReader(testdir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8') # Converts testing data into tuples of [(words,label), ...] test_documents = [ ([w for w in mr_test.words(i) if w.lower() not in stop and w not in string.punctuation], i.split('/')[0]) for i in mr_test.fileids()] # Reads test data into features: test_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in test_documents] correct = 0 wrong = 0 #### Evaluate the classifier #### for doc, gold_label in test_set: tagged_label = classifier.classify(doc) if tagged_label == gold_label: correct += 1 else: wrong += 1 print correct, wrong, (float(correct) / wrong + correct)
def construct_model(copusPath, modelPath): mr = CategorizedPlaintextCorpusReader(copusPath, r'(?!\.).*\.txt', cat_pattern=r'*/.*', encoding='iso-8859-1') stop = stopwords.words('french') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i, j in documents])) word_features = list(word_features.keys()) numtrain = int(len(documents) * 100 / 100) train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]] """test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag in documents[numtrain:]]""" classifier = nbc.train(train_set) mrtest = CategorizedPlaintextCorpusReader(os.path.abspath("corpus_test"), r'(?!\.).*\.txt', cat_pattern=r'*/.*', encoding='iso-8859-1') documentsTest = [([w for w in mrtest.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mrtest.fileids()] word_features_test = FreqDist(chain(*[i for i, j in documentsTest])) word_features_test = list(word_features_test.keys()) numtrain_test = int(len(documentsTest) * 100 / 100) test_set = [({i:(i in tokens) for i in word_features_test}, tag) for tokens, tag in documentsTest[:numtrain_test]] save_classifier(classifier, modelPath)
def display_features(num_features=1000, show_features=200, filepath='classifiers/nltk_nb.pkl', verbose=True): ''' Displays informative features from NHLCorpus ''' stop_words = set(stopwords.words('english')) nhl = CategorizedPlaintextCorpusReader(root='data/NHLcorpus/', fileids=r'.*\.txt', cat_pattern='(\w+)/*') documents = [] for category in nhl.categories(): for fileid in nhl.fileids(category): documents.append(([ re.sub(r'\W+', '', w.lower()) for w in nhl.words(fileid) if w.lower() not in stop_words ], category)) all_words = nltk.FreqDist( re.sub(r'\W+', '', w.lower()) for w in nhl.words() if w.lower() not in stop_words) word_features = [w[0] for w in all_words.most_common(num_features)] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains({})'.format(word)] = word in document_words return features featuresets = [(document_features(d), c) for (d, c) in documents] nb_clf = nltk.NaiveBayesClassifier.train(featuresets) if verbose: nb_clf.show_most_informative_features(show_features) print('Accuracy on training data: {}'.format( nltk.classify.accuracy(nb_clf, featuresets))) save_classifier = open(filepath, 'wb') pickle.dump(nb_clf, save_classifier) save_classifier.close()
from nltk.corpus import CategorizedPlaintextCorpusReader import nltk, string, numpy reader = CategorizedPlaintextCorpusReader( r'\Users\JoeDi\Desktop\MSC\Idioms Corpera', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) from random import randint File = reader.fileids() fileP = File[randint(0, len(File) - 1)] print(fileP) for w in reader.words(fileP): print(w + ' ', end='') if (w is '.'): print() #https://sites.temple.edu/tudsc/2017/03/30/measuring-similarity-between-texts-in-python/ from sklearn.feature_extraction.text import CountVectorizer import nltk, string, numpy sss = "Because there is no easy way to decide how two words, two documents are related. All we have is sequence of letters " \ "or strings if you prefer. So how to find a relationship between two words? If you want to decide how two documents related, " \ "how to figure that out? It cant be done without having any other data."
from nltk.corpus import CategorizedPlaintextCorpusReader import ProcessText d1 = "judge people by what they say" d1_processed = ProcessText.ProcessText.process(d1) documents = [d1] #Read documents reader = CategorizedPlaintextCorpusReader( r'\Users\JoeDi\Desktop\MSC\Idioms Corpera', r'.*\.txt', cat_pattern=r'(\w+)/*') for w in reader.fileids(): wd = reader.raw(w) documents.append(w + " " + wd) print("Documents in the collection are: ") print(documents) print("\n") from sklearn.feature_extraction.text import TfidfVectorizer #build a TF/IDF matrix for each description tfidf = TfidfVectorizer().fit_transform(documents) print("Tf-idf weightings are: ") print(tfidf) print("\n")
pos_file.close() neg_file.close() # Words for all emotions lexicon = {} for emotion in base_emotions: f = open('./opinion-lexicon-English/%s-words.txt' % emotion, 'rU') words = [word.strip() for word in f.readlines()] lexicon[emotion] = words f.close() # Make a classifier based on the feature sets of the poems poem_corpus = CategorizedPlaintextCorpusReader('./data', 'poems.*', cat_file='cats.txt') poem_set = [(fileid, category) for fileid in poem_corpus.fileids() \ for category in poem_corpus.categories(fileid)] random.shuffle(poem_set) feature_set = [(poem_features(poem_corpus.words(fileids=[fileid])), category) for (fileid, category) in poem_set] train_set, test_set = feature_set[2000:], feature_set[:2000] # Initialize the classifier classifier = nltk.NaiveBayesClassifier.train(train_set) # For improving the algorithm classifier.show_most_informative_features(20)
import time start=time.time() import nltk from nltk.corpus import sentiwordnet as swn from nltk.corpus import CategorizedPlaintextCorpusReader from sklearn.cluster import KMeans import numpy as np import copy import math import re corpus_root = 'C:\\MyData\\PythonPractice\\IMDB\\train' #Path of IMDB Train Data reader=CategorizedPlaintextCorpusReader(corpus_root,r'.*\.txt',cat_pattern=r'(\w+)/*') r_pos=reader.fileids(categories=['pos']) r_neg=reader.fileids(categories=['neg']) global_shortlisted=[] TRAIN_GS_POS=[] for i in range(0,12500): doc=reader.raw(r_pos[i:i+1]) #doc contains the movie review sentences = nltk.sent_tokenize(doc) senlen=len(sentences) def decontracted(phrase): # specific phrase = re.sub(r"won't", "will not", phrase) phrase = re.sub(r"can\'t", "can not", phrase)
problem = 'problemA' problem_root = nltk.data.find('corpora/AAAC/%s' % (problem)) problem_files = PlaintextCorpusReader(problem_root, '.*\.txt') # Categorize corpus by author auth_map = {} for filename in problem_files.fileids(): a_n = filename[:3] auth_map[filename] = [a_n] # By the entire corpus problem_cat = CategorizedPlaintextCorpusReader(problem_root, '.*\.txt', cat_map=auth_map) documents = [(list(problem_cat.words(fileid)), category) for category in problem_cat.categories() for fileid in problem_cat.fileids(category)] random.shuffle(documents) # Word Frequency featureset # Word freq accross corpus all_words = nltk.FreqDist(words.lower() for words in problem_cat.words()) key_words = all_words.keys()[:2000] # Compares whether a word from the keywords is in a document def doc_features(doc): doc_words = set(doc) features = {} for word in key_words: features['contains(%s)' % word] = (word in doc_words)
# Abrir os documentos dentro do caminho específico # Argumentos # 1. Caminho absoluto para os documentos # 2. tipo / extensão dos documentos (*.txt) # 3. indicativo das pastas que formarão as categorias # todos os argumentos são expressões regulares leitor = CategorizedPlaintextCorpusReader( '../Dados/mix20_rand700_tokens_cleaned/tokens/', '.*.txt', cat_pattern=r'(\w+)/*') # Verificar o que foi carregado print(leitor.categories()) print(leitor.fileids()) # Separar o corpus de acordo com as categorias posFiles = leitor.fileids(categories='pos') negFiles = leitor.fileids(categories='neg') print('Arquivos pos:', posFiles) print('Arquivos neg:', negFiles) # Carregar os primeiros arquivos das categorias arqP = posFiles[0] arqN = negFiles[1] print("ArqP: ", arqP) print("ArqN: ", arqN) # Imprimir as sentenças dos arquivos
import string from itertools import chain from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk mydir = 'Documents/Plab/Project4/subset/test/neg' mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii') stop = stopwords.words('english') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i,j in documents])) word_features = word_features.keys()[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]] test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[numtrain:]] classifier = nbc.train(train_set) print(nltk.classify.accuracy(classifier, test_set)) classifier.show_most_informative_features(5)
os.chdir(directory) file = open(fname, 'w') file.write(text) file.close() doc_start = {} doc_start[0] = "Staff Review of the Economic Situation" doc_start[1] = re.compile('The information (reviewed|received|provided)') doc_start[ 2] = "The Committee then turned to a discussion of the economic outlook" doc_start[3] = re.compile('The information (reviewed|received|provided)') doc_end = {} doc_end[0] = re.compile( '(At the conclusion of) (this|the) (discussion|meeting)') doc_end[1] = re.compile('(?i)The Committee voted to authorize') doc_end[2] = re.compile('(?i)The vote encompassed approval of') if __name__ == '__main__': corpus_root = '/Users/aaroncgw/Google Drive/fednlp/data/minutes/' data_m = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*') data_fileids = data_m.fileids() for f in data_fileids: year, fname = f.split('/') cropped_text = crop_text(data_m.raw(f), doc_start, doc_end) saveFile(fname, year, cropped_text)
def transform(corpus: CategorizedPlaintextCorpusReader, target_root_dir): if not os.path.exists(target_root_dir): os.makedirs(target_root_dir) open(target_root_dir + "\\meta.info", 'w').write("tagged\nmarks.txt") for fileid in corpus.fileids(): yield process(corpus, target_root_dir, fileid)
import nltk from nltk.corpus import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader( r'/home/smadyastha/Projects/PythonCheck/Dataset/Reviews/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') from random import randint fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(posFiles) - 1)] print(fileP) print(fileN) for w in reader.words(fileP): print(w + "") if (w is '.'): print() # /home/smadyastha/Projects/PythonCheck/Dataset/Reviews
start = time.time() import nltk from nltk.corpus import sentiwordnet as swn from nltk.corpus import CategorizedPlaintextCorpusReader from sklearn.cluster import KMeans import numpy as np import copy import math import re corpus_root = 'C:\\MyData\\PythonPractice\\IMDB\\test' #Path of IMDB Test Data reader = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*') r_neg = reader.fileids(categories=['neg']) r_pos = reader.fileids(categories=['pos']) global_shortlisted = [] TEST_GS_POS = [] for i in range(0, 12500): doc = reader.raw(r_pos[i:i + 1]) #doc contains the movie review sentences = nltk.sent_tokenize(doc) senlen = len(sentences) def decontracted(phrase): # specific phrase = re.sub(r"won't", "will not", phrase) phrase = re.sub(r"can\'t", "can not", phrase)
# Provide path to the custom corpora mydir = '/Users/vasilis/Desktop/Lennon/lyrics_custom_corpus' # Read data from our custom corpora mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*') # Clean lyrics from the English stop words. stop = stopwords.words('english') documents = [(list(mr.words(fileid)), category) for category in mr.categories() for fileid in mr.fileids(category)] classifiers_dir = '/Users/vasilis/vxm773/Lennon/pickled_classifiers' if os.path.exists(classifiers_dir): shutil.rmtree(classifiers_dir) os.makedirs(classifiers_dir) save_documents = open("pickled_classifiers/documents.pickle", "wb") pickle.dump(documents, save_documents) save_documents.close() # Shuffle lyrics in order to avoid training only towards pos/neg lyrics. random.shuffle(documents)
for i in range(10): dataset = str(i + 1) #mydir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Classified News/Training' train_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Train+val' test_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Testing' #test_dir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/NA is negative old/Classified News/Criteria '+dataset+'' preprocessed = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/data_2.p' train_Corpus = CategorizedPlaintextCorpusReader(train_dir, r'(?!\.).*\.txt', cat_pattern=r'(\w+)/*') train_documents = [(list(train_Corpus.words(fileid)), category) for category in train_Corpus.categories() for fileid in train_Corpus.fileids(category)] only_docs = [' '.join(doc[:1000]) for (doc, category) in train_documents] only_docs = [ ' '.join(normalize_text(document, lemmatize=True, remove_stop=None)) for document in only_docs ] ####################################################################################### train_labels = [category for (doc, category) in train_documents] train_binary_labels = [1 if i == 'pos' else 0 for i in train_labels] #train_data, test_data, train_labels, test_labels = train_test_split(only_docs, binary_labels,test_size=.15) train_data = only_docs train_labels = train_binary_labels
# NLTK brow selection word_list_brown = brown.words() sents_list_brown = brown.sents() vocabulary_brown = set(word_list_brown) brown_len_words = len(word_list_brown) brown_len_sents = len(sents_list_brown) brown_len_vocab = len(vocabulary_brown) brown_richness = lexical_diversity(word_list_brown) # Lyric corpus cats = corpus.categories() print(len(cats)) print(cats) num_files = len(corpus.fileids()) word_list = list(corpus.words()) sents_list = list(corpus.sents()) vocabulary = set(word_list) total_len_words = len(word_list) total_len_sents = len(sents_list) total_len_vocab = len(vocabulary) total_richness = lexical_diversity(word_list) # POP word_list_pop = list(corpus.words(categories="POP")) sents_list_pop = list(corpus.sents(categories="POP")) vocabulary_pop = set(word_list_pop) pop_len_words = len(word_list_pop) pop_len_sents = len(sents_list_pop) pop_len_vocab = len(vocabulary_pop)
''' import string from itertools import chain from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk # working dir: UN/ mydir = 'corpus/meeting_records_final_categorized' mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(intervention|soft_action)/.*', encoding='utf-8') stop = stopwords.words('english') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i,j in documents])) word_features = word_features.keys()[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]] test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[numtrain:]] classifier = nbc.train(train_set) print nltk.classify.accuracy(classifier, test_set) # .87 - ?!?!?! classifier.show_most_informative_features(20) # for word_features.keys()[:100] ''' Most Informative Features
positive_consolidated_list = list(pos_list) + positive_greg negative_consolidated_list = list(neg_list) + negative_greg print(positive_consolidated_list) print(negative_consolidated_list) init_notebook_mode(connected=True) cf.set_config_file(offline=True, world_readable=True, theme='ggplot') #%% corpus_root = "/Users/LENOVO USER/Desktop/FedTranscript1" data_m = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='latin1') data_fileids = data_m.fileids() #%% def corpus_Stats(crp): print('Total number of files: ' + str(len(crp.fileids()))) print('Number of paragraphs: ' + str(len(crp.paras()))) print('Number of sentences: ' + str(len(crp.sents()))) print('Number of words: ' + str(len(crp.words()))) #corpus_Stats(data_m) #print('\n'+'First file: '+ data_fileids[0]) #print('Last file: '+ data_fileids[-1]) #%%
def classify_emails(): stop_words = set(stopwords.words("english")) lemmatizer = WordNetLemmatizer() mydir = '/home/ubuntu/nltk_data/corpora/gmail' all_words = [] filtered_words = [] removedPuncuations_words = [] lematized_words = [] test_filter = [] mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(hotel|flight|other)/.*', encoding='latin-1') stop = stopwords.words('english') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i,j in documents])) word_features = word_features.keys()[:100] def word_feats(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) return dict(features) negids = mr.fileids('hotel') posids = mr.fileids('flight') neutralids = mr.fileids('other') negfeats = [(word_feats(mr.words(fileids=[f])), 'hotel') for f in negids] posfeats = [(word_feats(mr.words(fileids=[f])), 'flight') for f in posids] neutralfeats = [(word_feats(mr.words(fileids=[f])), 'other') for f in neutralids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 neutralcutoff = len(neutralfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] + neutralfeats[:neutralcutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neutralfeats[neutralcutoff:] classifier = nltk.NaiveBayesClassifier.train(trainfeats) print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testfeats))*100) print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats)*100) file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read() tokens = nltk.word_tokenize(file_content) test_sent_features = {word.lower(): (word in tokens) for word in mr.words()} file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read() tokens = nltk.word_tokenize(file_content) tri_tokens = trigrams(tokens) cities = [] matchedIndex = [] tokenized = [] addresses = [] district = ['Akarawita','Angamuwa','Avissawella','Batawala','Battaramulla','Batugampola','Bope','Boralesgamuwa','Borella','Dedigamuwa','Dehiwala','Deltara','Habarakada','Handapangoda','Hanwella','Hewainna','Hiripitya','Hokandara','Homagama','Horagala','Kaduwela','Kahawala','Kalatuwawa','Madapatha','Maharagama','Malabe','Meegoda','Padukka','Pannipitiya','Piliyandala','Pitipana','Homagama','Polgasowita','Puwakpitiya','Ranala','Siddamulla','Slave Island','Sri Jayawardenapura','Talawatugoda','Tummodara','Waga','Watareka','Dickwella'] for i in tokens: tokenized.append(i) pattern = re.compile("\d+") for i in tokenized: if pattern.match(i): matchedIndex.append(tokenized.index(i)) print ("match"+i) print (tokenized.index(i)) else: print ("not match") for t in tokenized: for i in district: if t.lower()==i.lower(): cities.append(tokenized.index(t)) distance= 200 start = 0 end = 0 for t in cities: for i in matchedIndex: dis = t-i; if (dis<=distance and dis>0): distance=dis start=t end=i else: print ("higher") address = "" for token in range(end,start+1): address+=tokenized[(token)] print (address) addresses.append(address) for address in addresses: try: search = geocoder.get(address) except ValueError: continue first_result = search[0] output = [first_result.geometry.location.lat,first_result.geometry.location.lng] stri = ','.join(map(str, output)) return stri
# 임포트 from nltk.corpus import CategorizedPlaintextCorpusReader # 말뭉치 읽어오기 reader = CategorizedPlaintextCorpusReader(r'C:\Users\hyery\Python-NLP\chaper01\Reviews\tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) # 각 카테고리의 샘플을 포함하는 목록 작성 posFiles = reader.fileids(categories='pos') # 카테고리의 이름을 인수로 받는 fileids()함수 negFiles = reader.fileids(categories='neg') # 각 목록에서 임의로 파일을 선택 from random import randint fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(negFiles) - 1)] print(fileP) print(fileN) # 선택한 파일에 엑세스를 해 문장을 출력 for w in reader.words(fileP): print(w+' ', end='') if (w is '.'): print() for w in reader.words(fileN): print(w+' ', end='') if (w is '.'): print()
class CorpusUtil(object): """Documentar """ def __init__(self, raiz_corpus): """Cria um objeto do tipo 'CategorizedPlaintextCorpusReader', utilizando o diretório raiz do corpus, onde os documentos estão localizados, dispostos em seus respectivos subdiretórios, de acordo com sua categoria, sejam eles/elas quais for --> raiz_corpus/{pos,neg,neu,...}. """ reload(sys) sys.setdefaultencoding("utf-8") self._raiz_corpus = raiz_corpus self._corpus = CategorizedPlaintextCorpusReader(raiz_corpus, r'.+\.txt', cat_pattern=r'(\w+)/*', encoding='utf-8') self._documentos = None self._palavras_frequentes = None self._todas_palavras = None self._featuresets = None self._train_set = None self._test_set = None def get_documentos(self): """Construimos uma lista de documentos, rotulados com as categorias apropriadas. Cada documento é representado por uma tupla na estrutura abaixo: (conteudo_do_documento, categoria) Retorna essa lista com todos os documentos do corpus. """ """ documentos = [(self.corpus.words(fileid), categoria) for categoria in self.corpus.categories() for fileid in self.corpus.fileids(categoria)] """ print "-- Recuperando documentos do corpus." if self._documentos is None: self._documentos = [Documento(" ".join(self._corpus.words(fileid)), categoria, self, fileid) for categoria in self._corpus.categories() for fileid in self._corpus.fileids(categoria)] # Embaralha documentos for i in range(0, 10): shuffle(self._documentos) return self._documentos def get_palavras_frequentes(self): """Documentar. """ if self._palavras_frequentes is None: print "-- Verificando as palavras mais frequentes do corpus." # Teste - retorna apenas as 2000 palavras mais frequentes do corpus todas_palavras = [word.lower() for word in self._corpus.words()] freq_dist_palavras = FreqDist(todas_palavras) frequencia_palavras = freq_dist_palavras.most_common(2000) # 2000 palavras mais frequentes self._palavras_frequentes = [palavra for palavra, frequencia in frequencia_palavras] # all_words = FreqDist(word.lower() for word in self.corpus.words()) # self.word_features = list(all_words)[:2000] return self._palavras_frequentes def get_todas_palavras(self): if self._todas_palavras is None: print "-- Recuperando todas as palavras do corpus." self._todas_palavras = [word.lower() for word in self._corpus.words()] self._todas_palavras = set(self._todas_palavras) return self._todas_palavras def get_featuresets(self): """Configura os featuresets que são construídos na seguinte estrutura: (features_do_documento, categoria) Retorna uma lista de featuresets """ if self._featuresets is None: if self._documentos is None: self.get_documentos() print "-- Recuperando featuresets." self._featuresets = apply_features(Documento.get_features, self._documentos) return self._featuresets def get_train_set(self): """Documentar """ if self._featuresets is None: self.get_featuresets() print "-- Recuperando train_set." # Para não ocupar toda a memória RAM, # não armazena todos os documentos de uma vez nesta. # self._train_set = apply_features(Documento.get_features, self._documentos[100:]) self._train_set = apply_features(Documento.get_features, self._documentos) return self._train_set def get_test_set(self): if self._featuresets is None: self.get_featuresets() print "-- Recuperando test_set." # self._test_set = apply_features(Documento.get_features, self._documentos[:100]) return self._test_set def gravar_palavras_frequentes(self): diretorio_destino = "/home/lucas/Documents/mineracao_opiniao/palavras_frequentes_corpus" molde_nome_arquivo = "palavras_frequentes_%s.pickle" tempo_agora = str(datetime.now()) # Substitui ':' e espaço em branco por '.' tempo_agora = re.sub(ur':|\s', '.', tempo_agora) nome_arquivo = molde_nome_arquivo % tempo_agora if self._palavras_frequentes is None: self.get_palavras_frequentes() f = open(diretorio_destino + "/" + nome_arquivo, 'wb') pickle.dump(self._palavras_frequentes, f) f.close() return True @staticmethod def abrir_arquivo_palavras_frequentes(arquivo_path): f = open(arquivo_path, 'rb') palavras_frequentes = pickle.load(f) f.close() return palavras_frequentes
# Return errors in order to improve algorithm def errors_em(poem_set): errors = [] for (fileid, category) in poem_set: poem = corpus_of_poems.words(fileids=[fileid]) emotion_correct = features_of_poem(poem) guess = classifier.classify(features_of_poem(poem)) if guess != category: errors.append((category, guess, poem, emotion_correct['emotions'])) return errors poem_set = [] for fileid in corpus_of_poems.fileids(): for category in corpus_of_poems.categories(fileid): poem_set.append((fileid, category)) print(poem_set) random.shuffle(poem_set) feature_set = [] for (fileid, category) in poem_set: feature_cal = (features_of_poem(fileid), category) feature_set.append(feature_cal) train_set = feature_set[25:] test_set = feature_set[:25]
from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk mydir = 'Documents/Plab/Project4/subset/test/neg' mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii') stop = stopwords.words('english') documents = [([ w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i, j in documents])) word_features = word_features.keys()[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]] test_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents[numtrain:]] classifier = nbc.train(train_set) print(nltk.classify.accuracy(classifier, test_set)) classifier.show_most_informative_features(5)
# returns a list of raw sentences def get_raw_sentences(fileid): # works data = corpus.raw(fileid) return tokenizer.tokenize(data) def get_raw_paragraph( fileid ): #TODO test if this works with yahoo! corpus as well (encoding might differ) data = corpus.raw(fileid) return data.split(u"\r\n \r\n") # ACCESS all FILEIDS: # corpus.fileids([category]) # category is optional print(corpus.fileids()) # GET ABSOLUTE PATH TO a FILEID # corpus.abspath('not_aggressive/0__10.txt') # GET RAW CORPUS # corpus.raw() or corpus.raw()[:10] to get the first 10 chars of the raw text # GET RAW TEXT COMMENT given fileid # corpus.raw([fileid]) # my_corpus.raw(my_corpus.fileids()[2])) # prints raw text of file index 2 of whole corpus# # GET list of TOKENIZED SENTS for a COMMENT via index or fileid: # sents = corpus.sents(corpus.fileids()[index]) # sents = corpus.sents([fileid]) """ GET TOKENIZED PARAGRAPHS
#http://www.cs.cornell.edu/people/pabo/movie%2Dreview%2Ddata/ from nltk.corpus import CategorizedPlaintextCorpusReader from random import randint reader = CategorizedPlaintextCorpusReader( r'mix20_rand700_tokens_cleaned/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(negFiles) - 1)] print(fileN) print(fileP) for w in reader.words(fileP): print(w + ' ', end='') if w is '.': print() for w in reader.words(fileN): print(w + ' ', end='') if w is '.': print()
r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*') mr_test = CategorizedPlaintextCorpusReader( mydir_test, r'(?!\.).*\.txt', cat_pattern= r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*') stop = stopwords.words('english') with open('.\\stopwords.txt') as f: stop = f.read().splitlines() documents_train = [([ w for w in mr_train.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in mr_train.fileids() if os.path.getsize(os.path.join(mydir_train, i)) > 0] documents_test = [([ w for w in mr_test.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in mr_test.fileids() if os.path.getsize(os.path.join(mydir_test, i)) > 0] word_features_train = FreqDist(chain(*[i for i, j in documents_train])) word_features_train = list(word_features_train.keys())[:1000] word_features_test = FreqDist(chain(*[i for i, j in documents_test])) word_features_test = list(word_features_test.keys())[:1000] for w in word_features_train: if (w in stop):