def init_documents(f_re, cat_re): logging.debug("Reading corpus") reports = CategorizedPlaintextCorpusReader(corpus_dir, f_re, cat_pattern=cat_re, encoding='utf8') logging.debug("Found {} fileids".format(len(reports.fileids()))) logging.debug("Found categories: {}".format(reports.categories())) logging.debug("Building docs") documents = [ (tokenize(reports.words(i)), reports.categories(i)[0]) for i in reports.fileids()] return documents
def __init__(self, dir, doc): self.doc = doc self.dir = dir self.eng_stopw = stopwords.words('english') text_corpus = CategorizedPlaintextCorpusReader( './%s/' % self.dir, r'.*\.csv', # leggo solamente i file che terminato con .csv cat_pattern=r'(\w+)/*', # prendi tutto quello che c'è dopo la directory encoding='latin-1' ) self.text = nltk.Text(text_corpus.words(self.doc))
def __init__(self, raiz_corpus): """Cria um objeto do tipo 'CategorizedPlaintextCorpusReader', utilizando o diretório raiz do corpus, onde os documentos estão localizados, dispostos em seus respectivos subdiretórios, de acordo com sua categoria, sejam eles/elas quais for --> raiz_corpus/{pos,neg,neu,...}. """ reload(sys) sys.setdefaultencoding("utf-8") self._raiz_corpus = raiz_corpus self._corpus = CategorizedPlaintextCorpusReader(raiz_corpus, r'.+\.txt', cat_pattern=r'(\w+)/*', encoding='utf-8') self._documentos = None self._palavras_frequentes = None self._todas_palavras = None self._featuresets = None self._train_set = None self._test_set = None
def display_features(num_features=1000, show_features=200, filepath='classifiers/nltk_nb.pkl', verbose=True): ''' Displays informative features from NHLCorpus ''' stop_words = set(stopwords.words('english')) nhl = CategorizedPlaintextCorpusReader(root='data/NHLcorpus/', fileids=r'.*\.txt', cat_pattern='(\w+)/*') documents = [] for category in nhl.categories(): for fileid in nhl.fileids(category): documents.append(([ re.sub(r'\W+', '', w.lower()) for w in nhl.words(fileid) if w.lower() not in stop_words ], category)) all_words = nltk.FreqDist( re.sub(r'\W+', '', w.lower()) for w in nhl.words() if w.lower() not in stop_words) word_features = [w[0] for w in all_words.most_common(num_features)] def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains({})'.format(word)] = word in document_words return features featuresets = [(document_features(d), c) for (d, c) in documents] nb_clf = nltk.NaiveBayesClassifier.train(featuresets) if verbose: nb_clf.show_most_informative_features(show_features) print('Accuracy on training data: {}'.format( nltk.classify.accuracy(nb_clf, featuresets))) save_classifier = open(filepath, 'wb') pickle.dump(nb_clf, save_classifier) save_classifier.close()
This code uses the meeting records (inputs) corpus. ''' import string from itertools import chain from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk # working dir: UN/ mydir = 'corpus/meeting_records_final_categorized' mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(intervention|soft_action)/.*', encoding='utf-8') stop = stopwords.words('english') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i,j in documents])) word_features = word_features.keys()[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]] test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[numtrain:]] classifier = nbc.train(train_set) print nltk.classify.accuracy(classifier, test_set) # .87 - ?!?!?! classifier.show_most_informative_features(20) # for word_features.keys()[:100]
if k in emotion_of_poems and v == max_value: emotion = k print(emotion) emotion_correct = {"emotion": emotion} return emotion_correct break return emotion_correct def classify(poem_text): return classifier.classify(features_of_poem(poem_text)) corpus_of_poems = CategorizedPlaintextCorpusReader('poems/', 'poems.*', cat_file='cats.txt') #code for generating errors # Return errors in order to improve algorithm def errors_em(poem_set): errors = [] for (fileid, category) in poem_set: poem = corpus_of_poems.words(fileids=[fileid]) emotion_correct = features_of_poem(poem) guess = classifier.classify(features_of_poem(poem)) if guess != category: errors.append((category, guess, poem, emotion_correct['emotions']))
from nltk.corpus import CategorizedPlaintextCorpusReader import ProcessText d1 = "judge people by what they say" d1_processed = ProcessText.ProcessText.process(d1) documents = [d1] #Read documents reader = CategorizedPlaintextCorpusReader( r'\Users\JoeDi\Desktop\MSC\Idioms Corpera', r'.*\.txt', cat_pattern=r'(\w+)/*') for w in reader.fileids(): wd = reader.raw(w) documents.append(w + " " + wd) print("Documents in the collection are: ") print(documents) print("\n") from sklearn.feature_extraction.text import TfidfVectorizer #build a TF/IDF matrix for each description tfidf = TfidfVectorizer().fit_transform(documents) print("Tf-idf weightings are: ") print(tfidf) print("\n")
print(positive_greg) print(negative_greg) positive_consolidated_list = list(pos_list) + positive_greg negative_consolidated_list = list(neg_list) + negative_greg print(positive_consolidated_list) print(negative_consolidated_list) init_notebook_mode(connected=True) cf.set_config_file(offline=True, world_readable=True, theme='ggplot') #%% corpus_root = "/Users/LENOVO USER/Desktop/FedTranscript1" data_m = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='latin1') data_fileids = data_m.fileids() #%% def corpus_Stats(crp): print('Total number of files: ' + str(len(crp.fileids()))) print('Number of paragraphs: ' + str(len(crp.paras()))) print('Number of sentences: ' + str(len(crp.sents()))) print('Number of words: ' + str(len(crp.words()))) #corpus_Stats(data_m) #print('\n'+'First file: '+ data_fileids[0]) #print('Last file: '+ data_fileids[-1])
sjar = '/Users/nischikata/PycharmProjects/JabRef-2.11.1.jar' from nltk.corpus import stopwords from nltk.corpus import CategorizedPlaintextCorpusReader from nltk import word_tokenize from nltk import TreebankWordTokenizer import nltk.data # PLAINTEXT CORPUS READER # http://www.nltk.org/_modules/nltk/corpus/reader/plaintext.html#CategorizedPlaintextCorpusReader # important: The TreebankWordTokenizer separates words like "don't" into "do", "n't", consequently the main verb is correctly identified. # For the Naive Bayes it may be better though to use WordPunctTokenizer - it is the default, so just omit the word_tokenizer param corpus = CategorizedPlaintextCorpusReader( '.', r'(?!\.).*\.txt', word_tokenizer=TreebankWordTokenizer(), cat_pattern=r'(aggressive|not_aggressive)/.*', encoding='utf8') # Getting RAW SENTENCES from RAW Comment seee: http://stackoverflow.com/a/4576110/4866678 tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') # returns a list of raw sentences def get_raw_sentences(fileid): # works data = corpus.raw(fileid) return tokenizer.tokenize(data) def get_raw_paragraph( fileid
import collections import nltk.classify.util, nltk.metrics from nltk.classify import NaiveBayesClassifier from nltk.classify import DecisionTreeClassifier from nltk.corpus import CategorizedPlaintextCorpusReader from sklearn import svm from sklearn.svm import LinearSVC import string from tabulate import tabulate corpus_root1='/Users/tianhan/Dropbox/Advanced_big_data_Project/aclImdb/train' train=CategorizedPlaintextCorpusReader(corpus_root1,r'(pos|neg)/.*\.txt',cat_pattern=r'(pos|neg)/.*\.txt') corpus_root2='/Users/tianhan/Dropbox/Advanced_big_data_Project/aclImdb/test' test=CategorizedPlaintextCorpusReader(corpus_root2,r'(pos|neg)/.*\.txt',cat_pattern=r'(pos|neg)/.*\.txt') def evaluate_classifier_Naive(featx): train_negids = train.fileids('neg') train_posids = train.fileids('pos') test_negids = test.fileids('neg') test_posids = test.fileids('pos') train_negfeats = [(featx(train.words(fileids=[f])), 'neg') for f in train_negids] train_posfeats = [(featx(train.words(fileids=[f])), 'pos') for f in train_posids] test_negfeats = [(featx(test.words(fileids=[f])), 'neg') for f in test_negids] test_posfeats = [(featx(test.words(fileids=[f])), 'pos') for f in test_posids] trainfeats = train_negfeats + train_posfeats testfeats = test_negfeats + test_posfeats Naive_classifier = NaiveBayesClassifier.train(trainfeats) refsets = collections.defaultdict(set) testsets_Naive = collections.defaultdict(set)
# 1장 말뭉치와 워드넷 - 외부 말뭉치 다운로드, 로드하고 액세스하기 from nltk.corpus import CategorizedPlaintextCorpusReader from random import randint # 말뭉치 읽기 reader = CategorizedPlaintextCorpusReader(r'/workspace/NLP_python/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) # 샘플 문서 출력 # pos, neg 카테고리의 샘플 목록 posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') # pos, neg 카테고리에서 각각 임의의 파일 선택 fileP = posFiles[randint(0, len(posFiles)-1)] fileN = negFiles[randint(0, len(negFiles)-1)] print(fileP) print(fileN) # 액세스한 임의 파일을 문장으로 출력 for w in reader.words(fileP): print(w + ' ', end='') if w is '.': print() for w in reader.words(fileN): print(w + ' ', end='') if w is '.': print()
def create_corpus(): poem_corpus = CategorizedPlaintextCorpusReader('../poems/', 'poems_.*', cat_file='cats.txt')
os.chdir(directory) file = open(fname, 'w') file.write(text) file.close() doc_start = {} doc_start[0] = "Staff Review of the Economic Situation" doc_start[1] = re.compile('The information (reviewed|received|provided)') doc_start[ 2] = "The Committee then turned to a discussion of the economic outlook" doc_start[3] = re.compile('The information (reviewed|received|provided)') doc_end = {} doc_end[0] = re.compile( '(At the conclusion of) (this|the) (discussion|meeting)') doc_end[1] = re.compile('(?i)The Committee voted to authorize') doc_end[2] = re.compile('(?i)The vote encompassed approval of') if __name__ == '__main__': corpus_root = '/Users/aaroncgw/Google Drive/fednlp/data/minutes/' data_m = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*') data_fileids = data_m.fileids() for f in data_fileids: year, fname = f.split('/') cropped_text = crop_text(data_m.raw(f), doc_start, doc_end) saveFile(fname, year, cropped_text)
from nltk.corpus import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader( r'D:\LEARNING\MISC\DataSet\movieCorpus\review_polarity\txt_sentoken', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') from random import randint fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(posFiles) - 1)] print(fileP) print(fileN) for w in reader.words(fileP): print(w + ' ', end='') if (w is '.'): print() for w in reader.words(fileN): print(w + ' ', end='') if (w is '.'): print()
import nltk from nltk.corpus import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader( r'/home/smadyastha/Projects/PythonCheck/Dataset/Reviews/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') from random import randint fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(posFiles) - 1)] print(fileP) print(fileN) for w in reader.words(fileP): print(w + "") if (w is '.'): print() # /home/smadyastha/Projects/PythonCheck/Dataset/Reviews
print strftime("%Y-%m-%d %H:%M:%S", gmtime()) # Uni machinename = 'maj27' j = 0 for i in range(10): dataset = str(i + 1) #mydir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Classified News/Training' train_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Train+val' test_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Testing' #test_dir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/NA is negative old/Classified News/Criteria '+dataset+'' preprocessed = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/data_2.p' train_Corpus = CategorizedPlaintextCorpusReader(train_dir, r'(?!\.).*\.txt', cat_pattern=r'(\w+)/*') train_documents = [(list(train_Corpus.words(fileid)), category) for category in train_Corpus.categories() for fileid in train_Corpus.fileids(category)] only_docs = [' '.join(doc[:1000]) for (doc, category) in train_documents] only_docs = [ ' '.join(normalize_text(document, lemmatize=True, remove_stop=None)) for document in only_docs ] ####################################################################################### train_labels = [category for (doc, category) in train_documents] train_binary_labels = [1 if i == 'pos' else 0 for i in train_labels]
from nltk.corpus import CategorizedPlaintextCorpusReader reader = CategorizedPlaintextCorpusReader( r'/Users/dechamoungsri/NLP_Learning/NLP_tutotial/mix20_rand700_tokens_cleaned/tokens/', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') from random import randint fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(posFiles) - 1)] print(fileP) print(fileN) for w in reader.words(fileP): print(w + ' ', end='') if (w is '.'): print() for w in reader.words(fileN): print(w + ' ', end='') if (w is '.'): print()
#Downloading an external corpus, load it, and access it from nltk.corpus import CategorizedPlaintextCorpusReader from random import randint #random # The first line is where you are reading the corpus by calling # the CategorizedPlaintextCorpusReader constructor. # The three arguments from left to right are Absolute Path # to the folder containing the corpus on your computer, all sample # document names from the txt_sentoken folder, and the categories # in the given corpus (in our case, 'pos' and 'neg' reader = CategorizedPlaintextCorpusReader(r'\Users\JoeDi\Desktop\python projs\tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) # Now that we've made sure that the corpus is loaded correctly, let's # get on with accessing any one of the sample documents from both the categories. # For that, let's first create a list, each containing samples of both the categories, 'pos' and 'neg', respectively. # Add the following two lines of code: posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') # The next two files select a random file, each from the set of positive # and negative category reviews. The last two lines just print the filenames. fileP = posFiles[randint(0,len(posFiles)-1)] fileN = negFiles[randint(0, len(posFiles) - 1)]
def classify_emails(): stop_words = set(stopwords.words("english")) lemmatizer = WordNetLemmatizer() mydir = '/home/ubuntu/nltk_data/corpora/gmail' all_words = [] filtered_words = [] removedPuncuations_words = [] lematized_words = [] test_filter = [] mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(hotel|flight|other)/.*', encoding='latin-1') stop = stopwords.words('english') documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i,j in documents])) word_features = word_features.keys()[:100] def word_feats(document): words = set(document) features = {} for w in word_features: features[w] = (w in words) return dict(features) negids = mr.fileids('hotel') posids = mr.fileids('flight') neutralids = mr.fileids('other') negfeats = [(word_feats(mr.words(fileids=[f])), 'hotel') for f in negids] posfeats = [(word_feats(mr.words(fileids=[f])), 'flight') for f in posids] neutralfeats = [(word_feats(mr.words(fileids=[f])), 'other') for f in neutralids] negcutoff = len(negfeats)*3/4 poscutoff = len(posfeats)*3/4 neutralcutoff = len(neutralfeats)*3/4 trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] + neutralfeats[:neutralcutoff] testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neutralfeats[neutralcutoff:] classifier = nltk.NaiveBayesClassifier.train(trainfeats) print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testfeats))*100) print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats)*100) file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read() tokens = nltk.word_tokenize(file_content) test_sent_features = {word.lower(): (word in tokens) for word in mr.words()} file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read() tokens = nltk.word_tokenize(file_content) tri_tokens = trigrams(tokens) cities = [] matchedIndex = [] tokenized = [] addresses = [] district = ['Akarawita','Angamuwa','Avissawella','Batawala','Battaramulla','Batugampola','Bope','Boralesgamuwa','Borella','Dedigamuwa','Dehiwala','Deltara','Habarakada','Handapangoda','Hanwella','Hewainna','Hiripitya','Hokandara','Homagama','Horagala','Kaduwela','Kahawala','Kalatuwawa','Madapatha','Maharagama','Malabe','Meegoda','Padukka','Pannipitiya','Piliyandala','Pitipana','Homagama','Polgasowita','Puwakpitiya','Ranala','Siddamulla','Slave Island','Sri Jayawardenapura','Talawatugoda','Tummodara','Waga','Watareka','Dickwella'] for i in tokens: tokenized.append(i) pattern = re.compile("\d+") for i in tokenized: if pattern.match(i): matchedIndex.append(tokenized.index(i)) print ("match"+i) print (tokenized.index(i)) else: print ("not match") for t in tokenized: for i in district: if t.lower()==i.lower(): cities.append(tokenized.index(t)) distance= 200 start = 0 end = 0 for t in cities: for i in matchedIndex: dis = t-i; if (dis<=distance and dis>0): distance=dis start=t end=i else: print ("higher") address = "" for token in range(end,start+1): address+=tokenized[(token)] print (address) addresses.append(address) for address in addresses: try: search = geocoder.get(address) except ValueError: continue first_result = search[0] output = [first_result.geometry.location.lat,first_result.geometry.location.lng] stri = ','.join(map(str, output)) return stri
def read_corpus(root_dir): return CategorizedPlaintextCorpusReader(root_dir, FILE_PATTERN, cat_pattern=CAT_PATTERN)
import nltk from nltk.corpus import CategorizedPlaintextCorpusReader from nltk.corpus import brown # Abrir os documentos dentro do caminho específico # Argumentos # 1. Caminho absoluto para os documentos # 2. tipo / extensão dos documentos (*.txt) # 3. indicativo das pastas que formarão as categorias # todos os argumentos são expressões regulares leitor = CategorizedPlaintextCorpusReader( '../Dados/mix20_rand700_tokens_cleaned/tokens/', '.*.txt', cat_pattern=r'(\w+)/*') # Verificar o que foi carregado print(leitor.categories()) print(leitor.fileids()) # Separar o corpus de acordo com as categorias posFiles = leitor.fileids(categories='pos') negFiles = leitor.fileids(categories='neg') print('Arquivos pos:', posFiles) print('Arquivos neg:', negFiles) # Carregar os primeiros arquivos das categorias arqP = posFiles[0] arqN = negFiles[1] print("ArqP: ", arqP)
# 임포트 from nltk.corpus import CategorizedPlaintextCorpusReader # 말뭉치 읽어오기 reader = CategorizedPlaintextCorpusReader(r'C:\Users\hyery\Python-NLP\chaper01\Reviews\tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) # 각 카테고리의 샘플을 포함하는 목록 작성 posFiles = reader.fileids(categories='pos') # 카테고리의 이름을 인수로 받는 fileids()함수 negFiles = reader.fileids(categories='neg') # 각 목록에서 임의로 파일을 선택 from random import randint fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(negFiles) - 1)] print(fileP) print(fileN) # 선택한 파일에 엑세스를 해 문장을 출력 for w in reader.words(fileP): print(w+' ', end='') if (w is '.'): print() for w in reader.words(fileN): print(w+' ', end='') if (w is '.'): print()
Example of reading a report corpus and generating a concordance and bi-grams Create a NLTK plaintext corpus using `examples/nltk_create_report_corpus.py` """ from pprint import pprint import nltk from nltk.corpus import CategorizedPlaintextCorpusReader, stopwords import logging CORPUS_ROOT = "/Users/derek/Data/RADCAT/corpus" if __name__ == "__main__": # For reports with category in the f/n abc_def+3.txt reports = CategorizedPlaintextCorpusReader(CORPUS_ROOT, '.*', cat_pattern=r'.*\+(.+)\.txt') logging.basicConfig(level=logging.DEBUG) logging.debug(reports.categories()) toks = [ w.lower() for w in reports.words() if w.isalpha() and w not in stopwords.words('english') ] all = nltk.Text(toks) print all.concordance('hemodynamically') # Create your bi-grams and n-grams # bgs = nltk.bigrams(toks)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Mar 11 08:43:52 2021 @author: paulogamero """ # ATIVIDADE: EXERCITANDO 1 - PARTE 01 # AUTOR: Paulo Gamero from nltk.corpus import CategorizedPlaintextCorpusReader d = CategorizedPlaintextCorpusReader( r'C:\Users\Usuario\Dropbox\Pos\Pós DataScience\4 - Análise de textos com R e Python\Dados\mix20_rand700_tokens_cleaned\tokens', r'.*.txt', cat_pattern = r'(\w+)/*', encoding = 'iso8859-1') for p in d.words('pos/cv003_tok-8338.txt'): print(p + ' ', end = '') for n in d.words('neg/cv002_tok-3321.txt'): print(n + ' ', end = '')
import string from itertools import chain from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk mydir = 'Documents/Plab/Project4/subset/test/neg' mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii') stop = stopwords.words('english') documents = [([ w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation ], i.split('/')[0]) for i in mr.fileids()] word_features = FreqDist(chain(*[i for i, j in documents])) word_features = word_features.keys()[:100] numtrain = int(len(documents) * 90 / 100) train_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]] test_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents[numtrain:]]
from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.classify import NaiveBayesClassifier as nbc from nltk.corpus import CategorizedPlaintextCorpusReader import nltk import sys import os mydir_train = '.\\Docs-txt\\train' mydir_test = '.\\Docs-txt\\test' featureVector_train = [] featureVector_test = [] mr_train = CategorizedPlaintextCorpusReader( mydir_train, r'(?!\.).*\.txt', cat_pattern= r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*') mr_test = CategorizedPlaintextCorpusReader( mydir_test, r'(?!\.).*\.txt', cat_pattern= r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*') stop = stopwords.words('english') with open('.\\stopwords.txt') as f: stop = f.read().splitlines() documents_train = [([ w for w in mr_train.words(i)
#http://www.cs.cornell.edu/people/pabo/movie%2Dreview%2Ddata/ from nltk.corpus import CategorizedPlaintextCorpusReader from random import randint reader = CategorizedPlaintextCorpusReader( r'mix20_rand700_tokens_cleaned/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*') print(reader.categories()) print(reader.fileids()) posFiles = reader.fileids(categories='pos') negFiles = reader.fileids(categories='neg') fileP = posFiles[randint(0, len(posFiles) - 1)] fileN = negFiles[randint(0, len(negFiles) - 1)] print(fileN) print(fileP) for w in reader.words(fileP): print(w + ' ', end='') if w is '.': print() for w in reader.words(fileN): print(w + ' ', end='') if w is '.': print()
def load_headline_corpus(with_dates=True, force_get=False, verbose=False): # set up paths if with_dates: zip_file_name = DATE_CORPUS_FILENAME else: zip_file_name = CAT_CORPUS_FILENAME # github download url url = 'https://github.com/tacticsiege/TacticCorpora/raw/master/headlines/archive/' + zip_file_name env_dir = get_env_dir() # archive paths archive_dir = env_dir + 'corpus\\archive\\' archive_file_name = archive_dir + zip_file_name # extracted corpus paths corpus_root = 'dated' if with_dates else 'categorized' saved_dir = env_dir + 'corpus\\' + corpus_root # check if the data is downloaded downloaded = os.path.exists(archive_file_name) # download the data from github if not downloaded: pathlib.Path(archive_dir).mkdir(parents=True, exist_ok=True) if verbose: print ('Downloading:', url, '...') with req.urlopen(url) as d, open(archive_file_name, 'wb') as tmpFile: data = d.read() tmpFile.write(data) if verbose: print ('Complete, saved to:', archive_file_name) # extract the data if the root directory doesn't exist extracted = os.path.exists(saved_dir) if not extracted: pathlib.Path(saved_dir).mkdir(parents=True, exist_ok=True) if verbose: print ('Opening:', archive_file_name) archive = zipfile.ZipFile(archive_file_name) archive.extractall(saved_dir) archive.close() if verbose: print ('Extracted to:', saved_dir) file_pattern = r'.*_corpus\.txt' if with_dates: cat_pattern = r'(.*)/' # HACK: fix this in archive later saved_dir = saved_dir + '\\2017_08_22\\corpus' if verbose: print ('Loading corpus from:', saved_dir) corpus = CategorizedDatedCorpusReader(saved_dir, file_pattern=file_pattern, cat_pattern=cat_pattern) else: cat_pattern = ".*_(.*)_corpus.txt" if verbose: print ('Loading corpus from:', saved_dir) corpus = CategorizedPlaintextCorpusReader(saved_dir, file_pattern=file_pattern, cat_pattern=cat_pattern) if corpus is not None and verbose: print ('Corpus loaded.') return corpus