def _extract_meta_data(self, corpus_path): self.db = CategorizedPlaintextCorpusReader(corpus_path, r'.*\.txt', cat_pattern=r'(\w+)/*') new_corpus = Corpus(self.db, corpus_path, self.limit_memory, self.verbose) return new_corpus.get_meta_features()
def load_documents(self,path): docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*') for cat in docs.categories(): self.cat_gram_freq[cat] = {} self.cat_word_freq[cat]={} return ((category,list(docs.words(fileid))) for category in docs.categories() for fileid in docs.fileids(category))
def predict(self, test_path): documents, self.y_test = self._read_corpus( CategorizedPlaintextCorpusReader(test_path, r'.*\.txt', cat_pattern=r'(\w+)/*'), test_path) self.X_test = self.representation.transform(documents) return self.automl.predict(self.X_test)
def load_documents(self,path): docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*') print docs.categories() documents = [(list(docs.words(fileid)), category) for category in docs.categories() for fileid in docs.fileids(category) ] random.shuffle(documents) return documents
import nltk from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader DOC_PATTERN = r'[\w_\s]+/[\w\s\d\-]+\.TXT' CAT_PATTERN = r'([\w_\s]+)/.*' corpus = CategorizedPlaintextCorpusReader('ENGLISH', DOC_PATTERN, cat_pattern=CAT_PATTERN) print(corpus.categories()) print(corpus.fileids()[100:110]) print(corpus.words())
parser = argparse.ArgumentParser() parser.add_argument('directory', help="the bill directory") parser.add_argument('--bigrams', action='store_true', dest='bigrams', default=False, help='use bigrams') args = parser.parse_args() if args.bigrams: featurizer = bigram_feats else: featurizer = word_feats corpus = CategorizedPlaintextCorpusReader(root=args.directory, fileids=".*/.*\.txt", cat_pattern=r'(dem|rep)/') best_words = most_informative_words(corpus) dem_ids = corpus.fileids(categories=['dem']) rep_ids = corpus.fileids(categories=['rep']) dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem') for f in dem_ids] rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep') for f in rep_ids] dem_cutoff = len(dem_feats) * 5 / 6 rep_cutoff = len(rep_feats) * 5 / 6
classifier.show_most_informative_features(5) #Document Classification #Load Libraries import os import random from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader #Read the dataset into the categorized corpus # Directory of the corpus corpusdir = 'corpus/' review_corpus = CategorizedPlaintextCorpusReader(corpusdir, r'.*\.txt', cat_pattern=r'\d+_(\w+)\.txt') # list of documents(fileid) and category (pos/neg) documents = [(list(review_corpus.words(fileid)), category) for category in review_corpus.categories() for fileid in review_corpus.fileids(category)] random.shuffle(documents) for category in review_corpus.categories(): print(category) type(review_corpus) len(documents) #Compute word frequency
# fileids_ = corpus_dir + '/rt-polarity*' corpus_dir = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/data/rt-polaritydata' cat_map_ = {'rt-polarity.pos': ['pos'], 'rt-polarity.neg': ['neg']} corpus_treatment(corpus_dir) encoded_corpus_dir = os.path.join(corpus_dir, 'encoded_data') fileids_ = '^rt-polarity.*' categorized_plaintext_corpusreader = CategorizedPlaintextCorpusReader( root=encoded_corpus_dir, cat_map=t_map_, fileids=fileids_, ) pos_words = categorized_plaintext_corpusreader.words(categories=['pos']) pos_sents = categorized_plaintext_corpusreader.sents(categories=['pos']) pos_paras = categorized_plaintext_corpusreader.paras(categories=['pos']) neg_words = categorized_plaintext_corpusreader.words(categories=['pos']) neg_sents = categorized_plaintext_corpusreader.sents(categories=['neg']) neg_paras = categorized_plaintext_corpusreader.paras(categories=['neg']) # NOTE: para views are not working to be looked into later # classification train = pos_words
label=target_name) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.title('PCA of BULATS dataset') plt.show() return model if __name__ == "__main__": PATH = "model.pickle" # Loading speech features speech = pd.read_csv("/ExamplePath.csv") if not os.path.exists(PATH): nli = CategorizedPlaintextCorpusReader(CORPUS, DOC_PATTERN, cat_pattern=CAT_PATTERN) # since `nli` already has all the information (text and ids) # you don't need to iterate over it multiple times so # construct `X` and `y` in one go. X = [] y = [] for fileid in nli.fileids(): X.append({ 'text': nli.raw(fileid), 'id': fileid.split('/')[-1].split('.')[0] }) y.append(nli.categories(fileid)[0]) clf = PCA(n_components=2) model = build_and_evaluate(X, y, clf, speech)
import os import re import csv from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader srm_data_dir = '/home/mayank/work/projects/SRM/Data' training_file_path = os.path.join(srm_data_dir, 'TrainingData.csv') training_file = open(training_file_path, 'r') root_dir = os.path.join(srm_data_dir, 'sub_data') # noraml_reader = PlaintextCorpusReader(root = root_dir, # fileids = ['Financial.csv']) cat_map_ = { 'Compliance': 'Compliance.csv', 'Financial': 'Financial.csv', 'Operational': 'Operational.csv', 'Strategic': 'Strategic.csv' } cat_reader = CategorizedPlaintextCorpusReader(root=root_dir, fileids=r'$.csv', cat_map=cat_map_)
import re from sklearn import metrics import string from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords from poslemma import LemmatizationWithPOSTagger from ordinal_classification import OrdinalSVC from sklearn.model_selection import StratifiedKFold base_dir = '/Users/ja/Documents/www' corpus_name = 'corpus' min_topics = 10 max_topics = 60 corpus = CategorizedPlaintextCorpusReader( os.path.join(base_dir, corpus_name), fileids=r'(?!\.).*\.txt', cat_pattern=r'(\w+)/*') def clean(doc): lemma = LemmatizationWithPOSTagger() stop = set(stopwords.words('english') + stopwords.words('numbers')) exclude = set(string.punctuation) wordchars = set(string.ascii_letters) wordchars |= set(string.digits) def contains_any(str, set): """Check whether 'str' contains ANY of the chars in 'set'""" return 1 in [c in str for c in set] def is_number(s): try:
def create_corpus(directory): word_tokenize = RegexpTokenizer('\w+(?:-\w+)*(?:[?!.,:])*') sent_tokenize = nltk.data.load('tokenizers/punkt/french.pickle') translation = str.maketrans("", "", ",.?!:") corpus = CategorizedPlaintextCorpusReader(directory, r"^[^.]*$", cat_file='cats.txt', encoding="iso-8859-1", word_tokenizer=word_tokenize, sent_tokenizer=sent_tokenize) return corpus
if type(i) == Tree: current_chunk.append(" ".join([token for token, pos in i.leaves()])) elif current_chunk: named_entity = " ".join(current_chunk) if named_entity not in continuous_chunk: continuous_chunk.append(named_entity) current_chunk = [] else: continue return continuous_chunk # create a corpus from the txt files given, with a file of categories to apply to the texts corpus = CategorizedPlaintextCorpusReader( 'corpus/', r'.*\.txt', cat_file="../textcats.prn") """ fileid="nytimes-2017.txt" raw = corpus.raw(fileid) raw = raw.replace("N.H.S.", "NHS") words = word_tokenize(raw) words = corpus.words(fileid) clean0 = [word for word in words if word not in stoplist] """ bloblist = corpus.fileids() #bloblist = corpus.fileids(categories='2016') M=len(bloblist) # Look at the categories corpus.categories()
import os import nltk import re from math import log from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader from nltk.text import Text from nltk import ConditionalFreqDist, FreqDist from nltk.stem.snowball import FrenchStemmer stemmer = FrenchStemmer() stopwordsdir = "C:/Projects/Allocine/stopwords/used" stopwords = [] root = "C:/Projects/Allocine/corpus2/" cats = ['cine', 'autre', 'critique', 'critique_a'] reader = CategorizedPlaintextCorpusReader(root, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='latin-1') text_all = Text(reader.words()) text_cine = Text(reader.words(categories='cine')) text_autre = Text(reader.words(categories='autre')) text_critique = Text(reader.words(categories='critique')) text_critique_a = Text(reader.words(categories='critique_a')) texts_list = [text_cine, text_autre, text_critique, text_critique_a] def remove_accents(text): text = re.sub("[àâäÄÂÀ]", "a", text) text = re.sub("[éèêëÈÊËÉ]", "e", text) text = re.sub("[ïîìÏÎÌ]", "i", text) text = re.sub("[öôòÖÔÒ]", "o", text)