def __init__(self, root, **kwargs): """ Initialize a PLoS reader with a specific corpus. Corpus information is contained in 'root/corpus_info.json' file. The @type root: string @param root: The directory path to the corpus directory. """ self._root = root fp = open( '%s/corpus_info.json' % (root), 'r' ) self._corpus_info = info = json.load(fp) fp.close() # doc_part is specific to PLoS and research article in general. # 'abstract' and 'body' are currently supported. # The corpus contains seperate text for each, but the # reader is initialized to readi only one. if 'doc_part' in kwargs: self._doc_part = doc_part = kwargs['doc_part'] del kwargs['doc_part'] else: self._doc_part = doc_part = 'body' if 'fileids' not in kwargs: fileids = [ doi2fn(d, doc_part) for d in info['d2c'].keys() ] else: fileids = kwargs['fileids'] # cat_map f -> [ c1, c2, ...] # The fileids depend on what the doc_part is ('body', 'abstract') cat_map = {} for d,cat in info['d2c'].iteritems(): cat_map[doi2fn(d, doc_part)] = cat kwargs['cat_map'] = cat_map # Subclass of Categorized Plaintext Corpus Reader CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)
def _extract_meta_data(self, corpus_path): self.db = CategorizedPlaintextCorpusReader(corpus_path, r'.*\.txt', cat_pattern=r'(\w+)/*') new_corpus = Corpus(self.db, corpus_path, self.limit_memory, self.verbose) return new_corpus.get_meta_features()
def __init__(self, input_folder_name, doc_pattern, categ_pattern, encoding='utf-8'): CategorizedPlaintextCorpusReader.__init__(self, input_folder_name, doc_pattern, cat_pattern=categ_pattern) self.input_folder_name = input_folder_name self.encoding = encoding self.root_reader = PlaintextCorpusReader(input_folder_name, fileids=r'[^\/]*.' + doc_pattern[-3:]) #self.root_ids =[ os.path.join(input_folder_name,item) for item in self.root_reader.fileids()] self.root_ids = list(self.root_reader.fileids())
def predict(self, test_path): documents, self.y_test = self._read_corpus( CategorizedPlaintextCorpusReader(test_path, r'.*\.txt', cat_pattern=r'(\w+)/*'), test_path) self.X_test = self.representation.transform(documents) return self.automl.predict(self.X_test)
def __init__(self, root, **kwargs): """ Initialize a PLoS reader with a specific corpus. Corpus information is contained in 'root/corpus_info.json' file. The @type root: string @param root: The directory path to the corpus. """ self._root = root # corpus type is specific to Plos_builder # full - all documents that were built. # partial - documents excluding training # training - documents intended for training if 'corpus_type' in kwargs: self._corpus_type = kwargs['corpus_type'] del kwargs['corpus_type'] else: self._corpus_type = 'full' fn = '{d}/{t}_corpus_info.json'.format(d=root, t=self._corpus_type) with open( fn, 'r' ) as fp: self._corpus_info = info = json.load(fp) # doc_part is specific to PLoS and research article. # 'abstract' and 'body' are currently supported. # The corpus contains seperate text for each, but the # reader is initialized to read only one. if 'doc_part' in kwargs: self._doc_part = doc_part = kwargs['doc_part'] del kwargs['doc_part'] else: self._doc_part = doc_part = 'body' if 'fileids' not in kwargs: fileids = [ doi2fn(d, doc_part) for d in self.dois() ] else: fileids = kwargs['fileids'] # cat_map f -> [ c1, c2, ...] # The fileids depend on what the doc_part is ('body', 'abstract') kwargs['cat_map'] = { doi2fn(d, doc_part) : cat for d,cat in info['dois_to_categories'].iteritems() } # Subclass of Categorized Plaintext Corpus Reader CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)
def load_documents(self,path): docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*') print docs.categories() documents = [(list(docs.words(fileid)), category) for category in docs.categories() for fileid in docs.fileids(category) ] random.shuffle(documents) return documents
def load_documents(self,path): docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*') for cat in docs.categories(): self.cat_gram_freq[cat] = {} self.cat_word_freq[cat]={} return ((category,list(docs.words(fileid))) for category in docs.categories() for fileid in docs.fileids(category))
import argparse parser = argparse.ArgumentParser() parser.add_argument('directory', help="the bill directory") parser.add_argument('--bigrams', action='store_true', dest='bigrams', default=False, help='use bigrams') args = parser.parse_args() if args.bigrams: featurizer = bigram_feats else: featurizer = word_feats corpus = CategorizedPlaintextCorpusReader( root=args.directory, fileids=".*/.*\.txt", cat_pattern=r'(dem|rep)/') best_words = most_informative_words(corpus) dem_ids = corpus.fileids(categories=['dem']) rep_ids = corpus.fileids(categories=['rep']) dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem') for f in dem_ids] rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep') for f in rep_ids] dem_cutoff = len(dem_feats) * 5 / 6 rep_cutoff = len(rep_feats) * 5 / 6
import nltk from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader DOC_PATTERN = r'[\w_\s]+/[\w\s\d\-]+\.TXT' CAT_PATTERN = r'([\w_\s]+)/.*' corpus = CategorizedPlaintextCorpusReader('ENGLISH', DOC_PATTERN, cat_pattern=CAT_PATTERN) print(corpus.categories()) print(corpus.fileids()[100:110]) print(corpus.words())
classifier.show_most_informative_features(5) #Document Classification #Load Libraries import os import random from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader #Read the dataset into the categorized corpus # Directory of the corpus corpusdir = 'corpus/' review_corpus = CategorizedPlaintextCorpusReader(corpusdir, r'.*\.txt', cat_pattern=r'\d+_(\w+)\.txt') # list of documents(fileid) and category (pos/neg) documents = [(list(review_corpus.words(fileid)), category) for category in review_corpus.categories() for fileid in review_corpus.fileids(category)] random.shuffle(documents) for category in review_corpus.categories(): print(category) type(review_corpus) len(documents) #Compute word frequency
parser = argparse.ArgumentParser() parser.add_argument('directory', help="the bill directory") parser.add_argument('--bigrams', action='store_true', dest='bigrams', default=False, help='use bigrams') args = parser.parse_args() if args.bigrams: featurizer = bigram_feats else: featurizer = word_feats corpus = CategorizedPlaintextCorpusReader(root=args.directory, fileids=".*/.*\.txt", cat_pattern=r'(dem|rep)/') best_words = most_informative_words(corpus) dem_ids = corpus.fileids(categories=['dem']) rep_ids = corpus.fileids(categories=['rep']) dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem') for f in dem_ids] rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep') for f in rep_ids] dem_cutoff = len(dem_feats) * 5 / 6 rep_cutoff = len(rep_feats) * 5 / 6
import os import nltk import re from math import log from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader from nltk.text import Text from nltk import ConditionalFreqDist, FreqDist from nltk.stem.snowball import FrenchStemmer stemmer = FrenchStemmer() stopwordsdir = "C:/Projects/Allocine/stopwords/used" stopwords = [] root = "C:/Projects/Allocine/corpus2/" cats = ['cine', 'autre', 'critique', 'critique_a'] reader = CategorizedPlaintextCorpusReader(root, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='latin-1') text_all = Text(reader.words()) text_cine = Text(reader.words(categories='cine')) text_autre = Text(reader.words(categories='autre')) text_critique = Text(reader.words(categories='critique')) text_critique_a = Text(reader.words(categories='critique_a')) texts_list = [text_cine, text_autre, text_critique, text_critique_a] def remove_accents(text): text = re.sub("[àâäÄÂÀ]", "a", text) text = re.sub("[éèêëÈÊËÉ]", "e", text) text = re.sub("[ïîìÏÎÌ]", "i", text) text = re.sub("[öôòÖÔÒ]", "o", text)
# fileids_ = corpus_dir + '/rt-polarity*' corpus_dir = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/data/rt-polaritydata' cat_map_ = {'rt-polarity.pos': ['pos'], 'rt-polarity.neg': ['neg']} corpus_treatment(corpus_dir) encoded_corpus_dir = os.path.join(corpus_dir, 'encoded_data') fileids_ = '^rt-polarity.*' categorized_plaintext_corpusreader = CategorizedPlaintextCorpusReader( root=encoded_corpus_dir, cat_map=t_map_, fileids=fileids_, ) pos_words = categorized_plaintext_corpusreader.words(categories=['pos']) pos_sents = categorized_plaintext_corpusreader.sents(categories=['pos']) pos_paras = categorized_plaintext_corpusreader.paras(categories=['pos']) neg_words = categorized_plaintext_corpusreader.words(categories=['pos']) neg_sents = categorized_plaintext_corpusreader.sents(categories=['neg']) neg_paras = categorized_plaintext_corpusreader.paras(categories=['neg']) # NOTE: para views are not working to be looked into later # classification train = pos_words
import sys import cPickle as pickle from itertools import chain # from nltk import trigrams, word_tokenize, sent_tokenize, FreqDist from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader from nltk.util import ingrams n = 3 train_path = "data/task1_train" print "Loading categorized corpus in", train_path, "..." cr = CategorizedPlaintextCorpusReader(train_path, ".*", cat_pattern="(\w*)") # Get categories print "%d categories: %s" % (len(cr.categories()), ", ".join(cr.categories())) for c in [cr.categories()[0]]: print c + "..." sys.stdout.flush() ngrams = {} for i in range(n, 0, -1): print str(i) + "-grams..." ngrams[i] = {} prefix = ("",) * (i - 1) for ngram in ingrams(chain(prefix, cr.words(categories=[c])), n): if not ngram in ngrams[i]:
if type(i) == Tree: current_chunk.append(" ".join([token for token, pos in i.leaves()])) elif current_chunk: named_entity = " ".join(current_chunk) if named_entity not in continuous_chunk: continuous_chunk.append(named_entity) current_chunk = [] else: continue return continuous_chunk # create a corpus from the txt files given, with a file of categories to apply to the texts corpus = CategorizedPlaintextCorpusReader( 'corpus/', r'.*\.txt', cat_file="../textcats.prn") """ fileid="nytimes-2017.txt" raw = corpus.raw(fileid) raw = raw.replace("N.H.S.", "NHS") words = word_tokenize(raw) words = corpus.words(fileid) clean0 = [word for word in words if word not in stoplist] """ bloblist = corpus.fileids() #bloblist = corpus.fileids(categories='2016') M=len(bloblist) # Look at the categories corpus.categories()
import sys from time import time from itertools import chain import cPickle as pickle from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader from nltk.probability import ConditionalProbDist, ConditionalFreqDist, MLEProbDist from nltk.util import ingrams print 'Loading corpus...', t = time() train_path = 'data/task1_train' cr = CategorizedPlaintextCorpusReader(train_path, '.*', cat_pattern='(\w*)') t = time() - t print str(t) + 's' # Test generation of CFD print 'Creating CFD...', sys.stdout.flush() t = time() cat = cr.categories()[0] n = 3 cfd = ConditionalFreqDist() prefix = ('',) * (n - 1)
import os import nltk from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader path = os.path.join(os.getcwd(), "debates") DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt' CAT_PATTERN = r'([\w_\s]+)/.*' corpus = CategorizedPlaintextCorpusReader(path, DOC_PATTERN, cat_pattern=CAT_PATTERN) def tag_corpus(corpus): return [nltk.pos_tag(sent) for sent in corpus.sents()] tagged_corpus = tag_corpus(corpus) import spacy nlp = spacy.load('en') def spacy_ner(tokenized_sent): doc = nlp(' '.join(tokenized_sent)) for ent in doc.ents: return ent.text, ent.label_
def create_corpus(directory): word_tokenize = RegexpTokenizer('\w+(?:-\w+)*(?:[?!.,:])*') sent_tokenize = nltk.data.load('tokenizers/punkt/french.pickle') translation = str.maketrans("", "", ",.?!:") corpus = CategorizedPlaintextCorpusReader(directory, r"^[^.]*$", cat_file='cats.txt', encoding="iso-8859-1", word_tokenizer=word_tokenize, sent_tokenizer=sent_tokenize) return corpus
label=target_name) plt.legend(loc='best', shadow=False, scatterpoints=1) plt.title('PCA of BULATS dataset') plt.show() return model if __name__ == "__main__": PATH = "model.pickle" # Loading speech features speech = pd.read_csv("/ExamplePath.csv") if not os.path.exists(PATH): nli = CategorizedPlaintextCorpusReader(CORPUS, DOC_PATTERN, cat_pattern=CAT_PATTERN) # since `nli` already has all the information (text and ids) # you don't need to iterate over it multiple times so # construct `X` and `y` in one go. X = [] y = [] for fileid in nli.fileids(): X.append({ 'text': nli.raw(fileid), 'id': fileid.split('/')[-1].split('.')[0] }) y.append(nli.categories(fileid)[0]) clf = PCA(n_components=2) model = build_and_evaluate(X, y, clf, speech)
import os import re import csv from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader srm_data_dir = '/home/mayank/work/projects/SRM/Data' training_file_path = os.path.join(srm_data_dir, 'TrainingData.csv') training_file = open(training_file_path, 'r') root_dir = os.path.join(srm_data_dir, 'sub_data') # noraml_reader = PlaintextCorpusReader(root = root_dir, # fileids = ['Financial.csv']) cat_map_ = { 'Compliance': 'Compliance.csv', 'Financial': 'Financial.csv', 'Operational': 'Operational.csv', 'Strategic': 'Strategic.csv' } cat_reader = CategorizedPlaintextCorpusReader(root=root_dir, fileids=r'$.csv', cat_map=cat_map_)
import re from sklearn import metrics import string from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader from nltk.corpus import stopwords from poslemma import LemmatizationWithPOSTagger from ordinal_classification import OrdinalSVC from sklearn.model_selection import StratifiedKFold base_dir = '/Users/ja/Documents/www' corpus_name = 'corpus' min_topics = 10 max_topics = 60 corpus = CategorizedPlaintextCorpusReader( os.path.join(base_dir, corpus_name), fileids=r'(?!\.).*\.txt', cat_pattern=r'(\w+)/*') def clean(doc): lemma = LemmatizationWithPOSTagger() stop = set(stopwords.words('english') + stopwords.words('numbers')) exclude = set(string.punctuation) wordchars = set(string.ascii_letters) wordchars |= set(string.digits) def contains_any(str, set): """Check whether 'str' contains ANY of the chars in 'set'""" return 1 in [c in str for c in set] def is_number(s): try:
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt' CAT_PATTERN = r'([\w_\s]+)/.*' corpus = CategorizedPlaintextCorpusReader('corpus/text', DOC_PATTERN, cat_pattern=CAT_PATTERN) print(corpus.categories()) print(corpus.fileids('2019'))