Python CategorizedPlaintextCorpusReader Beispiele, nltk.corpus.reader.plaintext.CategorizedPlaintextCorpusReader Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: reader.py Projekt: SavinaRoja/gitPLoS

    def __init__(self, root, **kwargs):
        """
	Initialize a PLoS reader with a specific corpus. Corpus 
	information is contained in 'root/corpus_info.json' file. The

        @type  root: string
	@param root: The directory path to the corpus directory.
        """
        self._root = root
        fp = open( '%s/corpus_info.json' % (root), 'r' )
        self._corpus_info = info = json.load(fp)
        fp.close()

        # doc_part is specific to PLoS and research article in general.
	# 'abstract' and 'body' are currently supported.
	# The corpus contains seperate text for each, but the 
	# reader is initialized to readi only one.
	if 'doc_part' in kwargs:
            self._doc_part = doc_part = kwargs['doc_part']
	    del kwargs['doc_part']
	else:
	    self._doc_part = doc_part = 'body'
	if 'fileids' not in kwargs:
            fileids = [ doi2fn(d, doc_part) for d in info['d2c'].keys() ] 
        else:
	    fileids =  kwargs['fileids']
        # cat_map f -> [ c1, c2, ...]
	# The fileids depend on what the doc_part is ('body', 'abstract')
	cat_map = {}
        for d,cat in info['d2c'].iteritems():
            cat_map[doi2fn(d, doc_part)] = cat

	kwargs['cat_map'] = cat_map
	# Subclass of Categorized Plaintext Corpus Reader
        CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)

Beispiel #2

0

Datei anzeigen

Datei: autotext.py Projekt: jorgegus/autotext

 def _extract_meta_data(self, corpus_path):
     self.db = CategorizedPlaintextCorpusReader(corpus_path,
                                                r'.*\.txt',
                                                cat_pattern=r'(\w+)/*')
     new_corpus = Corpus(self.db, corpus_path, self.limit_memory,
                         self.verbose)
     return new_corpus.get_meta_features()

Beispiel #3

0

Datei anzeigen

    def __init__(self,
                 input_folder_name,
                 doc_pattern,
                 categ_pattern,
                 encoding='utf-8'):
        CategorizedPlaintextCorpusReader.__init__(self,
                                                  input_folder_name,
                                                  doc_pattern,
                                                  cat_pattern=categ_pattern)
        self.input_folder_name = input_folder_name
        self.encoding = encoding
        self.root_reader = PlaintextCorpusReader(input_folder_name,
                                                 fileids=r'[^\/]*.' +
                                                 doc_pattern[-3:])
        #self.root_ids =[ os.path.join(input_folder_name,item) for item in self.root_reader.fileids()]

        self.root_ids = list(self.root_reader.fileids())

Beispiel #4

0

Datei anzeigen

Datei: autotext.py Projekt: jorgegus/autotext

 def predict(self, test_path):
     documents, self.y_test = self._read_corpus(
         CategorizedPlaintextCorpusReader(test_path,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*'),
         test_path)
     self.X_test = self.representation.transform(documents)
     return self.automl.predict(self.X_test)

Beispiel #5

0

Datei anzeigen

  def __init__(self, root, **kwargs):
    """ 
	Initialize a PLoS reader with a specific corpus. Corpus 
	information is contained in 'root/corpus_info.json' file. The

    @type  root: string
	@param root: The directory path to the corpus.
    """
    self._root = root
    
    # corpus type is specific to Plos_builder
    # full - all documents that were built.
    # partial - documents excluding training 
    # training - documents intended for training
    if 'corpus_type' in kwargs:
      self._corpus_type = kwargs['corpus_type']
      del kwargs['corpus_type']
    else:
      self._corpus_type = 'full'
    
    fn = '{d}/{t}_corpus_info.json'.format(d=root, t=self._corpus_type)
    with open( fn, 'r' ) as fp:
      self._corpus_info = info = json.load(fp)

    # doc_part is specific to PLoS and research article.
	# 'abstract' and 'body' are currently supported.
	# The corpus contains seperate text for each, but the 
	# reader is initialized to read only one.
    if 'doc_part' in kwargs:
      self._doc_part = doc_part = kwargs['doc_part']
      del kwargs['doc_part']
    else:
      self._doc_part = doc_part = 'body'
    
    if 'fileids' not in kwargs:
      fileids = [ doi2fn(d, doc_part) for d in self.dois() ] 
    else:
	    fileids =  kwargs['fileids']
    # cat_map f -> [ c1, c2, ...]
	# The fileids depend on what the doc_part is ('body', 'abstract')
    kwargs['cat_map'] = { doi2fn(d, doc_part) : cat for d,cat in info['dois_to_categories'].iteritems() }
	  # Subclass of Categorized Plaintext Corpus Reader
    CategorizedPlaintextCorpusReader.__init__(self, root, fileids, **kwargs)

Beispiel #6

0

Datei anzeigen

	def load_documents(self,path):
		docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*')
		print docs.categories()
		documents = [(list(docs.words(fileid)), category)
				for category in docs.categories()
				for fileid in docs.fileids(category)
		]
		random.shuffle(documents)
		return documents

Beispiel #7

0

Datei anzeigen

	def load_documents(self,path):
		docs = CategorizedPlaintextCorpusReader(path,r'.*/.*',cat_pattern=r'(.*)/.*')
		for cat in docs.categories():
			self.cat_gram_freq[cat] = {}
			self.cat_word_freq[cat]={}
		return ((category,list(docs.words(fileid))) 
			for category in docs.categories() 
			for fileid in docs.fileids(category))

Beispiel #8

0

Datei anzeigen

Datei: bill_bayes.py Projekt: PamelaM/billy

    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('directory',
                        help="the bill directory")
    parser.add_argument('--bigrams', action='store_true', dest='bigrams',
                        default=False, help='use bigrams')
    args = parser.parse_args()

    if args.bigrams:
        featurizer = bigram_feats
    else:
        featurizer = word_feats

    corpus = CategorizedPlaintextCorpusReader(
        root=args.directory,
        fileids=".*/.*\.txt",
        cat_pattern=r'(dem|rep)/')

    best_words = most_informative_words(corpus)

    dem_ids = corpus.fileids(categories=['dem'])
    rep_ids = corpus.fileids(categories=['rep'])

    dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem')
                 for f in dem_ids]
    rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep')
                 for f in rep_ids]

    dem_cutoff = len(dem_feats) * 5 / 6
    rep_cutoff = len(rep_feats) * 5 / 6

Beispiel #9

0

Datei anzeigen

import nltk
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

DOC_PATTERN = r'[\w_\s]+/[\w\s\d\-]+\.TXT'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader('ENGLISH',
                                          DOC_PATTERN,
                                          cat_pattern=CAT_PATTERN)

print(corpus.categories())
print(corpus.fileids()[100:110])
print(corpus.words())

Beispiel #10

0

Datei anzeigen

Datei: Chapter 5 NLP NLU NLG.py Projekt: zgj0607/building-an-enterprise-chatbot

classifier.show_most_informative_features(5)

#Document Classification

#Load Libraries

import os
import random
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

#Read the dataset into the categorized corpus

# Directory of the corpus
corpusdir = 'corpus/' 
review_corpus = CategorizedPlaintextCorpusReader(corpusdir, r'.*\.txt', cat_pattern=r'\d+_(\w+)\.txt')

# list of documents(fileid) and category (pos/neg)
documents = [(list(review_corpus.words(fileid)), category)
              for category in review_corpus.categories()
              for fileid in review_corpus.fileids(category)]
random.shuffle(documents)

for category in review_corpus.categories():
    print(category)

type(review_corpus)

len(documents)

#Compute word frequency

Beispiel #11

0

Datei anzeigen

    parser = argparse.ArgumentParser()
    parser.add_argument('directory', help="the bill directory")
    parser.add_argument('--bigrams',
                        action='store_true',
                        dest='bigrams',
                        default=False,
                        help='use bigrams')
    args = parser.parse_args()

    if args.bigrams:
        featurizer = bigram_feats
    else:
        featurizer = word_feats

    corpus = CategorizedPlaintextCorpusReader(root=args.directory,
                                              fileids=".*/.*\.txt",
                                              cat_pattern=r'(dem|rep)/')

    best_words = most_informative_words(corpus)

    dem_ids = corpus.fileids(categories=['dem'])
    rep_ids = corpus.fileids(categories=['rep'])

    dem_feats = [(featurizer(corpus.words(fileids=[f])), 'dem')
                 for f in dem_ids]
    rep_feats = [(featurizer(corpus.words(fileids=[f])), 'rep')
                 for f in rep_ids]

    dem_cutoff = len(dem_feats) * 5 / 6
    rep_cutoff = len(rep_feats) * 5 / 6

Beispiel #12

0

Datei anzeigen

Datei: test_nltk.py Projekt: RandomJungle/TextCat

import os
import nltk
import re
from math import log
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk.text import Text
from nltk import ConditionalFreqDist, FreqDist
from nltk.stem.snowball import FrenchStemmer

stemmer = FrenchStemmer()
stopwordsdir = "C:/Projects/Allocine/stopwords/used"
stopwords = []
root = "C:/Projects/Allocine/corpus2/"
cats = ['cine', 'autre', 'critique', 'critique_a']
reader = CategorizedPlaintextCorpusReader(root,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*',
                                          encoding='latin-1')

text_all = Text(reader.words())
text_cine = Text(reader.words(categories='cine'))
text_autre = Text(reader.words(categories='autre'))
text_critique = Text(reader.words(categories='critique'))
text_critique_a = Text(reader.words(categories='critique_a'))
texts_list = [text_cine, text_autre, text_critique, text_critique_a]


def remove_accents(text):
    text = re.sub("[àâäÄÂÀ]", "a", text)
    text = re.sub("[éèêëÈÊËÉ]", "e", text)
    text = re.sub("[ïîìÏÎÌ]", "i", text)
    text = re.sub("[öôòÖÔÒ]", "o", text)

Beispiel #13

0

Datei anzeigen


# fileids_ = corpus_dir + '/rt-polarity*'

corpus_dir = '/home/mayank/IdeaProjects/Lab_Machine_Learning/src/Text_Analytics/data/rt-polaritydata'

cat_map_ = {'rt-polarity.pos': ['pos'], 'rt-polarity.neg': ['neg']}

corpus_treatment(corpus_dir)

encoded_corpus_dir = os.path.join(corpus_dir, 'encoded_data')
fileids_ = '^rt-polarity.*'

categorized_plaintext_corpusreader = CategorizedPlaintextCorpusReader(
    root=encoded_corpus_dir,
    cat_map=t_map_,
    fileids=fileids_,
)

pos_words = categorized_plaintext_corpusreader.words(categories=['pos'])
pos_sents = categorized_plaintext_corpusreader.sents(categories=['pos'])
pos_paras = categorized_plaintext_corpusreader.paras(categories=['pos'])

neg_words = categorized_plaintext_corpusreader.words(categories=['pos'])
neg_sents = categorized_plaintext_corpusreader.sents(categories=['neg'])
neg_paras = categorized_plaintext_corpusreader.paras(categories=['neg'])

# NOTE: para views are not working to be looked into later

# classification
train = pos_words

Beispiel #14

0

Datei anzeigen

Datei: count_ngrams.py Projekt: tehf0x/gabe-and-joh

import sys
import cPickle as pickle

from itertools import chain

# from nltk import trigrams, word_tokenize, sent_tokenize, FreqDist
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk.util import ingrams

n = 3

train_path = "data/task1_train"

print "Loading categorized corpus in", train_path, "..."

cr = CategorizedPlaintextCorpusReader(train_path, ".*", cat_pattern="(\w*)")

# Get categories
print "%d categories: %s" % (len(cr.categories()), ", ".join(cr.categories()))

for c in [cr.categories()[0]]:
    print c + "..."
    sys.stdout.flush()

    ngrams = {}
    for i in range(n, 0, -1):
        print str(i) + "-grams..."
        ngrams[i] = {}
        prefix = ("",) * (i - 1)
        for ngram in ingrams(chain(prefix, cr.words(categories=[c])), n):
            if not ngram in ngrams[i]:

Beispiel #15

0

Datei anzeigen

             if type(i) == Tree:
                     current_chunk.append(" ".join([token for token, pos in i.leaves()]))
             elif current_chunk:
                     named_entity = " ".join(current_chunk)
                     if named_entity not in continuous_chunk:
                             continuous_chunk.append(named_entity)
                             current_chunk = []
             else:
                     continue
     return continuous_chunk



# create a corpus from the txt files given, with a file of categories to apply to the texts
corpus = CategorizedPlaintextCorpusReader(
                           'corpus/', 
                           r'.*\.txt',
                           cat_file="../textcats.prn")
"""
fileid="nytimes-2017.txt"
raw = corpus.raw(fileid)
raw = raw.replace("N.H.S.", "NHS")
words = word_tokenize(raw)
words = corpus.words(fileid)
clean0 = [word for word in words if word not in stoplist]
"""

bloblist = corpus.fileids()
#bloblist = corpus.fileids(categories='2016')
M=len(bloblist)
# Look at the categories
corpus.categories()

Beispiel #16

0

Datei anzeigen

Datei: benchmark_ngram.py Projekt: okkhoy/gabe-and-joh

import sys
from time import time
from itertools import chain
import cPickle as pickle


from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk.probability import ConditionalProbDist, ConditionalFreqDist, MLEProbDist
from nltk.util import ingrams


print 'Loading corpus...',
t = time()

train_path = 'data/task1_train'
cr = CategorizedPlaintextCorpusReader(train_path, '.*', cat_pattern='(\w*)')

t = time() - t
print str(t) + 's'

# Test generation of CFD
print 'Creating CFD...',
sys.stdout.flush()
t = time()

cat = cr.categories()[0]

n = 3

cfd = ConditionalFreqDist()
prefix = ('',) * (n - 1)

Beispiel #17

0

Datei anzeigen

Datei: ner.py Projekt: TinaCloud/atap

import os
import nltk
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

path = os.path.join(os.getcwd(), "debates")

DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader(path,
                                          DOC_PATTERN,
                                          cat_pattern=CAT_PATTERN)


def tag_corpus(corpus):
    return [nltk.pos_tag(sent) for sent in corpus.sents()]


tagged_corpus = tag_corpus(corpus)

import spacy

nlp = spacy.load('en')


def spacy_ner(tokenized_sent):
    doc = nlp(' '.join(tokenized_sent))
    for ent in doc.ents:
        return ent.text, ent.label_

Beispiel #18

0

Datei anzeigen

def create_corpus(directory):
    word_tokenize = RegexpTokenizer('\w+(?:-\w+)*(?:[?!.,:])*')
    sent_tokenize = nltk.data.load('tokenizers/punkt/french.pickle')
    translation = str.maketrans("", "", ",.?!:")
    corpus = CategorizedPlaintextCorpusReader(directory, r"^[^.]*$", cat_file='cats.txt', encoding="iso-8859-1", word_tokenizer=word_tokenize, sent_tokenizer=sent_tokenize)
    return corpus

Beispiel #19

0

Datei anzeigen

                    label=target_name)
    plt.legend(loc='best', shadow=False, scatterpoints=1)
    plt.title('PCA of BULATS dataset')
    plt.show()

    return model


if __name__ == "__main__":
    PATH = "model.pickle"
    # Loading speech features
    speech = pd.read_csv("/ExamplePath.csv")

    if not os.path.exists(PATH):
        nli = CategorizedPlaintextCorpusReader(CORPUS,
                                               DOC_PATTERN,
                                               cat_pattern=CAT_PATTERN)
        # since `nli` already has all the information (text and ids)
        # you don't need to iterate over it multiple times so
        # construct `X` and `y` in one go.
        X = []
        y = []
        for fileid in nli.fileids():
            X.append({
                'text': nli.raw(fileid),
                'id': fileid.split('/')[-1].split('.')[0]
            })
            y.append(nli.categories(fileid)[0])
        clf = PCA(n_components=2)
        model = build_and_evaluate(X, y, clf, speech)

Beispiel #20

0

Datei anzeigen

import os
import re
import csv

from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader

srm_data_dir = '/home/mayank/work/projects/SRM/Data'

training_file_path = os.path.join(srm_data_dir, 'TrainingData.csv')
training_file = open(training_file_path, 'r')

root_dir = os.path.join(srm_data_dir, 'sub_data')

# noraml_reader = PlaintextCorpusReader(root = root_dir,
#                                       fileids = ['Financial.csv'])

cat_map_ = {
    'Compliance': 'Compliance.csv',
    'Financial': 'Financial.csv',
    'Operational': 'Operational.csv',
    'Strategic': 'Strategic.csv'
}

cat_reader = CategorizedPlaintextCorpusReader(root=root_dir,
                                              fileids=r'$.csv',
                                              cat_map=cat_map_)

Beispiel #21

0

Datei anzeigen

Datei: score.py Projekt: pavelsavov/paper-scores

import re
from sklearn import metrics
import string
from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
from nltk.corpus import stopwords
from poslemma import LemmatizationWithPOSTagger
from ordinal_classification import OrdinalSVC
from sklearn.model_selection import StratifiedKFold

base_dir = '/Users/ja/Documents/www'
corpus_name = 'corpus'

min_topics = 10
max_topics = 60

corpus = CategorizedPlaintextCorpusReader(
    os.path.join(base_dir, corpus_name), fileids=r'(?!\.).*\.txt', cat_pattern=r'(\w+)/*')


def clean(doc):
    lemma = LemmatizationWithPOSTagger()
    stop = set(stopwords.words('english') + stopwords.words('numbers'))
    exclude = set(string.punctuation)
    wordchars = set(string.ascii_letters)
    wordchars |= set(string.digits)

    def contains_any(str, set):
        """Check whether 'str' contains ANY of the chars in 'set'"""
        return 1 in [c in str for c in set]

    def is_number(s):
        try:

Beispiel #22

0

Datei anzeigen

Datei: text_corpus.py Projekt: EvgenyNovashov/AnalizML

from nltk.corpus.reader.plaintext import CategorizedPlaintextCorpusReader
DOC_PATTERN = r'(?!\.)[\w_\s]+/[\w\s\d\-]+\.txt'
CAT_PATTERN = r'([\w_\s]+)/.*'

corpus = CategorizedPlaintextCorpusReader('corpus/text',
                                          DOC_PATTERN,
                                          cat_pattern=CAT_PATTERN)

print(corpus.categories())
print(corpus.fileids('2019'))