Ejemplo n.º 1
0
    def init_documents(f_re, cat_re):
        logging.debug("Reading corpus")
        reports = CategorizedPlaintextCorpusReader(corpus_dir,
                                                   f_re,
                                                   cat_pattern=cat_re,
                                                   encoding='utf8')
        logging.debug("Found {} fileids".format(len(reports.fileids())))
        logging.debug("Found categories: {}".format(reports.categories()))
        logging.debug("Building docs")

        documents = [
            (tokenize(reports.words(i)), reports.categories(i)[0])
              for i in reports.fileids()]
        return documents
Ejemplo n.º 2
0
    def __init__(self, dir, doc):
        self.doc = doc
        self.dir = dir
        self.eng_stopw = stopwords.words('english')


        text_corpus = CategorizedPlaintextCorpusReader(
            './%s/' % self.dir,
            r'.*\.csv',  # leggo solamente i file che terminato con .csv
            cat_pattern=r'(\w+)/*',  # prendi tutto quello che c'è dopo la directory
            encoding='latin-1'
        )

        self.text = nltk.Text(text_corpus.words(self.doc))
Ejemplo n.º 3
0
 def __init__(self, raiz_corpus):
     """Cria um objeto do tipo 'CategorizedPlaintextCorpusReader',
     utilizando o diretório raiz do corpus, onde os documentos
     estão localizados, dispostos em seus respectivos subdiretórios,
     de acordo com sua categoria, sejam eles/elas quais for
     
     -->     raiz_corpus/{pos,neg,neu,...}.
     """
     reload(sys)
     sys.setdefaultencoding("utf-8")
     
     self._raiz_corpus = raiz_corpus
     self._corpus = CategorizedPlaintextCorpusReader(raiz_corpus, r'.+\.txt', cat_pattern=r'(\w+)/*',
                                                     encoding='utf-8')
     self._documentos = None
     self._palavras_frequentes = None
     self._todas_palavras = None
     self._featuresets = None
     self._train_set = None
     self._test_set = None
Ejemplo n.º 4
0
def display_features(num_features=1000,
                     show_features=200,
                     filepath='classifiers/nltk_nb.pkl',
                     verbose=True):
    '''
    Displays informative features from NHLCorpus
    '''
    stop_words = set(stopwords.words('english'))
    nhl = CategorizedPlaintextCorpusReader(root='data/NHLcorpus/',
                                           fileids=r'.*\.txt',
                                           cat_pattern='(\w+)/*')
    documents = []
    for category in nhl.categories():
        for fileid in nhl.fileids(category):
            documents.append(([
                re.sub(r'\W+', '', w.lower()) for w in nhl.words(fileid)
                if w.lower() not in stop_words
            ], category))
    all_words = nltk.FreqDist(
        re.sub(r'\W+', '', w.lower()) for w in nhl.words()
        if w.lower() not in stop_words)
    word_features = [w[0] for w in all_words.most_common(num_features)]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains({})'.format(word)] = word in document_words
        return features

    featuresets = [(document_features(d), c) for (d, c) in documents]
    nb_clf = nltk.NaiveBayesClassifier.train(featuresets)
    if verbose:
        nb_clf.show_most_informative_features(show_features)
        print('Accuracy on training data: {}'.format(
            nltk.classify.accuracy(nb_clf, featuresets)))

    save_classifier = open(filepath, 'wb')
    pickle.dump(nb_clf, save_classifier)
    save_classifier.close()
Ejemplo n.º 5
0
This code uses the meeting records (inputs) corpus.
'''
import string
from itertools import chain

from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk

# working dir: UN/
mydir = 'corpus/meeting_records_final_categorized'

mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(intervention|soft_action)/.*', encoding='utf-8')
stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i,j in documents]))
word_features = word_features.keys()[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]]
test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag  in documents[numtrain:]]

classifier = nbc.train(train_set)
print nltk.classify.accuracy(classifier, test_set) # .87 - ?!?!?!
classifier.show_most_informative_features(20)

# for word_features.keys()[:100]
Ejemplo n.º 6
0
        if k in emotion_of_poems and v == max_value:
            emotion = k
            print(emotion)
            emotion_correct = {"emotion": emotion}
            return emotion_correct
            break

    return emotion_correct


def classify(poem_text):
    return classifier.classify(features_of_poem(poem_text))


corpus_of_poems = CategorizedPlaintextCorpusReader('poems/',
                                                   'poems.*',
                                                   cat_file='cats.txt')


#code for generating errors
# Return errors in order to improve algorithm
def errors_em(poem_set):
    errors = []
    for (fileid, category) in poem_set:
        poem = corpus_of_poems.words(fileids=[fileid])
        emotion_correct = features_of_poem(poem)
        guess = classifier.classify(features_of_poem(poem))

        if guess != category:
            errors.append((category, guess, poem, emotion_correct['emotions']))
Ejemplo n.º 7
0
from nltk.corpus import CategorizedPlaintextCorpusReader
import ProcessText

d1 = "judge people by what they say"

d1_processed = ProcessText.ProcessText.process(d1)

documents = [d1]

#Read documents
reader = CategorizedPlaintextCorpusReader(
    r'\Users\JoeDi\Desktop\MSC\Idioms Corpera',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*')

for w in reader.fileids():
    wd = reader.raw(w)
    documents.append(w + " " + wd)

print("Documents in the collection are: ")
print(documents)
print("\n")

from sklearn.feature_extraction.text import TfidfVectorizer

#build a TF/IDF matrix for each description
tfidf = TfidfVectorizer().fit_transform(documents)

print("Tf-idf weightings are:  ")
print(tfidf)
print("\n")
print(positive_greg)
print(negative_greg)

positive_consolidated_list = list(pos_list) + positive_greg
negative_consolidated_list = list(neg_list) + negative_greg
print(positive_consolidated_list)
print(negative_consolidated_list)

init_notebook_mode(connected=True)
cf.set_config_file(offline=True, world_readable=True, theme='ggplot')

#%%

corpus_root = "/Users/LENOVO USER/Desktop/FedTranscript1"
data_m = CategorizedPlaintextCorpusReader(corpus_root,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*',
                                          encoding='latin1')
data_fileids = data_m.fileids()


#%%
def corpus_Stats(crp):
    print('Total number of files: ' + str(len(crp.fileids())))
    print('Number of paragraphs: ' + str(len(crp.paras())))
    print('Number of sentences: ' + str(len(crp.sents())))
    print('Number of words: ' + str(len(crp.words())))


#corpus_Stats(data_m)
#print('\n'+'First file: '+ data_fileids[0])
#print('Last file: '+ data_fileids[-1])
Ejemplo n.º 9
0
sjar = '/Users/nischikata/PycharmProjects/JabRef-2.11.1.jar'

from nltk.corpus import stopwords
from nltk.corpus import CategorizedPlaintextCorpusReader
from nltk import word_tokenize
from nltk import TreebankWordTokenizer
import nltk.data

# PLAINTEXT CORPUS READER
# http://www.nltk.org/_modules/nltk/corpus/reader/plaintext.html#CategorizedPlaintextCorpusReader
# important: The TreebankWordTokenizer separates words like "don't" into "do", "n't", consequently the main verb is correctly identified.
# For the Naive Bayes it may be better though to use WordPunctTokenizer - it is the default, so just omit the word_tokenizer param
corpus = CategorizedPlaintextCorpusReader(
    '.',
    r'(?!\.).*\.txt',
    word_tokenizer=TreebankWordTokenizer(),
    cat_pattern=r'(aggressive|not_aggressive)/.*',
    encoding='utf8')

# Getting RAW SENTENCES from RAW Comment seee: http://stackoverflow.com/a/4576110/4866678
tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')


# returns a list of raw sentences
def get_raw_sentences(fileid):  # works
    data = corpus.raw(fileid)
    return tokenizer.tokenize(data)


def get_raw_paragraph(
    fileid
import collections
import nltk.classify.util, nltk.metrics
from nltk.classify import NaiveBayesClassifier
from nltk.classify import DecisionTreeClassifier
from nltk.corpus import CategorizedPlaintextCorpusReader
from sklearn import svm
from sklearn.svm import LinearSVC
import string
from tabulate import tabulate

corpus_root1='/Users/tianhan/Dropbox/Advanced_big_data_Project/aclImdb/train'
train=CategorizedPlaintextCorpusReader(corpus_root1,r'(pos|neg)/.*\.txt',cat_pattern=r'(pos|neg)/.*\.txt')
corpus_root2='/Users/tianhan/Dropbox/Advanced_big_data_Project/aclImdb/test'
test=CategorizedPlaintextCorpusReader(corpus_root2,r'(pos|neg)/.*\.txt',cat_pattern=r'(pos|neg)/.*\.txt')

def evaluate_classifier_Naive(featx):
    
    train_negids = train.fileids('neg')
    train_posids = train.fileids('pos')
    test_negids = test.fileids('neg')
    test_posids = test.fileids('pos')
    train_negfeats = [(featx(train.words(fileids=[f])), 'neg') for f in train_negids]
    train_posfeats = [(featx(train.words(fileids=[f])), 'pos') for f in train_posids]
    test_negfeats = [(featx(test.words(fileids=[f])), 'neg') for f in test_negids]
    test_posfeats = [(featx(test.words(fileids=[f])), 'pos') for f in test_posids]
    trainfeats = train_negfeats + train_posfeats
    testfeats = test_negfeats + test_posfeats

    Naive_classifier = NaiveBayesClassifier.train(trainfeats)
    refsets = collections.defaultdict(set)
    testsets_Naive = collections.defaultdict(set)
# 1장 말뭉치와 워드넷 - 외부 말뭉치 다운로드, 로드하고 액세스하기
from nltk.corpus import CategorizedPlaintextCorpusReader
from random import randint

# 말뭉치 읽기
reader = CategorizedPlaintextCorpusReader(r'/workspace/NLP_python/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*')
print(reader.categories())
print(reader.fileids())

# 샘플 문서 출력
# pos, neg 카테고리의 샘플 목록
posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

# pos, neg 카테고리에서 각각 임의의 파일 선택
fileP = posFiles[randint(0, len(posFiles)-1)]
fileN = negFiles[randint(0, len(negFiles)-1)]
print(fileP)
print(fileN)

# 액세스한 임의 파일을 문장으로 출력
for w in reader.words(fileP):
    print(w + ' ', end='')
    if w is '.':
        print()
for w in reader.words(fileN):
    print(w + ' ', end='')
    if w is '.':
        print()
def create_corpus():
    poem_corpus = CategorizedPlaintextCorpusReader('../poems/',
                                                   'poems_.*',
                                                   cat_file='cats.txt')
Ejemplo n.º 13
0
        os.chdir(directory)
        file = open(fname, 'w')
        file.write(text)
        file.close()


doc_start = {}
doc_start[0] = "Staff Review of the Economic Situation"
doc_start[1] = re.compile('The information (reviewed|received|provided)')
doc_start[
    2] = "The Committee then turned to a discussion of the economic outlook"
doc_start[3] = re.compile('The information  (reviewed|received|provided)')

doc_end = {}
doc_end[0] = re.compile(
    '(At the conclusion of) (this|the) (discussion|meeting)')
doc_end[1] = re.compile('(?i)The Committee voted to authorize')
doc_end[2] = re.compile('(?i)The vote encompassed approval of')

if __name__ == '__main__':
    corpus_root = '/Users/aaroncgw/Google Drive/fednlp/data/minutes/'
    data_m = CategorizedPlaintextCorpusReader(corpus_root,
                                              r'.*\.txt',
                                              cat_pattern=r'(\w+)/*')
    data_fileids = data_m.fileids()

    for f in data_fileids:
        year, fname = f.split('/')
        cropped_text = crop_text(data_m.raw(f), doc_start, doc_end)
        saveFile(fname, year, cropped_text)
from nltk.corpus import CategorizedPlaintextCorpusReader

reader = CategorizedPlaintextCorpusReader(
    r'D:\LEARNING\MISC\DataSet\movieCorpus\review_polarity\txt_sentoken',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*')

print(reader.categories())
print(reader.fileids())

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

from random import randint
fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(posFiles) - 1)]
print(fileP)
print(fileN)

for w in reader.words(fileP):
    print(w + ' ', end='')
    if (w is '.'):
        print()

for w in reader.words(fileN):
    print(w + ' ', end='')
    if (w is '.'):
        print()
Ejemplo n.º 15
0
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader

reader = CategorizedPlaintextCorpusReader(
    r'/home/smadyastha/Projects/PythonCheck/Dataset/Reviews/tokens',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*')

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

from random import randint
fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(posFiles) - 1)]
print(fileP)
print(fileN)

for w in reader.words(fileP):
    print(w + "")
    if (w is '.'):
        print()

# /home/smadyastha/Projects/PythonCheck/Dataset/Reviews
Ejemplo n.º 16
0
print strftime("%Y-%m-%d %H:%M:%S", gmtime())
# Uni
machinename = 'maj27'

j = 0
for i in range(10):
    dataset = str(i + 1)
    #mydir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Classified News/Training'
    train_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Train+val'
    test_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Testing'
    #test_dir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/NA is negative old/Classified News/Criteria '+dataset+''

    preprocessed = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/data_2.p'

    train_Corpus = CategorizedPlaintextCorpusReader(train_dir,
                                                    r'(?!\.).*\.txt',
                                                    cat_pattern=r'(\w+)/*')

    train_documents = [(list(train_Corpus.words(fileid)), category)
                       for category in train_Corpus.categories()
                       for fileid in train_Corpus.fileids(category)]

    only_docs = [' '.join(doc[:1000]) for (doc, category) in train_documents]
    only_docs = [
        ' '.join(normalize_text(document, lemmatize=True, remove_stop=None))
        for document in only_docs
    ]

    #######################################################################################
    train_labels = [category for (doc, category) in train_documents]
    train_binary_labels = [1 if i == 'pos' else 0 for i in train_labels]
Ejemplo n.º 17
0
from nltk.corpus import CategorizedPlaintextCorpusReader

reader = CategorizedPlaintextCorpusReader(
    r'/Users/dechamoungsri/NLP_Learning/NLP_tutotial/mix20_rand700_tokens_cleaned/tokens/',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*')
print(reader.categories())
print(reader.fileids())

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

from random import randint
fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(posFiles) - 1)]

print(fileP)
print(fileN)

for w in reader.words(fileP):
    print(w + ' ', end='')
    if (w is '.'):
        print()
for w in reader.words(fileN):
    print(w + ' ', end='')
    if (w is '.'):
        print()
Ejemplo n.º 18
0
#Downloading an external corpus, load it, and access it

from nltk.corpus import CategorizedPlaintextCorpusReader
from random import randint #random

# The first line is where you are reading the corpus by calling
# the CategorizedPlaintextCorpusReader constructor.
# The three arguments from left to right are Absolute Path
# to the folder containing the corpus on your computer, all sample
# document names from the txt_sentoken folder, and the categories
# in the given corpus (in our case, 'pos' and 'neg'

reader = CategorizedPlaintextCorpusReader(r'\Users\JoeDi\Desktop\python projs\tokens', r'.*\.txt', cat_pattern=r'(\w+)/*')


print(reader.categories())
print(reader.fileids())

# Now that we've made sure that the corpus is loaded correctly, let's
# get on with accessing any one of the sample documents from both the categories.
# For that, let's first create a list, each containing samples of both the categories, 'pos' and 'neg', respectively.
# Add the following two lines of code:

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

# The next two files select a random file, each from the set of positive
# and negative category reviews. The last two lines just print the filenames.

fileP = posFiles[randint(0,len(posFiles)-1)]
fileN = negFiles[randint(0, len(posFiles) - 1)]
Ejemplo n.º 19
0
def classify_emails():
    stop_words = set(stopwords.words("english"))

    lemmatizer = WordNetLemmatizer()

    mydir = '/home/ubuntu/nltk_data/corpora/gmail'

    all_words = []
    filtered_words = []
    removedPuncuations_words = []
    lematized_words = []
    test_filter = []

    mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(hotel|flight|other)/.*', encoding='latin-1')
    stop = stopwords.words('english')
    documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

    word_features = FreqDist(chain(*[i for i,j in documents]))
    word_features = word_features.keys()[:100]

    def word_feats(document):
        words = set(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)

        return dict(features)

    negids = mr.fileids('hotel')
    posids = mr.fileids('flight')
    neutralids = mr.fileids('other')

    negfeats = [(word_feats(mr.words(fileids=[f])), 'hotel') for f in negids]
    posfeats = [(word_feats(mr.words(fileids=[f])), 'flight') for f in posids]
    neutralfeats = [(word_feats(mr.words(fileids=[f])), 'other') for f in neutralids]

    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
    neutralcutoff = len(neutralfeats)*3/4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] + neutralfeats[:neutralcutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neutralfeats[neutralcutoff:]

    classifier = nltk.NaiveBayesClassifier.train(trainfeats)
    print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testfeats))*100)

    print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats)*100)


    file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read()
    tokens = nltk.word_tokenize(file_content)

    test_sent_features = {word.lower(): (word in tokens) for word in mr.words()}

    file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read()
    tokens = nltk.word_tokenize(file_content)
    tri_tokens = trigrams(tokens)

    cities = []
    matchedIndex = []
    tokenized = []
    addresses = []
    district = ['Akarawita','Angamuwa','Avissawella','Batawala','Battaramulla','Batugampola','Bope','Boralesgamuwa','Borella','Dedigamuwa','Dehiwala','Deltara','Habarakada','Handapangoda','Hanwella','Hewainna','Hiripitya','Hokandara','Homagama','Horagala','Kaduwela','Kahawala','Kalatuwawa','Madapatha','Maharagama','Malabe','Meegoda','Padukka','Pannipitiya','Piliyandala','Pitipana','Homagama','Polgasowita','Puwakpitiya','Ranala','Siddamulla','Slave Island','Sri Jayawardenapura','Talawatugoda','Tummodara','Waga','Watareka','Dickwella']

    for i in tokens:
        tokenized.append(i)

    pattern = re.compile("\d+")
    for i in tokenized:
        if pattern.match(i):
            matchedIndex.append(tokenized.index(i))
            print ("match"+i)
            print (tokenized.index(i))

        else:
            print ("not match")

    for t in tokenized:
        for i in district:
            if t.lower()==i.lower():
                cities.append(tokenized.index(t))

    distance= 200
    start = 0
    end = 0

    for t in cities:
        for i in matchedIndex:
            dis = t-i;
            if (dis<=distance and dis>0):
                distance=dis
                start=t
                end=i
            else:
                print ("higher")

    address = ""

    for token in range(end,start+1):
        address+=tokenized[(token)]
        print (address)
        addresses.append(address)

    for address in addresses:
        try:
            search = geocoder.get(address)
        except ValueError:
            continue
        first_result = search[0]

    output =  [first_result.geometry.location.lat,first_result.geometry.location.lng]


    stri = ','.join(map(str, output))
    return stri
def read_corpus(root_dir):
    return CategorizedPlaintextCorpusReader(root_dir,
                                            FILE_PATTERN,
                                            cat_pattern=CAT_PATTERN)
Ejemplo n.º 21
0
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader
from nltk.corpus import brown

# Abrir os documentos dentro do caminho específico
# Argumentos
# 1. Caminho absoluto para os documentos
# 2. tipo / extensão dos documentos (*.txt)
# 3. indicativo das pastas que formarão as categorias
# todos os argumentos são expressões regulares

leitor = CategorizedPlaintextCorpusReader(
    '../Dados/mix20_rand700_tokens_cleaned/tokens/',
    '.*.txt',
    cat_pattern=r'(\w+)/*')

# Verificar o que foi carregado
print(leitor.categories())
print(leitor.fileids())

# Separar o corpus de acordo com as categorias
posFiles = leitor.fileids(categories='pos')
negFiles = leitor.fileids(categories='neg')
print('Arquivos pos:', posFiles)
print('Arquivos neg:', negFiles)

# Carregar os primeiros arquivos das categorias
arqP = posFiles[0]
arqN = negFiles[1]

print("ArqP: ", arqP)
Ejemplo n.º 22
0
# 임포트
from nltk.corpus import CategorizedPlaintextCorpusReader

# 말뭉치 읽어오기
reader = CategorizedPlaintextCorpusReader(r'C:\Users\hyery\Python-NLP\chaper01\Reviews\tokens',
                                             r'.*\.txt', cat_pattern=r'(\w+)/*')
print(reader.categories())
print(reader.fileids())

# 각 카테고리의 샘플을 포함하는 목록 작성
posFiles = reader.fileids(categories='pos')  # 카테고리의 이름을 인수로 받는 fileids()함수
negFiles = reader.fileids(categories='neg')

# 각 목록에서 임의로 파일을 선택
from random import randint

fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(negFiles) - 1)]
print(fileP)
print(fileN)

# 선택한 파일에 엑세스를 해 문장을 출력
for w in reader.words(fileP):
    print(w+' ', end='')
    if (w is '.'):
        print()

for w in reader.words(fileN):
    print(w+' ', end='')
    if (w is '.'):
        print()
Ejemplo n.º 23
0
Example of reading a report corpus and generating a concordance and bi-grams

Create a NLTK plaintext corpus using `examples/nltk_create_report_corpus.py`
"""

from pprint import pprint
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader, stopwords
import logging

CORPUS_ROOT = "/Users/derek/Data/RADCAT/corpus"

if __name__ == "__main__":
    # For reports with category in the f/n abc_def+3.txt
    reports = CategorizedPlaintextCorpusReader(CORPUS_ROOT,
                                               '.*',
                                               cat_pattern=r'.*\+(.+)\.txt')

    logging.basicConfig(level=logging.DEBUG)
    logging.debug(reports.categories())

    toks = [
        w.lower() for w in reports.words()
        if w.isalpha() and w not in stopwords.words('english')
    ]

    all = nltk.Text(toks)
    print all.concordance('hemodynamically')

    # Create your bi-grams and n-grams
    # bgs = nltk.bigrams(toks)
Ejemplo n.º 24
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 11 08:43:52 2021

@author: paulogamero
"""
# ATIVIDADE: EXERCITANDO 1 - PARTE 01
# AUTOR: Paulo Gamero

from nltk.corpus import CategorizedPlaintextCorpusReader

d = CategorizedPlaintextCorpusReader(
    r'C:\Users\Usuario\Dropbox\Pos\Pós DataScience\4 - Análise de textos com R e Python\Dados\mix20_rand700_tokens_cleaned\tokens',
r'.*.txt', cat_pattern = r'(\w+)/*', encoding = 'iso8859-1')

for p in d.words('pos/cv003_tok-8338.txt'):
    print(p + ' ', end = '')

for n in d.words('neg/cv002_tok-3321.txt'):
    print(n + ' ', end = '')
Ejemplo n.º 25
0
import string
from itertools import chain

from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk

mydir = 'Documents/Plab/Project4/subset/test/neg'

mr = CategorizedPlaintextCorpusReader(mydir,
                                      r'(?!\.).*\.txt',
                                      cat_pattern=r'(neg|pos)/.*',
                                      encoding='ascii')
stop = stopwords.words('english')
documents = [([
    w for w in mr.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i, j in documents]))
word_features = word_features.keys()[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i: (i in tokens)
               for i in word_features}, tag)
             for tokens, tag in documents[:numtrain]]
test_set = [({i: (i in tokens)
              for i in word_features}, tag)
            for tokens, tag in documents[numtrain:]]
Ejemplo n.º 26
0
from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk
import sys
import os

mydir_train = '.\\Docs-txt\\train'
mydir_test = '.\\Docs-txt\\test'
featureVector_train = []
featureVector_test = []

mr_train = CategorizedPlaintextCorpusReader(
    mydir_train,
    r'(?!\.).*\.txt',
    cat_pattern=
    r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*')
mr_test = CategorizedPlaintextCorpusReader(
    mydir_test,
    r'(?!\.).*\.txt',
    cat_pattern=
    r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*')

stop = stopwords.words('english')

with open('.\\stopwords.txt') as f:
    stop = f.read().splitlines()

documents_train = [([
    w for w in mr_train.words(i)
Ejemplo n.º 27
0
#http://www.cs.cornell.edu/people/pabo/movie%2Dreview%2Ddata/

from nltk.corpus import CategorizedPlaintextCorpusReader
from random import randint

reader = CategorizedPlaintextCorpusReader(
    r'mix20_rand700_tokens_cleaned/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*')
print(reader.categories())
print(reader.fileids())

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(negFiles) - 1)]

print(fileN)
print(fileP)

for w in reader.words(fileP):
    print(w + ' ', end='')
    if w is '.':
        print()

for w in reader.words(fileN):
    print(w + ' ', end='')
    if w is '.':
        print()
Ejemplo n.º 28
0
def load_headline_corpus(with_dates=True, force_get=False, verbose=False):
    # set up paths
    if with_dates:
        zip_file_name = DATE_CORPUS_FILENAME        
    else:
        zip_file_name = CAT_CORPUS_FILENAME
    
    # github download url
    url = 'https://github.com/tacticsiege/TacticCorpora/raw/master/headlines/archive/' + zip_file_name

    
    env_dir = get_env_dir()
    # archive paths
    archive_dir = env_dir + 'corpus\\archive\\'
    archive_file_name = archive_dir + zip_file_name
    # extracted corpus paths
    corpus_root = 'dated' if with_dates else 'categorized'
    saved_dir = env_dir + 'corpus\\' + corpus_root
    
    # check if the data is downloaded
    downloaded = os.path.exists(archive_file_name)

    # download the data from github
    if not downloaded:
        pathlib.Path(archive_dir).mkdir(parents=True, exist_ok=True)
        if verbose:
            print ('Downloading:', url, '...')
        with req.urlopen(url) as d, open(archive_file_name, 'wb') as tmpFile:
            data = d.read()
            tmpFile.write(data)
        if verbose:
            print ('Complete, saved to:', archive_file_name)

    # extract the data if the root directory doesn't exist
    extracted = os.path.exists(saved_dir)
    if not extracted:
        pathlib.Path(saved_dir).mkdir(parents=True, exist_ok=True)
        if verbose:
            print ('Opening:', archive_file_name)
        archive = zipfile.ZipFile(archive_file_name)        
        archive.extractall(saved_dir)
        archive.close()
        if verbose:
            print ('Extracted to:', saved_dir)
    
    file_pattern = r'.*_corpus\.txt'
    
    if with_dates:
        cat_pattern = r'(.*)/'
        # HACK: fix this in archive later
        saved_dir = saved_dir + '\\2017_08_22\\corpus'
        if verbose:
            print ('Loading corpus from:', saved_dir)
        corpus = CategorizedDatedCorpusReader(saved_dir, file_pattern=file_pattern, cat_pattern=cat_pattern)
    else:
        cat_pattern = ".*_(.*)_corpus.txt"
        if verbose:
            print ('Loading corpus from:', saved_dir)
        corpus = CategorizedPlaintextCorpusReader(saved_dir, file_pattern=file_pattern, cat_pattern=cat_pattern)

    if corpus is not None and verbose:
        print ('Corpus loaded.')

    return corpus