Ejemplo n.º 1
0
    def init_documents(f_re, cat_re):
        logging.debug("Reading corpus")
        reports = CategorizedPlaintextCorpusReader(corpus_dir,
                                                   f_re,
                                                   cat_pattern=cat_re,
                                                   encoding='utf8')
        logging.debug("Found {} fileids".format(len(reports.fileids())))
        logging.debug("Found categories: {}".format(reports.categories()))
        logging.debug("Building docs")

        documents = [
            (tokenize(reports.words(i)), reports.categories(i)[0])
              for i in reports.fileids()]
        return documents
Ejemplo n.º 2
0
def nltk():
    #### FOR TRAINING DATA ####
    stop = stopwords.words('spanish')

    # Reads the training data.
    traindir = '/Users/ruben/Desktop/Formularios_clasificados/training'
    mr = CategorizedPlaintextCorpusReader(traindir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8')

    # Converts training data into tuples of [(words,label), ...]
    documents = [([w for w in mr.words(i) if w.lower() not in stop and w not in string.punctuation], i.split('/')[0]) for i
                 in mr.fileids()]
    # Extract training features.
    word_features = FreqDist(chain(*[i for i, j in documents]))
    word_features = word_features.keys()[:100]
    # Assuming that you're using full data set
    # since your test set is different.
    train_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in documents]

    #### TRAINS THE TAGGER ####
    # Train the tagger
    classifier = NaiveBayesClassifier.train(train_set)

    #### FOR TESTING DATA ####
    # Now do the same reading and processing for the testing data.
    testdir = '/Users/ruben/Desktop/Formularios_clasificados/testing'
    mr_test = CategorizedPlaintextCorpusReader(testdir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='utf-8')
    # Converts testing data into tuples of [(words,label), ...]
    test_documents = [
        ([w for w in mr_test.words(i) if w.lower() not in stop and w not in string.punctuation], i.split('/')[0]) for i in
        mr_test.fileids()]
    # Reads test data into features:
    test_set = [({i: (i in tokens) for i in word_features}, tag) for tokens, tag in test_documents]

    correct = 0
    wrong = 0
    #### Evaluate the classifier ####
    for doc, gold_label in test_set:
        tagged_label = classifier.classify(doc)
        if tagged_label == gold_label:
            correct += 1
        else:
            wrong += 1

    print correct, wrong, (float(correct) / wrong + correct)
def construct_model(copusPath, modelPath):
    mr = CategorizedPlaintextCorpusReader(copusPath, r'(?!\.).*\.txt',
                                           cat_pattern=r'*/.*', encoding='iso-8859-1')
    stop = stopwords.words('french')
    documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation],
                   i.split('/')[0]) for i in mr.fileids()]
    word_features = FreqDist(chain(*[i for i, j in documents]))
    word_features = list(word_features.keys())
    numtrain = int(len(documents) * 100 / 100)
    train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag in documents[:numtrain]]
    """test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens, tag  in documents[numtrain:]]"""
    classifier = nbc.train(train_set)
    mrtest = CategorizedPlaintextCorpusReader(os.path.abspath("corpus_test"), r'(?!\.).*\.txt', cat_pattern=r'*/.*', encoding='iso-8859-1')
    documentsTest = [([w for w in mrtest.words(i) if w.lower() not in stop and w.lower() 
                   not in string.punctuation],
                   i.split('/')[0]) for i in mrtest.fileids()]
    word_features_test = FreqDist(chain(*[i for i, j in documentsTest]))
    word_features_test = list(word_features_test.keys())
    numtrain_test = int(len(documentsTest) * 100 / 100)
    test_set = [({i:(i in tokens) for i in word_features_test}, tag) for tokens, tag  in documentsTest[:numtrain_test]]
    save_classifier(classifier, modelPath)
Ejemplo n.º 4
0
def display_features(num_features=1000,
                     show_features=200,
                     filepath='classifiers/nltk_nb.pkl',
                     verbose=True):
    '''
    Displays informative features from NHLCorpus
    '''
    stop_words = set(stopwords.words('english'))
    nhl = CategorizedPlaintextCorpusReader(root='data/NHLcorpus/',
                                           fileids=r'.*\.txt',
                                           cat_pattern='(\w+)/*')
    documents = []
    for category in nhl.categories():
        for fileid in nhl.fileids(category):
            documents.append(([
                re.sub(r'\W+', '', w.lower()) for w in nhl.words(fileid)
                if w.lower() not in stop_words
            ], category))
    all_words = nltk.FreqDist(
        re.sub(r'\W+', '', w.lower()) for w in nhl.words()
        if w.lower() not in stop_words)
    word_features = [w[0] for w in all_words.most_common(num_features)]

    def document_features(document):
        document_words = set(document)
        features = {}
        for word in word_features:
            features['contains({})'.format(word)] = word in document_words
        return features

    featuresets = [(document_features(d), c) for (d, c) in documents]
    nb_clf = nltk.NaiveBayesClassifier.train(featuresets)
    if verbose:
        nb_clf.show_most_informative_features(show_features)
        print('Accuracy on training data: {}'.format(
            nltk.classify.accuracy(nb_clf, featuresets)))

    save_classifier = open(filepath, 'wb')
    pickle.dump(nb_clf, save_classifier)
    save_classifier.close()
Ejemplo n.º 5
0
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk, string, numpy

reader = CategorizedPlaintextCorpusReader(
    r'\Users\JoeDi\Desktop\MSC\Idioms Corpera',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*')

print(reader.categories())
print(reader.fileids())

from random import randint

File = reader.fileids()

fileP = File[randint(0, len(File) - 1)]
print(fileP)

for w in reader.words(fileP):
    print(w + ' ', end='')
    if (w is '.'):
        print()

#https://sites.temple.edu/tudsc/2017/03/30/measuring-similarity-between-texts-in-python/

from sklearn.feature_extraction.text import CountVectorizer
import nltk, string, numpy

sss = "Because there is no easy way to decide how two words, two documents are related. All we have is sequence of letters " \
      "or strings if you prefer. So how to find a relationship between two words? If you want to decide how two documents related, " \
      "how to figure that out? It cant be done without having any other data."
Ejemplo n.º 6
0
from nltk.corpus import CategorizedPlaintextCorpusReader
import ProcessText

d1 = "judge people by what they say"

d1_processed = ProcessText.ProcessText.process(d1)

documents = [d1]

#Read documents
reader = CategorizedPlaintextCorpusReader(
    r'\Users\JoeDi\Desktop\MSC\Idioms Corpera',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*')

for w in reader.fileids():
    wd = reader.raw(w)
    documents.append(w + " " + wd)

print("Documents in the collection are: ")
print(documents)
print("\n")

from sklearn.feature_extraction.text import TfidfVectorizer

#build a TF/IDF matrix for each description
tfidf = TfidfVectorizer().fit_transform(documents)

print("Tf-idf weightings are:  ")
print(tfidf)
print("\n")
Ejemplo n.º 7
0
pos_file.close()
neg_file.close()


# Words for all emotions
lexicon = {}
for emotion in base_emotions:
    f = open('./opinion-lexicon-English/%s-words.txt' % emotion, 'rU')
    words = [word.strip() for word in f.readlines()]
    lexicon[emotion] = words
    f.close()

# Make a classifier based on the feature sets of the poems
poem_corpus = CategorizedPlaintextCorpusReader('./data', 'poems.*',
        cat_file='cats.txt')

poem_set = [(fileid, category) for fileid in poem_corpus.fileids() \
        for category in poem_corpus.categories(fileid)]
random.shuffle(poem_set)

feature_set = [(poem_features(poem_corpus.words(fileids=[fileid])),
        category) for (fileid, category) in poem_set]

train_set, test_set = feature_set[2000:], feature_set[:2000]

# Initialize the classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

# For improving the algorithm
classifier.show_most_informative_features(20)
Ejemplo n.º 8
0
import time
start=time.time()
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import CategorizedPlaintextCorpusReader
from sklearn.cluster import KMeans
import numpy as np
import copy
import math
import re

corpus_root = 'C:\\MyData\\PythonPractice\\IMDB\\train' #Path of IMDB Train Data
reader=CategorizedPlaintextCorpusReader(corpus_root,r'.*\.txt',cat_pattern=r'(\w+)/*')

r_pos=reader.fileids(categories=['pos'])
r_neg=reader.fileids(categories=['neg'])

global_shortlisted=[]
TRAIN_GS_POS=[]

for i in range(0,12500):
    
    doc=reader.raw(r_pos[i:i+1])   #doc contains the movie review
    sentences = nltk.sent_tokenize(doc)
    senlen=len(sentences)
    
    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won't", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)
    
problem = 'problemA'
problem_root = nltk.data.find('corpora/AAAC/%s' % (problem))
problem_files = PlaintextCorpusReader(problem_root, '.*\.txt')


# Categorize corpus by author
auth_map = {}
for filename in problem_files.fileids():
	a_n =  filename[:3]
	auth_map[filename] =  [a_n]

# By the entire corpus
problem_cat = CategorizedPlaintextCorpusReader(problem_root, '.*\.txt', cat_map=auth_map)
documents = [(list(problem_cat.words(fileid)), category) 
				for category in problem_cat.categories() 
				for fileid in problem_cat.fileids(category)]
random.shuffle(documents)


# Word Frequency featureset
# Word freq accross corpus
all_words = nltk.FreqDist(words.lower() for words in problem_cat.words())
key_words = all_words.keys()[:2000]


# Compares whether a word from the keywords is in a document
def doc_features(doc):
	doc_words = set(doc)
	features = {}
	for word in key_words:
		features['contains(%s)' % word] = (word in doc_words)
Ejemplo n.º 10
0
# Abrir os documentos dentro do caminho específico
# Argumentos
# 1. Caminho absoluto para os documentos
# 2. tipo / extensão dos documentos (*.txt)
# 3. indicativo das pastas que formarão as categorias
# todos os argumentos são expressões regulares

leitor = CategorizedPlaintextCorpusReader(
    '../Dados/mix20_rand700_tokens_cleaned/tokens/',
    '.*.txt',
    cat_pattern=r'(\w+)/*')

# Verificar o que foi carregado
print(leitor.categories())
print(leitor.fileids())

# Separar o corpus de acordo com as categorias
posFiles = leitor.fileids(categories='pos')
negFiles = leitor.fileids(categories='neg')
print('Arquivos pos:', posFiles)
print('Arquivos neg:', negFiles)

# Carregar os primeiros arquivos das categorias
arqP = posFiles[0]
arqN = negFiles[1]

print("ArqP: ", arqP)
print("ArqN: ", arqN)

# Imprimir as sentenças dos arquivos
Ejemplo n.º 11
0
import string
from itertools import chain

from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk

mydir = 'Documents/Plab/Project4/subset/test/neg'

mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(neg|pos)/.*', encoding='ascii')
stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i,j in documents]))
word_features = word_features.keys()[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]]
test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag  in documents[numtrain:]]

classifier = nbc.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)
Ejemplo n.º 12
0
        os.chdir(directory)
        file = open(fname, 'w')
        file.write(text)
        file.close()


doc_start = {}
doc_start[0] = "Staff Review of the Economic Situation"
doc_start[1] = re.compile('The information (reviewed|received|provided)')
doc_start[
    2] = "The Committee then turned to a discussion of the economic outlook"
doc_start[3] = re.compile('The information  (reviewed|received|provided)')

doc_end = {}
doc_end[0] = re.compile(
    '(At the conclusion of) (this|the) (discussion|meeting)')
doc_end[1] = re.compile('(?i)The Committee voted to authorize')
doc_end[2] = re.compile('(?i)The vote encompassed approval of')

if __name__ == '__main__':
    corpus_root = '/Users/aaroncgw/Google Drive/fednlp/data/minutes/'
    data_m = CategorizedPlaintextCorpusReader(corpus_root,
                                              r'.*\.txt',
                                              cat_pattern=r'(\w+)/*')
    data_fileids = data_m.fileids()

    for f in data_fileids:
        year, fname = f.split('/')
        cropped_text = crop_text(data_m.raw(f), doc_start, doc_end)
        saveFile(fname, year, cropped_text)
def transform(corpus: CategorizedPlaintextCorpusReader, target_root_dir):
    if not os.path.exists(target_root_dir):
        os.makedirs(target_root_dir)
    open(target_root_dir + "\\meta.info", 'w').write("tagged\nmarks.txt")
    for fileid in corpus.fileids():
        yield process(corpus, target_root_dir, fileid)
Ejemplo n.º 14
0
import nltk
from nltk.corpus import CategorizedPlaintextCorpusReader

reader = CategorizedPlaintextCorpusReader(
    r'/home/smadyastha/Projects/PythonCheck/Dataset/Reviews/tokens',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*')

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

from random import randint
fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(posFiles) - 1)]
print(fileP)
print(fileN)

for w in reader.words(fileP):
    print(w + "")
    if (w is '.'):
        print()

# /home/smadyastha/Projects/PythonCheck/Dataset/Reviews
Ejemplo n.º 15
0
start = time.time()
import nltk
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import CategorizedPlaintextCorpusReader
from sklearn.cluster import KMeans
import numpy as np
import copy
import math
import re

corpus_root = 'C:\\MyData\\PythonPractice\\IMDB\\test'  #Path of IMDB Test Data
reader = CategorizedPlaintextCorpusReader(corpus_root,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*')

r_neg = reader.fileids(categories=['neg'])
r_pos = reader.fileids(categories=['pos'])

global_shortlisted = []
TEST_GS_POS = []

for i in range(0, 12500):

    doc = reader.raw(r_pos[i:i + 1])  #doc contains the movie review
    sentences = nltk.sent_tokenize(doc)
    senlen = len(sentences)

    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won't", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)
Ejemplo n.º 16
0
# Provide path to the custom corpora

mydir = '/Users/vasilis/Desktop/Lennon/lyrics_custom_corpus'

# Read data from our custom corpora

mr = CategorizedPlaintextCorpusReader(mydir,
                                      r'(?!\.).*\.txt',
                                      cat_pattern=r'(neg|pos)/.*')

# Clean lyrics from the English stop words.
stop = stopwords.words('english')

documents = [(list(mr.words(fileid)), category)
             for category in mr.categories()
             for fileid in mr.fileids(category)]

classifiers_dir = '/Users/vasilis/vxm773/Lennon/pickled_classifiers'

if os.path.exists(classifiers_dir):
    shutil.rmtree(classifiers_dir)
os.makedirs(classifiers_dir)

save_documents = open("pickled_classifiers/documents.pickle", "wb")
pickle.dump(documents, save_documents)
save_documents.close()

# Shuffle lyrics in order to avoid training only towards pos/neg lyrics.

random.shuffle(documents)
Ejemplo n.º 17
0
for i in range(10):
    dataset = str(i + 1)
    #mydir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Classified News/Training'
    train_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Train+val'
    test_dir = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/Testing'
    #test_dir = 'C:/Users/'+machinename+'/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/NA is negative old/Classified News/Criteria '+dataset+''

    preprocessed = 'C:/Users/' + machinename + '/New folder/Dropbox/PhD Brighton/Dataset/healthnewsreview_org/Well done 5 and 10 inverted/Classified Story/Criteria ' + dataset + '/data_2.p'

    train_Corpus = CategorizedPlaintextCorpusReader(train_dir,
                                                    r'(?!\.).*\.txt',
                                                    cat_pattern=r'(\w+)/*')

    train_documents = [(list(train_Corpus.words(fileid)), category)
                       for category in train_Corpus.categories()
                       for fileid in train_Corpus.fileids(category)]

    only_docs = [' '.join(doc[:1000]) for (doc, category) in train_documents]
    only_docs = [
        ' '.join(normalize_text(document, lemmatize=True, remove_stop=None))
        for document in only_docs
    ]

    #######################################################################################
    train_labels = [category for (doc, category) in train_documents]
    train_binary_labels = [1 if i == 'pos' else 0 for i in train_labels]

    #train_data, test_data, train_labels, test_labels = train_test_split(only_docs, binary_labels,test_size=.15)
    train_data = only_docs
    train_labels = train_binary_labels
# NLTK brow selection
word_list_brown = brown.words()
sents_list_brown = brown.sents()
vocabulary_brown = set(word_list_brown)
brown_len_words = len(word_list_brown)
brown_len_sents = len(sents_list_brown)
brown_len_vocab = len(vocabulary_brown)
brown_richness = lexical_diversity(word_list_brown)

# Lyric corpus
cats = corpus.categories()
print(len(cats))
print(cats)

num_files = len(corpus.fileids())
word_list = list(corpus.words())
sents_list = list(corpus.sents())
vocabulary = set(word_list)
total_len_words = len(word_list)
total_len_sents = len(sents_list)
total_len_vocab = len(vocabulary)
total_richness = lexical_diversity(word_list)

# POP
word_list_pop = list(corpus.words(categories="POP"))
sents_list_pop = list(corpus.sents(categories="POP"))
vocabulary_pop = set(word_list_pop)
pop_len_words = len(word_list_pop)
pop_len_sents = len(sents_list_pop)
pop_len_vocab = len(vocabulary_pop)
Ejemplo n.º 19
0
'''
import string
from itertools import chain

from nltk.corpus import stopwords
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk

# working dir: UN/
mydir = 'corpus/meeting_records_final_categorized'

mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(intervention|soft_action)/.*', encoding='utf-8')
stop = stopwords.words('english')
documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i,j in documents]))
word_features = word_features.keys()[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag in documents[:numtrain]]
test_set = [({i:(i in tokens) for i in word_features}, tag) for tokens,tag  in documents[numtrain:]]

classifier = nbc.train(train_set)
print nltk.classify.accuracy(classifier, test_set) # .87 - ?!?!?!
classifier.show_most_informative_features(20)

# for word_features.keys()[:100]
'''
Most Informative Features
positive_consolidated_list = list(pos_list) + positive_greg
negative_consolidated_list = list(neg_list) + negative_greg
print(positive_consolidated_list)
print(negative_consolidated_list)

init_notebook_mode(connected=True)
cf.set_config_file(offline=True, world_readable=True, theme='ggplot')

#%%

corpus_root = "/Users/LENOVO USER/Desktop/FedTranscript1"
data_m = CategorizedPlaintextCorpusReader(corpus_root,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*',
                                          encoding='latin1')
data_fileids = data_m.fileids()


#%%
def corpus_Stats(crp):
    print('Total number of files: ' + str(len(crp.fileids())))
    print('Number of paragraphs: ' + str(len(crp.paras())))
    print('Number of sentences: ' + str(len(crp.sents())))
    print('Number of words: ' + str(len(crp.words())))


#corpus_Stats(data_m)
#print('\n'+'First file: '+ data_fileids[0])
#print('Last file: '+ data_fileids[-1])

#%%
Ejemplo n.º 21
0
def classify_emails():
    stop_words = set(stopwords.words("english"))

    lemmatizer = WordNetLemmatizer()

    mydir = '/home/ubuntu/nltk_data/corpora/gmail'

    all_words = []
    filtered_words = []
    removedPuncuations_words = []
    lematized_words = []
    test_filter = []

    mr = CategorizedPlaintextCorpusReader(mydir, r'(?!\.).*\.txt', cat_pattern=r'(hotel|flight|other)/.*', encoding='latin-1')
    stop = stopwords.words('english')
    documents = [([w for w in mr.words(i) if w.lower() not in stop and w.lower() not in string.punctuation], i.split('/')[0]) for i in mr.fileids()]

    word_features = FreqDist(chain(*[i for i,j in documents]))
    word_features = word_features.keys()[:100]

    def word_feats(document):
        words = set(document)
        features = {}
        for w in word_features:
            features[w] = (w in words)

        return dict(features)

    negids = mr.fileids('hotel')
    posids = mr.fileids('flight')
    neutralids = mr.fileids('other')

    negfeats = [(word_feats(mr.words(fileids=[f])), 'hotel') for f in negids]
    posfeats = [(word_feats(mr.words(fileids=[f])), 'flight') for f in posids]
    neutralfeats = [(word_feats(mr.words(fileids=[f])), 'other') for f in neutralids]

    negcutoff = len(negfeats)*3/4
    poscutoff = len(posfeats)*3/4
    neutralcutoff = len(neutralfeats)*3/4

    trainfeats = negfeats[:negcutoff] + posfeats[:poscutoff] + neutralfeats[:neutralcutoff]
    testfeats = negfeats[negcutoff:] + posfeats[poscutoff:] + neutralfeats[neutralcutoff:]

    classifier = nltk.NaiveBayesClassifier.train(trainfeats)
    print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testfeats))*100)

    print ('accuracy:', nltk.classify.util.accuracy(classifier, testfeats)*100)


    file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read()
    tokens = nltk.word_tokenize(file_content)

    test_sent_features = {word.lower(): (word in tokens) for word in mr.words()}

    file_content = open("/home/ubuntu/nltk_data/corpora/gmail/hotel/h12.txt").read()
    tokens = nltk.word_tokenize(file_content)
    tri_tokens = trigrams(tokens)

    cities = []
    matchedIndex = []
    tokenized = []
    addresses = []
    district = ['Akarawita','Angamuwa','Avissawella','Batawala','Battaramulla','Batugampola','Bope','Boralesgamuwa','Borella','Dedigamuwa','Dehiwala','Deltara','Habarakada','Handapangoda','Hanwella','Hewainna','Hiripitya','Hokandara','Homagama','Horagala','Kaduwela','Kahawala','Kalatuwawa','Madapatha','Maharagama','Malabe','Meegoda','Padukka','Pannipitiya','Piliyandala','Pitipana','Homagama','Polgasowita','Puwakpitiya','Ranala','Siddamulla','Slave Island','Sri Jayawardenapura','Talawatugoda','Tummodara','Waga','Watareka','Dickwella']

    for i in tokens:
        tokenized.append(i)

    pattern = re.compile("\d+")
    for i in tokenized:
        if pattern.match(i):
            matchedIndex.append(tokenized.index(i))
            print ("match"+i)
            print (tokenized.index(i))

        else:
            print ("not match")

    for t in tokenized:
        for i in district:
            if t.lower()==i.lower():
                cities.append(tokenized.index(t))

    distance= 200
    start = 0
    end = 0

    for t in cities:
        for i in matchedIndex:
            dis = t-i;
            if (dis<=distance and dis>0):
                distance=dis
                start=t
                end=i
            else:
                print ("higher")

    address = ""

    for token in range(end,start+1):
        address+=tokenized[(token)]
        print (address)
        addresses.append(address)

    for address in addresses:
        try:
            search = geocoder.get(address)
        except ValueError:
            continue
        first_result = search[0]

    output =  [first_result.geometry.location.lat,first_result.geometry.location.lng]


    stri = ','.join(map(str, output))
    return stri
Ejemplo n.º 22
0
# 임포트
from nltk.corpus import CategorizedPlaintextCorpusReader

# 말뭉치 읽어오기
reader = CategorizedPlaintextCorpusReader(r'C:\Users\hyery\Python-NLP\chaper01\Reviews\tokens',
                                             r'.*\.txt', cat_pattern=r'(\w+)/*')
print(reader.categories())
print(reader.fileids())

# 각 카테고리의 샘플을 포함하는 목록 작성
posFiles = reader.fileids(categories='pos')  # 카테고리의 이름을 인수로 받는 fileids()함수
negFiles = reader.fileids(categories='neg')

# 각 목록에서 임의로 파일을 선택
from random import randint

fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(negFiles) - 1)]
print(fileP)
print(fileN)

# 선택한 파일에 엑세스를 해 문장을 출력
for w in reader.words(fileP):
    print(w+' ', end='')
    if (w is '.'):
        print()

for w in reader.words(fileN):
    print(w+' ', end='')
    if (w is '.'):
        print()
Ejemplo n.º 23
0
class CorpusUtil(object):
    """Documentar
    """
    def __init__(self, raiz_corpus):
        """Cria um objeto do tipo 'CategorizedPlaintextCorpusReader',
        utilizando o diretório raiz do corpus, onde os documentos
        estão localizados, dispostos em seus respectivos subdiretórios,
        de acordo com sua categoria, sejam eles/elas quais for
        
        -->     raiz_corpus/{pos,neg,neu,...}.
        """
        reload(sys)
        sys.setdefaultencoding("utf-8")
        
        self._raiz_corpus = raiz_corpus
        self._corpus = CategorizedPlaintextCorpusReader(raiz_corpus, r'.+\.txt', cat_pattern=r'(\w+)/*',
                                                        encoding='utf-8')
        self._documentos = None
        self._palavras_frequentes = None
        self._todas_palavras = None
        self._featuresets = None
        self._train_set = None
        self._test_set = None

    def get_documentos(self):
        """Construimos uma lista de documentos, rotulados com as
        categorias apropriadas. Cada documento é representado por
        uma tupla na estrutura abaixo:
        
        (conteudo_do_documento, categoria)
        
        Retorna essa lista com todos os documentos do corpus.
        """
        """
        documentos = [(self.corpus.words(fileid), categoria)
                       for categoria in self.corpus.categories()
                       for fileid in self.corpus.fileids(categoria)]
        """
        print "-- Recuperando documentos do corpus."

        if self._documentos is None:            
            self._documentos = [Documento(" ".join(self._corpus.words(fileid)), categoria, self, fileid)
                                for categoria in self._corpus.categories()
                                for fileid in self._corpus.fileids(categoria)]

        # Embaralha documentos
        for i in range(0, 10):
            shuffle(self._documentos)

        return self._documentos

    def get_palavras_frequentes(self):
        """Documentar.
        """
        if self._palavras_frequentes is None:

            print "-- Verificando as palavras mais frequentes do corpus."

            # Teste - retorna apenas as 2000 palavras mais frequentes do corpus
            todas_palavras = [word.lower() for word in self._corpus.words()]
            freq_dist_palavras = FreqDist(todas_palavras)
            frequencia_palavras = freq_dist_palavras.most_common(2000)  # 2000 palavras mais frequentes
            
            self._palavras_frequentes = [palavra for palavra, frequencia in frequencia_palavras]
            
            # all_words = FreqDist(word.lower() for word in self.corpus.words())
            # self.word_features = list(all_words)[:2000]
        return self._palavras_frequentes

    def get_todas_palavras(self):
        if self._todas_palavras is None:
            print "-- Recuperando todas as palavras do corpus."
            self._todas_palavras = [word.lower() for word in self._corpus.words()]
            self._todas_palavras = set(self._todas_palavras)

        return self._todas_palavras

    def get_featuresets(self):
        """Configura os featuresets que são construídos na
        seguinte estrutura:
            (features_do_documento, categoria)
        
        Retorna uma lista de featuresets
        """
        if self._featuresets is None:
            
            if self._documentos is None:
                self.get_documentos()

            print "-- Recuperando featuresets."

            self._featuresets = apply_features(Documento.get_features, self._documentos)
        
        return self._featuresets

    def get_train_set(self):
        """Documentar
        """
        if self._featuresets is None:
            self.get_featuresets()

        print "-- Recuperando train_set."

        # Para não ocupar toda a memória RAM,
        # não armazena todos os documentos de uma vez nesta.
        # self._train_set = apply_features(Documento.get_features, self._documentos[100:])
        self._train_set = apply_features(Documento.get_features, self._documentos)

        return self._train_set

    def get_test_set(self):
        if self._featuresets is None:
            self.get_featuresets()

        print "-- Recuperando test_set."

        # self._test_set = apply_features(Documento.get_features, self._documentos[:100])

        return self._test_set

    def gravar_palavras_frequentes(self):
        diretorio_destino = "/home/lucas/Documents/mineracao_opiniao/palavras_frequentes_corpus"
        molde_nome_arquivo = "palavras_frequentes_%s.pickle"

        tempo_agora = str(datetime.now())
        # Substitui ':' e espaço em branco por '.'
        tempo_agora = re.sub(ur':|\s', '.', tempo_agora)
        nome_arquivo = molde_nome_arquivo % tempo_agora

        if self._palavras_frequentes is None:
            self.get_palavras_frequentes()

        f = open(diretorio_destino + "/" + nome_arquivo, 'wb')
        pickle.dump(self._palavras_frequentes, f)
        f.close()

        return True

    @staticmethod
    def abrir_arquivo_palavras_frequentes(arquivo_path):
        f = open(arquivo_path, 'rb')
        palavras_frequentes = pickle.load(f)
        f.close()

        return palavras_frequentes
Ejemplo n.º 24
0
# Return errors in order to improve algorithm
def errors_em(poem_set):
    errors = []
    for (fileid, category) in poem_set:
        poem = corpus_of_poems.words(fileids=[fileid])
        emotion_correct = features_of_poem(poem)
        guess = classifier.classify(features_of_poem(poem))

        if guess != category:
            errors.append((category, guess, poem, emotion_correct['emotions']))

    return errors


poem_set = []
for fileid in corpus_of_poems.fileids():
    for category in corpus_of_poems.categories(fileid):
        poem_set.append((fileid, category))

print(poem_set)

random.shuffle(poem_set)

feature_set = []
for (fileid, category) in poem_set:
    feature_cal = (features_of_poem(fileid), category)
    feature_set.append(feature_cal)

train_set = feature_set[25:]

test_set = feature_set[:25]
Ejemplo n.º 25
0
from nltk.probability import FreqDist
from nltk.classify import NaiveBayesClassifier as nbc
from nltk.corpus import CategorizedPlaintextCorpusReader
import nltk

mydir = 'Documents/Plab/Project4/subset/test/neg'

mr = CategorizedPlaintextCorpusReader(mydir,
                                      r'(?!\.).*\.txt',
                                      cat_pattern=r'(neg|pos)/.*',
                                      encoding='ascii')
stop = stopwords.words('english')
documents = [([
    w for w in mr.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in mr.fileids()]

word_features = FreqDist(chain(*[i for i, j in documents]))
word_features = word_features.keys()[:100]

numtrain = int(len(documents) * 90 / 100)
train_set = [({i: (i in tokens)
               for i in word_features}, tag)
             for tokens, tag in documents[:numtrain]]
test_set = [({i: (i in tokens)
              for i in word_features}, tag)
            for tokens, tag in documents[numtrain:]]

classifier = nbc.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))
classifier.show_most_informative_features(5)
Ejemplo n.º 26
0
# returns a list of raw sentences
def get_raw_sentences(fileid):  # works
    data = corpus.raw(fileid)
    return tokenizer.tokenize(data)


def get_raw_paragraph(
    fileid
):  #TODO test if this works with yahoo! corpus as well (encoding might differ)
    data = corpus.raw(fileid)
    return data.split(u"\r\n \r\n")


# ACCESS all FILEIDS:
# corpus.fileids([category])  # category is optional
print(corpus.fileids())

# GET ABSOLUTE PATH TO a FILEID
# corpus.abspath('not_aggressive/0__10.txt')

# GET RAW CORPUS
# corpus.raw()  or corpus.raw()[:10]  to get the first 10 chars of the raw text

# GET RAW TEXT COMMENT given fileid
# corpus.raw([fileid])  #  my_corpus.raw(my_corpus.fileids()[2])) # prints raw text of file index 2 of whole corpus#

# GET list of TOKENIZED SENTS for a COMMENT via index or fileid:
# sents = corpus.sents(corpus.fileids()[index])
# sents = corpus.sents([fileid])
"""
GET TOKENIZED PARAGRAPHS
Ejemplo n.º 27
0
#http://www.cs.cornell.edu/people/pabo/movie%2Dreview%2Ddata/

from nltk.corpus import CategorizedPlaintextCorpusReader
from random import randint

reader = CategorizedPlaintextCorpusReader(
    r'mix20_rand700_tokens_cleaned/tokens', r'.*\.txt', cat_pattern=r'(\w+)/*')
print(reader.categories())
print(reader.fileids())

posFiles = reader.fileids(categories='pos')
negFiles = reader.fileids(categories='neg')

fileP = posFiles[randint(0, len(posFiles) - 1)]
fileN = negFiles[randint(0, len(negFiles) - 1)]

print(fileN)
print(fileP)

for w in reader.words(fileP):
    print(w + ' ', end='')
    if w is '.':
        print()

for w in reader.words(fileN):
    print(w + ' ', end='')
    if w is '.':
        print()
Ejemplo n.º 28
0
    r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*')
mr_test = CategorizedPlaintextCorpusReader(
    mydir_test,
    r'(?!\.).*\.txt',
    cat_pattern=
    r'(Analyst Report|Case Study|Datasheets|Technical Brief|Whitepaper)/.*')

stop = stopwords.words('english')

with open('.\\stopwords.txt') as f:
    stop = f.read().splitlines()

documents_train = [([
    w for w in mr_train.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in mr_train.fileids()
                   if os.path.getsize(os.path.join(mydir_train, i)) > 0]
documents_test = [([
    w for w in mr_test.words(i)
    if w.lower() not in stop and w.lower() not in string.punctuation
], i.split('/')[0]) for i in mr_test.fileids()
                  if os.path.getsize(os.path.join(mydir_test, i)) > 0]

word_features_train = FreqDist(chain(*[i for i, j in documents_train]))
word_features_train = list(word_features_train.keys())[:1000]

word_features_test = FreqDist(chain(*[i for i, j in documents_test]))
word_features_test = list(word_features_test.keys())[:1000]

for w in word_features_train:
    if (w in stop):