def Automated_Readability_Index40(section):
	sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
	text = abc.raw(section)
	sents = len(sent_tokenize.tokenize(text))
	words = len(abc.words(section))
	text = " ".join(abc.words(section))
	letters = len(text)
	uw = letters / float(words) 
	us = words / float(sents) 
	ari = (4.71 * uw) + (0.5 * us) - 21.43
	return ari
def Automated_Readability_Index40(section):
    sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
    text = abc.raw(section)
    sents = len(sent_tokenize.tokenize(text))
    words = len(abc.words(section))
    text = " ".join(abc.words(section))
    letters = len(text)
    uw = letters / float(words)
    us = words / float(sents)
    ari = (4.71 * uw) + (0.5 * us) - 21.43
    return ari
Exemple #3
0
def calcARI(file):
    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
    text = abc.raw(file)
    sents = sent_tokenizer.tokenize(text)
    avg_words = 0
    avg_letters = 0
    for sentence in sents:
        avg_words += len(sentence)
    avg_words = avg_words / len(sents)
    for word in abc.words(file):
        avg_letters += len(word)
    avg_letters = avg_letters / len(abc.words(file))
    return (4.71 * avg_letters) + (0.5 * avg_words) - 21.43
Exemple #4
0
def ari(fileid):
    """Accept text as list of words"""
    print(fileid)
    num_chars = len(abc.raw(fileid))
    num_words = len(abc.words(fileid))
    num_sents = len(abc.sents(fileid))

    avg_word_len = num_chars / num_words
    avg_sent_len = num_words / num_sents

    return avg_word_len * 4.71 + avg_sent_len * 0.5 - 21.43
def Automated_Readability_Index40(section):
    char_count = 0
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_text = abc.raw(section)
    sent = len(sent_tokenizer.tokenize(raw_text))
    words = len(abc.words(section))

    for ch in raw_text:
        if ch.isalpha():
            char_count = char_count + 1

    uw = char_count / float(words)
    us = words / float(sent)
    ARI = (4.71 * uw) + (0.5 * us) - 21.43
    return ARI
Exemple #6
0
def pre_process():
    """Remove stop words and punctuation marks from corpus
    """
    if 'cleaned_corpus.pkl' not in os.listdir(
            os.curdir) or 'cleaned_sentences.pkl' not in os.listdir(os.curdir):
        print('Pre-processing...')
        words = abc.words()
        words = [w for w in words]

        sentences = abc.sents()
        sentences = [s for s in sentences]

        stop_words = stopwords.words('english')
        punctuation = list(string.punctuation)
        for i in range(len(sentences)):
            print(i)
            for j in sentences[i]:
                prev = len(sentences[i])
                #print(i*j)
                if set(j) - set(punctuation) == set() or j.lower(
                ) in stop_words:
                    print(j)
                    print('removed')
                    if j in words:
                        words.remove(j)
                    sentences[i].remove(j)
                    assert prev == len(sentences[i]) + 1

        for s in sentences:
            if len(s) <= 1:
                print(s)
                sentences.remove(s)

        pickle.dump(words, open('cleaned_corpus.pkl', 'wb'))
        pickle.dump(sentences, open('cleaned_sentences.pkl', 'wb'))

    else:
        print('Pre processed data already present..')
        words = pickle.load(open('cleaned_corpus.pkl', 'rb'))
        sentences = pickle.load(open('cleaned_sentences.pkl', 'rb'))

    return words, sentences
# Part b - Parse data using BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())
print(soup.get_text())

# Question-5 - Tokenize text parsed from the above url using nltk.
# Find all phone numbers and email addresses from this text using regular expressions.
import re
# All the emails from the above text
email = re.findall('\S+@\S+', final_doc)
print(email)
# All the phone numbers
phone = re.findall('\([0-9](3)\)-[0-9](3)-[0-9](4)', final_doc)
print(phone)

# Question-6 - Use the Porter Stemmer to normalize some tokenized text, calling the stemmer on each word.
# Do the same thing with the Lancaster Stemmer and see if you observe any differences
import nltk
from nltk.corpus import abc
text = abc.words()

porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

for w in text:
    print(w)
    # Word after implementing Porter Stemmer
    print(porter.stem(w))
    # Word after implementing Lancaster Stemmer
    print(lancaster.stem(w))
Exemple #8
0
# File Name			: corpus.py
# Description		: This creates a collection of words with their frequency
# Author			: Ajay
# Date				: 2016-11-19
#==================================================

import os, sys, pickle
from nltk.corpus import brown, movie_reviews, reuters, gutenberg, abc
from collections import Counter

w1 = gutenberg.words()
w2 = brown.words()
w3 = movie_reviews.words()
w4 = reuters.words()
w5 = abc.words()
ww = w1 + w2 + w3 + w4 + w5


WORDS = Counter(ww)

# print(len(Counter(w5)))
# print(len(WORDS))
os.chdir("/Users/chaser/Projects/Dictionary")
with open("corpus", 'wb') as corpora: #always use this because if opening of file fails the file will not get overwritten
    pickle.dump(WORDS, corpora)
from collections import Counter
from torch.autograd import Variable
import random
random.seed(44)
import math
import numpy as np
punctuation = '!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~,.'
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
"""**Getting** dataset"""

words = abc.words()

threshold = 0.000001
window_size = 3
"""*Supporting* functions"""


def preprocess(words):
    words = [i.lower() for i in words]
    words = [i for i in words if i not in punctuation]

    return words


def get_preprocess_and_sub_sampled_data(words):
    corpus = preprocess(words)
            temp_x = Variable(torch.LongTensor([w2i[x]]))
            temp_y = Variable(torch.LongTensor([w2i[y]]))
            model.zero_grad()
            log_probs = model(temp_x, temp_y)
            compare = Variable(torch.Tensor([1]))
            loss = loss_fn(log_probs[0], compare)
            loss.backward()
            optimizer.step()
            total += loss.data.item()
        print(epoch, total_loss)
        plot_tsne_skip(skipgram_train[:1000], model, epoch)
    return model


nltk.download('abc')
text = list(abc.words())
vocab = set(text)
vocab_size = len(vocab)
embd_size = 50
lr = 0.1
epochs = 50
hidden_size = 100
pt = 0
for word in vocab:
    w2i[word] = pt
    i2w[pt] = word
    pt += 1
subset = text[:5000]
cbow_train = cbow_dataset(subset)
skipgram_train = skipgram_dataset(subset)
loss_fn = nn.NLLLoss()
Exemple #11
0
def trainModel():
    totalwords = abc.words() #+ genesis.words() + gutenberg.words() + webtext.words()
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    BigramModel = NgramModel(2, totalwords)
    UnigramModel = NgramModel(1, totalwords)
    return (UnigramModel, BigramModel)
import nltk
nltk.download('abc')
from nltk.corpus import abc
import sys
from keras.models import Model
from keras.layers import Input, Dense, Reshape, dot
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams, make_sampling_table
from collections import Counter
import numpy as np
from keras.callbacks import Callback
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm

vocab = [word.lower() for word in abc.words()]
vocab = [word for word in vocab if word.isalnum()]
freq_count = Counter(vocab).most_common()
unique_words_count = len(freq_count)

word_indices = {}
words_dictionary = {}
for word in freq_count:
    word_indices[word[0]] = len(word_indices)
    words_dictionary[len(words_dictionary)] = word[0]

data = []
for word in vocab:
    if word in words_dictionary.values():
        data.append(word_indices[word])
        # data.append(list(words_dictionary.keys())[list(words_dictionary.values()).index(word)])
Exemple #13
0
# File Name			: corpus.py
# Description		: This creates a collection of words with their frequency
# Author			: Ajay
# Date				: 2016-11-19
#==================================================

import os, sys, pickle
from nltk.corpus import brown, movie_reviews, reuters, gutenberg, abc
from collections import Counter

w1 = gutenberg.words()
w2 = brown.words()
w3 = movie_reviews.words()
w4 = reuters.words()
w5 = abc.words()
ww = w1 + w2 + w3 + w4 + w5

WORDS = Counter(ww)

# print(len(Counter(w5)))
# print(len(WORDS))
os.chdir("/Users/chaser/Projects/Dictionary")
with open(
        "corpus", 'wb'
) as corpora:  #always use this because if opening of file fails the file will not get overwritten
    pickle.dump(WORDS, corpora)
#!/usr/bin/env python3

import nltk

nltk.download('abc')
nltk.download('smultron')

# from nltk import smultron
from nltk.corpus import abc

print(abc)
print(dir(abc))
print(abc.words())

print(sv)
print(dir(sv))

print(sv.words())
sv.demo()

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

sentence = 'Ibland hoppar Jonas upp ur sängen som en gasell, redo att tackla världen med hela sin makt.'
tokens = nltk.word_tokenize(sentence)
print(tokens)
tagged = nltk.pos_tag(tokens)
print(tagged)
    plt.figure(figsize=(16, 9))
    x = embeddings[:,0]
    y = embeddings[:,1]
    plt.scatter(x, y, c="red", alpha=a, label=label)
    for i, word in enumerate(words):
        plt.annotate(word, alpha=0.3, xy=(x[i], y[i]), xytext=(5, 2), 
                     textcoords='offset points', ha='right', va='bottom', size=10)
    plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.legend(loc=4)
    plt.grid(True)
    plt.show()




if __name__ == '__main__':
    corpus=list(abc.words()) #the abc corpus of nltk
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)
    tokenizer.fit_on_texts(corpus)#tokenizing the corpus
    tokenized_corpus = tokenizer.texts_to_sequences(corpus)
    tokens=[] # removing the empty tokens("")
    for i in tokenized_corpus:
        if i!=[]:
            tokens.append(i[0])
    #tokens
    vocab=tokenizer.word_index  #vocabulary with all word indexes
    vocab_size = len(vocab) #size of vocabulary
    training_samples=training_samples(tokens,2,vocab_size)#create training samples
    word2vec(training_samples,10,vocab,vocab_size,10,0.05) 
    #10 epochs with learning rate=0.05 with 10 neurons in the NN
Exemple #16
0
                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
# plt.plot( 'x', 'y', data=df, linestyle='', marker='o', markersize=.713)

    plt.savefig('plots/epoch' + str(epoch))


# Inputs
learning_rate = 0.1
embdg_size = 10
epochs = 50
wordlen = 10000

corpus = (abc.words()[:wordlen])

window = 4
# main
corpus = corpus_cleaning(corpus)
vocab = set(corpus)

vocab_size = len(vocab)
mapped = mapping(vocab)
word_to_id = mapped["wti"]
id_to_word = mapped["itw"]
x_train, y_train = create_traindata(vocab_size, corpus, window, word_to_id)
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
print(vocab_size)
print(x_train.shape)
Exemple #17
0
from collections import defaultdict
from nltk.corpus import brown, treebank, words as words_list, abc, movie_reviews, genesis

conn = sqlite3.connect(os.path.join(os.path.dirname(os.path.realpath(__file__)), "wofkov_db.sqlite"))
c = conn.cursor()

with open('wofkov_db_schema.sql', 'r') as sql:
    commands = sql.read().split(';')
    for command in commands:
        c.execute(command)
    
print "Building clean words list..."
words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]
words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])

print "Building clean sentences list"
sentences = []
for s in brown.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in treebank.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in abc.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in movie_reviews.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in genesis.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# @Time       : 2020/6/28 23:28
# @Author     : 代登辉
# @Email      : [email protected]
# @File       : abc.py
# @Software   : PyCharm
# @Description: 访问abc语句库

from nltk.corpus import abc

files = abc.fileids()
print(files)

wordsRural = abc.words(['rural.txt'])
print(wordsRural)

word20 = abc.words(['rural.txt'])[:20]
print(word20)

# abcGenres = abc.categories()
# print(abcGenres)

for w in abc.words(['science.txt']):
    print(w + ' ', end=' ')
    if w is '.':
        print()

# Example of comparison of reading difficulty score (ARI) for two NLTK corpora.

from nltk.corpus import abc


def avg(lst):
    lentotal = 0.0
    for word in lst:
        lentotal = lentotal + len(word)
    return lentotal / len(lst)


def ari(corpus_words, corpus_sents):
    avgchar = avg(corpus_words)
    avgsent = avg(corpus_sents)
    ari = 4.71 * avgchar + 0.5 * avgsent - 21.43
    return ari

print ari(abc.words('rural.txt'), abc.sents('rural.txt'))
print ari(abc.words('science.txt'), abc.sents('science.txt'))