def Automated_Readability_Index40(section): sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle') text = abc.raw(section) sents = len(sent_tokenize.tokenize(text)) words = len(abc.words(section)) text = " ".join(abc.words(section)) letters = len(text) uw = letters / float(words) us = words / float(sents) ari = (4.71 * uw) + (0.5 * us) - 21.43 return ari
def calcARI(file): sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') text = abc.raw(file) sents = sent_tokenizer.tokenize(text) avg_words = 0 avg_letters = 0 for sentence in sents: avg_words += len(sentence) avg_words = avg_words / len(sents) for word in abc.words(file): avg_letters += len(word) avg_letters = avg_letters / len(abc.words(file)) return (4.71 * avg_letters) + (0.5 * avg_words) - 21.43
def ari(fileid): """Accept text as list of words""" print(fileid) num_chars = len(abc.raw(fileid)) num_words = len(abc.words(fileid)) num_sents = len(abc.sents(fileid)) avg_word_len = num_chars / num_words avg_sent_len = num_words / num_sents return avg_word_len * 4.71 + avg_sent_len * 0.5 - 21.43
def Automated_Readability_Index40(section): char_count = 0 sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') raw_text = abc.raw(section) sent = len(sent_tokenizer.tokenize(raw_text)) words = len(abc.words(section)) for ch in raw_text: if ch.isalpha(): char_count = char_count + 1 uw = char_count / float(words) us = words / float(sent) ARI = (4.71 * uw) + (0.5 * us) - 21.43 return ARI
def pre_process(): """Remove stop words and punctuation marks from corpus """ if 'cleaned_corpus.pkl' not in os.listdir( os.curdir) or 'cleaned_sentences.pkl' not in os.listdir(os.curdir): print('Pre-processing...') words = abc.words() words = [w for w in words] sentences = abc.sents() sentences = [s for s in sentences] stop_words = stopwords.words('english') punctuation = list(string.punctuation) for i in range(len(sentences)): print(i) for j in sentences[i]: prev = len(sentences[i]) #print(i*j) if set(j) - set(punctuation) == set() or j.lower( ) in stop_words: print(j) print('removed') if j in words: words.remove(j) sentences[i].remove(j) assert prev == len(sentences[i]) + 1 for s in sentences: if len(s) <= 1: print(s) sentences.remove(s) pickle.dump(words, open('cleaned_corpus.pkl', 'wb')) pickle.dump(sentences, open('cleaned_sentences.pkl', 'wb')) else: print('Pre processed data already present..') words = pickle.load(open('cleaned_corpus.pkl', 'rb')) sentences = pickle.load(open('cleaned_sentences.pkl', 'rb')) return words, sentences
# Part b - Parse data using BeautifulSoup soup = BeautifulSoup(html_doc, 'html.parser') print(soup.prettify()) print(soup.get_text()) # Question-5 - Tokenize text parsed from the above url using nltk. # Find all phone numbers and email addresses from this text using regular expressions. import re # All the emails from the above text email = re.findall('\S+@\S+', final_doc) print(email) # All the phone numbers phone = re.findall('\([0-9](3)\)-[0-9](3)-[0-9](4)', final_doc) print(phone) # Question-6 - Use the Porter Stemmer to normalize some tokenized text, calling the stemmer on each word. # Do the same thing with the Lancaster Stemmer and see if you observe any differences import nltk from nltk.corpus import abc text = abc.words() porter = nltk.PorterStemmer() lancaster = nltk.LancasterStemmer() for w in text: print(w) # Word after implementing Porter Stemmer print(porter.stem(w)) # Word after implementing Lancaster Stemmer print(lancaster.stem(w))
# File Name : corpus.py # Description : This creates a collection of words with their frequency # Author : Ajay # Date : 2016-11-19 #================================================== import os, sys, pickle from nltk.corpus import brown, movie_reviews, reuters, gutenberg, abc from collections import Counter w1 = gutenberg.words() w2 = brown.words() w3 = movie_reviews.words() w4 = reuters.words() w5 = abc.words() ww = w1 + w2 + w3 + w4 + w5 WORDS = Counter(ww) # print(len(Counter(w5))) # print(len(WORDS)) os.chdir("/Users/chaser/Projects/Dictionary") with open("corpus", 'wb') as corpora: #always use this because if opening of file fails the file will not get overwritten pickle.dump(WORDS, corpora)
from collections import Counter from torch.autograd import Variable import random random.seed(44) import math import numpy as np punctuation = '!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~,.' import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F from sklearn.manifold import TSNE import matplotlib.pyplot as plt """**Getting** dataset""" words = abc.words() threshold = 0.000001 window_size = 3 """*Supporting* functions""" def preprocess(words): words = [i.lower() for i in words] words = [i for i in words if i not in punctuation] return words def get_preprocess_and_sub_sampled_data(words): corpus = preprocess(words)
temp_x = Variable(torch.LongTensor([w2i[x]])) temp_y = Variable(torch.LongTensor([w2i[y]])) model.zero_grad() log_probs = model(temp_x, temp_y) compare = Variable(torch.Tensor([1])) loss = loss_fn(log_probs[0], compare) loss.backward() optimizer.step() total += loss.data.item() print(epoch, total_loss) plot_tsne_skip(skipgram_train[:1000], model, epoch) return model nltk.download('abc') text = list(abc.words()) vocab = set(text) vocab_size = len(vocab) embd_size = 50 lr = 0.1 epochs = 50 hidden_size = 100 pt = 0 for word in vocab: w2i[word] = pt i2w[pt] = word pt += 1 subset = text[:5000] cbow_train = cbow_dataset(subset) skipgram_train = skipgram_dataset(subset) loss_fn = nn.NLLLoss()
def trainModel(): totalwords = abc.words() #+ genesis.words() + gutenberg.words() + webtext.words() estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) BigramModel = NgramModel(2, totalwords) UnigramModel = NgramModel(1, totalwords) return (UnigramModel, BigramModel)
import nltk nltk.download('abc') from nltk.corpus import abc import sys from keras.models import Model from keras.layers import Input, Dense, Reshape, dot from keras.layers.embeddings import Embedding from keras.preprocessing.sequence import skipgrams, make_sampling_table from collections import Counter import numpy as np from keras.callbacks import Callback from sklearn.manifold import TSNE import matplotlib.pyplot as plt import matplotlib.cm as cm vocab = [word.lower() for word in abc.words()] vocab = [word for word in vocab if word.isalnum()] freq_count = Counter(vocab).most_common() unique_words_count = len(freq_count) word_indices = {} words_dictionary = {} for word in freq_count: word_indices[word[0]] = len(word_indices) words_dictionary[len(words_dictionary)] = word[0] data = [] for word in vocab: if word in words_dictionary.values(): data.append(word_indices[word]) # data.append(list(words_dictionary.keys())[list(words_dictionary.values()).index(word)])
# File Name : corpus.py # Description : This creates a collection of words with their frequency # Author : Ajay # Date : 2016-11-19 #================================================== import os, sys, pickle from nltk.corpus import brown, movie_reviews, reuters, gutenberg, abc from collections import Counter w1 = gutenberg.words() w2 = brown.words() w3 = movie_reviews.words() w4 = reuters.words() w5 = abc.words() ww = w1 + w2 + w3 + w4 + w5 WORDS = Counter(ww) # print(len(Counter(w5))) # print(len(WORDS)) os.chdir("/Users/chaser/Projects/Dictionary") with open( "corpus", 'wb' ) as corpora: #always use this because if opening of file fails the file will not get overwritten pickle.dump(WORDS, corpora)
#!/usr/bin/env python3 import nltk nltk.download('abc') nltk.download('smultron') # from nltk import smultron from nltk.corpus import abc print(abc) print(dir(abc)) print(abc.words()) print(sv) print(dir(sv)) print(sv.words()) sv.demo() # nltk.download('punkt') # nltk.download('averaged_perceptron_tagger') sentence = 'Ibland hoppar Jonas upp ur sängen som en gasell, redo att tackla världen med hela sin makt.' tokens = nltk.word_tokenize(sentence) print(tokens) tagged = nltk.pos_tag(tokens) print(tagged)
plt.figure(figsize=(16, 9)) x = embeddings[:,0] y = embeddings[:,1] plt.scatter(x, y, c="red", alpha=a, label=label) for i, word in enumerate(words): plt.annotate(word, alpha=0.3, xy=(x[i], y[i]), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom', size=10) plt.savefig(filename, format='png', dpi=150, bbox_inches='tight') plt.legend(loc=4) plt.grid(True) plt.show() if __name__ == '__main__': corpus=list(abc.words()) #the abc corpus of nltk tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True) tokenizer.fit_on_texts(corpus)#tokenizing the corpus tokenized_corpus = tokenizer.texts_to_sequences(corpus) tokens=[] # removing the empty tokens("") for i in tokenized_corpus: if i!=[]: tokens.append(i[0]) #tokens vocab=tokenizer.word_index #vocabulary with all word indexes vocab_size = len(vocab) #size of vocabulary training_samples=training_samples(tokens,2,vocab_size)#create training samples word2vec(training_samples,10,vocab,vocab_size,10,0.05) #10 epochs with learning rate=0.05 with 10 neurons in the NN
xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') # plt.plot( 'x', 'y', data=df, linestyle='', marker='o', markersize=.713) plt.savefig('plots/epoch' + str(epoch)) # Inputs learning_rate = 0.1 embdg_size = 10 epochs = 50 wordlen = 10000 corpus = (abc.words()[:wordlen]) window = 4 # main corpus = corpus_cleaning(corpus) vocab = set(corpus) vocab_size = len(vocab) mapped = mapping(vocab) word_to_id = mapped["wti"] id_to_word = mapped["itw"] x_train, y_train = create_traindata(vocab_size, corpus, window, word_to_id) x_train = np.asarray(x_train) y_train = np.asarray(y_train) print(vocab_size) print(x_train.shape)
from collections import defaultdict from nltk.corpus import brown, treebank, words as words_list, abc, movie_reviews, genesis conn = sqlite3.connect(os.path.join(os.path.dirname(os.path.realpath(__file__)), "wofkov_db.sqlite")) c = conn.cursor() with open('wofkov_db_schema.sql', 'r') as sql: commands = sql.read().split(';') for command in commands: c.execute(command) print "Building clean words list..." words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")] words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]) print "Building clean sentences list" sentences = [] for s in brown.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in treebank.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in abc.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in movie_reviews.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'"))) for s in genesis.sents(): sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
#!/usr/bin/python3.6 # -*- coding: utf-8 -*- # @Time : 2020/6/28 23:28 # @Author : 代登辉 # @Email : [email protected] # @File : abc.py # @Software : PyCharm # @Description: 访问abc语句库 from nltk.corpus import abc files = abc.fileids() print(files) wordsRural = abc.words(['rural.txt']) print(wordsRural) word20 = abc.words(['rural.txt'])[:20] print(word20) # abcGenres = abc.categories() # print(abcGenres) for w in abc.words(['science.txt']): print(w + ' ', end=' ') if w is '.': print()
# Example of comparison of reading difficulty score (ARI) for two NLTK corpora. from nltk.corpus import abc def avg(lst): lentotal = 0.0 for word in lst: lentotal = lentotal + len(word) return lentotal / len(lst) def ari(corpus_words, corpus_sents): avgchar = avg(corpus_words) avgsent = avg(corpus_sents) ari = 4.71 * avgchar + 0.5 * avgsent - 21.43 return ari print ari(abc.words('rural.txt'), abc.sents('rural.txt')) print ari(abc.words('science.txt'), abc.sents('science.txt'))