def collect_data(vocabulary_size=10000): v1 = abc.raw("rural.txt").split() v2 = abc.raw("science.txt").split() vocabulary = v1 + v2 data, count, dictionary, reverse_dictionary = build_dataset( vocabulary, vocabulary_size) del vocabulary return data, count, dictionary, reverse_dictionary
def ari(fileid): """Accept text as list of words""" print(fileid) num_chars = len(abc.raw(fileid)) num_words = len(abc.words(fileid)) num_sents = len(abc.sents(fileid)) avg_word_len = num_chars / num_words avg_sent_len = num_words / num_sents return avg_word_len * 4.71 + avg_sent_len * 0.5 - 21.43
def Automated_Readability_Index40(section): sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle') text = abc.raw(section) sents = len(sent_tokenize.tokenize(text)) words = len(abc.words(section)) text = " ".join(abc.words(section)) letters = len(text) uw = letters / float(words) us = words / float(sents) ari = (4.71 * uw) + (0.5 * us) - 21.43 return ari
def calcARI(file): sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle') text = abc.raw(file) sents = sent_tokenizer.tokenize(text) avg_words = 0 avg_letters = 0 for sentence in sents: avg_words += len(sentence) avg_words = avg_words / len(sents) for word in abc.words(file): avg_letters += len(word) avg_letters = avg_letters / len(abc.words(file)) return (4.71 * avg_letters) + (0.5 * avg_words) - 21.43
def Automated_Readability_Index40(section): char_count = 0 sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') raw_text = abc.raw(section) sent = len(sent_tokenizer.tokenize(raw_text)) words = len(abc.words(section)) for ch in raw_text: if ch.isalpha(): char_count = char_count + 1 uw = char_count / float(words) us = words / float(sent) ARI = (4.71 * uw) + (0.5 * us) - 21.43 return ARI
def practice(): stemmed_tokens = [] train_tokens = word_tokenize(abc.raw("rural.txt").lower()) bigrams = list(ngrams(train_tokens, 3)) POS_tag = nltk.pos_tag(train_tokens) print(POS_tag) #custom_tokenizer = PunktSentenceTokenizer(train_tokens) #word_token = custom_tokenizer.tokenize(sample_tokens) ps = PorterStemmer() for token in train_tokens: stemmed_value = ps.stem(token) stemmed_tokens.append(stemmed_value) frequencies = Counter(stemmed_tokens) stop_words = stopwords.words('English') for word, count in frequencies.most_common(50): if word not in stop_words and len(word) > 2: #continue print(word, count)
import nltk from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters from nltk.corpus import abc def ari(raw): # tokenize raw text and get words tokens = nltk.wordpunct_tokenize(raw) words = [word.lower() for word in tokens if word.isalpha()] # instantiate punctuation parameters punkt_params = PunktParameters() # specify abbreviations to be ignored in sentence separation punkt_params.abbrev_types = set(['dr', 'inc', 'mr', 'mrs', 'ms', 'prof', 'etc']) # separate into sentences using a PuktSentenceTokenizer sentences = PunktSentenceTokenizer(punkt_params).tokenize(raw) chars = 0 for word in words: chars += len(word) return (4.71 * (chars / len(words)) + 0.5 * (len(words) / len(sentences)) - 21.43) for fileid in abc.fileids(): print '%*s %9f' % (max(len(f) for f in abc.fileids()), fileid, ari(abc.raw(fileids=fileid)))
import nltk from nltk.tokenize import word_tokenize, sent_tokenize from nltk.corpus import abc, stopwords from nltk.stem import PorterStemmer, WordNetLemmatizer from nltk.tokenize import PunktSentenceTokenizer from nltk.corpus import wordnet from collections import Counter cor = abc.raw("rural.txt").lower() cor_abc = abc.raw("rural.txt").lower() cor_word_tokens = word_tokenize(cor) #print(cor_word_tokens) cor_sent_tokens = sent_tokenize(cor) #print(cor_sent_tokens) #stop words stp = stopwords.words("english") #print(stp) filtered_sentence = [i for i in cor_word_tokens if i not in stp and len(i)>2] # for i in cor_word_tokens: # if i not in stp: # filtered_sentence.append(i) #print(filtered_sentence) #stemming def filteredstem(input): ps = PorterStemmer() for w in input: print(ps.stem(w))
from nltk.corpus import gutenberg, abc, reuters, brown, movie_reviews from topia.termextract import extract extractor = extract.TermExtractor() with open('./corpus/all3.txt', 'r') as f: with open('./data/terms.txt', 'w') as o: o.write("Term\tOccurences\tStrength\n") for term in extractor(f.read()+gutenberg.raw()+abc.raw()+reuters.raw()+brown.raw()+movie_reviews.raw()): o.write("\t".join(map(str, term)) + "\n")
def get_corpus(): science = abc.raw('science.txt') rural = abc.raw('rural.txt') concat = science + '\n' + rural return concat
import nltk nltk.download('abc') nltk.download('punkt') """#### Skip gram model is used for making word embeddings.""" from nltk.corpus import abc from nltk.tokenize import RegexpTokenizer import torch from tqdm import tqdm ''' The size of the corpus is : 663964 The Vocabulary size is : 11557 ''' cut_indx = 70000 corp = abc.raw() wds1 = corp.split()[:cut_indx] print(len(wds1)) t = 1e-5 # this is the frequency d = dict() for i in wds1: d[i] = 0 for i in wds1: d[i] += 1 wds = list() for j in wds1: if (d[j] >= 5): wds.append(j)
bible = genesis.raw('english-kjv.txt') blake = gutenberg.raw('blake-poems.txt') bryant = gutenberg.raw('bryant-stories.txt') burgess = gutenberg.raw('burgess-busterbrown.txt') carroll = gutenberg.raw('carroll-alice.txt') ch_ball = gutenberg.raw('chesterton-ball.txt') ch_brown = gutenberg.raw('chesterton-brown.txt') ch_thurs = gutenberg.raw('chesterton-thursday.txt') edge = gutenberg.raw('edgeworth-parents.txt') mel = gutenberg.raw('melville-moby_dick.txt') mil = gutenberg.raw('milton-paradise.txt') caesar = gutenberg.raw('shakespeare-caesar.txt') hamlet = gutenberg.raw('shakespeare-hamlet.txt') macbeth = gutenberg.raw('shakespeare-macbeth.txt') whit = gutenberg.raw('whitman-leaves.txt') rural = abc.raw('rural.txt') science = abc.raw('science.txt') plots = subjectivity.raw('plot.tok.gt9.5000') quotes = subjectivity.raw('quote.tok.gt9.5000') austen = sense + emma + persuasion shakespeare = caesar + hamlet + macbeth facts = rural + science opinions = plots + quotes gute = bryant + burgess + carroll + edge + mel + mil + whit chester = ch_ball + ch_brown + ch_thurs total = austen + shakespeare + facts + opinions + gute + chester + b spaces = {} wordlist = [] with open('words.json', 'r') as f:
from collections import Counter import random, math import itertools import torch from torch import nn, optim from torch.utils.data import DataLoader import matplotlib.pyplot as plt from sklearn.manifold import TSNE import pandas as pd device = 'cuda' if torch.cuda.is_available() else 'cpu' # creating corpus corpus = [] for text_id in abc.fileids(): text = abc.raw(text_id) text = text.lower() text = text.replace('\n', ' ') text = re.sub('[^a-zA-Z1-9]+', ' ', text) text = re.sub(' +', ' ', text) corpus.append([w for w in text.split() if w != '']) n_docs = len(corpus) # subsample frequent words filtered_corpus = [] word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus)))) total_words = np.sum(list(word_counts.values())) freq = {word: word_counts[word] / float(total_words) for word in word_counts} threshold = 1e-5 for doc in corpus:
#refernce : https://towardsdatascience.com/google-news-and-leo-tolstoy-visualizing-word2vec-word-embeddings-with-t-sne-11558d8bd4d def tsne_plot(label, embedding): print('Plotting...') plt.figure(figsize=(16, 9)) colors = cm.rainbow(np.linspace(0, 1, 1)) plt.legend(loc=4) x = embedding[:, 0] y = embedding[:, 1] plt.scatter(x, y, c=colors, alpha=0.2, label=label) plt.savefig(label + '.png') # plt.show() t = 1e-5 x1 = abc.raw() x1 = re.findall(r"[\w']+", x1) vocab_to_int = dict() int_to_vocab = dict() x2 = set(x1) x2 = list(x2) for i in range(len(x2)): vocab_to_int[x2[i]] = i int_to_vocab[i] = x2[i] # vocab_to_int, int_to_vocab = utils.create_lookup_tables(x1) int_words = [vocab_to_int[word] for word in x1] y = dict()
from nltk.corpus import abc import string import numpy as np from torch.utils.data import DataLoader from torch.utils.data import Dataset, DataLoader from datetime import datetime import pickle as pkl from sklearn.manifold import TSNE # %matplotlib inline import matplotlib.pyplot as plt import pickle as pkl torch.manual_seed(1) CONTEXT_SIZE = 2 # 2 words to the left, 2 to the right text = abc.raw().lower().split() text2 = [] for i in text: word = '' for j in i: if j not in string.punctuation: word += j if word != '': text2.append(word) # text = [''.join(c for c in s if c not in string.punctuation) for s in text] # text = [s for s in text if s] text = text2 vocab = set(text) vocab_size = len(vocab)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Thu Sep 20 21:17:25 2018 @author: vpapg """ # Obtain raw texts from two or more genres and compute their respective reading difficulty scores as in the earlier exercise on reading difficulty. E.g. compare ABC Rural News and ABC Science News (nltk.corpus.abc). Use Punkt to perform sentence segmentation. from nltk.corpus import abc from nltk import word_tokenize, sent_tokenize abc_rural = abc.raw("rural.txt") abc_science = abc.raw("science.txt") def ARI(raw): words = word_tokenize(raw) sents = sent_tokenize( raw) # I used different method for sentence segmentation mw = sum(len(w) for w in words) / len(words) ms = sum(len(s) for s in sents) / len(sents) return 4.71 * mw + 0.5 * ms - 21.43 print(ARI(abc_rural)) print(ARI(abc_science))