Python raw Examples, nltk.corpus.abc.raw Python Examples

Example #1

0

Show file

File: Word2Vec.py Project: isha-goel/MCA

def collect_data(vocabulary_size=10000):
    v1 = abc.raw("rural.txt").split()
    v2 = abc.raw("science.txt").split()
    vocabulary = v1 + v2
    data, count, dictionary, reverse_dictionary = build_dataset(
        vocabulary, vocabulary_size)
    del vocabulary
    return data, count, dictionary, reverse_dictionary

Example #2

0

Show file

def ari(fileid):
    """Accept text as list of words"""
    print(fileid)
    num_chars = len(abc.raw(fileid))
    num_words = len(abc.words(fileid))
    num_sents = len(abc.sents(fileid))

    avg_word_len = num_chars / num_words
    avg_sent_len = num_words / num_sents

    return avg_word_len * 4.71 + avg_sent_len * 0.5 - 21.43

Example #3

0

Show file

File: ch340.py Project: GirishSrinivas/PythonPrograms

def Automated_Readability_Index40(section):
    sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
    text = abc.raw(section)
    sents = len(sent_tokenize.tokenize(text))
    words = len(abc.words(section))
    text = " ".join(abc.words(section))
    letters = len(text)
    uw = letters / float(words)
    us = words / float(sents)
    ari = (4.71 * uw) + (0.5 * us) - 21.43
    return ari

Example #4

0

Show file

File: ch340.py Project: GirishSrinivas/PythonPrograms

def Automated_Readability_Index40(section):
	sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
	text = abc.raw(section)
	sents = len(sent_tokenize.tokenize(text))
	words = len(abc.words(section))
	text = " ".join(abc.words(section))
	letters = len(text)
	uw = letters / float(words) 
	us = words / float(sents) 
	ari = (4.71 * uw) + (0.5 * us) - 21.43
	return ari

Example #5

0

Show file

File: ex5.py Project: Lion223/univ_training_python

def calcARI(file):
    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
    text = abc.raw(file)
    sents = sent_tokenizer.tokenize(text)
    avg_words = 0
    avg_letters = 0
    for sentence in sents:
        avg_words += len(sentence)
    avg_words = avg_words / len(sents)
    for word in abc.words(file):
        avg_letters += len(word)
    avg_letters = avg_letters / len(abc.words(file))
    return (4.71 * avg_letters) + (0.5 * avg_words) - 21.43

Example #6

0

Show file

File: Girish_Srinivas3b.py Project: GirishSrinivas/PythonPrograms

def Automated_Readability_Index40(section):
    char_count = 0
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_text = abc.raw(section)
    sent = len(sent_tokenizer.tokenize(raw_text))
    words = len(abc.words(section))

    for ch in raw_text:
        if ch.isalpha():
            char_count = char_count + 1

    uw = char_count / float(words)
    us = words / float(sent)
    ARI = (4.71 * uw) + (0.5 * us) - 21.43
    return ARI

Example #7

0

Show file

def practice():
    stemmed_tokens = []
    train_tokens = word_tokenize(abc.raw("rural.txt").lower())
    bigrams = list(ngrams(train_tokens, 3))
    POS_tag = nltk.pos_tag(train_tokens)
    print(POS_tag)
    #custom_tokenizer = PunktSentenceTokenizer(train_tokens)
    #word_token = custom_tokenizer.tokenize(sample_tokens)
    ps = PorterStemmer()
    for token in train_tokens:
        stemmed_value = ps.stem(token)
        stemmed_tokens.append(stemmed_value)

    frequencies = Counter(stemmed_tokens)
    stop_words = stopwords.words('English')

    for word, count in frequencies.most_common(50):
        if word not in stop_words and len(word) > 2:
            #continue

            print(word, count)

Example #8

0

Show file

File: c3q40.py Project: jonathanmonreal/nltk-examples

import nltk
from nltk.tokenize.punkt import PunktSentenceTokenizer, PunktParameters
from nltk.corpus import abc

def ari(raw):

    # tokenize raw text and get words
    tokens = nltk.wordpunct_tokenize(raw)
    words = [word.lower() for word in tokens if word.isalpha()]

    # instantiate punctuation parameters
    punkt_params = PunktParameters()
    # specify abbreviations to be ignored in sentence separation
    punkt_params.abbrev_types = set(['dr', 'inc', 'mr', 'mrs', 'ms', 'prof',
                                     'etc'])
    # separate into sentences using a PuktSentenceTokenizer
    sentences = PunktSentenceTokenizer(punkt_params).tokenize(raw)

    chars = 0

    for word in words:
        chars += len(word)
    
    return (4.71 * (chars / len(words)) + 0.5 * (len(words) / len(sentences))
            - 21.43)

for fileid in abc.fileids():
    print '%*s %9f' % (max(len(f) for f in abc.fileids()), fileid,
                       ari(abc.raw(fileids=fileid)))

Example #9

0

Show file

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import abc, stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import PunktSentenceTokenizer
from nltk.corpus import wordnet
from collections import Counter


cor = abc.raw("rural.txt").lower()
cor_abc = abc.raw("rural.txt").lower()
cor_word_tokens = word_tokenize(cor)
#print(cor_word_tokens)
cor_sent_tokens = sent_tokenize(cor)
#print(cor_sent_tokens)

#stop words
stp = stopwords.words("english")
#print(stp)
filtered_sentence = [i for i in cor_word_tokens if i not in stp and len(i)>2]

# for i in cor_word_tokens:
#      if i not in stp:
#          filtered_sentence.append(i)
#print(filtered_sentence)

#stemming
def filteredstem(input):
    ps = PorterStemmer()
    for w in input:
        print(ps.stem(w))

Example #10

0

Show file

File: term_extraction.py Project: darkliquid/NaNoGenMo

from nltk.corpus import gutenberg, abc, reuters, brown, movie_reviews
from topia.termextract import extract
extractor = extract.TermExtractor()

with open('./corpus/all3.txt', 'r') as f:
	with open('./data/terms.txt', 'w') as o:
		o.write("Term\tOccurences\tStrength\n")
		for term in extractor(f.read()+gutenberg.raw()+abc.raw()+reuters.raw()+brown.raw()+movie_reviews.raw()):
			o.write("\t".join(map(str, term)) + "\n")

Example #11

0

Show file

File: q1.py Project: vishaal27/MCA-W2020

def get_corpus():
    science = abc.raw('science.txt')
    rural = abc.raw('rural.txt')
    concat = science + '\n' + rural
    return concat

Example #12

0

Show file

File: question_1.py Project: rishabh1704/MCA

import nltk
nltk.download('abc')
nltk.download('punkt')
"""#### Skip gram model is used for making word embeddings."""

from nltk.corpus import abc
from nltk.tokenize import RegexpTokenizer
import torch
from tqdm import tqdm
'''
  The size of the corpus is : 663964
  The Vocabulary size is : 11557
'''

cut_indx = 70000
corp = abc.raw()
wds1 = corp.split()[:cut_indx]
print(len(wds1))
t = 1e-5
# this is the frequency
d = dict()
for i in wds1:
    d[i] = 0
for i in wds1:
    d[i] += 1

wds = list()
for j in wds1:
    if (d[j] >= 5):
        wds.append(j)

Example #13

0

Show file

bible = genesis.raw('english-kjv.txt')
blake = gutenberg.raw('blake-poems.txt')
bryant = gutenberg.raw('bryant-stories.txt')
burgess = gutenberg.raw('burgess-busterbrown.txt')
carroll = gutenberg.raw('carroll-alice.txt')
ch_ball = gutenberg.raw('chesterton-ball.txt')
ch_brown = gutenberg.raw('chesterton-brown.txt')
ch_thurs = gutenberg.raw('chesterton-thursday.txt')
edge = gutenberg.raw('edgeworth-parents.txt')
mel = gutenberg.raw('melville-moby_dick.txt')
mil = gutenberg.raw('milton-paradise.txt')
caesar = gutenberg.raw('shakespeare-caesar.txt')
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
macbeth = gutenberg.raw('shakespeare-macbeth.txt')
whit = gutenberg.raw('whitman-leaves.txt')
rural = abc.raw('rural.txt')
science = abc.raw('science.txt')
plots = subjectivity.raw('plot.tok.gt9.5000')
quotes = subjectivity.raw('quote.tok.gt9.5000')
austen = sense + emma + persuasion
shakespeare = caesar + hamlet + macbeth
facts = rural + science
opinions = plots + quotes
gute = bryant + burgess + carroll + edge + mel + mil + whit
chester = ch_ball + ch_brown + ch_thurs
total = austen + shakespeare + facts + opinions + gute + chester + b

spaces = {}
wordlist = []

with open('words.json', 'r') as f:

Example #14

0

Show file

from collections import Counter
import random, math
import itertools
import torch
from torch import nn, optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# creating corpus
corpus = []
for text_id in abc.fileids():
    text = abc.raw(text_id)
    text = text.lower()
    text = text.replace('\n', ' ')
    text = re.sub('[^a-zA-Z1-9]+', ' ', text)
    text = re.sub(' +', ' ', text)
    corpus.append([w for w in text.split() if w != ''])

n_docs = len(corpus)

# subsample frequent words
filtered_corpus = []
word_counts = dict(Counter(list(itertools.chain.from_iterable(corpus))))
total_words = np.sum(list(word_counts.values()))
freq = {word: word_counts[word] / float(total_words) for word in word_counts}
threshold = 1e-5
for doc in corpus:

Example #15

0

Show file

File: question1_1.py Project: Avenger17146/MCA

#refernce : https://towardsdatascience.com/google-news-and-leo-tolstoy-visualizing-word2vec-word-embeddings-with-t-sne-11558d8bd4d
def tsne_plot(label, embedding):
    print('Plotting...')
    plt.figure(figsize=(16, 9))
    colors = cm.rainbow(np.linspace(0, 1, 1))
    plt.legend(loc=4)
    x = embedding[:, 0]
    y = embedding[:, 1]
    plt.scatter(x, y, c=colors, alpha=0.2, label=label)
    plt.savefig(label + '.png')


#   plt.show()

t = 1e-5
x1 = abc.raw()
x1 = re.findall(r"[\w']+", x1)
vocab_to_int = dict()
int_to_vocab = dict()

x2 = set(x1)
x2 = list(x2)
for i in range(len(x2)):
    vocab_to_int[x2[i]] = i
    int_to_vocab[i] = x2[i]

# vocab_to_int, int_to_vocab = utils.create_lookup_tables(x1)
int_words = [vocab_to_int[word] for word in x1]

y = dict()

Example #16

0

Show file

File: untitled0.py Project: saksham16085/MCA-Assignment-3

from nltk.corpus import abc
import string
import numpy as np
from torch.utils.data import DataLoader
from torch.utils.data import Dataset, DataLoader
from datetime import datetime
import pickle as pkl
from sklearn.manifold import TSNE
# %matplotlib inline
import matplotlib.pyplot as plt
import pickle as pkl

torch.manual_seed(1)

CONTEXT_SIZE = 2  # 2 words to the left, 2 to the right
text = abc.raw().lower().split()
text2 = []
for i in text:
    word = ''
    for j in i:
        if j not in string.punctuation:
            word += j
    if word != '':
        text2.append(word)
# text = [''.join(c for c in s if c not in string.punctuation) for s in text]
# text = [s for s in text if s]

text = text2

vocab = set(text)
vocab_size = len(vocab)

Example #17

0

Show file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Sep 20 21:17:25 2018

@author: vpapg
"""

# Obtain raw texts from two or more genres and compute their respective reading difficulty scores as in the earlier exercise on reading difficulty. E.g. compare ABC Rural News and ABC Science News (nltk.corpus.abc). Use Punkt to perform sentence segmentation.

from nltk.corpus import abc
from nltk import word_tokenize, sent_tokenize

abc_rural = abc.raw("rural.txt")
abc_science = abc.raw("science.txt")


def ARI(raw):
    words = word_tokenize(raw)
    sents = sent_tokenize(
        raw)  # I used different method for sentence segmentation
    mw = sum(len(w) for w in words) / len(words)
    ms = sum(len(s) for s in sents) / len(sents)
    return 4.71 * mw + 0.5 * ms - 21.43


print(ARI(abc_rural))
print(ARI(abc_science))