Ejemplos de words en Python, ejemplos de nltk.corpus.abc.words en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: ch340.py Proyecto: GirishSrinivas/PythonPrograms

def Automated_Readability_Index40(section):
	sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
	text = abc.raw(section)
	sents = len(sent_tokenize.tokenize(text))
	words = len(abc.words(section))
	text = " ".join(abc.words(section))
	letters = len(text)
	uw = letters / float(words) 
	us = words / float(sents) 
	ari = (4.71 * uw) + (0.5 * us) - 21.43
	return ari

Ejemplo n.º 2

0

Mostrar archivo

Archivo: ch340.py Proyecto: GirishSrinivas/PythonPrograms

def Automated_Readability_Index40(section):
    sent_tokenize = nltk.data.load('tokenizers/punkt/english.pickle')
    text = abc.raw(section)
    sents = len(sent_tokenize.tokenize(text))
    words = len(abc.words(section))
    text = " ".join(abc.words(section))
    letters = len(text)
    uw = letters / float(words)
    us = words / float(sents)
    ari = (4.71 * uw) + (0.5 * us) - 21.43
    return ari

Ejemplo n.º 3

0

Mostrar archivo

Archivo: ex5.py Proyecto: Lion223/univ_training_python

def calcARI(file):
    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
    text = abc.raw(file)
    sents = sent_tokenizer.tokenize(text)
    avg_words = 0
    avg_letters = 0
    for sentence in sents:
        avg_words += len(sentence)
    avg_words = avg_words / len(sents)
    for word in abc.words(file):
        avg_letters += len(word)
    avg_letters = avg_letters / len(abc.words(file))
    return (4.71 * avg_letters) + (0.5 * avg_words) - 21.43

Ejemplo n.º 4

0

Mostrar archivo

def ari(fileid):
    """Accept text as list of words"""
    print(fileid)
    num_chars = len(abc.raw(fileid))
    num_words = len(abc.words(fileid))
    num_sents = len(abc.sents(fileid))

    avg_word_len = num_chars / num_words
    avg_sent_len = num_words / num_sents

    return avg_word_len * 4.71 + avg_sent_len * 0.5 - 21.43

Ejemplo n.º 5

0

Mostrar archivo

Archivo: Girish_Srinivas3b.py Proyecto: GirishSrinivas/PythonPrograms

def Automated_Readability_Index40(section):
    char_count = 0
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_text = abc.raw(section)
    sent = len(sent_tokenizer.tokenize(raw_text))
    words = len(abc.words(section))

    for ch in raw_text:
        if ch.isalpha():
            char_count = char_count + 1

    uw = char_count / float(words)
    us = words / float(sent)
    ARI = (4.71 * uw) + (0.5 * us) - 21.43
    return ARI

Ejemplo n.º 6

0

Mostrar archivo

def pre_process():
    """Remove stop words and punctuation marks from corpus
    """
    if 'cleaned_corpus.pkl' not in os.listdir(
            os.curdir) or 'cleaned_sentences.pkl' not in os.listdir(os.curdir):
        print('Pre-processing...')
        words = abc.words()
        words = [w for w in words]

        sentences = abc.sents()
        sentences = [s for s in sentences]

        stop_words = stopwords.words('english')
        punctuation = list(string.punctuation)
        for i in range(len(sentences)):
            print(i)
            for j in sentences[i]:
                prev = len(sentences[i])
                #print(i*j)
                if set(j) - set(punctuation) == set() or j.lower(
                ) in stop_words:
                    print(j)
                    print('removed')
                    if j in words:
                        words.remove(j)
                    sentences[i].remove(j)
                    assert prev == len(sentences[i]) + 1

        for s in sentences:
            if len(s) <= 1:
                print(s)
                sentences.remove(s)

        pickle.dump(words, open('cleaned_corpus.pkl', 'wb'))
        pickle.dump(sentences, open('cleaned_sentences.pkl', 'wb'))

    else:
        print('Pre processed data already present..')
        words = pickle.load(open('cleaned_corpus.pkl', 'rb'))
        sentences = pickle.load(open('cleaned_sentences.pkl', 'rb'))

    return words, sentences

Ejemplo n.º 7

0

Mostrar archivo

Archivo: Assignment-1.py Proyecto: alinauman/Natural-Language-Processing

# Part b - Parse data using BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())
print(soup.get_text())

# Question-5 - Tokenize text parsed from the above url using nltk.
# Find all phone numbers and email addresses from this text using regular expressions.
import re
# All the emails from the above text
email = re.findall('\S+@\S+', final_doc)
print(email)
# All the phone numbers
phone = re.findall('\([0-9](3)\)-[0-9](3)-[0-9](4)', final_doc)
print(phone)

# Question-6 - Use the Porter Stemmer to normalize some tokenized text, calling the stemmer on each word.
# Do the same thing with the Lancaster Stemmer and see if you observe any differences
import nltk
from nltk.corpus import abc
text = abc.words()

porter = nltk.PorterStemmer()
lancaster = nltk.LancasterStemmer()

for w in text:
    print(w)
    # Word after implementing Porter Stemmer
    print(porter.stem(w))
    # Word after implementing Lancaster Stemmer
    print(lancaster.stem(w))

Ejemplo n.º 8

0

Mostrar archivo

Archivo: corpus.py Proyecto: chaser3/Projects

# File Name			: corpus.py
# Description		: This creates a collection of words with their frequency
# Author			: Ajay
# Date				: 2016-11-19
#==================================================

import os, sys, pickle
from nltk.corpus import brown, movie_reviews, reuters, gutenberg, abc
from collections import Counter

w1 = gutenberg.words()
w2 = brown.words()
w3 = movie_reviews.words()
w4 = reuters.words()
w5 = abc.words()
ww = w1 + w2 + w3 + w4 + w5


WORDS = Counter(ww)

# print(len(Counter(w5)))
# print(len(WORDS))
os.chdir("/Users/chaser/Projects/Dictionary")
with open("corpus", 'wb') as corpora: #always use this because if opening of file fails the file will not get overwritten
    pickle.dump(WORDS, corpora)

Ejemplo n.º 9

0

Mostrar archivo

Archivo: problem1.py Proyecto: mukulkumar22/MCA-assignments

from collections import Counter
from torch.autograd import Variable
import random
random.seed(44)
import math
import numpy as np
punctuation = '!"#$%&\'()*+-/:;<=>?@[\\]^_`{|}~,.'
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
"""**Getting** dataset"""

words = abc.words()

threshold = 0.000001
window_size = 3
"""*Supporting* functions"""


def preprocess(words):
    words = [i.lower() for i in words]
    words = [i for i in words if i not in punctuation]

    return words


def get_preprocess_and_sub_sampled_data(words):
    corpus = preprocess(words)

Ejemplo n.º 10

0

Mostrar archivo

Archivo: mca_assignment3.py Proyecto: arushi019/MCA_Assignment3

            temp_x = Variable(torch.LongTensor([w2i[x]]))
            temp_y = Variable(torch.LongTensor([w2i[y]]))
            model.zero_grad()
            log_probs = model(temp_x, temp_y)
            compare = Variable(torch.Tensor([1]))
            loss = loss_fn(log_probs[0], compare)
            loss.backward()
            optimizer.step()
            total += loss.data.item()
        print(epoch, total_loss)
        plot_tsne_skip(skipgram_train[:1000], model, epoch)
    return model


nltk.download('abc')
text = list(abc.words())
vocab = set(text)
vocab_size = len(vocab)
embd_size = 50
lr = 0.1
epochs = 50
hidden_size = 100
pt = 0
for word in vocab:
    w2i[word] = pt
    i2w[pt] = word
    pt += 1
subset = text[:5000]
cbow_train = cbow_dataset(subset)
skipgram_train = skipgram_dataset(subset)
loss_fn = nn.NLLLoss()

Ejemplo n.º 11

0

Mostrar archivo

Archivo: test.py Proyecto: ryandiaz/pa6

def trainModel():
    totalwords = abc.words() #+ genesis.words() + gutenberg.words() + webtext.words()
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    BigramModel = NgramModel(2, totalwords)
    UnigramModel = NgramModel(1, totalwords)
    return (UnigramModel, BigramModel)

Ejemplo n.º 12

0

Mostrar archivo

Archivo: question1_word2vec.py Proyecto: Nisinghal/Multimedia-Computing-and-Applications

import nltk
nltk.download('abc')
from nltk.corpus import abc
import sys
from keras.models import Model
from keras.layers import Input, Dense, Reshape, dot
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams, make_sampling_table
from collections import Counter
import numpy as np
from keras.callbacks import Callback
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import matplotlib.cm as cm

vocab = [word.lower() for word in abc.words()]
vocab = [word for word in vocab if word.isalnum()]
freq_count = Counter(vocab).most_common()
unique_words_count = len(freq_count)

word_indices = {}
words_dictionary = {}
for word in freq_count:
    word_indices[word[0]] = len(word_indices)
    words_dictionary[len(words_dictionary)] = word[0]

data = []
for word in vocab:
    if word in words_dictionary.values():
        data.append(word_indices[word])
        # data.append(list(words_dictionary.keys())[list(words_dictionary.values()).index(word)])

Ejemplo n.º 13

0

Mostrar archivo

Archivo: corpus.py Proyecto: ajjaysingh/Projects

# File Name			: corpus.py
# Description		: This creates a collection of words with their frequency
# Author			: Ajay
# Date				: 2016-11-19
#==================================================

import os, sys, pickle
from nltk.corpus import brown, movie_reviews, reuters, gutenberg, abc
from collections import Counter

w1 = gutenberg.words()
w2 = brown.words()
w3 = movie_reviews.words()
w4 = reuters.words()
w5 = abc.words()
ww = w1 + w2 + w3 + w4 + w5

WORDS = Counter(ww)

# print(len(Counter(w5)))
# print(len(WORDS))
os.chdir("/Users/chaser/Projects/Dictionary")
with open(
        "corpus", 'wb'
) as corpora:  #always use this because if opening of file fails the file will not get overwritten
    pickle.dump(WORDS, corpora)

Ejemplo n.º 14

0

Mostrar archivo

Archivo: text-analyzer.py Proyecto: magnuskjellberg/scimed-surveyor

#!/usr/bin/env python3

import nltk

nltk.download('abc')
nltk.download('smultron')

# from nltk import smultron
from nltk.corpus import abc

print(abc)
print(dir(abc))
print(abc.words())

print(sv)
print(dir(sv))

print(sv.words())
sv.demo()

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')

sentence = 'Ibland hoppar Jonas upp ur sängen som en gasell, redo att tackla världen med hela sin makt.'
tokens = nltk.word_tokenize(sentence)
print(tokens)
tagged = nltk.pos_tag(tokens)
print(tagged)

Ejemplo n.º 15

0

Mostrar archivo

Archivo: question1.py Proyecto: kshitijgulati98/MCA_assignment3

    plt.figure(figsize=(16, 9))
    x = embeddings[:,0]
    y = embeddings[:,1]
    plt.scatter(x, y, c="red", alpha=a, label=label)
    for i, word in enumerate(words):
        plt.annotate(word, alpha=0.3, xy=(x[i], y[i]), xytext=(5, 2), 
                     textcoords='offset points', ha='right', va='bottom', size=10)
    plt.savefig(filename, format='png', dpi=150, bbox_inches='tight')
    plt.legend(loc=4)
    plt.grid(True)
    plt.show()




if __name__ == '__main__':
    corpus=list(abc.words()) #the abc corpus of nltk
    tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True)
    tokenizer.fit_on_texts(corpus)#tokenizing the corpus
    tokenized_corpus = tokenizer.texts_to_sequences(corpus)
    tokens=[] # removing the empty tokens("")
    for i in tokenized_corpus:
        if i!=[]:
            tokens.append(i[0])
    #tokens
    vocab=tokenizer.word_index  #vocabulary with all word indexes
    vocab_size = len(vocab) #size of vocabulary
    training_samples=training_samples(tokens,2,vocab_size)#create training samples
    word2vec(training_samples,10,vocab,vocab_size,10,0.05) 
    #10 epochs with learning rate=0.05 with 10 neurons in the NN

Ejemplo n.º 16

0

Mostrar archivo

                     xytext=(5, 2),
                     textcoords='offset points',
                     ha='right',
                     va='bottom')
# plt.plot( 'x', 'y', data=df, linestyle='', marker='o', markersize=.713)

    plt.savefig('plots/epoch' + str(epoch))


# Inputs
learning_rate = 0.1
embdg_size = 10
epochs = 50
wordlen = 10000

corpus = (abc.words()[:wordlen])

window = 4
# main
corpus = corpus_cleaning(corpus)
vocab = set(corpus)

vocab_size = len(vocab)
mapped = mapping(vocab)
word_to_id = mapped["wti"]
id_to_word = mapped["itw"]
x_train, y_train = create_traindata(vocab_size, corpus, window, word_to_id)
x_train = np.asarray(x_train)
y_train = np.asarray(y_train)
print(vocab_size)
print(x_train.shape)

Ejemplo n.º 17

0

Mostrar archivo

Archivo: wofkov_db_init.py Proyecto: hillmanov/wofkov

from collections import defaultdict
from nltk.corpus import brown, treebank, words as words_list, abc, movie_reviews, genesis

conn = sqlite3.connect(os.path.join(os.path.dirname(os.path.realpath(__file__)), "wofkov_db.sqlite"))
c = conn.cursor()

with open('wofkov_db_schema.sql', 'r') as sql:
    commands = sql.read().split(';')
    for command in commands:
        c.execute(command)
    
print "Building clean words list..."
words = [w.lower() for w in brown.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")]
words.extend([w.lower() for w in treebank.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in words_list.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in abc.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in movie_reviews.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])
words.extend([w.lower() for w in genesis.words() if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")])

print "Building clean sentences list"
sentences = []
for s in brown.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in treebank.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in abc.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in movie_reviews.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))
for s in genesis.sents():
    sentences.append(' '.join(w.lower() for w in s if re.match("[a-zA-Z'-]+", w.strip()) and w.strip() not in ("''", "'")))

Ejemplo n.º 18

0

Mostrar archivo

Archivo: abc.py Proyecto: daidenghui1234/Natural-Language-Processing-with-Python-Cookbook

#!/usr/bin/python3.6
# -*- coding: utf-8 -*-
# @Time       : 2020/6/28 23:28
# @Author     : 代登辉
# @Email      : [email protected]
# @File       : abc.py
# @Software   : PyCharm
# @Description: 访问abc语句库

from nltk.corpus import abc

files = abc.fileids()
print(files)

wordsRural = abc.words(['rural.txt'])
print(wordsRural)

word20 = abc.words(['rural.txt'])[:20]
print(word20)

# abcGenres = abc.categories()
# print(abcGenres)

for w in abc.words(['science.txt']):
    print(w + ' ', end=' ')
    if w is '.':
        print()

Ejemplo n.º 19

0

Mostrar archivo

Archivo: reading_difficulty.py Proyecto: asadurski/snippets

# Example of comparison of reading difficulty score (ARI) for two NLTK corpora.

from nltk.corpus import abc


def avg(lst):
    lentotal = 0.0
    for word in lst:
        lentotal = lentotal + len(word)
    return lentotal / len(lst)


def ari(corpus_words, corpus_sents):
    avgchar = avg(corpus_words)
    avgsent = avg(corpus_sents)
    ari = 4.71 * avgchar + 0.5 * avgsent - 21.43
    return ari

print ari(abc.words('rural.txt'), abc.sents('rural.txt'))
print ari(abc.words('science.txt'), abc.sents('science.txt'))