Python rawの例、nltk.corpus.webtext.raw Pythonの例

コード例 #1

0

ファイルを表示

ファイル: nextWord.py プロジェクト: arrgee23/ml

def makeModel():
    #sentences = webtext.raw()+brown.raw()+reuters.raw()
    sentences = webtext.raw() + reuters.raw()
    # Tokenize the sentences
    try:  # Use the default NLTK tokenizer.
        from nltk import word_tokenize, sent_tokenize
        # Testing whether it works.
        # Sometimes it doesn't work on some machines because of setup issues.
        word_tokenize(
            sent_tokenize("This is a foobar sentence. Yes it is.")[0])

    except:  # Use a naive sentence tokenizer and toktok.
        import re
        from nltk.tokenize import ToktokTokenizer
        # See https://stackoverflow.com/a/25736515/610569
        sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
        # Use the toktok tokenizer that requires no dependencies.
        toktok = ToktokTokenizer()
        word_tokenize = word_tokenize = toktok.tokenize

    tokenized_text = [
        list(map(str.lower, word_tokenize(sent)))
        for sent in sent_tokenize(sentences)
    ]

    # Make it ready for making 3 grams
    n = 5
    train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

    model = MLE(n)  # Lets train a 3-grams model, previously we set n=3

    model.fit(train_data, padded_sents)
    #print(model.vocab)

    return model

コード例 #2

0

ファイルを表示

ファイル: wordcloud.py プロジェクト: David-Byrne/SADD

def calculate_normal_word_freq():
    normal_word_freq = Counter()
    [
        normal_word_freq.update(
            WordCloud.parse_tweet(text.split(":", maxsplit=1)[-1]))
        for text in webtext.raw("overheard.txt").split("\n")
    ]
    return normal_word_freq

コード例 #3

0

ファイルを表示

ファイル: examples.py プロジェクト: necronet/nlp-sink

def tokenize_example():
    singles = webtext.raw('singles.txt')
    singles_no_8 = singles.split('\n')[8]
    print('[singles] Line:8 - {}'.format(singles_no_8))

    print('\n-----\n'.join(sent_tokenize(singles_no_8)))

    print('Word tokenizer')
    for i, sent in enumerate(sent_tokenize(singles_no_8)):
        print('{}: {}'.format(i, word_tokenize(sent)))

コード例 #4

0

ファイルを表示

ファイル: examples.py プロジェクト: necronet/nlp-sink

def stopwords_example():
    singles = webtext.raw('singles.txt')
    singles_no_8 = singles.split('\n')[8]
    stopwords_en = stopwords.words('english')
    single_no8_tokenized_lowered = list(
        map(str.lower, word_tokenize(singles_no_8)))
    stopwords_en = set(stopwords.words('english'))
    stopwords_en_withpunct = stopwords_en.union(set(punctuation))

    print([
        word for word in single_no8_tokenized_lowered
        if word not in stopwords_en_withpunct
    ])

コード例 #5

0

ファイルを表示

ファイル: AESOEventPrep.py プロジェクト: NagsTheProgrammer/AESO-Market-Price-Data-Preparation

    def tagMessage(self, print_tag=0):
        text = webtext.raw('overheard.txt')
        temp = PunktSentenceTokenizer(text)
        message = temp.tokenize(self.message)

        for w in message:
            words = nltk.word_tokenize(w)
            tagged = nltk.pos_tag(words)

        if print_tag:
            print("Message is: ", self.message)
            print("Tagged message is: ", tagged)

        return tagged

コード例 #6

0

ファイルを表示

ファイル: w7p1.py プロジェクト: Pepper0694/Informatics

def tokenize(corpus, fileID):
    '''
    Tokenizes the, casting all words to lower case, stripping out punctuation marks, spaces,
    and words not made of one or more alphanumerical characters.
    
    Parameters
    ----------
    corpus: An NLTK corpus
    fileID: A string
    
    Returns
    -------
    words: a list of strings
    '''
    #Use regex to remove punctuation marks and spaces. Only returns words with >= 1 alphanumerical characters
    pattern = re.compile(r'[^\w\s]')
    
    text=webtext.raw(fileID)
    words = [word.lower() for word in nltk.word_tokenize(re.sub(pattern, ' ', text))]
    return words

コード例 #7

0

ファイルを表示

def webtext():
    from nltk.corpus import webtext as webtext
    from nltk.corpus import nps_chat

    # list comprehension version
    file_ids = [fileid for fileid in webtext.fileids()]
    chat_file_ids = [fileid for fileid in nps_chat.fileids()]

    pirates = webtext.raw('pirates.txt')
    pirates_words = len(webtext.words('pirates.txt'))
    pirates_sents = len(webtext.sents('pirates.txt'))
    uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')]))

    lexical_diversity = lexical_div(uniqs, pirates_words)

    # import nltk.book as book
    # text1 = book.text1
    # pirates = webtext.raw('pirates.txt')

    return render_template('webtext.html',
                           file_ids=file_ids,
                           chat_file_ids=chat_file_ids,
                           pirates=pirates)

コード例 #8

0

ファイルを表示

ファイル: ch2.py プロジェクト: juri-220/Python-NLP

from nltk.corpus import gutenberg
gutenberg.fileids()
emma=gutenberg.words('austen-emma.txt')

#loop over the text to get information
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print (int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid)

#webtext in nltk.corpus
from nltk.corpus import webtext
for filleid in webtext.fileids():
    print (fileid, webtext.raw(fileid)[:2])

from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]

#brown corpus
from nltk.corpus import brown
brown.categories()
brown.words(categories='editorial')
brown.words(fileids=['cp12'])
brown.sents(categories=['news','editorials'])

edi_text = brown.words(categories='fiction')
fdist=nltk.FreqDist([w.lower() for w in edi_text])
modals=['what','who','where','when','why']

コード例 #9

0

ファイルを表示

ファイル: NLP.py プロジェクト: Toma-L/NLP

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print (int(num_chars/num_words), int(num_words/num_sents)) #avg word & sentence length and the diversity of words

macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences #load sentences of Macbeth
macbeth_sentences[1037]
longest_len = max([len(s) for s in macbeth_sentences])
[s for s in macbeth_sentences if len(s) == longest_len] #find longest sentence

from nltk.corpus import webtext
for fileid in webtext.fileids():
    print (fileid, webtext.raw(fileid)[:65], '...')

from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]

from nltk.corpus import brown
brown.categories()
brown.words(categories = 'news') 
brown.words(fileids = ['cg22'])

from nltk.corpus import brown
news_text = brown.words(categories = 'news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:

コード例 #10

0

ファイルを表示

ファイル: ch02.py プロジェクト: gree2/hobby

def fun04():
    """fun04"""
    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:50]

コード例 #11

0

ファイルを表示

ファイル: c02_text_corpora.py プロジェクト: AkiraKane/Python

def webtext():

    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:65], '...'

コード例 #12

0

ファイルを表示

def fun04():
    """fun04"""
    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:50]

コード例 #13

0

ファイルを表示

ファイル: nlp_w2.py プロジェクト: KaninManoch/Nlp

import nltk
from nltk.corpus import webtext
from nltk.corpus import nps_chat

#emma = gutenberg.words('austen-emma.txt')
#print(len(set(w.lower() for w in emma)))

for file in webtext.fileids():
    print(file, webtext.raw(file[:65]))

chatroom = nps_chat.posts()

コード例 #14

0

ファイルを表示

ファイル: nltk2.py プロジェクト: STIMALiU/TextMiningCourse

    nSents = len(gutenberg.sents(fileid))
    nVocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print int(nChars/nWords), int(nWords/nSents), int(nWords/nVocab), fileid
    
macbethRaw = gutenberg.raw('shakespeare-macbeth.txt')
macbethWords = gutenberg.words('shakespeare-macbeth.txt')
macbethSents = gutenberg.sents('shakespeare-macbeth.txt')

longestLen = max([len(s) for s in macbethSents])
longestSents = [s for s in macbethSents if len(s) == longestLen]


from nltk.corpus import webtext
webtext.fileids()
for fileid in webtext.fileids():
    print fileid, webtext.raw(fileid)[:65], '...'
    
webtext.raw('pirates.txt').lower().count('jack')   
pirates = nltk.Text(webtext.words('pirates.txt'))


from nltk.corpus import brown
brown.categories()
brown.words(categories = 'news')
brown.words(fileids = ['cg22'])
brown.words(fileids = ['cg22','ca16']) # Concatenates the two corpora into one.

from nltk.corpus import brown
newsText = brown.words(categories = 'news')
fdist = nltk.FreqDist([w.lower() for w in newsText])
modals = ['can','could','may','might','must','will']

コード例 #15

0

ファイルを表示

 def __init__(self):
     self.number_id = 28
     self.source_id = "webtext"
     self.titles = [name for name in webtext.fileids()]
     self.data = [webtext.raw(name) for name in self.titles]

コード例 #16

0

ファイルを表示

    nVocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print int(nChars / nWords), int(nWords / nSents), int(nWords /
                                                          nVocab), fileid

macbethRaw = gutenberg.raw('shakespeare-macbeth.txt')
macbethWords = gutenberg.words('shakespeare-macbeth.txt')
macbethSents = gutenberg.sents('shakespeare-macbeth.txt')

longestLen = max([len(s) for s in macbethSents])
longestSents = [s for s in macbethSents if len(s) == longestLen]

from nltk.corpus import webtext

webtext.fileids()
for fileid in webtext.fileids():
    print fileid, webtext.raw(fileid)[:65], '...'

webtext.raw('pirates.txt').lower().count('jack')
pirates = nltk.Text(webtext.words('pirates.txt'))

from nltk.corpus import brown

brown.categories()
brown.words(categories='news')
brown.words(fileids=['cg22'])
brown.words(fileids=['cg22', 'ca16'])  # Concatenates the two corpora into one.

from nltk.corpus import brown

newsText = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in newsText])

コード例 #17

0

ファイルを表示

ファイル: 14_nlp_class.py プロジェクト: AaronRanAn/DAT3

'''
Tokenization

What:  Separate text into units such as sentences or words
Why:   Gives structure to previously unstructured text
Notes: Relatively easy with English language text, not easy with some languages
'''

# "corpus" = collection of documents
# "corpora" = plural form of corpus
from nltk.corpus import webtext
webtext.fileids()

# wine reviews corpus
text = webtext.raw('wine.txt')
text[:500]

# tokenize into sentences
sentences = [sent for sent in nltk.sent_tokenize(text)]
sentences[:10]

# tokenize into words
tokens = [word for word in nltk.word_tokenize(text)]
tokens[:100]

# only keep tokens that start with a letter (using regular expressions)
import re
clean_tokens = [token for token in tokens if re.search(r'^[a-zA-Z]+', token)]
clean_tokens[:100]

コード例 #18

0

ファイルを表示

ファイル: nltk04.py プロジェクト: rtoddf/python_practice

import nltk
from nltk.corpus import webtext
from nltk.corpus import nps_chat
from nltk.corpus import brown 

# for fileid in webtext.fileids():
# 	print fileid, webtext.raw(fileid)[:65]

# for fileId in nps_chat.fileids():
# 	print fileId

pirates = webtext.raw('pirates.txt')
pirates_char = len(webtext.raw('pirates.txt'))
pirates_words = len(webtext.words('pirates.txt'))
pirates_sents = len(webtext.sents('pirates.txt'))
print 'pirates_char: ', pirates_char, 'pirates_words: ', pirates_words, 'pirates_sents: ', pirates_sents, 'avg char per word: ', int(pirates_char/pirates_words), 'avg words per sentence: ', int(pirates_words/pirates_sents)

uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')]))

def lexical_div(un, total):
	return total/un

print 'lexical diversity: ', lexical_div(uniqs, pirates_words)

# brown_categories = brown.categories()
# for genre in brown_categories:
# 	print genre

news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
# modal verbs

コード例 #19

0

ファイルを表示

ファイル: 03.py プロジェクト: kouheiszk/nltk

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import brown
from nltk.corpus import webtext

brown.raw(fileids=["cm02"])
webtext.raw("firefox.txt")

コード例 #20

0

ファイルを表示

ファイル: NLTK_Tokenizer.py プロジェクト: pupatel/Data-Analytics

)  # output: ['Hello World.', "It's good to see you.", 'Thanks for buying this book.']

#### WORD TOKENIZE ######
sent = 'Hello World.'
print(word_tokenize(sent))  # output: ['Hello', 'World', '.']

#### ALTERNATIVE WORD TOKENIZER ######
para_1 = "Can't is a contraction."
tokenizer = WordPunctTokenizer()
print(tokenizer.tokenize(
    para_1))  # output: ['Can', "'", 't', 'is', 'a', 'contraction', '.']

#### REGULAR EXPRESSION TOKENIZER ######
regex = "Can't is a contraction."
tokenizer = RegexpTokenizer("[\w']+")
print(tokenizer.tokenize(regex))  # output: ["Can't", 'is', 'a', 'contraction']

#### TRAINING A SENTENCE TOKENIZER ######
text = webtext.raw('overheard.txt')  # Read text example
sent_tokenizer = PunktSentenceTokenizer(text)  # Train tokenizer on text
sents_tokenizer_1 = sent_tokenizer.tokenize(text)  # Use new tokenizer
sents_tokenizer_2 = sent_tokenize(text)  # Old tokenizer

#### FILTERING STOPWORDS ######

english_stops = set(
    stopwords.words('english'))  #set english languagge and load stopwords
words = ["Can't", 'is', 'a', 'contraction']
print([word for word in words
       if word not in english_stops])  # output: ["Can't", 'contraction']

コード例 #21

0

ファイルを表示

from functools import reduce
import operator
import string
from nltk.corpus import gutenberg

num_of_words_to_plot = 20
num_of_words_compare = 50
file_path = "/home/helena/Documents/NLP/data/study_in_scarlet.txt"
moby_file_name = 'melville-moby_dick.txt'

# Read file
file = open(file_path, 'r')
raw_text = file.read()

# Word and sentence tokenization
tokenized_sentences = sent_tokenize(webtext.raw(file_path))
#tokenized_words = reduce(operator.concat, [word_tokenize(s) for s in tokenized_sentences])

tokenizer = RegexpTokenizer(r'\w+')

stop = stopwords.words('english') + list(string.punctuation)
raw_tokens = tokenizer.tokenize(webtext.raw(file_path).lower())
tokens = [i for i in raw_tokens if i not in stop]

# Convert to nltk text
text = Text(tokens)

# Freq dist
fdist = FreqDist(text)
fdist.plot(num_of_words_to_plot, cumulative = False)
scarlet_commons = [word for word, counts in fdist.most_common(num_of_words_compare)]

コード例 #22

0

ファイルを表示

        'freq_tokens_top15': [],
        'freq_bigrams': None,
        'freq_bigrams_top15': [],
        'freq_quadrigrams_life': []
    }
}

# Gera os stopwords e inclui palavras personalizadas
stopwords = stopwords.words('english') + [
    "[", "]", ".", ",", "?", "*", ":", "...", "!", "'", "'s", "#", "(", ")",
    "'m", "-", "'ve", "ft.", "n't", "y.o", "&", "..", "n/s", "s/d", "n/d",
    "s/s", "s/e", "''"
]

for file in data:
    text = webtext.raw(file)

    # Gera e filtra os tokens de cada arquivo
    data[file]['tokens'] = tokenize.word_tokenize(text)
    data[file]['tokens'] = [
        t.lower() for t in data[file]['tokens'] if t.lower() not in stopwords
    ]

    # Gera os dados de frequência dos tokens
    data[file]['freq_tokens'] = nltk.FreqDist(data[file]['tokens'])

    # Gera os dados dos 15 tokens mais frequentes
    top15 = data[file]['freq_tokens'].most_common(15)
    data[file]['freq_tokens_top15'] = top15

    # Gera os dados de frequência dos bigramas

コード例 #23

0

ファイルを表示

        word_sim = {}
        for i in range(self.unique_word):
            vet_w2 = self.w1[i]
            theta_sum = np.dot(vet_w1, vet_w2)
            theta_den = np.linalg.norm(vet_w1) + np.linalg.norm(vet_w2)
            theta = theta_sum / theta_den

            word = self.index_word[i]
            word_sim[word] = theta
        sort_word = sorted(word_sim.items(),
                           key=lambda kv: kv[1],
                           reverse=True)

        for word, sim in sort_word[:top_n]:
            print(word, sim)


fx = webtext.raw(webtext.fileids()[0])
corpus = fx[:1000]
print(corpus)
settings = {"train": {"window": 2, "epoch": 3000, "lr": 0.01}}
w2 = word2vec(settings)
pre_pr = w2.pre_process(corpus, ispara=False)
# print(corpus)
training_Data = w2.gen_training_data(pre_pr)
w2.train(training_Data)
t_word = "phoenix"
print(w2.word_vec(t_word))
w2.vec_sim(t_word, 5)
# print(training_Data.size*training_Data.itemsize)

コード例 #24

0

ファイルを表示

ファイル: webchat.py プロジェクト: xiaohan2012/nltk-book

from nltk.corpus import webtext
for file_id in webtext.fileids():
	print file_id
	print webtext.raw(file_id)[:100]
	print

コード例 #25

0

ファイルを表示

ファイル: webtext.py プロジェクト: fierlion/naturalLanguage

import nltk
from nltk.corpus import webtext
from nltk.corpus import nps_chat

print('WEBTEXT___')
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:20])

print('NPS_CHAT___')
for post in nps_chat.posts():
    print(post)

コード例 #26

0

ファイルを表示

ファイル: SentenceTokenization.py プロジェクト: dxr1988/NLTK-Research

from nltk.corpus import webtext

article = """Girl: But you already have a Big Mac...
Hobo: Oh, this is all theatrical.
Girl: Hola amigo... 
Hobo: his is all theatrical.
我说: "U.S.A 你好啊".
U.S.A is the abbreviation of United States. To use statistical parameters such as mean and standard deviation reliably, you need to have a good estimator for them. The maximum likelihood estimates (MLEs) provide one such estimator. However, an MLE might be biased, which means that its expected value of the parameter might not equal the parameter being estimated."""

sentences = sent_tokenize(article)

for sentence in sentences:
    tokens = word_tokenize(sentence)
    #print(sentence)

text = webtext.raw('overheard.txt')

print(text)
sent_tokenizer = PunktSentenceTokenizer(text)
sents1 = sent_tokenizer.tokenize(text)
sents2 = sent_tokenize(text)

sents1_article = sent_tokenizer.tokenize(article)
sents2_article = sent_tokenize(article)

print(sents1[0])
print(sents2[0])
print()
print(sents1[677])
print(sents2[677])
print()

コード例 #27

0

ファイルを表示

args = parser.parse_args()

if args.top_n_remove < 0:
    parser.error("--top_n_remove Must Be 0 Or Greater")
else:
    n = args.top_n_remove

print("Number Of Most Frequent Words To Remove: " + str(args.top_n_remove))

# Download WebText Corpus If Not Already Downloaded
nltk.download("webtext")
from nltk.corpus import webtext

# Break Raw Data Into Individual Reviews
wine_reviews_raw = webtext.raw("wine.txt").split("\n")

# Used To Remove Non-Text Data
translator = str.maketrans('', '', string.punctuation)

################
# Bag Of Words #
################

cleaned_review_data = []
review_labels = []
good_review_freq = nltk.FreqDist()
bad_review_freq = nltk.FreqDist()

for review in wine_reviews_raw:

コード例 #28

0

ファイルを表示

ファイル: TrainSentenceTokenizer.py プロジェクト: AbhideepRND/NLTK

from nltk.tokenize import PunktSentenceTokenizer, sent_tokenize

from nltk.corpus import webtext

text = webtext.raw('overheard.txt')
sent_tokenizer = PunktSentenceTokenizer(text)
sents1 = sent_tokenizer.tokenize(text)
print(sents1[678])

sents2= sent_tokenize(text)
print(sents2[678])

with open('D:\Python\Data\overheard.txt',encoding='ISO-8859-2') as f:
    text2 = f.read()

sent_tokenizer3 = PunktSentenceTokenizer(text2)
sents3 = sent_tokenizer3.tokenize(text)
print(sents3[0])

コード例 #29

0

ファイルを表示

    "vpon",
    "our",
    "Battlements",
]]

longest_sentence = " ".join(longest_sentence[0])
print(longest_sentence)

# Doubtfull it stood , As two spent Swimmers , that doe cling together , And choake their Art : The mercilesse Macdonwald ( Worthie to be a Rebell , for to that The multiplying Villanies of Nature Doe swarme vpon him ) from the Westerne Isles Of Kernes and Gallowgrosses is supply ' d , And Fortune on his damned Quarry smiling , Shew ' d like a Rebells W***e : but all ' s too weake : For braue Macbeth ( well hee deserues that Name ) Disdayning Fortune , with his brandisht Steele , Which smoak ' d with bloody execution ( Like Valours Minion ) caru ' d out his passage , Till hee fac ' d the Slaue : Which neu ' r shooke hands , nor bad farwell to him , Till he vnseam ' d him from the Naue toth ' Chops , And fix ' d his Head vpon our Battlements

# 1.2 Web and Chat Text

from nltk.corpus import webtext

for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], "...")
"""
output 
firefox.txt Cookie Manager: "Don't allow sites that set removed cookies to se ...
grail.txt SCENE 1: [wind] [clop clop clop] 
KING ARTHUR: Whoa there!  [clop ...
overheard.txt White guy: So, do you have any plans for this evening?
Asian girl ...
pirates.txt PIRATES OF THE CARRIBEAN: DEAD MAN'S CHEST, by Ted Elliott & Terr ...
singles.txt 25 SEXY MALE, seeks attrac older single lady, for discreet encoun ...
wine.txt Lovely delicate, fragrant Rhone wine. Polished leather and strawb ...

"""
from nltk.corpus import nps_chat

chatroom = nps_chat.posts("10-19-20s_706posts.xml")

コード例 #30

0

ファイルを表示

ファイル: NLP_3.py プロジェクト: Reggielang/NLP_skill

    print(int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid)
    
#%%
sentences = gutenberg.sents('shakespeare-macbeth.txt')
sentences
sentences[1037]
#最长的句子
long = max([len(s) for s in sentences])
[s for s in sentences if len(s) == long]

#%%
#网络和聊天文本
from nltk.corpus import webtext
webtext.fileids()
for fileid in webtext.fileids():
    print(fileid,webtext.raw(fileid)[:60])

#%%
#布朗语料库
from nltk.corpus import brown
brown.categories()
brown.words(categories='news')

news_words = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_words])
modals = ['can','could','may','might','must','will']

for m in modals:
    print(m,fdist[m]) 
         
#%%

コード例 #31

0

ファイルを表示

ファイル: NLTK_lesson_chp_2.py プロジェクト: a1309820/Tw_1

emma = gutenberg.words('austen-emma.txt')
#%%
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
    print(round(num_chars/num_words), round(num_words/num_sents), round(num_words/num_vocab), fileid)
    
#%%
#Web and Chat Text

from nltk.corpus import webtext

for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:100], '...')

#%%
from nltk.corpus import brown
news_text = brown.words(categories='news')
fdist = FreqDist(w.lower() for w in news_text)
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    #print(m + ':', fdist[m], )
    print m, ':', fdist[m], '  ',

#%%
#conditional frequencies

cdf = ConditionalFreqDist(
           (genre, word)

コード例 #32

0

ファイルを表示

ファイル: 14_nlp_class.py プロジェクト: zehndec/DAT3

nltk.download()
'''
Tokenization

What:  Separate text into units such as sentences or words
Why:   Gives structure to previously unstructured text
Notes: Relatively easy with English language text, not easy with some languages
'''

# "corpus" = collection of documents
# "corpora" = plural form of corpus
from nltk.corpus import webtext
webtext.fileids()

# wine reviews corpus
text = webtext.raw('wine.txt')
text[:500]

# tokenize into sentences
sentences = [sent for sent in nltk.sent_tokenize(text)]
sentences[:10]

# tokenize into words
tokens = [word for word in nltk.word_tokenize(text)]
tokens[:100]

# only keep tokens that start with a letter (using regular expressions)
import re
clean_tokens = [token for token in tokens if re.search(r'^[a-zA-Z]+', token)]
clean_tokens[:100]

コード例 #33

0

ファイルを表示

import nltk
print(nltk.corpus.gutenberg.fileids())
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
print(emma.concordance("surprize"))

from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print(int(num_chars / num_words), int(num_words / num_sents),
      int(num_words / num_vocab), fileid)

# 网络和聊天文本
from nltk.corpus import webtext
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], '...')

from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom[123])

コード例 #34

0

ファイルを表示

ファイル: nlp.py プロジェクト: vivekam101/Basic-Kaggle-Projects

# print(brown.words())
# print(brown.fileids())
# print(brown.raw('cr08').strip()[:1000])

from nltk.corpus import webtext
import re
# print(webtext.fileids())

# Each line is one advertisement.
# for i, line in enumerate(webtext.raw('singles.txt').split('\n')):
#     if i > 10: # Lets take a look at the first 10 ads.
#         break
#     print(str(i) + ':\t' + line)

import pandas as pd
single_no8 = webtext.raw('singles.txt').split('\n')[8]
# print(single_no8)
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# Sentence Tokenization
from nltk import sent_tokenize, word_tokenize
# print(sent_tokenize(single_no8))

# for sent in sent_tokenize(single_no8):
#     print(word_tokenize(sent))

# for sent in sent_tokenize(single_no8):
# It's a little in efficient to loop through each word,
# after but sometimes it helps to get better tokens.
# print([word.lower() for word in word_tokenize(sent)])

コード例 #35

0

ファイルを表示

ファイル: part2_1.py プロジェクト: laoji168/study

def fun3():
    from nltk.corpus import webtext
    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:65]

コード例 #36

0

ファイルを表示

                                                 fileid))

macbeth_sentences = gutenberg.sents("shakespeare-macbeth.txt")
print("macbeth_sentences= ", macbeth_sentences)
print("macbeth_sentences[1037]= ", macbeth_sentences[1037])

longest_len = max([len(s) for s in macbeth_sentences])
longest_sent = [s for s in macbeth_sentences if len(s) == longest_len]
print("longest_sent= ", longest_sent)

# 2.1.2. 网络文本 和 聊天文本
# 网络文本
from nltk.corpus import webtext

for field in webtext.fileids():
    print(field, webtext.raw(field)[:65], '...')

# 聊天文本
from nltk.corpus import nps_chat

for field in nps_chat.fileids():
    print(field, nps_chat.posts(field)[:12])

chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print("chatroom[123]= ", chatroom[123])

# 1.3. Brown（布朗）语料库：用于研究文体之间的系统性差异（又叫文体学研究）
from nltk.corpus import brown

show_subtitle("使用 categories 区分文本")
print("brown.categories() =", brown.categories())

コード例 #37

0

ファイルを表示

ファイル: web_chats.py プロジェクト: anderscui/nlpy

from nltk.corpus import webtext

for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65])

# IM chat sessions
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom[123])

コード例 #38

0

ファイルを表示

ファイル: 07_Web_Chat.py プロジェクト: amir-jafari/NLP

from nltk.corpus import webtext
from nltk.corpus import nps_chat

for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65])

text = webtext.raw('firefox.txt')
print([i for i in range(len(text)) if text.startswith('a', i)])

chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom[123])

text2 = nps_chat.raw('11-09-teens_706posts.xml')

コード例 #39

0

ファイルを表示

ファイル: chapter2.py プロジェクト: RandyViG/SelectedTopics

#Return the max len of sentences 
longest_len = max([len(s) for s in macbeth_sentences])

#Save the sentences biggest
longest_sent = [s for s in macbeth_sentences if len(s) == longest_len]

#********************************************************************************************************
#                                        Web and Chat Text
#********************************************************************************************************

'''
Web Texts
'''
from nltk.corpus import webtext
for fileid in webtext.fileids():
    print fileid, webtext.raw(fileid)[:65]
'''
Chats
'''
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]


#********************************************************************************************************
#                                        Brown Corpus
#********************************************************************************************************

from nltk.corpus import brown

Displsy the categories that it have.

コード例 #40

0

ファイルを表示

ファイル: fakeSinglesGen.py プロジェクト: uklineale/general

import nltk
from nltk.corpus import webtext
#Remember, this is for funz, but later make a wine recommender - either pick new wine or learn what you like ( I like these wines -> you have this taste preference
#generate a mix of presidential addresses and genesis - corpus.state_union && corpus.genesis
srcLen = webtext.raw('singles.txt')

print 'Length of sezzy shingles waiting for you: {0}'.format(srcLen)

コード例 #41

0

ファイルを表示

ファイル: toturial.py プロジェクト: Paul-Lin/misc

def print_private():
    from nltk.corpus import webtext
    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:65]

コード例 #42

0

ファイルを表示

ファイル: NLTK_LanguageProcessing.py プロジェクト: laurajshi/nat-lang-processing

macbeth_sentences[1116]


# In[64]:


longest_len = max(len(s) for s in macbeth_sentences)
[s for s in macbeth_sentences if len(s) == longest_len]


# In[65]:


from nltk.corpus import webtext
for f in webtext.fileids():
    print(f, webtext.raw(f)[:65], "....")


# In[66]:


from nltk.corpus import brown
brown.categories()


# In[67]:


gov_text = brown.words(categories = 'government')

コード例 #43

0

ファイルを表示

ファイル: run_models.py プロジェクト: apmoore1/Spacy-Stanza-Speed-Comparison

def wine_reviews() -> Iterable[str]:
    reviews = webtext.raw('wine.txt')
    for match in re.finditer(r'(.*)\n', reviews):
        if match.group(0).strip():
            yield match.group(0)

コード例 #44

0

ファイルを表示

ファイル: Chapter2_1.py プロジェクト: uzihs/PyNLP

import nltk
import pprint

print "****** gutenberg"
from nltk.corpus import gutenberg
print gutenberg.fileids()
print "raw: ", len(gutenberg.raw())
print "words: ", len(gutenberg.words())
print "sents: ", len(gutenberg.sents())

print "****** webtext"
from nltk.corpus import webtext
print len(webtext.raw('firefox.txt'))

print "****** nps_chat"
from nltk.corpus import nps_chat
print nps_chat.fileids()
cr=nps_chat.posts('10-19-20s_706posts.xml')
print cr

print "****** brown"
from nltk.corpus import brown
nt=brown.words(categories='news')
print nt

from nltk.corpus import reuters
from nltk.corpus import inaugural

print [w for w in nltk.corpus.udhr.fileids() if 'heb' in w.lower()]
 
print nltk.corpus.brown.readme()