コード例 #1
0
ファイル: nextWord.py プロジェクト: arrgee23/ml
def makeModel():
    #sentences = webtext.raw()+brown.raw()+reuters.raw()
    sentences = webtext.raw() + reuters.raw()
    # Tokenize the sentences
    try:  # Use the default NLTK tokenizer.
        from nltk import word_tokenize, sent_tokenize
        # Testing whether it works.
        # Sometimes it doesn't work on some machines because of setup issues.
        word_tokenize(
            sent_tokenize("This is a foobar sentence. Yes it is.")[0])

    except:  # Use a naive sentence tokenizer and toktok.
        import re
        from nltk.tokenize import ToktokTokenizer
        # See https://stackoverflow.com/a/25736515/610569
        sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
        # Use the toktok tokenizer that requires no dependencies.
        toktok = ToktokTokenizer()
        word_tokenize = word_tokenize = toktok.tokenize

    tokenized_text = [
        list(map(str.lower, word_tokenize(sent)))
        for sent in sent_tokenize(sentences)
    ]

    # Make it ready for making 3 grams
    n = 5
    train_data, padded_sents = padded_everygram_pipeline(n, tokenized_text)

    model = MLE(n)  # Lets train a 3-grams model, previously we set n=3

    model.fit(train_data, padded_sents)
    #print(model.vocab)

    return model
コード例 #2
0
ファイル: wordcloud.py プロジェクト: David-Byrne/SADD
def calculate_normal_word_freq():
    normal_word_freq = Counter()
    [
        normal_word_freq.update(
            WordCloud.parse_tweet(text.split(":", maxsplit=1)[-1]))
        for text in webtext.raw("overheard.txt").split("\n")
    ]
    return normal_word_freq
コード例 #3
0
ファイル: examples.py プロジェクト: necronet/nlp-sink
def tokenize_example():
    singles = webtext.raw('singles.txt')
    singles_no_8 = singles.split('\n')[8]
    print('[singles] Line:8 - {}'.format(singles_no_8))

    print('\n-----\n'.join(sent_tokenize(singles_no_8)))

    print('Word tokenizer')
    for i, sent in enumerate(sent_tokenize(singles_no_8)):
        print('{}: {}'.format(i, word_tokenize(sent)))
コード例 #4
0
ファイル: examples.py プロジェクト: necronet/nlp-sink
def stopwords_example():
    singles = webtext.raw('singles.txt')
    singles_no_8 = singles.split('\n')[8]
    stopwords_en = stopwords.words('english')
    single_no8_tokenized_lowered = list(
        map(str.lower, word_tokenize(singles_no_8)))
    stopwords_en = set(stopwords.words('english'))
    stopwords_en_withpunct = stopwords_en.union(set(punctuation))

    print([
        word for word in single_no8_tokenized_lowered
        if word not in stopwords_en_withpunct
    ])
    def tagMessage(self, print_tag=0):
        text = webtext.raw('overheard.txt')
        temp = PunktSentenceTokenizer(text)
        message = temp.tokenize(self.message)

        for w in message:
            words = nltk.word_tokenize(w)
            tagged = nltk.pos_tag(words)

        if print_tag:
            print("Message is: ", self.message)
            print("Tagged message is: ", tagged)

        return tagged
コード例 #6
0
ファイル: w7p1.py プロジェクト: Pepper0694/Informatics
def tokenize(corpus, fileID):
    '''
    Tokenizes the, casting all words to lower case, stripping out punctuation marks, spaces,
    and words not made of one or more alphanumerical characters.
    
    Parameters
    ----------
    corpus: An NLTK corpus
    fileID: A string
    
    Returns
    -------
    words: a list of strings
    '''
    #Use regex to remove punctuation marks and spaces. Only returns words with >= 1 alphanumerical characters
    pattern = re.compile(r'[^\w\s]')
    
    text=webtext.raw(fileID)
    words = [word.lower() for word in nltk.word_tokenize(re.sub(pattern, ' ', text))]
    return words
コード例 #7
0
def webtext():
    from nltk.corpus import webtext as webtext
    from nltk.corpus import nps_chat

    # list comprehension version
    file_ids = [fileid for fileid in webtext.fileids()]
    chat_file_ids = [fileid for fileid in nps_chat.fileids()]

    pirates = webtext.raw('pirates.txt')
    pirates_words = len(webtext.words('pirates.txt'))
    pirates_sents = len(webtext.sents('pirates.txt'))
    uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')]))

    lexical_diversity = lexical_div(uniqs, pirates_words)

    # import nltk.book as book
    # text1 = book.text1
    # pirates = webtext.raw('pirates.txt')

    return render_template('webtext.html',
                           file_ids=file_ids,
                           chat_file_ids=chat_file_ids,
                           pirates=pirates)
コード例 #8
0
ファイル: ch2.py プロジェクト: juri-220/Python-NLP
from nltk.corpus import gutenberg
gutenberg.fileids()
emma=gutenberg.words('austen-emma.txt')

#loop over the text to get information
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print (int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid)

#webtext in nltk.corpus
from nltk.corpus import webtext
for filleid in webtext.fileids():
    print (fileid, webtext.raw(fileid)[:2])

from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]

#brown corpus
from nltk.corpus import brown
brown.categories()
brown.words(categories='editorial')
brown.words(fileids=['cp12'])
brown.sents(categories=['news','editorials'])

edi_text = brown.words(categories='fiction')
fdist=nltk.FreqDist([w.lower() for w in edi_text])
modals=['what','who','where','when','why']
コード例 #9
0
ファイル: NLP.py プロジェクト: Toma-L/NLP
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print (int(num_chars/num_words), int(num_words/num_sents)) #avg word & sentence length and the diversity of words

macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences #load sentences of Macbeth
macbeth_sentences[1037]
longest_len = max([len(s) for s in macbeth_sentences])
[s for s in macbeth_sentences if len(s) == longest_len] #find longest sentence

from nltk.corpus import webtext
for fileid in webtext.fileids():
    print (fileid, webtext.raw(fileid)[:65], '...')

from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]

from nltk.corpus import brown
brown.categories()
brown.words(categories = 'news') 
brown.words(fileids = ['cg22'])

from nltk.corpus import brown
news_text = brown.words(categories = 'news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
コード例 #10
0
ファイル: ch02.py プロジェクト: gree2/hobby
def fun04():
    """fun04"""
    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:50]
コード例 #11
0
ファイル: c02_text_corpora.py プロジェクト: AkiraKane/Python
def webtext():

    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:65], '...'
コード例 #12
0
def fun04():
    """fun04"""
    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:50]
コード例 #13
0
ファイル: nlp_w2.py プロジェクト: KaninManoch/Nlp
import nltk
from nltk.corpus import webtext
from nltk.corpus import nps_chat

#emma = gutenberg.words('austen-emma.txt')
#print(len(set(w.lower() for w in emma)))

for file in webtext.fileids():
    print(file, webtext.raw(file[:65]))

chatroom = nps_chat.posts()
コード例 #14
0
ファイル: nltk2.py プロジェクト: STIMALiU/TextMiningCourse
    nSents = len(gutenberg.sents(fileid))
    nVocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print int(nChars/nWords), int(nWords/nSents), int(nWords/nVocab), fileid
    
macbethRaw = gutenberg.raw('shakespeare-macbeth.txt')
macbethWords = gutenberg.words('shakespeare-macbeth.txt')
macbethSents = gutenberg.sents('shakespeare-macbeth.txt')

longestLen = max([len(s) for s in macbethSents])
longestSents = [s for s in macbethSents if len(s) == longestLen]


from nltk.corpus import webtext
webtext.fileids()
for fileid in webtext.fileids():
    print fileid, webtext.raw(fileid)[:65], '...'
    
webtext.raw('pirates.txt').lower().count('jack')   
pirates = nltk.Text(webtext.words('pirates.txt'))


from nltk.corpus import brown
brown.categories()
brown.words(categories = 'news')
brown.words(fileids = ['cg22'])
brown.words(fileids = ['cg22','ca16']) # Concatenates the two corpora into one.

from nltk.corpus import brown
newsText = brown.words(categories = 'news')
fdist = nltk.FreqDist([w.lower() for w in newsText])
modals = ['can','could','may','might','must','will']
コード例 #15
0
 def __init__(self):
     self.number_id = 28
     self.source_id = "webtext"
     self.titles = [name for name in webtext.fileids()]
     self.data = [webtext.raw(name) for name in self.titles]
コード例 #16
0
    nVocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print int(nChars / nWords), int(nWords / nSents), int(nWords /
                                                          nVocab), fileid

macbethRaw = gutenberg.raw('shakespeare-macbeth.txt')
macbethWords = gutenberg.words('shakespeare-macbeth.txt')
macbethSents = gutenberg.sents('shakespeare-macbeth.txt')

longestLen = max([len(s) for s in macbethSents])
longestSents = [s for s in macbethSents if len(s) == longestLen]

from nltk.corpus import webtext

webtext.fileids()
for fileid in webtext.fileids():
    print fileid, webtext.raw(fileid)[:65], '...'

webtext.raw('pirates.txt').lower().count('jack')
pirates = nltk.Text(webtext.words('pirates.txt'))

from nltk.corpus import brown

brown.categories()
brown.words(categories='news')
brown.words(fileids=['cg22'])
brown.words(fileids=['cg22', 'ca16'])  # Concatenates the two corpora into one.

from nltk.corpus import brown

newsText = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in newsText])
コード例 #17
0
ファイル: 14_nlp_class.py プロジェクト: AaronRanAn/DAT3
'''
Tokenization

What:  Separate text into units such as sentences or words
Why:   Gives structure to previously unstructured text
Notes: Relatively easy with English language text, not easy with some languages
'''

# "corpus" = collection of documents
# "corpora" = plural form of corpus
from nltk.corpus import webtext
webtext.fileids()

# wine reviews corpus
text = webtext.raw('wine.txt')
text[:500]

# tokenize into sentences
sentences = [sent for sent in nltk.sent_tokenize(text)]
sentences[:10]

# tokenize into words
tokens = [word for word in nltk.word_tokenize(text)]
tokens[:100]

# only keep tokens that start with a letter (using regular expressions)
import re
clean_tokens = [token for token in tokens if re.search(r'^[a-zA-Z]+', token)]
clean_tokens[:100]
コード例 #18
0
ファイル: nltk04.py プロジェクト: rtoddf/python_practice
import nltk
from nltk.corpus import webtext
from nltk.corpus import nps_chat
from nltk.corpus import brown 

# for fileid in webtext.fileids():
# 	print fileid, webtext.raw(fileid)[:65]

# for fileId in nps_chat.fileids():
# 	print fileId

pirates = webtext.raw('pirates.txt')
pirates_char = len(webtext.raw('pirates.txt'))
pirates_words = len(webtext.words('pirates.txt'))
pirates_sents = len(webtext.sents('pirates.txt'))
print 'pirates_char: ', pirates_char, 'pirates_words: ', pirates_words, 'pirates_sents: ', pirates_sents, 'avg char per word: ', int(pirates_char/pirates_words), 'avg words per sentence: ', int(pirates_words/pirates_sents)

uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')]))

def lexical_div(un, total):
	return total/un

print 'lexical diversity: ', lexical_div(uniqs, pirates_words)

# brown_categories = brown.categories()
# for genre in brown_categories:
# 	print genre

news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
# modal verbs
コード例 #19
0
ファイル: 03.py プロジェクト: kouheiszk/nltk
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import nltk
from nltk.corpus import brown
from nltk.corpus import webtext

brown.raw(fileids=["cm02"])
webtext.raw("firefox.txt")
コード例 #20
0
)  # output: ['Hello World.', "It's good to see you.", 'Thanks for buying this book.']

#### WORD TOKENIZE ######
sent = 'Hello World.'
print(word_tokenize(sent))  # output: ['Hello', 'World', '.']

#### ALTERNATIVE WORD TOKENIZER ######
para_1 = "Can't is a contraction."
tokenizer = WordPunctTokenizer()
print(tokenizer.tokenize(
    para_1))  # output: ['Can', "'", 't', 'is', 'a', 'contraction', '.']

#### REGULAR EXPRESSION TOKENIZER ######
regex = "Can't is a contraction."
tokenizer = RegexpTokenizer("[\w']+")
print(tokenizer.tokenize(regex))  # output: ["Can't", 'is', 'a', 'contraction']

#### TRAINING A SENTENCE TOKENIZER ######
text = webtext.raw('overheard.txt')  # Read text example
sent_tokenizer = PunktSentenceTokenizer(text)  # Train tokenizer on text
sents_tokenizer_1 = sent_tokenizer.tokenize(text)  # Use new tokenizer
sents_tokenizer_2 = sent_tokenize(text)  # Old tokenizer

#### FILTERING STOPWORDS ######

english_stops = set(
    stopwords.words('english'))  #set english languagge and load stopwords
words = ["Can't", 'is', 'a', 'contraction']
print([word for word in words
       if word not in english_stops])  # output: ["Can't", 'contraction']
コード例 #21
0
from functools import reduce
import operator
import string
from nltk.corpus import gutenberg

num_of_words_to_plot = 20
num_of_words_compare = 50
file_path = "/home/helena/Documents/NLP/data/study_in_scarlet.txt"
moby_file_name = 'melville-moby_dick.txt'

# Read file
file = open(file_path, 'r')
raw_text = file.read()

# Word and sentence tokenization
tokenized_sentences = sent_tokenize(webtext.raw(file_path))
#tokenized_words = reduce(operator.concat, [word_tokenize(s) for s in tokenized_sentences])

tokenizer = RegexpTokenizer(r'\w+')

stop = stopwords.words('english') + list(string.punctuation)
raw_tokens = tokenizer.tokenize(webtext.raw(file_path).lower())
tokens = [i for i in raw_tokens if i not in stop]

# Convert to nltk text
text = Text(tokens)

# Freq dist
fdist = FreqDist(text)
fdist.plot(num_of_words_to_plot, cumulative = False)
scarlet_commons = [word for word, counts in fdist.most_common(num_of_words_compare)]
コード例 #22
0
        'freq_tokens_top15': [],
        'freq_bigrams': None,
        'freq_bigrams_top15': [],
        'freq_quadrigrams_life': []
    }
}

# Gera os stopwords e inclui palavras personalizadas
stopwords = stopwords.words('english') + [
    "[", "]", ".", ",", "?", "*", ":", "...", "!", "'", "'s", "#", "(", ")",
    "'m", "-", "'ve", "ft.", "n't", "y.o", "&", "..", "n/s", "s/d", "n/d",
    "s/s", "s/e", "''"
]

for file in data:
    text = webtext.raw(file)

    # Gera e filtra os tokens de cada arquivo
    data[file]['tokens'] = tokenize.word_tokenize(text)
    data[file]['tokens'] = [
        t.lower() for t in data[file]['tokens'] if t.lower() not in stopwords
    ]

    # Gera os dados de frequência dos tokens
    data[file]['freq_tokens'] = nltk.FreqDist(data[file]['tokens'])

    # Gera os dados dos 15 tokens mais frequentes
    top15 = data[file]['freq_tokens'].most_common(15)
    data[file]['freq_tokens_top15'] = top15

    # Gera os dados de frequência dos bigramas
コード例 #23
0
        word_sim = {}
        for i in range(self.unique_word):
            vet_w2 = self.w1[i]
            theta_sum = np.dot(vet_w1, vet_w2)
            theta_den = np.linalg.norm(vet_w1) + np.linalg.norm(vet_w2)
            theta = theta_sum / theta_den

            word = self.index_word[i]
            word_sim[word] = theta
        sort_word = sorted(word_sim.items(),
                           key=lambda kv: kv[1],
                           reverse=True)

        for word, sim in sort_word[:top_n]:
            print(word, sim)


fx = webtext.raw(webtext.fileids()[0])
corpus = fx[:1000]
print(corpus)
settings = {"train": {"window": 2, "epoch": 3000, "lr": 0.01}}
w2 = word2vec(settings)
pre_pr = w2.pre_process(corpus, ispara=False)
# print(corpus)
training_Data = w2.gen_training_data(pre_pr)
w2.train(training_Data)
t_word = "phoenix"
print(w2.word_vec(t_word))
w2.vec_sim(t_word, 5)
# print(training_Data.size*training_Data.itemsize)
コード例 #24
0
ファイル: webchat.py プロジェクト: xiaohan2012/nltk-book
from nltk.corpus import webtext
for file_id in webtext.fileids():
	print file_id
	print webtext.raw(file_id)[:100]
	print
コード例 #25
0
ファイル: webtext.py プロジェクト: fierlion/naturalLanguage
import nltk
from nltk.corpus import webtext
from nltk.corpus import nps_chat

print('WEBTEXT___')
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:20])

print('NPS_CHAT___')
for post in nps_chat.posts():
    print(post)
コード例 #26
0
from nltk.corpus import webtext

article = """Girl: But you already have a Big Mac...
Hobo: Oh, this is all theatrical.
Girl: Hola amigo... 
Hobo: his is all theatrical.
我说: "U.S.A 你好啊".
U.S.A is the abbreviation of United States. To use statistical parameters such as mean and standard deviation reliably, you need to have a good estimator for them. The maximum likelihood estimates (MLEs) provide one such estimator. However, an MLE might be biased, which means that its expected value of the parameter might not equal the parameter being estimated."""

sentences = sent_tokenize(article)

for sentence in sentences:
    tokens = word_tokenize(sentence)
    #print(sentence)

text = webtext.raw('overheard.txt')

print(text)
sent_tokenizer = PunktSentenceTokenizer(text)
sents1 = sent_tokenizer.tokenize(text)
sents2 = sent_tokenize(text)

sents1_article = sent_tokenizer.tokenize(article)
sents2_article = sent_tokenize(article)

print(sents1[0])
print(sents2[0])
print()
print(sents1[677])
print(sents2[677])
print()
コード例 #27
0
args = parser.parse_args()

if args.top_n_remove < 0:
    parser.error("--top_n_remove Must Be 0 Or Greater")
else:
    n = args.top_n_remove

print("Number Of Most Frequent Words To Remove: " + str(args.top_n_remove))

# Download WebText Corpus If Not Already Downloaded
nltk.download("webtext")
from nltk.corpus import webtext

# Break Raw Data Into Individual Reviews
wine_reviews_raw = webtext.raw("wine.txt").split("\n")

# Used To Remove Non-Text Data
translator = str.maketrans('', '', string.punctuation)

################
# Bag Of Words #
################

cleaned_review_data = []
review_labels = []
good_review_freq = nltk.FreqDist()
bad_review_freq = nltk.FreqDist()

for review in wine_reviews_raw:
コード例 #28
0
from nltk.tokenize import PunktSentenceTokenizer, sent_tokenize

from nltk.corpus import webtext

text = webtext.raw('overheard.txt')
sent_tokenizer = PunktSentenceTokenizer(text)
sents1 = sent_tokenizer.tokenize(text)
print(sents1[678])

sents2= sent_tokenize(text)
print(sents2[678])

with open('D:\Python\Data\overheard.txt',encoding='ISO-8859-2') as f:
    text2 = f.read()

sent_tokenizer3 = PunktSentenceTokenizer(text2)
sents3 = sent_tokenizer3.tokenize(text)
print(sents3[0])
コード例 #29
0
    "vpon",
    "our",
    "Battlements",
]]

longest_sentence = " ".join(longest_sentence[0])
print(longest_sentence)

# Doubtfull it stood , As two spent Swimmers , that doe cling together , And choake their Art : The mercilesse Macdonwald ( Worthie to be a Rebell , for to that The multiplying Villanies of Nature Doe swarme vpon him ) from the Westerne Isles Of Kernes and Gallowgrosses is supply ' d , And Fortune on his damned Quarry smiling , Shew ' d like a Rebells W***e : but all ' s too weake : For braue Macbeth ( well hee deserues that Name ) Disdayning Fortune , with his brandisht Steele , Which smoak ' d with bloody execution ( Like Valours Minion ) caru ' d out his passage , Till hee fac ' d the Slaue : Which neu ' r shooke hands , nor bad farwell to him , Till he vnseam ' d him from the Naue toth ' Chops , And fix ' d his Head vpon our Battlements

# 1.2 Web and Chat Text

from nltk.corpus import webtext

for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], "...")
"""
output 
firefox.txt Cookie Manager: "Don't allow sites that set removed cookies to se ...
grail.txt SCENE 1: [wind] [clop clop clop] 
KING ARTHUR: Whoa there!  [clop ...
overheard.txt White guy: So, do you have any plans for this evening?
Asian girl ...
pirates.txt PIRATES OF THE CARRIBEAN: DEAD MAN'S CHEST, by Ted Elliott & Terr ...
singles.txt 25 SEXY MALE, seeks attrac older single lady, for discreet encoun ...
wine.txt Lovely delicate, fragrant Rhone wine. Polished leather and strawb ...

"""
from nltk.corpus import nps_chat

chatroom = nps_chat.posts("10-19-20s_706posts.xml")
コード例 #30
0
ファイル: NLP_3.py プロジェクト: Reggielang/NLP_skill
    print(int(num_chars/num_words),int(num_words/num_sents),int(num_words/num_vocab),fileid)
    
#%%
sentences = gutenberg.sents('shakespeare-macbeth.txt')
sentences
sentences[1037]
#最长的句子
long = max([len(s) for s in sentences])
[s for s in sentences if len(s) == long]

#%%
#网络和聊天文本
from nltk.corpus import webtext
webtext.fileids()
for fileid in webtext.fileids():
    print(fileid,webtext.raw(fileid)[:60])

#%%
#布朗语料库
from nltk.corpus import brown
brown.categories()
brown.words(categories='news')

news_words = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_words])
modals = ['can','could','may','might','must','will']

for m in modals:
    print(m,fdist[m]) 
         
#%%
コード例 #31
0
ファイル: NLTK_lesson_chp_2.py プロジェクト: a1309820/Tw_1
emma = gutenberg.words('austen-emma.txt')
#%%
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
    print(round(num_chars/num_words), round(num_words/num_sents), round(num_words/num_vocab), fileid)
    
#%%
#Web and Chat Text

from nltk.corpus import webtext

for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:100], '...')

#%%
from nltk.corpus import brown
news_text = brown.words(categories='news')
fdist = FreqDist(w.lower() for w in news_text)
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    #print(m + ':', fdist[m], )
    print m, ':', fdist[m], '  ',

#%%
#conditional frequencies

cdf = ConditionalFreqDist(
           (genre, word)
コード例 #32
0
ファイル: 14_nlp_class.py プロジェクト: zehndec/DAT3
nltk.download()
'''
Tokenization

What:  Separate text into units such as sentences or words
Why:   Gives structure to previously unstructured text
Notes: Relatively easy with English language text, not easy with some languages
'''

# "corpus" = collection of documents
# "corpora" = plural form of corpus
from nltk.corpus import webtext
webtext.fileids()

# wine reviews corpus
text = webtext.raw('wine.txt')
text[:500]

# tokenize into sentences
sentences = [sent for sent in nltk.sent_tokenize(text)]
sentences[:10]

# tokenize into words
tokens = [word for word in nltk.word_tokenize(text)]
tokens[:100]

# only keep tokens that start with a letter (using regular expressions)
import re
clean_tokens = [token for token in tokens if re.search(r'^[a-zA-Z]+', token)]
clean_tokens[:100]
コード例 #33
0
import nltk
print(nltk.corpus.gutenberg.fileids())
emma = nltk.Text(nltk.corpus.gutenberg.words('austen-emma.txt'))
print(emma.concordance("surprize"))

from nltk.corpus import gutenberg
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
print(int(num_chars / num_words), int(num_words / num_sents),
      int(num_words / num_vocab), fileid)

# 网络和聊天文本
from nltk.corpus import webtext
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], '...')

from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom[123])
コード例 #34
0
# print(brown.words())
# print(brown.fileids())
# print(brown.raw('cr08').strip()[:1000])

from nltk.corpus import webtext
import re
# print(webtext.fileids())

# Each line is one advertisement.
# for i, line in enumerate(webtext.raw('singles.txt').split('\n')):
#     if i > 10: # Lets take a look at the first 10 ads.
#         break
#     print(str(i) + ':\t' + line)

import pandas as pd
single_no8 = webtext.raw('singles.txt').split('\n')[8]
# print(single_no8)
# import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# Sentence Tokenization
from nltk import sent_tokenize, word_tokenize
# print(sent_tokenize(single_no8))

# for sent in sent_tokenize(single_no8):
#     print(word_tokenize(sent))

# for sent in sent_tokenize(single_no8):
# It's a little in efficient to loop through each word,
# after but sometimes it helps to get better tokens.
# print([word.lower() for word in word_tokenize(sent)])
コード例 #35
0
ファイル: part2_1.py プロジェクト: laoji168/study
def fun3():
    from nltk.corpus import webtext
    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:65]
コード例 #36
0
                                                 fileid))

macbeth_sentences = gutenberg.sents("shakespeare-macbeth.txt")
print("macbeth_sentences= ", macbeth_sentences)
print("macbeth_sentences[1037]= ", macbeth_sentences[1037])

longest_len = max([len(s) for s in macbeth_sentences])
longest_sent = [s for s in macbeth_sentences if len(s) == longest_len]
print("longest_sent= ", longest_sent)

# 2.1.2. 网络文本 和 聊天文本
# 网络文本
from nltk.corpus import webtext

for field in webtext.fileids():
    print(field, webtext.raw(field)[:65], '...')

# 聊天文本
from nltk.corpus import nps_chat

for field in nps_chat.fileids():
    print(field, nps_chat.posts(field)[:12])

chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print("chatroom[123]= ", chatroom[123])

# 1.3. Brown(布朗)语料库:用于研究文体之间的系统性差异(又叫文体学研究)
from nltk.corpus import brown

show_subtitle("使用 categories 区分文本")
print("brown.categories() =", brown.categories())
コード例 #37
0
ファイル: web_chats.py プロジェクト: anderscui/nlpy
from nltk.corpus import webtext

for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65])

# IM chat sessions
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom[123])
コード例 #38
0
ファイル: 07_Web_Chat.py プロジェクト: amir-jafari/NLP
from nltk.corpus import webtext
from nltk.corpus import nps_chat

for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65])

text = webtext.raw('firefox.txt')
print([i for i in range(len(text)) if text.startswith('a', i)])

chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom[123])

text2 = nps_chat.raw('11-09-teens_706posts.xml')
コード例 #39
0
ファイル: chapter2.py プロジェクト: RandyViG/SelectedTopics
#Return the max len of sentences 
longest_len = max([len(s) for s in macbeth_sentences])

#Save the sentences biggest
longest_sent = [s for s in macbeth_sentences if len(s) == longest_len]

#********************************************************************************************************
#                                        Web and Chat Text
#********************************************************************************************************

'''
Web Texts
'''
from nltk.corpus import webtext
for fileid in webtext.fileids():
    print fileid, webtext.raw(fileid)[:65]
'''
Chats
'''
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]


#********************************************************************************************************
#                                        Brown Corpus
#********************************************************************************************************

from nltk.corpus import brown

Displsy the categories that it have.
コード例 #40
0
ファイル: fakeSinglesGen.py プロジェクト: uklineale/general
import nltk
from nltk.corpus import webtext
#Remember, this is for funz, but later make a wine recommender - either pick new wine or learn what you like ( I like these wines -> you have this taste preference
#generate a mix of presidential addresses and genesis - corpus.state_union && corpus.genesis
srcLen = webtext.raw('singles.txt')

print 'Length of sezzy shingles waiting for you: {0}'.format(srcLen) 

コード例 #41
0
ファイル: toturial.py プロジェクト: Paul-Lin/misc
def print_private():
    from nltk.corpus import webtext
    for fileid in webtext.fileids():
        print fileid, webtext.raw(fileid)[:65]
コード例 #42
0
macbeth_sentences[1116]


# In[64]:


longest_len = max(len(s) for s in macbeth_sentences)
[s for s in macbeth_sentences if len(s) == longest_len]


# In[65]:


from nltk.corpus import webtext
for f in webtext.fileids():
    print(f, webtext.raw(f)[:65], "....")


# In[66]:


from nltk.corpus import brown
brown.categories()


# In[67]:


gov_text = brown.words(categories = 'government')

コード例 #43
0
def wine_reviews() -> Iterable[str]:
    reviews = webtext.raw('wine.txt')
    for match in re.finditer(r'(.*)\n', reviews):
        if match.group(0).strip():
            yield match.group(0)
コード例 #44
0
ファイル: Chapter2_1.py プロジェクト: uzihs/PyNLP
import nltk
import pprint

print "****** gutenberg"
from nltk.corpus import gutenberg
print gutenberg.fileids()
print "raw: ", len(gutenberg.raw())
print "words: ", len(gutenberg.words())
print "sents: ", len(gutenberg.sents())

print "****** webtext"
from nltk.corpus import webtext
print len(webtext.raw('firefox.txt'))

print "****** nps_chat"
from nltk.corpus import nps_chat
print nps_chat.fileids()
cr=nps_chat.posts('10-19-20s_706posts.xml')
print cr

print "****** brown"
from nltk.corpus import brown
nt=brown.words(categories='news')
print nt

from nltk.corpus import reuters
from nltk.corpus import inaugural

print [w for w in nltk.corpus.udhr.fileids() if 'heb' in w.lower()]
 
print nltk.corpus.brown.readme()