Esempio n. 1
0
def word_freq(category, lowercase = False):
	word_counter = defaultdict(int)
	total_word_count = 0

	user_pattern = re.compile(r'U\d')

	file_list = young_and_old[category]

	for file_name in file_list:
		chat_log = nps_chat.posts(file_name)
		for post in chat_log:
			if post == ['JOIN'] or post == ['PART'] or post[0] == ':':
				continue
			if post[0] == '.':
				if post[1] == 'ACTION':
					word_counter['.action'] += 1
					post = post[2:]
				else:
					continue
			for word in post:
				if re.match(user_pattern, word):
					continue
				if lowercase:
					word = word.lower()
				# if word == '#14-19teens':
				# 	print post
				word_counter[word] += 1
				total_word_count += 1

	word_freq = {}

	for key,value in word_counter.iteritems():
		word_freq[key] = 1.0*value / total_word_count
	return (word_freq, total_word_count)
def getData(corpus="brown", categories=""):
    if corpus == "brown":
        if categories != "":
            return brown.tagged_sents(tagset='universal',
                                      categories=categories)

        return brown.tagged_sents(tagset='universal')
    elif corpus == "treebank":
        return treebank.tagged_sents(tagset='universal')
    elif corpus == "nps_chat":
        #Dialogue dataset
        data = []
        posts = nps_chat.posts()
        words = nps_chat.tagged_words(tagset='universal')

        index = 0
        for sent in posts:
            data.append(words[index:index + len(sent)])
            index += len(sent)
        return data

    elif corpus == "conll2000":
        return conll2000.tagged_sents(tagset='universal')

    return brown.tagged_sents(tagset='universal')
Esempio n. 3
0
def most_common_precedents(target_word, category, num_precedents = 3):
	precedents = Counter()

	file_list = young_and_old[category]
	user_pattern = re.compile(r'u\d')

	for file_name in file_list:
		chat_log = nps_chat.posts(file_name)
		for post in chat_log:
			if post == ['JOIN'] or post == ['PART'] or post[0] == ':':
				continue
			if post[0] == '.':
				if post[1] == 'ACTION':
					previous_word = '.action'
					post = post[2:]
				else:
					continue
			else:
				previous_word = '<s>'
			for word in post:
				word = word.lower()
				if word == target_word:
					if re.match(user_pattern, previous_word):
						precedents['user_name'] += 1
					else:
						precedents[previous_word] += 1
				previous_word = word
	return precedents.most_common(num_precedents)
Esempio n. 4
0
def fun4():
    from nltk.corpus import nps_chat
    nltk.download('nps_chat')
    chatroom = nps_chat.posts('10-19-20s_706posts.xml')
    print chatroom[123]
    for i in nps_chat.fileids():
        print i
Esempio n. 5
0
def word_freq(category, lowercase=False):
    word_counter = defaultdict(int)
    total_word_count = 0

    user_pattern = re.compile(r'U\d')

    file_list = young_and_old[category]

    for file_name in file_list:
        chat_log = nps_chat.posts(file_name)
        for post in chat_log:
            if post == ['JOIN'] or post == ['PART'] or post[0] == ':':
                continue
            if post[0] == '.':
                if post[1] == 'ACTION':
                    word_counter['.action'] += 1
                    post = post[2:]
                else:
                    continue
            for word in post:
                if re.match(user_pattern, word):
                    continue
                if lowercase:
                    word = word.lower()
                # if word == '#14-19teens':
                # 	print post
                word_counter[word] += 1
                total_word_count += 1

    word_freq = {}

    for key, value in word_counter.iteritems():
        word_freq[key] = 1.0 * value / total_word_count
    return (word_freq, total_word_count)
Esempio n. 6
0
def tag_it(train, test, regex_pattern, print_errors=False):
    """
    Use tagger hierarchy approach shown in the lecture
    I actually tried some variations and different orders, e.g. regex at the beginning.
    But the below order gave me the best results
    :param train:
    :param test:
    :param regex_pattern:
    :param print_errors:
    :return:
    """

    default_tagger = nltk.DefaultTagger('NOUN')
    regex_tagger = nltk.tag.RegexpTagger(regex_pattern, backoff=default_tagger)
    unigram_tagger = nltk.UnigramTagger(train, backoff=regex_tagger)
    bigram_tagger = nltk.BigramTagger(train, backoff=unigram_tagger)
    trigram_tagger = nltk.TrigramTagger(train, backoff=bigram_tagger)

    print(trigram_tagger.evaluate(test))

    # print wrongly classified values
    if print_errors:
        sents = nps_chat.posts()
        untagged = trigram_tagger.tag_sents(sents[((len(sents) * 9) // 10):])
        cfd = nltk.ConditionalFreqDist((word, tag)
                                       for idx1, sent in enumerate(test)
                                       for idx2, (word, tag) in enumerate(sent)
                                       if tag != untagged[idx1][idx2][1])

        for k, v in cfd.items():
            for key, item in v.items():
                print(k, key, item)
Esempio n. 7
0
def most_common_precedents(target_word, category, num_precedents=3):
    precedents = Counter()

    file_list = young_and_old[category]
    user_pattern = re.compile(r'u\d')

    for file_name in file_list:
        chat_log = nps_chat.posts(file_name)
        for post in chat_log:
            if post == ['JOIN'] or post == ['PART'] or post[0] == ':':
                continue
            if post[0] == '.':
                if post[1] == 'ACTION':
                    previous_word = '.action'
                    post = post[2:]
                else:
                    continue
            else:
                previous_word = '<s>'
            for word in post:
                word = word.lower()
                if word == target_word:
                    if re.match(user_pattern, previous_word):
                        precedents['user_name'] += 1
                    else:
                        precedents[previous_word] += 1
                previous_word = word
    return precedents.most_common(num_precedents)
Esempio n. 8
0
def readNPSChat(filePathOut):
    """
    Reads from the corpus of text developped by the US government
    for online monitoring, available through the nltk library
    Parameters:
        filePathOut - the location to write output
    """
    from nltk.corpus import nps_chat
    chatroom = nps_chat.posts()
    wordList = []
    for l in chatroom[1:4256]:
        wordList += l
    for l in chatroom[4259:]:
        wordList += l
    # los miembros 4257 y 4258 continen caracteres inválidos
    messages = [" ".join(wordList)]
    parseEtymologies(messages, filePathOut)
Esempio n. 9
0
from nltk.corpus import nps_chat as nps

# NPS_CHAT can be found in: https://catalog.ldc.upenn.edu/LDC2010T05
# but is a charged service - buaa buaa buaa

caminho = 'C:\\Users\\theone\\Documents\\FATEC\\PROJETO TG1\\PJ_FINAL\\'
i = 0
for fid in nps.fileids():
    print('CREATING FILE: ' + 'arqNPS_CHAT-' + str(i) + '--' + fid[:-4] + '.txt\n')
    arq = open(caminho + 'arqNPS_CHAT-' + str(i) + '--' + fid[:-4] + '.txt', 'a')
    arq.truncate()
    for post in nps.posts(fid):
        line = ' '.join(post).rstrip()
        if 'ACTION' in line or 'JOIN' in line or 'PART' in line:
            continue
        arq.write(line + '\n')
    arq.close()
    i+=1
    
Esempio n. 10
0
def get_dataset(low_it=False):
    '''
	removes most not natural messages from the nps_chat dataset and returns it
	also replaces user ids with random names
	'''
    nameList = list(names.words())

    posts = list(nps_chat.posts())

    #removing the START, JOIN, PART messages
    while 1:
        try:
            posts.remove(['PART'])

        except Exception as e:
            pass

        try:
            posts.remove(['JOIN'])

        except Exception as e:
            pass

        try:
            posts.remove(['START'])

        except Exception as e:
            pass

        if ['PART'] not in posts and ['START'
                                      ] not in posts and ['JOIN'] not in posts:
            break

    #normalization

    re_pat = re.compile('^[.][ ]ACTION[ ]')
    re_pat2 = re.compile('^[.][ ]wz')
    #re_pat3 = re.compile('^\d+ [/] [a-m] [a-zA-Z!@#$%^&*()_\\/\'";:<>,.?`~]+')
    re_pat4 = re.compile('^[.] [3-9] |^[1-2][0-9] [/] [a-m]')
    re_pat5 = re.compile(
        '^[!] \w+|^UnScramble|^U\d+ [(] U\d+|^[:] U\d+|^[<]empty[>]')
    re_pat6 = re.compile('^[.] Question |^[.] Scorpio |^[.] Rooster ')
    re_pat7 = re.compile('U\d+')

    for index, i in enumerate(posts):
        to_search = ' '.join(i)
        result = re_pat.search(to_search)

        if result != None:
            temp = i[2:-1]
            temp.insert(0, '*')
            temp.append('*')
            posts[index] = temp
            #print (posts[index])

    print('beep')

    for index, i in enumerate(posts):
        to_search = ' '.join(i)

        if re_pat2.search(to_search) != None:
            #print (to_search)
            del posts[index]
            del posts[index + 1]
            del posts[index + 2]

    print('bop')

    for index, i in enumerate(posts):
        to_search = ' '.join(i)

        if re_pat4.search(to_search) != None:
            #print (to_search)
            del posts[index]

    print('boop')

    for index, i in enumerate(posts):
        to_search = ' '.join(i)

        if re_pat5.search(to_search) != None:
            #print (to_search)
            #print (' '.join(posts[index+1]))
            del posts[index]

    for index, i in enumerate(posts):
        to_search = ' '.join(i)

        if re_pat6.search(to_search) != None:
            #print (to_search)
            #print (' '.join(posts[index+1]))
            posts[index] = i[3:]
            #print (posts[index])

    for index, i in enumerate(posts):
        to_search = ' '.join(i)

        if re_pat7.search(to_search) != None:
            for index2, j in enumerate(i):
                if re_pat7.search(j) != None:
                    posts[index][index2] = 'Human'  #random.choice(nameList)

    out = untokenize(posts)
    re_pat8 = re.compile(
        '(?<=^[hH][Ii])[ ,.a-zA-Z0-9]+|(?<=^[Hh][eE][lL][lL][oO])[ a-zA-Z0-9]+'
    )

    for index, i in enumerate(out):
        out[index] = re_pat8.sub('', i)

    nick_pat = re.compile('^NICK[:] Human')

    for index, i in enumerate(out):
        if nick_pat.search(i) != None:
            del out[index]

    if low_it:
        for index, i in enumerate(out):
            out[index] = i.lower()

    print('done')

    return out
Esempio n. 11
0
print(macbeth_sentences)
print(macbeth_sentences[1116])
longest_len = max(len(s) for s in macbeth_sentences)
longest_sent = [s for s in macbeth_sentences if len(s) == longest_len]
print(' '.join(longest_sent[0]))
print(longest_sent)

# 1.2. 网络文本 和 聊天文本
from nltk.corpus import webtext

for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], '...')

from nltk.corpus import nps_chat

chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom)
print(' '.join(chatroom[123]))

for fileid in nps_chat.fileids():
    print(fileid, ' '.join(nps_chat.posts(fileid)[123]))

# 1.3. Brown(布朗)语料库:用于研究文体之间的系统性差异(又叫文体学研究)
from nltk.corpus import brown

print(brown.categories())
brown_news_words = brown.words(categories='news')
print(brown_news_words)
brown_cg22_words = brown.words(fileids='cg22')
brown_sents = brown.sents(categories=['news', 'editorial', 'reviews'])
print(brown_sents)
Esempio n. 12
0
def fun05():
    """fun05"""
    chatroom = nps_chat.posts("10-19-20s_706posts.xml")
    print chatroom[123]
Esempio n. 13
0
import nltk
import numpy as np

# nltk.download('nps_chat')
from nltk import bigrams
from nltk.corpus import webtext
fx = webtext.raw(webtext.fileids()[0])
from nltk.corpus import nps_chat
chat = nps_chat.posts(nps_chat.fileids()[0])
print(len(chat))

fx = fx.replace("\r", "")
fxline = fx.splitlines()
fxcorpus = []
for line in fxline:
    fxcorpus.append(line.split(" "))


def generate_co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    vocab_index = {word: i for i, word in enumerate(vocab)}

    # Create bigrams from all words in corpus
    bi_grams = list(bigrams(corpus))

    # Frequency distribution of bigrams ((word1, word2), num_occurrences)
    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))

    # Initialise co-occurrence matrix
    # co_occurrence_matrix[current][previous]
Esempio n. 14
0
import nltk
from nltk.corpus import webtext
from nltk.corpus import nps_chat

print('WEBTEXT___')
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:20])

print('NPS_CHAT___')
for post in nps_chat.posts():
    print(post)
Esempio n. 15
0
#********************************************************************************************************
#                                        Web and Chat Text
#********************************************************************************************************

'''
Web Texts
'''
from nltk.corpus import webtext
for fileid in webtext.fileids():
    print fileid, webtext.raw(fileid)[:65]
'''
Chats
'''
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]


#********************************************************************************************************
#                                        Brown Corpus
#********************************************************************************************************

from nltk.corpus import brown

Displsy the categories that it have.
brown.categories()

We can choose a especific genre and extract a list of words 
brown.words(categories='news')
brown.words(categories='cg22')
]
corp_words_untagged = [
    brown.words(),
    nps_chat.words(),
    conll2000.words(),
    treebank.words()
]
corp_sents_tagged = [
    brown.tagged_sents(tagset=CONST_tagset),
    nps_chat.tagged_posts(tagset=CONST_tagset),
    conll2000.tagged_sents(tagset=CONST_tagset),
    treebank.tagged_sents(tagset=CONST_tagset)
]
corp_sents_untagged = [
    brown.sents(),
    nps_chat.posts(),
    conll2000.sents(),
    treebank.sents()
]

# language tool spell checker
lt_check = language_check.LanguageTool('en-US')

# pyenchant spell checker
# pe_check = enchant.Dict('en_US')

universal_tagset = [
    '.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT',
    'VERB', 'X'
]
Esempio n. 17
0
def index(request):
    posts = nps_chat.posts()
    return render(request,'index.html', {
        'posts': posts,
    })
Esempio n. 18
0
def nps_chat_example():
    from nltk.corpus import nps_chat
    chatroom = nps_chat.posts('10-19-20s_706posts.xml')
    print(chatroom[123])
#
#
#

import nltk
import random
from nltk.corpus import nps_chat
from nltk.corpus import stopwords
import pickle

stop_words = set(stopwords.words('english'))
stop_words.remove('no')
stop_words.add('...')

xml_posts_0 = nps_chat.xml_posts()
posts_0 = nps_chat.posts()

categorized_posts = []
index = 0

# Categorize 'Accept' and 'Non-accept' posts
for el in xml_posts_0:
    if el.attrib.get('class') == 'yAnswer':
        categorized_posts.append((posts_0[index], 'Yes'))
    elif el.attrib.get('class') == 'nAnswer':
        categorized_posts.append((posts_0[index], 'No'))
    index += 1

all_words = []
for (post, category) in categorized_posts:
    for word in post:
from nltk.corpus import brown, nps_chat
import nltk

# Initialize all training and test data
tokens_brown = brown.sents()
tokens_nps_chat = nps_chat.posts()
tagged_sents_brown = brown.tagged_sents()
tagged_posts_nps_chat = nps_chat.tagged_posts()

size_brown_09 = int(len(tagged_sents_brown) * 0.9)
size_brown_05 = int(len(tagged_sents_brown) * 0.5)
size_nps_chat_09 = int(len(tagged_posts_nps_chat) * 0.9)
size_nps_chat_05 = int(len(tagged_posts_nps_chat) * 0.5)
train_sents_brown_09 = tagged_sents_brown[:size_brown_09]
test_sents_brown_09 = tagged_sents_brown[size_brown_09:]
train_sents_brown_05 = tagged_sents_brown[:size_brown_05]
test_sents_brown_05 = tagged_sents_brown[size_brown_05:]

train_posts_nps_chat_09 = tagged_posts_nps_chat[:size_nps_chat_09]
test_posts_nps_chat_09 = tagged_posts_nps_chat[size_nps_chat_09:]
train_posts_nps_chat_05 = tagged_posts_nps_chat[:size_nps_chat_05]
test_posts_nps_chat_05 = tagged_posts_nps_chat[size_nps_chat_05:]

# Task a)
print("Task a)")
tags_brown = [tag for word, tag in brown.tagged_words()]
tags_nps_chat = [tag for word, tag in nps_chat.tagged_words()]

# Find most common tags
max_brown = nltk.FreqDist(tags_brown).max()  # NN
max_nps_chat = nltk.FreqDist(tags_nps_chat).max()  # UH
Esempio n. 21
0
def chat():
    chatroom = nps_chat.posts('10-19-20s_706posts.xml')
    chatroom[123]
Esempio n. 22
0
    print(fileid, webtext.raw(fileid)[:65], "...")
"""
output 
firefox.txt Cookie Manager: "Don't allow sites that set removed cookies to se ...
grail.txt SCENE 1: [wind] [clop clop clop] 
KING ARTHUR: Whoa there!  [clop ...
overheard.txt White guy: So, do you have any plans for this evening?
Asian girl ...
pirates.txt PIRATES OF THE CARRIBEAN: DEAD MAN'S CHEST, by Ted Elliott & Terr ...
singles.txt 25 SEXY MALE, seeks attrac older single lady, for discreet encoun ...
wine.txt Lovely delicate, fragrant Rhone wine. Polished leather and strawb ...

"""
from nltk.corpus import nps_chat

chatroom = nps_chat.posts("10-19-20s_706posts.xml")
print(chatroom[123])

output = [
    "i",
    "do",
    "n't",
    "want",
    "hot",
    "pics",
    "of",
    "a",
    "female",
    ",",
    "I",
    "can",
Esempio n. 23
0
File: nlp1.py Progetto: cony56/TIL
textId = webtext.fileids()
textId
text = webtext.raw('pirates.txt')
print(text[:4000])

word = webtext.words('pirates.txt')

#인터넷의 일반 데이터-채팅

from nltk.corpus import nps_chat

textId = nps_chat.fileids()
print(textId)

text = nps_chat.raw(textId[0])
chatroom = nps_chat.posts(textId[0])
len(nps_chat.posts(textId[1]))

# 브라운 코퍼스 - 브라운 대학교에서 만든 전자문서
from nltk.corpus import brown
textId = brown.fileids()
print(textId)
cat = brown.categories()
cat
news = brown.raw(categories='news')
len(news)

brown.words(fileids=['cg22'])

#장르의 단어분포를 확인할 수 있음
cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()
Esempio n. 24
0
[s for s in shakes_macbeth if len(s) == longest_len]

# Web and Chat Text
from nltk.corpus import webtext
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], '...')
'''
Instant chat analysis
There is also a corpus of instant messaging chat sessions, 
originally collected by the Naval Postgraduate School 
for research on automatic detection of Internet predators. 
The corpus contains over 10,000 posts, 
anonymized by replacing usernames with generic names of the form "UserNNN", 
and manually edited to remove any other identifying information. 
The corpus is organized into 15 files, 
where each file contains several hundred posts collected on a given date, 
for an age-specific chatroom (teens, 20s, 30s, 40s, plus a generic adults chatroom). 
The filename contains the date, chatroom, and number of posts; 
e.g., 10-19-20s_706posts.xml contains 706 posts gathered from the 20s chat room on 10/19/2006.

'''
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]

chatroom2 = nps_chat.posts('11-09-40s_706posts.xml')
chatroom[123]

chatroom_adult = nps_chat.posts('11-09-adults_706posts.xml')
chatroom_adult[123]
Esempio n. 25
0
import nltk
from nltk.corpus import webtext
from nltk.corpus import nps_chat

#emma = gutenberg.words('austen-emma.txt')
#print(len(set(w.lower() for w in emma)))

for file in webtext.fileids():
    print(file, webtext.raw(file[:65]))

chatroom = nps_chat.posts()
Esempio n. 26
0
import nltk
from nltk.corpus import nps_chat
nps_chat.fileids()
chatroom1 = nps_chat.fileids()[1]
chatroom1 = nps_chat.posts(chatroom1)
chatwords_list = []
for w in chatroom1:
    chatwords_list.append(' '.join(w))
chatwords = ' '.join(chatwords_list)
#tokenization
chat_token = nltk.word_tokenize(chatwords)
print(chat_token)
#lower & alpha
lower_chat = [w.lower() for w in chat_token]
#alpha_chat = [w for w in lower_chat if w.isalpha()]
#stop words
stopwords = nltk.corpus.stopwords.words('english')
stopped_chat = [m for m in lower_chat if m not in stopwords]
#Frequency Table
from nltk import FreqDist
fdist = FreqDist(lower_chat)
print("Top 50 words in NPS-chat corpus [1]:")
topkeys = fdist.most_common(50)
for p in topkeys:
    print(p)
#Bigram Frequency
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(lower_chat)
scored = finder.score_ngrams(bigram_measures.raw_freq)
print("Top 50 biagram in NPS-chat corpus [1]:")
print('gutenberg:\n', gutenberg.fileids())
print('webtext:\n', webtext.fileids())
print('nps_chat:\n', nps_chat.fileids())
print('brown:\n', brown.fileids())
print('reuters:\n', reuters.fileids())

# Printing the categories of each module
# NOTE: gutenberg, webtext and nps_text do not have "categories"
print('Printing the categories for each module, if available...\n')
print('brown:\n', brown.categories())
print('reuters:\n', reuters.categories())

# Accessing the corpora
# NOTE: TXT files can be accessed through "raw" to get the full files
print('Accessing the sample files...')
print('gutenberg:\n', gutenberg.raw("austen-emma.txt"))

# Accessing sentences of a sample file
print('Getting a list of sentences...')
print('List of sentences from austen-emma.txt:\n', gutenberg.sents("austen-emma.txt"))
print('List of sentences from a chat:\n', nps_chat.posts("10-19-20s_706posts.xml"))

# Example going through each post from a chat
posts = nps_chat.posts("10-19-20s_706posts.xml")

for post in posts:
    print('- ', post)

# As we can see, 'posts' is a list of all the posts in the file 10-19-20s_706posts.xml.
# Each entry from that list is also a list of words for each post.
Esempio n. 28
0
File: ch02.py Progetto: gree2/hobby
def fun05():
    """fun05"""
    chatroom = nps_chat.posts("10-19-20s_706posts.xml")
    print chatroom[123]
Esempio n. 29
0
longest_len = max([len(s) for s in macbeth_sentences])
longest_sent = [s for s in macbeth_sentences if len(s) == longest_len]
print("longest_sent= ", longest_sent)

# 2.1.2. 网络文本 和 聊天文本
# 网络文本
from nltk.corpus import webtext

for field in webtext.fileids():
    print(field, webtext.raw(field)[:65], '...')

# 聊天文本
from nltk.corpus import nps_chat

for field in nps_chat.fileids():
    print(field, nps_chat.posts(field)[:12])

chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print("chatroom[123]= ", chatroom[123])

# 1.3. Brown(布朗)语料库:用于研究文体之间的系统性差异(又叫文体学研究)
from nltk.corpus import brown

show_subtitle("使用 categories 区分文本")
print("brown.categories() =", brown.categories())
print("brown.words(categories='news')= ", brown.words(categories='news'))
print("brown.words(categories=['news', 'editorial', 'reviews'])= ",
      brown.words(categories=['news', 'editorial', 'reviews']))
print("brown.sents(categories=['news', 'editorial', 'reviews'])= ",
      brown.sents(categories=['news', 'editorial', 'reviews']))
Esempio n. 30
0
gutenberg.fileids()

for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid

#句子划分
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
longest_len = max([len(s) for s in macbeth_sentences])
#网络聊天语料库
from nltk.corpus import webtext
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]
from nltk.corpus import brown
brown.categories()
brown.sents(categories=['news', 'editorial', 'reviews'])
news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print m + ':', fdist[m]


cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
# Richer linguistic content is available from some corpora, 
# such as part-of-speech tags, dialogue tags, syntactic trees, and so forth

macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
macbeth_sentences
macbeth_sentences[1116]
longest_len = max(len(s) for s in macbeth_sentences)
[s for s in macbeth_sentences if len(s) == longest_len]

# NLTK's small collection of web text
from nltk.corpus import webtext
for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], '...')
    
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')  # @UndefinedVariable
chatroom[123]

# Brown Corpus: the sources have been categorized by genre, such as news, editorial, and so on.
# a complete list, see http://icame.uib.no/brown/bcm-los.html
from nltk.corpus import brown
brown.categories()
# access the corpus as a list of words
brown.words(categories='news')
brown.words(fileids=['cg22'])
# OR a list of sentences(where each sentence is itself just a list of words)
brown.sents(categories=['news', 'editorial', 'reviews'])
# The Brown Corpus is a convenient resource for studying systematic differences between genres, a kind of linguistic inquiry known as stylistics. 
# compare genres in their usage of modal verbs
# 1. produce the counts for a particular genre
import nltk
Esempio n. 32
0
def print_chatroom():
    from nltk.corpus import nps_chat
    chatroom = nps_chat.posts('10-19-20s_706posts.xml')
    print chatroom[123]
Esempio n. 33
0
print("\n112th Macbeth sentence: ", macbeth_sentences[111])
print("\nNumber of sentences in Macbeth: ", len(macbeth_sentences))
longest_length = max(len(s) for s in macbeth_sentences)
longest_sentence = [
    sentence for sentence in macbeth_sentences
    if len(sentence) == longest_length
]
print("\nLength of longest sentence in Macbeth: ", longest_length)
print("\nLongest sentence in Macbeth: ", longest_sentence)

from nltk.corpus import webtext  # Notes: Web and Chat Corpus. Gutenberg contains formal literature, it is important to consider less formal language as well.
for fileids in webtext.fileids(
):  # Notes: From the web and chat corpus print the fileids with first 65 characters
    print(fileids, webtext.raw(fileids)[:65], "\n")
from nltk.corpus import nps_chat  # Notes: Instant messaging chat session corpus. Contains over 10,000 'posts'
chatroom = nps_chat.posts(
    '10-19-20s_706posts.xml')  # Notes: format = dd_mm-age_numberofposts.xml
print(chatroom[123])

# Notes: Borwn corpus and its categories
# Notes: Reuters corpus
# Notes: Insuagral Address Corpus
# Notes: Annotated Text Corpora
# Notes: Corpora in other languages
# Notes: Loading your own corpus

print(
    "\n--- 2.2 Conditional Frequency Distributions ---\n"
)  # Notes: Conditional Frequency distribution helps us maintain different frequency distributions for each category. This can be used to study systematic diferences between the categories. Example, female and male trend in 'names' corpus.

text = [
    'The', 'Fulton', 'County', 'Grand', 'Jury', 'said'