Example #1
0
def conditional_freq_distrubution():
    cfd = nltk.ConditionalFreqDist((target, fileid[:10])
        for fileid in nps.fileids()
        for posts in nps.words(fileid)
        for target in ['sexy', 'guy']
        if posts.lower().startswith(target))
    cfd.plot()
Example #2
0
def fun4():
    from nltk.corpus import nps_chat
    nltk.download('nps_chat')
    chatroom = nps_chat.posts('10-19-20s_706posts.xml')
    print chatroom[123]
    for i in nps_chat.fileids():
        print i
Example #3
0
def calculate_confidence_index():
    cfd = nltk.ConditionalFreqDist((target, fileid[:10])
        for fileid in nps.fileids()
        for posts in nps.xml_posts(fileid)
        for target in ['ynQuestion']
        if (posts.get('class') == 'ynQuestion'))
    cfd.plot()


    # if(flagCount != 0 && timeElapsed != 0)
    # {

    
    # }else{

    
    # }
    print("Printing confidence index as a function"
        "of flagCount and timeElapsed")
Example #4
0
def calculate_flags():
    flagNumber = 0
    tokens = nltk.word_tokenize(flagList)

    # TODO: using a list of flags to be determined,
    # iterate through posts to find instances of any flags
    cfd = nltk.ConditionalFreqDist((tokens, fileid[:10])
        for fileid in nps.fileids()
        for posts in nps.words(fileid)
        for target in [tokens]
        #you need a check if len(samples) < 1
        #you don't need to use a format specifier to get string length
        if posts.lower().startswith(str(target)))
    print("printing flagList " + str(tokens))
    print("cfd values: " + str(cfd.keys()))


    #problem here with "max() arg is an empty sequence" if we try to .tabulate()
    cfd.tabulate(cumulative = True)
Example #5
0
def webtext():
    from nltk.corpus import webtext as webtext
    from nltk.corpus import nps_chat

    # list comprehension version
    file_ids = [fileid for fileid in webtext.fileids()]
    chat_file_ids = [fileid for fileid in nps_chat.fileids()]

    pirates = webtext.raw('pirates.txt')
    pirates_words = len(webtext.words('pirates.txt'))
    pirates_sents = len(webtext.sents('pirates.txt'))
    uniqs = len(set([w.lower() for w in webtext.words('pirates.txt')]))

    lexical_diversity = lexical_div(uniqs, pirates_words)

    # import nltk.book as book
    # text1 = book.text1
    # pirates = webtext.raw('pirates.txt')

    return render_template('webtext.html',
                           file_ids=file_ids,
                           chat_file_ids=chat_file_ids,
                           pirates=pirates)
Example #6
0
import nltk
from nltk.corpus import nps_chat
nps_chat.fileids()
chatroom1 = nps_chat.fileids()[1]
chatroom1 = nps_chat.posts(chatroom1)
chatwords_list = []
for w in chatroom1:
    chatwords_list.append(' '.join(w))
chatwords = ' '.join(chatwords_list)
#tokenization
chat_token = nltk.word_tokenize(chatwords)
print(chat_token)
#lower & alpha
lower_chat = [w.lower() for w in chat_token]
#alpha_chat = [w for w in lower_chat if w.isalpha()]
#stop words
stopwords = nltk.corpus.stopwords.words('english')
stopped_chat = [m for m in lower_chat if m not in stopwords]
#Frequency Table
from nltk import FreqDist
fdist = FreqDist(lower_chat)
print("Top 50 words in NPS-chat corpus [1]:")
topkeys = fdist.most_common(50)
for p in topkeys:
    print(p)
#Bigram Frequency
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(lower_chat)
scored = finder.score_ngrams(bigram_measures.raw_freq)
print("Top 50 biagram in NPS-chat corpus [1]:")
Example #7
0
	return precedents.most_common(num_precedents)

def show_most_common_context(freq_diff_counter, first_category, second_category):
	print 
	print "The following words are used more by " + first_category + " than by " + second_category
	for item in freq_diff_counter:
		word = item[0]
		count = item[1]
		first_precedents = most_common_precedents(word, first_category)
		second_precedents = most_common_precedents(word, second_category)
		print 
		print first_category + " uses the word '" + word + "' " + str(count) + " more than " + second_category
		print first_category + " uses the word with the following words most " + str(first_precedents)
		print second_category + " uses the word with the following words most " + str(second_precedents)

fileids_list = nps_chat.fileids()
fid = {"20s":[], "30s":[], "40s":[],"adu":[],"tee":[]}

for f_id in fileids_list:
	tag = f_id[6:9]
	fid[tag].append(f_id)

young_and_old = {"young": fid["tee"]+fid["20s"] , "old": fid["30s"]+fid["40s"]+fid["adu"]}


(young_word_freq, young_words) = word_freq("young", True)
(old_word_freq, old_words) = word_freq("old", True)

old_more_than_young = freq_diff(old_word_freq, young_word_freq)

show_most_common_context(old_more_than_young, 'old', 'young')
Example #8
0
import nltk
import numpy as np

# nltk.download('nps_chat')
from nltk import bigrams
from nltk.corpus import webtext
fx = webtext.raw(webtext.fileids()[0])
from nltk.corpus import nps_chat
chat = nps_chat.posts(nps_chat.fileids()[0])
print(len(chat))

fx = fx.replace("\r", "")
fxline = fx.splitlines()
fxcorpus = []
for line in fxline:
    fxcorpus.append(line.split(" "))


def generate_co_occurrence_matrix(corpus):
    vocab = set(corpus)
    vocab = list(vocab)
    vocab_index = {word: i for i, word in enumerate(vocab)}

    # Create bigrams from all words in corpus
    bi_grams = list(bigrams(corpus))

    # Frequency distribution of bigrams ((word1, word2), num_occurrences)
    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))

    # Initialise co-occurrence matrix
    # co_occurrence_matrix[current][previous]
Example #9
0
def exercise_nps_chat():
    # 打印聊天室文本名, 名称由日期,年龄,包含的帖子数量注册
    # 如: 10-19-20s_706posts.xml,包含10月19号从20多岁聊天室收集的706个帖子
    for file_id in nps_chat.fileids():
        print(file_id)
Example #10
0
    print "The following words are used more by " + first_category + " than by " + second_category
    for item in freq_diff_counter:
        word = item[0]
        count = item[1]
        first_precedents = most_common_precedents(word, first_category)
        second_precedents = most_common_precedents(word, second_category)
        print
        print first_category + " uses the word '" + word + "' " + str(
            count) + " more than " + second_category
        print first_category + " uses the word with the following words most " + str(
            first_precedents)
        print second_category + " uses the word with the following words most " + str(
            second_precedents)


fileids_list = nps_chat.fileids()
fid = {"20s": [], "30s": [], "40s": [], "adu": [], "tee": []}

for f_id in fileids_list:
    tag = f_id[6:9]
    fid[tag].append(f_id)

young_and_old = {
    "young": fid["tee"] + fid["20s"],
    "old": fid["30s"] + fid["40s"] + fid["adu"]
}

(young_word_freq, young_words) = word_freq("young", True)
(old_word_freq, old_words) = word_freq("old", True)

old_more_than_young = freq_diff(old_word_freq, young_word_freq)
Example #11
0
#!/usr/bin/python3
# coding: utf-8
from nltk.corpus import nps_chat
##################################################################
## 简单查看
print(type(
    nps_chat))  # <class 'nltk.corpus.reader.nps_chat.NPSChatCorpusReader'>
print(len(nps_chat.fileids()))  # 15
print(
    nps_chat.fileids()
)  # ['10-19-20s_706posts.xml', '10-19-30s_705posts.xml', '10-19-40s_686posts.xml', '10-19-adults_706posts.xml', '10-24-40s_706posts.xml', '10-26-teens_706posts.xml', '11-06-adults_706posts.xml', '11-08-20s_705posts.xml', '11-08-40s_706posts.xml', '11-08-adults_705posts.xml', '11-08-teens_706posts.xml', '11-09-20s_706posts.xml', '11-09-40s_706posts.xml', '11-09-adults_706posts.xml', '11-09-teens_706posts.xml']
print(len(nps_chat.words('10-19-20s_706posts.xml')))  # 2829
print(
    nps_chat.words('10-19-20s_706posts.xml')[:10]
)  # ['now', 'im', 'left', 'with', 'this', 'gay', 'name', ':P', 'PART', 'hey']
##################################################################
## posts()
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(
    chatroom[123]
)  # ['i', 'do', "n't", 'want', 'hot', 'pics', 'of', 'a', 'female', ',', 'I', 'can', 'look', 'in', 'a', 'mirror', '.']
Example #12
0
# 문서를 word 단위로 읽어온다.
word = webtext.words('firefox.txt')
print(word)
print("word 개수 = ", len(word))

# 문서를 문장 단위로 읽어온다.
sentence = webtext.sents('firefox.txt')
for i in range(5):
    print(sentence[i])
print("문장 개수 = ", len(sentence))

# NPS Chat 데이터
# http://faculty.nps.edu/cmartell/npschat.htm
from nltk.corpus import nps_chat
nltk.download('nps_chat')

# Chat 코퍼스의 파일 ID를 조회한다.
textId = nps_chat.fileids()
print(textId)

# 특정 Chat session의 텍스트 문서를 조회한다.
text = nps_chat.raw('10-19-20s_706posts.xml')
print(text[:2000])
print("문자 개수 = ", len(text))

# XML의 post 데이터를 읽는다
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
for chat in chatroom[:20]:
    print(chat)

Example #13
0
def exercise_nps_chat():
    # 打印聊天室文本名, 名称由日期,年龄,包含的帖子数量注册
    # 如: 10-19-20s_706posts.xml,包含10月19号从20多岁聊天室收集的706个帖子
    for file_id in nps_chat.fileids():
        print file_id
Example #14
0
import pprint

print "****** gutenberg"
from nltk.corpus import gutenberg
print gutenberg.fileids()
print "raw: ", len(gutenberg.raw())
print "words: ", len(gutenberg.words())
print "sents: ", len(gutenberg.sents())

print "****** webtext"
from nltk.corpus import webtext
print len(webtext.raw('firefox.txt'))

print "****** nps_chat"
from nltk.corpus import nps_chat
print nps_chat.fileids()
cr=nps_chat.posts('10-19-20s_706posts.xml')
print cr

print "****** brown"
from nltk.corpus import brown
nt=brown.words(categories='news')
print nt

from nltk.corpus import reuters
from nltk.corpus import inaugural

print [w for w in nltk.corpus.udhr.fileids() if 'heb' in w.lower()]
 
print nltk.corpus.brown.readme()
print nltk.corpus.brown.words()[1:10]
Example #15
0
 def __init__(self):
     self.number_id = 29
     self.source_id = "nps_chat"
     self.titles = [name for name in nps_chat.fileids()]
     self.data = [nps_chat.raw(name) for name in self.titles]
Example #16
0
print(' '.join(longest_sent[0]))
print(longest_sent)

# 1.2. 网络文本 和 聊天文本
from nltk.corpus import webtext

for fileid in webtext.fileids():
    print(fileid, webtext.raw(fileid)[:65], '...')

from nltk.corpus import nps_chat

chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print(chatroom)
print(' '.join(chatroom[123]))

for fileid in nps_chat.fileids():
    print(fileid, ' '.join(nps_chat.posts(fileid)[123]))

# 1.3. Brown(布朗)语料库:用于研究文体之间的系统性差异(又叫文体学研究)
from nltk.corpus import brown

print(brown.categories())
brown_news_words = brown.words(categories='news')
print(brown_news_words)
brown_cg22_words = brown.words(fileids='cg22')
brown_sents = brown.sents(categories=['news', 'editorial', 'reviews'])
print(brown_sents)

fdist = nltk.FreqDist([w.lower() for w in brown_news_words])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
Example #17
0
10-19-40s_686posts.xml
10-19-adults_706posts.xml
10-24-40s_706posts.xml
10-26-teens_706posts.xml
11-06-adults_706posts.xml
11-08-20s_705posts.xml
11-08-40s_706posts.xml
11-08-adults_705posts.xml
11-08-teens_706posts.xml
11-09-20s_706posts.xml
11-09-40s_706posts.xml
11-09-adults_706posts.xml
11-09-teens_706posts.xml
'''


# putting all tagged posts from the nps_chat corpus into one list
nps_chat_tagged = list()

for fileid in nps_chat.fileids():
    print fileid
    for post in nps_chat.tagged_posts(fileid):
        nps_chat_tagged.append(post)
    print str(len(nps_chat_tagged))


print nps_chat_tagged[0]
# tags can be retrieved in the same way as the Brown corpus

    
Example #18
0
from nltk.corpus import nps_chat as nps

# NPS_CHAT can be found in: https://catalog.ldc.upenn.edu/LDC2010T05
# but is a charged service - buaa buaa buaa

caminho = 'C:\\Users\\theone\\Documents\\FATEC\\PROJETO TG1\\PJ_FINAL\\'
i = 0
for fid in nps.fileids():
    print('CREATING FILE: ' + 'arqNPS_CHAT-' + str(i) + '--' + fid[:-4] + '.txt\n')
    arq = open(caminho + 'arqNPS_CHAT-' + str(i) + '--' + fid[:-4] + '.txt', 'a')
    arq.truncate()
    for post in nps.posts(fid):
        line = ' '.join(post).rstrip()
        if 'ACTION' in line or 'JOIN' in line or 'PART' in line:
            continue
        arq.write(line + '\n')
    arq.close()
    i+=1
    
#---------------------------------
##CHAPTER 2:Accessing Text Corpora
import nltk
#print(nltk.corpus.gutenberg.fileids()) #prints filenames for nltk.gutenberg
emma = nltk.corpus.gutenberg.words('austen-emma.txt') #select text
#print(len(emma))
emma = nltk.Text(emma)  #to use previous functions as with nltk.book txts
print(emma.concordance('surprise'))
print(' '.join(emma[20:50])) #LIST to STRING - comes out as text

#examples of corpus available in nltk
from nltk.corpus import webtext  #less formal text
print(webtext.fileids())  #filenames

from nltk.corpus import nps_chat #predators
print(nps_chat.fileids()) 

from nltk.corpus import brown #brown uni various texts
print(brown.fileids())

from nltk.corpus import reuters
print(reuters.fileids())

from nltk.corpus import inaugural
print(inaugural.fileids())
#page 72 for a variety of corpus functionality commands



##SPACY SECTION - DataCamp course code collection, starting with 'Feature Engineering for NLP'
import spacy 
# Importing modules with datasets within nltk.corpus
from nltk.corpus import gutenberg
from nltk.corpus import webtext
from nltk.corpus import nps_chat
from nltk.corpus import brown
from nltk.corpus import reuters

# Printing the list of all dataset names in each module
print('Printing the file IDs for each module...\n')
print('gutenberg:\n', gutenberg.fileids())
print('webtext:\n', webtext.fileids())
print('nps_chat:\n', nps_chat.fileids())
print('brown:\n', brown.fileids())
print('reuters:\n', reuters.fileids())

# Printing the categories of each module
# NOTE: gutenberg, webtext and nps_text do not have "categories"
print('Printing the categories for each module, if available...\n')
print('brown:\n', brown.categories())
print('reuters:\n', reuters.categories())

# Accessing the corpora
# NOTE: TXT files can be accessed through "raw" to get the full files
print('Accessing the sample files...')
print('gutenberg:\n', gutenberg.raw("austen-emma.txt"))

# Accessing sentences of a sample file
print('Getting a list of sentences...')
print('List of sentences from austen-emma.txt:\n', gutenberg.sents("austen-emma.txt"))
print('List of sentences from a chat:\n', nps_chat.posts("10-19-20s_706posts.xml"))
 def make_sessions():
     fileids = nps_chat.fileids()  # @UndefinedVariable
     for fileid in fileids:
         yield nps_data.fromxml(nps.get_session(fileid), fileid)