def word_freq(category, lowercase = False): word_counter = defaultdict(int) total_word_count = 0 user_pattern = re.compile(r'U\d') file_list = young_and_old[category] for file_name in file_list: chat_log = nps_chat.posts(file_name) for post in chat_log: if post == ['JOIN'] or post == ['PART'] or post[0] == ':': continue if post[0] == '.': if post[1] == 'ACTION': word_counter['.action'] += 1 post = post[2:] else: continue for word in post: if re.match(user_pattern, word): continue if lowercase: word = word.lower() # if word == '#14-19teens': # print post word_counter[word] += 1 total_word_count += 1 word_freq = {} for key,value in word_counter.iteritems(): word_freq[key] = 1.0*value / total_word_count return (word_freq, total_word_count)
def getData(corpus="brown", categories=""): if corpus == "brown": if categories != "": return brown.tagged_sents(tagset='universal', categories=categories) return brown.tagged_sents(tagset='universal') elif corpus == "treebank": return treebank.tagged_sents(tagset='universal') elif corpus == "nps_chat": #Dialogue dataset data = [] posts = nps_chat.posts() words = nps_chat.tagged_words(tagset='universal') index = 0 for sent in posts: data.append(words[index:index + len(sent)]) index += len(sent) return data elif corpus == "conll2000": return conll2000.tagged_sents(tagset='universal') return brown.tagged_sents(tagset='universal')
def most_common_precedents(target_word, category, num_precedents = 3): precedents = Counter() file_list = young_and_old[category] user_pattern = re.compile(r'u\d') for file_name in file_list: chat_log = nps_chat.posts(file_name) for post in chat_log: if post == ['JOIN'] or post == ['PART'] or post[0] == ':': continue if post[0] == '.': if post[1] == 'ACTION': previous_word = '.action' post = post[2:] else: continue else: previous_word = '<s>' for word in post: word = word.lower() if word == target_word: if re.match(user_pattern, previous_word): precedents['user_name'] += 1 else: precedents[previous_word] += 1 previous_word = word return precedents.most_common(num_precedents)
def fun4(): from nltk.corpus import nps_chat nltk.download('nps_chat') chatroom = nps_chat.posts('10-19-20s_706posts.xml') print chatroom[123] for i in nps_chat.fileids(): print i
def word_freq(category, lowercase=False): word_counter = defaultdict(int) total_word_count = 0 user_pattern = re.compile(r'U\d') file_list = young_and_old[category] for file_name in file_list: chat_log = nps_chat.posts(file_name) for post in chat_log: if post == ['JOIN'] or post == ['PART'] or post[0] == ':': continue if post[0] == '.': if post[1] == 'ACTION': word_counter['.action'] += 1 post = post[2:] else: continue for word in post: if re.match(user_pattern, word): continue if lowercase: word = word.lower() # if word == '#14-19teens': # print post word_counter[word] += 1 total_word_count += 1 word_freq = {} for key, value in word_counter.iteritems(): word_freq[key] = 1.0 * value / total_word_count return (word_freq, total_word_count)
def tag_it(train, test, regex_pattern, print_errors=False): """ Use tagger hierarchy approach shown in the lecture I actually tried some variations and different orders, e.g. regex at the beginning. But the below order gave me the best results :param train: :param test: :param regex_pattern: :param print_errors: :return: """ default_tagger = nltk.DefaultTagger('NOUN') regex_tagger = nltk.tag.RegexpTagger(regex_pattern, backoff=default_tagger) unigram_tagger = nltk.UnigramTagger(train, backoff=regex_tagger) bigram_tagger = nltk.BigramTagger(train, backoff=unigram_tagger) trigram_tagger = nltk.TrigramTagger(train, backoff=bigram_tagger) print(trigram_tagger.evaluate(test)) # print wrongly classified values if print_errors: sents = nps_chat.posts() untagged = trigram_tagger.tag_sents(sents[((len(sents) * 9) // 10):]) cfd = nltk.ConditionalFreqDist((word, tag) for idx1, sent in enumerate(test) for idx2, (word, tag) in enumerate(sent) if tag != untagged[idx1][idx2][1]) for k, v in cfd.items(): for key, item in v.items(): print(k, key, item)
def most_common_precedents(target_word, category, num_precedents=3): precedents = Counter() file_list = young_and_old[category] user_pattern = re.compile(r'u\d') for file_name in file_list: chat_log = nps_chat.posts(file_name) for post in chat_log: if post == ['JOIN'] or post == ['PART'] or post[0] == ':': continue if post[0] == '.': if post[1] == 'ACTION': previous_word = '.action' post = post[2:] else: continue else: previous_word = '<s>' for word in post: word = word.lower() if word == target_word: if re.match(user_pattern, previous_word): precedents['user_name'] += 1 else: precedents[previous_word] += 1 previous_word = word return precedents.most_common(num_precedents)
def readNPSChat(filePathOut): """ Reads from the corpus of text developped by the US government for online monitoring, available through the nltk library Parameters: filePathOut - the location to write output """ from nltk.corpus import nps_chat chatroom = nps_chat.posts() wordList = [] for l in chatroom[1:4256]: wordList += l for l in chatroom[4259:]: wordList += l # los miembros 4257 y 4258 continen caracteres inválidos messages = [" ".join(wordList)] parseEtymologies(messages, filePathOut)
from nltk.corpus import nps_chat as nps # NPS_CHAT can be found in: https://catalog.ldc.upenn.edu/LDC2010T05 # but is a charged service - buaa buaa buaa caminho = 'C:\\Users\\theone\\Documents\\FATEC\\PROJETO TG1\\PJ_FINAL\\' i = 0 for fid in nps.fileids(): print('CREATING FILE: ' + 'arqNPS_CHAT-' + str(i) + '--' + fid[:-4] + '.txt\n') arq = open(caminho + 'arqNPS_CHAT-' + str(i) + '--' + fid[:-4] + '.txt', 'a') arq.truncate() for post in nps.posts(fid): line = ' '.join(post).rstrip() if 'ACTION' in line or 'JOIN' in line or 'PART' in line: continue arq.write(line + '\n') arq.close() i+=1
def get_dataset(low_it=False): ''' removes most not natural messages from the nps_chat dataset and returns it also replaces user ids with random names ''' nameList = list(names.words()) posts = list(nps_chat.posts()) #removing the START, JOIN, PART messages while 1: try: posts.remove(['PART']) except Exception as e: pass try: posts.remove(['JOIN']) except Exception as e: pass try: posts.remove(['START']) except Exception as e: pass if ['PART'] not in posts and ['START' ] not in posts and ['JOIN'] not in posts: break #normalization re_pat = re.compile('^[.][ ]ACTION[ ]') re_pat2 = re.compile('^[.][ ]wz') #re_pat3 = re.compile('^\d+ [/] [a-m] [a-zA-Z!@#$%^&*()_\\/\'";:<>,.?`~]+') re_pat4 = re.compile('^[.] [3-9] |^[1-2][0-9] [/] [a-m]') re_pat5 = re.compile( '^[!] \w+|^UnScramble|^U\d+ [(] U\d+|^[:] U\d+|^[<]empty[>]') re_pat6 = re.compile('^[.] Question |^[.] Scorpio |^[.] Rooster ') re_pat7 = re.compile('U\d+') for index, i in enumerate(posts): to_search = ' '.join(i) result = re_pat.search(to_search) if result != None: temp = i[2:-1] temp.insert(0, '*') temp.append('*') posts[index] = temp #print (posts[index]) print('beep') for index, i in enumerate(posts): to_search = ' '.join(i) if re_pat2.search(to_search) != None: #print (to_search) del posts[index] del posts[index + 1] del posts[index + 2] print('bop') for index, i in enumerate(posts): to_search = ' '.join(i) if re_pat4.search(to_search) != None: #print (to_search) del posts[index] print('boop') for index, i in enumerate(posts): to_search = ' '.join(i) if re_pat5.search(to_search) != None: #print (to_search) #print (' '.join(posts[index+1])) del posts[index] for index, i in enumerate(posts): to_search = ' '.join(i) if re_pat6.search(to_search) != None: #print (to_search) #print (' '.join(posts[index+1])) posts[index] = i[3:] #print (posts[index]) for index, i in enumerate(posts): to_search = ' '.join(i) if re_pat7.search(to_search) != None: for index2, j in enumerate(i): if re_pat7.search(j) != None: posts[index][index2] = 'Human' #random.choice(nameList) out = untokenize(posts) re_pat8 = re.compile( '(?<=^[hH][Ii])[ ,.a-zA-Z0-9]+|(?<=^[Hh][eE][lL][lL][oO])[ a-zA-Z0-9]+' ) for index, i in enumerate(out): out[index] = re_pat8.sub('', i) nick_pat = re.compile('^NICK[:] Human') for index, i in enumerate(out): if nick_pat.search(i) != None: del out[index] if low_it: for index, i in enumerate(out): out[index] = i.lower() print('done') return out
print(macbeth_sentences) print(macbeth_sentences[1116]) longest_len = max(len(s) for s in macbeth_sentences) longest_sent = [s for s in macbeth_sentences if len(s) == longest_len] print(' '.join(longest_sent[0])) print(longest_sent) # 1.2. 网络文本 和 聊天文本 from nltk.corpus import webtext for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65], '...') from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') print(chatroom) print(' '.join(chatroom[123])) for fileid in nps_chat.fileids(): print(fileid, ' '.join(nps_chat.posts(fileid)[123])) # 1.3. Brown(布朗)语料库:用于研究文体之间的系统性差异(又叫文体学研究) from nltk.corpus import brown print(brown.categories()) brown_news_words = brown.words(categories='news') print(brown_news_words) brown_cg22_words = brown.words(fileids='cg22') brown_sents = brown.sents(categories=['news', 'editorial', 'reviews']) print(brown_sents)
def fun05(): """fun05""" chatroom = nps_chat.posts("10-19-20s_706posts.xml") print chatroom[123]
import nltk import numpy as np # nltk.download('nps_chat') from nltk import bigrams from nltk.corpus import webtext fx = webtext.raw(webtext.fileids()[0]) from nltk.corpus import nps_chat chat = nps_chat.posts(nps_chat.fileids()[0]) print(len(chat)) fx = fx.replace("\r", "") fxline = fx.splitlines() fxcorpus = [] for line in fxline: fxcorpus.append(line.split(" ")) def generate_co_occurrence_matrix(corpus): vocab = set(corpus) vocab = list(vocab) vocab_index = {word: i for i, word in enumerate(vocab)} # Create bigrams from all words in corpus bi_grams = list(bigrams(corpus)) # Frequency distribution of bigrams ((word1, word2), num_occurrences) bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams)) # Initialise co-occurrence matrix # co_occurrence_matrix[current][previous]
import nltk from nltk.corpus import webtext from nltk.corpus import nps_chat print('WEBTEXT___') for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:20]) print('NPS_CHAT___') for post in nps_chat.posts(): print(post)
#******************************************************************************************************** # Web and Chat Text #******************************************************************************************************** ''' Web Texts ''' from nltk.corpus import webtext for fileid in webtext.fileids(): print fileid, webtext.raw(fileid)[:65] ''' Chats ''' from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] #******************************************************************************************************** # Brown Corpus #******************************************************************************************************** from nltk.corpus import brown Displsy the categories that it have. brown.categories() We can choose a especific genre and extract a list of words brown.words(categories='news') brown.words(categories='cg22')
] corp_words_untagged = [ brown.words(), nps_chat.words(), conll2000.words(), treebank.words() ] corp_sents_tagged = [ brown.tagged_sents(tagset=CONST_tagset), nps_chat.tagged_posts(tagset=CONST_tagset), conll2000.tagged_sents(tagset=CONST_tagset), treebank.tagged_sents(tagset=CONST_tagset) ] corp_sents_untagged = [ brown.sents(), nps_chat.posts(), conll2000.sents(), treebank.sents() ] # language tool spell checker lt_check = language_check.LanguageTool('en-US') # pyenchant spell checker # pe_check = enchant.Dict('en_US') universal_tagset = [ '.', 'ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRON', 'PRT', 'VERB', 'X' ]
def index(request): posts = nps_chat.posts() return render(request,'index.html', { 'posts': posts, })
def nps_chat_example(): from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') print(chatroom[123])
# # # import nltk import random from nltk.corpus import nps_chat from nltk.corpus import stopwords import pickle stop_words = set(stopwords.words('english')) stop_words.remove('no') stop_words.add('...') xml_posts_0 = nps_chat.xml_posts() posts_0 = nps_chat.posts() categorized_posts = [] index = 0 # Categorize 'Accept' and 'Non-accept' posts for el in xml_posts_0: if el.attrib.get('class') == 'yAnswer': categorized_posts.append((posts_0[index], 'Yes')) elif el.attrib.get('class') == 'nAnswer': categorized_posts.append((posts_0[index], 'No')) index += 1 all_words = [] for (post, category) in categorized_posts: for word in post:
from nltk.corpus import brown, nps_chat import nltk # Initialize all training and test data tokens_brown = brown.sents() tokens_nps_chat = nps_chat.posts() tagged_sents_brown = brown.tagged_sents() tagged_posts_nps_chat = nps_chat.tagged_posts() size_brown_09 = int(len(tagged_sents_brown) * 0.9) size_brown_05 = int(len(tagged_sents_brown) * 0.5) size_nps_chat_09 = int(len(tagged_posts_nps_chat) * 0.9) size_nps_chat_05 = int(len(tagged_posts_nps_chat) * 0.5) train_sents_brown_09 = tagged_sents_brown[:size_brown_09] test_sents_brown_09 = tagged_sents_brown[size_brown_09:] train_sents_brown_05 = tagged_sents_brown[:size_brown_05] test_sents_brown_05 = tagged_sents_brown[size_brown_05:] train_posts_nps_chat_09 = tagged_posts_nps_chat[:size_nps_chat_09] test_posts_nps_chat_09 = tagged_posts_nps_chat[size_nps_chat_09:] train_posts_nps_chat_05 = tagged_posts_nps_chat[:size_nps_chat_05] test_posts_nps_chat_05 = tagged_posts_nps_chat[size_nps_chat_05:] # Task a) print("Task a)") tags_brown = [tag for word, tag in brown.tagged_words()] tags_nps_chat = [tag for word, tag in nps_chat.tagged_words()] # Find most common tags max_brown = nltk.FreqDist(tags_brown).max() # NN max_nps_chat = nltk.FreqDist(tags_nps_chat).max() # UH
def chat(): chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123]
print(fileid, webtext.raw(fileid)[:65], "...") """ output firefox.txt Cookie Manager: "Don't allow sites that set removed cookies to se ... grail.txt SCENE 1: [wind] [clop clop clop] KING ARTHUR: Whoa there! [clop ... overheard.txt White guy: So, do you have any plans for this evening? Asian girl ... pirates.txt PIRATES OF THE CARRIBEAN: DEAD MAN'S CHEST, by Ted Elliott & Terr ... singles.txt 25 SEXY MALE, seeks attrac older single lady, for discreet encoun ... wine.txt Lovely delicate, fragrant Rhone wine. Polished leather and strawb ... """ from nltk.corpus import nps_chat chatroom = nps_chat.posts("10-19-20s_706posts.xml") print(chatroom[123]) output = [ "i", "do", "n't", "want", "hot", "pics", "of", "a", "female", ",", "I", "can",
textId = webtext.fileids() textId text = webtext.raw('pirates.txt') print(text[:4000]) word = webtext.words('pirates.txt') #인터넷의 일반 데이터-채팅 from nltk.corpus import nps_chat textId = nps_chat.fileids() print(textId) text = nps_chat.raw(textId[0]) chatroom = nps_chat.posts(textId[0]) len(nps_chat.posts(textId[1])) # 브라운 코퍼스 - 브라운 대학교에서 만든 전자문서 from nltk.corpus import brown textId = brown.fileids() print(textId) cat = brown.categories() cat news = brown.raw(categories='news') len(news) brown.words(fileids=['cg22']) #장르의 단어분포를 확인할 수 있음 cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories()
[s for s in shakes_macbeth if len(s) == longest_len] # Web and Chat Text from nltk.corpus import webtext for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65], '...') ''' Instant chat analysis There is also a corpus of instant messaging chat sessions, originally collected by the Naval Postgraduate School for research on automatic detection of Internet predators. The corpus contains over 10,000 posts, anonymized by replacing usernames with generic names of the form "UserNNN", and manually edited to remove any other identifying information. The corpus is organized into 15 files, where each file contains several hundred posts collected on a given date, for an age-specific chatroom (teens, 20s, 30s, 40s, plus a generic adults chatroom). The filename contains the date, chatroom, and number of posts; e.g., 10-19-20s_706posts.xml contains 706 posts gathered from the 20s chat room on 10/19/2006. ''' from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] chatroom2 = nps_chat.posts('11-09-40s_706posts.xml') chatroom[123] chatroom_adult = nps_chat.posts('11-09-adults_706posts.xml') chatroom_adult[123]
import nltk from nltk.corpus import webtext from nltk.corpus import nps_chat #emma = gutenberg.words('austen-emma.txt') #print(len(set(w.lower() for w in emma))) for file in webtext.fileids(): print(file, webtext.raw(file[:65])) chatroom = nps_chat.posts()
import nltk from nltk.corpus import nps_chat nps_chat.fileids() chatroom1 = nps_chat.fileids()[1] chatroom1 = nps_chat.posts(chatroom1) chatwords_list = [] for w in chatroom1: chatwords_list.append(' '.join(w)) chatwords = ' '.join(chatwords_list) #tokenization chat_token = nltk.word_tokenize(chatwords) print(chat_token) #lower & alpha lower_chat = [w.lower() for w in chat_token] #alpha_chat = [w for w in lower_chat if w.isalpha()] #stop words stopwords = nltk.corpus.stopwords.words('english') stopped_chat = [m for m in lower_chat if m not in stopwords] #Frequency Table from nltk import FreqDist fdist = FreqDist(lower_chat) print("Top 50 words in NPS-chat corpus [1]:") topkeys = fdist.most_common(50) for p in topkeys: print(p) #Bigram Frequency from nltk.collocations import * bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(lower_chat) scored = finder.score_ngrams(bigram_measures.raw_freq) print("Top 50 biagram in NPS-chat corpus [1]:")
print('gutenberg:\n', gutenberg.fileids()) print('webtext:\n', webtext.fileids()) print('nps_chat:\n', nps_chat.fileids()) print('brown:\n', brown.fileids()) print('reuters:\n', reuters.fileids()) # Printing the categories of each module # NOTE: gutenberg, webtext and nps_text do not have "categories" print('Printing the categories for each module, if available...\n') print('brown:\n', brown.categories()) print('reuters:\n', reuters.categories()) # Accessing the corpora # NOTE: TXT files can be accessed through "raw" to get the full files print('Accessing the sample files...') print('gutenberg:\n', gutenberg.raw("austen-emma.txt")) # Accessing sentences of a sample file print('Getting a list of sentences...') print('List of sentences from austen-emma.txt:\n', gutenberg.sents("austen-emma.txt")) print('List of sentences from a chat:\n', nps_chat.posts("10-19-20s_706posts.xml")) # Example going through each post from a chat posts = nps_chat.posts("10-19-20s_706posts.xml") for post in posts: print('- ', post) # As we can see, 'posts' is a list of all the posts in the file 10-19-20s_706posts.xml. # Each entry from that list is also a list of words for each post.
longest_len = max([len(s) for s in macbeth_sentences]) longest_sent = [s for s in macbeth_sentences if len(s) == longest_len] print("longest_sent= ", longest_sent) # 2.1.2. 网络文本 和 聊天文本 # 网络文本 from nltk.corpus import webtext for field in webtext.fileids(): print(field, webtext.raw(field)[:65], '...') # 聊天文本 from nltk.corpus import nps_chat for field in nps_chat.fileids(): print(field, nps_chat.posts(field)[:12]) chatroom = nps_chat.posts('10-19-20s_706posts.xml') print("chatroom[123]= ", chatroom[123]) # 1.3. Brown(布朗)语料库:用于研究文体之间的系统性差异(又叫文体学研究) from nltk.corpus import brown show_subtitle("使用 categories 区分文本") print("brown.categories() =", brown.categories()) print("brown.words(categories='news')= ", brown.words(categories='news')) print("brown.words(categories=['news', 'editorial', 'reviews'])= ", brown.words(categories=['news', 'editorial', 'reviews'])) print("brown.sents(categories=['news', 'editorial', 'reviews'])= ", brown.sents(categories=['news', 'editorial', 'reviews']))
gutenberg.fileids() for fileid in gutenberg.fileids(): num_chars = len(gutenberg.raw(fileid)) num_words = len(gutenberg.words(fileid)) num_sents = len(gutenberg.sents(fileid)) num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)])) print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid #句子划分 macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') longest_len = max([len(s) for s in macbeth_sentences]) #网络聊天语料库 from nltk.corpus import webtext from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') chatroom[123] from nltk.corpus import brown brown.categories() brown.sents(categories=['news', 'editorial', 'reviews']) news_text = brown.words(categories='news') fdist = nltk.FreqDist([w.lower() for w in news_text]) modals = ['can', 'could', 'may', 'might', 'must', 'will'] for m in modals: print m + ':', fdist[m] cfd = nltk.ConditionalFreqDist( (genre, word) for genre in brown.categories() for word in brown.words(categories=genre))
# Richer linguistic content is available from some corpora, # such as part-of-speech tags, dialogue tags, syntactic trees, and so forth macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt') macbeth_sentences macbeth_sentences[1116] longest_len = max(len(s) for s in macbeth_sentences) [s for s in macbeth_sentences if len(s) == longest_len] # NLTK's small collection of web text from nltk.corpus import webtext for fileid in webtext.fileids(): print(fileid, webtext.raw(fileid)[:65], '...') from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') # @UndefinedVariable chatroom[123] # Brown Corpus: the sources have been categorized by genre, such as news, editorial, and so on. # a complete list, see http://icame.uib.no/brown/bcm-los.html from nltk.corpus import brown brown.categories() # access the corpus as a list of words brown.words(categories='news') brown.words(fileids=['cg22']) # OR a list of sentences(where each sentence is itself just a list of words) brown.sents(categories=['news', 'editorial', 'reviews']) # The Brown Corpus is a convenient resource for studying systematic differences between genres, a kind of linguistic inquiry known as stylistics. # compare genres in their usage of modal verbs # 1. produce the counts for a particular genre import nltk
def print_chatroom(): from nltk.corpus import nps_chat chatroom = nps_chat.posts('10-19-20s_706posts.xml') print chatroom[123]
print("\n112th Macbeth sentence: ", macbeth_sentences[111]) print("\nNumber of sentences in Macbeth: ", len(macbeth_sentences)) longest_length = max(len(s) for s in macbeth_sentences) longest_sentence = [ sentence for sentence in macbeth_sentences if len(sentence) == longest_length ] print("\nLength of longest sentence in Macbeth: ", longest_length) print("\nLongest sentence in Macbeth: ", longest_sentence) from nltk.corpus import webtext # Notes: Web and Chat Corpus. Gutenberg contains formal literature, it is important to consider less formal language as well. for fileids in webtext.fileids( ): # Notes: From the web and chat corpus print the fileids with first 65 characters print(fileids, webtext.raw(fileids)[:65], "\n") from nltk.corpus import nps_chat # Notes: Instant messaging chat session corpus. Contains over 10,000 'posts' chatroom = nps_chat.posts( '10-19-20s_706posts.xml') # Notes: format = dd_mm-age_numberofposts.xml print(chatroom[123]) # Notes: Borwn corpus and its categories # Notes: Reuters corpus # Notes: Insuagral Address Corpus # Notes: Annotated Text Corpora # Notes: Corpora in other languages # Notes: Loading your own corpus print( "\n--- 2.2 Conditional Frequency Distributions ---\n" ) # Notes: Conditional Frequency distribution helps us maintain different frequency distributions for each category. This can be used to study systematic diferences between the categories. Example, female and male trend in 'names' corpus. text = [ 'The', 'Fulton', 'County', 'Grand', 'Jury', 'said'