def createNames(): from nltk.corpus import names as nm list_names = [] for fileid in nm.fileids(): list_names += nm.words(fileid) list_names.sort() list_names = [x.lower() for x in list_names] #print list_names[0:26] return list_names
def load_data(): """ 加载数据 :return: list[(name, 0), (name, 1)...] """ female_file, male_file = names.fileids() female = names.words(female_file) male = names.words(male_file) data_set = [(name.lower(), 0) for name in female] + [(name.lower(), 1) for name in male] random.shuffle(data_set) print('10 names:', data_set[:10]) return data_set
content = [w for w in text if w.lower() not in stopwords_list] return len(content) / len(text) * 100 content_fraction(reuters.words()) #solving word puzzle puzzle_letters = nltk.FreqDist('egivrvonl') obligatory = 'r' wordlist = words.words() result = [ w for w in wordlist if len(w) >= 6 and obligatory in w and nltk.FreqDist(w) <= puzzle_letters ] print(result) #find names common to both genders print(names.fileids()) male_names = names.words('male.txt') female_names = names.words('female.txt') common_names = [w for w in male_names if w in female_names] print(common_names) #cfd against last letters for all names to check well known fact that names ending in letter a are almost always female cfd = nltk.ConditionalFreqDist((fileid, name[-1]) for fileid in names.fileids() for name in names.words(fileid)) cfd.plot() #pronouncing dictionary for speech synthesizers - corpus cmu pronoucing dictionary entries = cmudict.entries() print(len(entries)) #for entry in entries: #can also use word,pronoun format # print(entry) for word, pron in entries:
import nltk from nltk.corpus import names last_letter_cfd = nltk.ConditionalFreqDist( (fileid, name[-1]) for fileid in names.fileids() for name in names.words(fileid) ) vowels = 'aeiouy' consonants = 'bcdfghjklmnpqrstvwxz' singleProns = ['ph', 'th'] def find_letters(index, text, letters): i = index while i < len(text) and text[i] in letters: i = i + 1 return i def combindSeq(index, seq): if index >= len(seq): return [] res = '' if seq[index][0] in consonants: res = seq[index] index = index + 1 if index >= len(seq): return [res] res = res + seq[index]
import nltk.data from nltk.corpus.reader import WordListCorpusReader from nltk.corpus import names from nltk.corpus.reader import TaggedCorpusReader from nltk.tokenize import SpaceTokenizer from nltk.corpus import treebank wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist']) print(wordlist.words()) print(wordlist.fileids()) print(names.fileids()) print(len(names.words('male.txt'))) reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged", r'.*\.pos', word_tokenizer=SpaceTokenizer(), tagset='en-brown') print(reader.words('wsj_0001.pos')) print(reader.tagged_words('wsj_0001.pos')) print(reader.tagged_sents('wsj_0001.pos')) print(reader.tagged_paras('wsj_0001.pos')) print(reader.fileids()) print("\n") print(reader.tagged_words('wsj_0001.pos', tagset='universal')) print(treebank.tagged_words())
import nltk from nltk.corpus import names cfd = nltk.ConditionalFreqDist((fileid, name[0]) for fileid in names.fileids() for name in names.words(fileid)) cfd.plot()
Created on Mon Jan 15 21:26:30 2018 @author: Mohnish_Devadiga """ import nltk import matplotlib import matplotlib.pyplot as plt import random from nltk.corpus import names from PIL import Image #print(names.fileids()) matplotlib.style.use("ggplot") names_cfd = nltk.ConditionalFreqDist((fileid,name[-2:])for fileid in names.fileids() for name in names.words(fileid)) ''' plt.figure(figsize=(50,10)) image = names_cfd.plot() ''' def name_features(name): return{"pair" : name[-2:]} print(name_features("katy")) name_list = ([(name, 'male') for name in names.words('male.txt')] + [(name, "female") for name in names.words('female.txt')]) print(name_list[:10]) print(name_list[-10:])
# ☼ Save some text into a file corpus.txt. Define a function load(f) that reads from the file named in its sole argument, and returns a string containing the text of the file. # Use nltk.regexp_tokenize() to create a tokenizer that tokenizes the various kinds of punctuation in this text. Use one multi-line regular expression, with inline comments, using the verbose flag (?x). # Use nltk.regexp_tokenize() to create a tokenizer that tokenizes the following kinds of expression: monetary amounts; dates; names of people and organizations. # to do: monetary amounts import nltk from nltk.corpus import names # loads an list full of names options = names.fileids() name_options = [names.words(f) for f in options] # flattens the list name_options = [item for sublist in name_options for item in sublist] def load(f): """Takes a file as its argument and returns a string containing the text of that file.""" # opens the file and loads its text in. t = open(f) t = t.read() return t def tokenize_punctuation(t): """Tokenizes the punctuation in a text 't'.""" pattern = r'''(?x) # set to be verbose \W # searches for non-alphanumeric characters. ''' matches = nltk.regexp_tokenize(t, pattern) return matches
def cfd_generator(): for fileid in names.fileids(): for name in names.words(fileid): (fileid, name[-1])
#Load packages from nltk.corpus import names import random from nltk import NaiveBayesClassifier from nltk.classify import accuracy #Feature extractor def gender_features(word): return {'last_letter': word[-1]} gender_features('Maria') #Exploring female names names.fileids() names.words('female.txt')[:5] #Building the classifier labeled_names = ([(name, 'female') for name in names.words('female.txt')] + [(name, 'male') for name in names.words('male.txt')]) labeled_names[:5] random.shuffle(labeled_names) labeled_names[:5] featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names] featuresets[:5] #Split data into training (80%) and test (20%) set train_set_size = round(len(featuresets) * .8)
from nltk.corpus import names from nltk import ConditionalFreqDist as CondFreqDist g2n = CondFreqDist([(gender, name[0]) for gender in names.fileids() for name in names.words(gender)]) n2g = CondFreqDist([(name[0] , gender) for gender in names.fileids() for name in names.words(gender)]) g2n.plot() n2g.plot()
# force floating point division from __future__ import division import nltk import collections from nltk.corpus import names # for exercise 8 from nltk.corpus import brown # for exercise 15 from nltk.corpus import reuters from nltk.corpus import stopwords ## 1 - EXERCISES ## print("----QUESTION 1----") # 8 # initials = nltk.ConditionalFreqDist((fileid, name[0]) for fileid in names.fileids() for name in names.words(fileid)) # male initials print("Most common male first initials: " + str(initials["male.txt"].most_common())) # female initials print("\nMost common female first initials: " + str(initials["female.txt"].most_common())) # 15 # # I left this commented out because it takes a while to generate a # frequency distribution for the entire Brown Corpus ##brown_freq = nltk.FreqDist(w.lower() for w in brown.words()) ##brownlist = [] ##for word in brown_freq: ## if brown_freq[word] >= 3: ## brownlist.append(word) ##print ("Number of words in Brown Corpus that occur at least three times: " ## + str(len(brownlist))) # 17 #
# Find the name both for woman and man from nltk.corpus.names import nltk from nltk.corpus import names names = nltk.corpus.names print names.fileids() male_name = names.words('male.txt') female_name = names.words('female.txt') print [w for w in male_name if w in female_name] # draw a FreqDist describe the relationship between sex and the last alpha. # Usually end by 'a e i' is for female # End by k o r s t is for male. cfd = nltk.ConditionalFreqDist( (fileid,name[-1]) for fileid in names.fileids() for name in names.words(fileid)) cfd.plot()
from nltk.corpus import names print(names.fileids()) print(len(names.words('female.txt'))) print(len(names.words('male.txt')))
def is_name(word): for fileid in names.fileids(): for name in names.words(fileid): if name == word: return True return False
for mer in syn.substance_meronyms(): print("Synset '{2}':\n\t{0}\n\nsubstance meronym '{1}':\n\t{3} ".format(syn.definition(), mer.lemma_names()[0],syn.lemma_names()[0],mer.definition())) for mer in syn.member_holonyms(): print("Synset '{2}':\n\t{0}\n\nmember holonym '{1}':\n\t{3} ".format(syn.definition(), mer.lemma_names()[0],syn.lemma_names()[0],mer.definition())) for mer in syn.part_holonyms(): print("Synset '{2}':\n\t{0}\n\npart holonym '{1}':\n\t{3} ".format(syn.definition(), mer.lemma_names()[0],syn.lemma_names()[0],mer.definition())) for mer in syn.substance_holonyms(): print("Synset '{2}':\n\t{0}\n\nsubstance holonym '{1}':\n\t{3} ".format(syn.definition(), mer.lemma_names()[0],syn.lemma_names()[0],mer.definition())) #8 from nltk.corpus import names names.fileids() tble=[(gender, first_letter) for gender in names.fileids() for first_letter in [w[0] for w in names.words(gender)]] cfd=nltk.ConditionalFreqDist(tble) cfd.plot() #12 from nltk.corpus import cmudict words=[a for a,b in cmudict.entries()] len(set(words))-len(words) i=0 m_words=words.copy() for word in list(set(words)): print(i) i=i+1 m_words.remove(word)
def gender_initials_plot(): """shows a plot of the distribution of first name initials of males and females""" cfd = nltk.ConditionalFreqDist((gender, name[:1]) for gender in names.fileids() for name in names.words(gender)) return cfd.plot()
# Below code is used to predict gender from list of values using Naive Bayes # building the feature set base on last two letters of given name # nltk provide list of male and female to train a model. # P(A|B) = P(B|A) * P(A) / P(B) # for example name given female ends with aeiouy and the naive Bayes formula look as below # P(female|'[aeiouy]') = p('[aeiouy]'|female) P(female) / p('[aeiouy]') import nltk import random # used for random selection in our model from nltk.corpus import names # importing male and female list from nltk import matpoltlib.pypolt as plt matplotlib.style.use('ggplot') a = names.fileids() print(a) # getting conditional frequency distribution from names.fileids and getting last two letter from the names name_cfd = nltk.conditionalFreqDist((fileid, name[-2:]) for fileid in names.fileids() for name in names.words(fileid)) # now ploting the graph plt.figure(figsize=(50, 10)) name_cfd.plot() # build a function to get last two letters def name_features(name): return {'pair': name[-2:]} # calling function
# Find the name both for woman and man from nltk.corpus.names import nltk from nltk.corpus import names names = nltk.corpus.names print names.fileids() male_name = names.words('male.txt') female_name = names.words('female.txt') print[w for w in male_name if w in female_name] # draw a FreqDist describe the relationship between sex and the last alpha. # Usually end by 'a e i' is for female # End by k o r s t is for male. cfd = nltk.ConditionalFreqDist((fileid, name[-1]) for fileid in names.fileids() for name in names.words(fileid)) cfd.plot()
# 使用名字语料库 names = nltk.corpus.names print(names.fileids()) male_names = names.words('male.txt') print(male_names) female_names = names.words('female.txt') print(female_names) print([w for w in male_names if w in female_names]) print(len(male_names)) print(len(female_names)) print(len(set(male_names).difference(female_names))) print(set([1, 2, 3, 4]).difference(set([4, 5, 6, 7]))) from nltk.corpus import names print(names.fileids()) male_names = names.words('male.txt') female_names = names.words('female.txt') print([w for w in male_names if w in female_names]) name_ends = ((fileid, name[-2:]) for fileid in names.fileids() for name in names.words(fileid)) for name_end in name_ends: print(name_end) cfd = nltk.ConditionalFreqDist((fileid, name[-2:]) for fileid in names.fileids() for name in names.words(fileid)) cfd.tabulate() cfd.plot() # 图2-7 显示男性与女性名字的结尾字母 # 4.2. 发音词典
import nltk nltk.corpus from nltk.corpus import names cfd = nltk.ConditionalFreqDist( (genre, name[0]) for genre in names.fileids() for name in names.words(genre)) cfd.plot()
# -*- coding: utf-8 -*- import matplotlib matplotlib.use('TkAgg') import nltk ''' ◑ Define a conditional frequency distribution over the Names corpus that allows you to see which initial letters are more frequent for males vs. females (cf. 4.4). ''' from nltk import ConditionalFreqDist from nltk.corpus import names pair = [(gender,word[0]) for gender in names.fileids() for word in names.words(gender)] print pair cfd = ConditionalFreqDist(pair) cfd.plot()
def get_data(): female_file, male_file = names.fileids() female_names = names.words(female_file) male_names = names.words(male_file) dataset = [(name.lower(), 0) for name in female_names] + [(name.lower(), 1) for name in male_names] return dataset
#!/usr/bin/env python import sys from zipfile import ZipFile import nltk # NAMES corpus from nltk.corpus import names NAME_SET = set() for f in names.fileids(): NAME_SET = NAME_SET.union(names.words(f)) # wget http://download.geonames.org/export/dump/cities15000.zip GEONAMES_FILE = 'cities15000.zip' # Prepare geonames CITIES = set() with ZipFile(GEONAMES_FILE) as zip_file: for filename in zip_file.namelist(): contents = zip_file.open(filename) for line in contents: geonameid, name, asciiname, alternatenames, other = line.split('\t', 4) other = other.split('\t') population = int(other[-5]) if population < 100000: continue CITIES.add(tuple(name.split())) CITIES.add(tuple(asciiname.split())) for name in alternatenames.split(','): CITIES.add(tuple(name.split()))
# Define a conditional frequency distribution over the Names Corpus that allows you to see which initial letters are # more frequent for males versus females import nltk from nltk.corpus import names name_fileids = names.fileids() print(name_fileids) print(names.words(name_fileids[1])) cfd = nltk.ConditionalFreqDist( (fileid, word[0]) for fileid in names.fileids() for word in names.words(fileid) ) cfd.plot() cfd.tabulate()
########## WORDLIST CORPUS READER ############### #Basic Corpus Reader from nltk.corpus.reader import WordListCorpusReader #List of a few thousand names organized by gender from nltk.corpus import names #List of english words from nltk.corpus import words nltkDir="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\" #nltkFile="mywords.txt" #source=nltkDir+nltkFile ### One File WordListCorpusReader reader=WordListCorpusReader(nltkDir,['wordlist.txt']) print reader.words() print reader.fileids() ### MultiFile WordListCorpusReader #To get the names of the files in the corpus use the "fileids" command names.fileids() print len(names.words('female.txt')) print len(names.words('female.txt')) words.fileids() print len(words.words('en-basic')) print len(words.words('en')) ###Chunked Corpus Reader