# Actually, any Python list object can be sorted: sorted(myText) # Python is nice because return arguments can be directly subsetted: sorted(mobyTokens)[900:910] # Lexical richness (#words/#tokens) [Don’t forget from __future__ import division] len(text1) / len(set(text1)) # How many times does a word appear in the text? text1.count("do") # Percentage of the text occupied by a word, see E28 below for a better function. from nltk.book import text5 # Chat conversations 100 * text5.count("mildly") / len(text5) # Define a function that computes lexical diversity def lexical_diversity(text): return len(text) / len(set(text)) #Note that our new function can be used on any text, even your own: lexical_diversity(myText) # You can combine two lists with text (the addition operator concatenates strings and lists): myText1 = ["This", "is", "my", "text", "and"] myText2 = ["there", "is", "nothing", "you", "can", "do", "about", "it", "!"]
# How many tokes among the first 100 words? len(set(text1[0:99])) # Python is nice because return arguments can be directly indexed: sorted(set(text1))[900:910] # Lexical richness (#words/#distinct tokens) [Don’t forget from __future__ import division, otherwise 1/2=0 !] from __future__ import division len(text1)/len(set(text1)) # How many times does a word appear in the text? text1.count("do") # Percentage of the text occupied by a word, see E28 below for a better function. from nltk.book import text5 # Chat conversations 100*text5.count("call")/len(text5) 100*text5.count("whatever")/len(text5) # Frequency distribution from nltk import FreqDist fdist1 = FreqDist(text1) vocabulary = fdist1.keys() frequencies = fdist1.values() fdist1['whale'] # Define a function that computes lexical diversity def lexical_diversity(text):
# How many tokes among the first 100 words? len(set(text1[0:99])) # Python is nice because return arguments can be directly subsetted: sorted(set(text1))[900:910] # Lexical richness (#words/#distinct tokens) [Don’t forget from __future__ import division, otherwise 1/2=0 !] from __future__ import division len(text1)/len(set(text1)) # How many times does a word appear in the text? text1.count("do") # Percentage of the text occupied by a word, see E28 below for a better function. from nltk.book import text5 # Chat conversations 100*text5.count("call")/len(text5) 100*text5.count("whatever")/len(text5) # Define a function that computes lexical diversity def lexical_diversity(text): return len(text)/len(set(text)) #Note that our new function can be used on any text, even your own: lexical_diversity(myText) # You can combine two lists with text (the addition operator concatenates strings and lists): myText1 = ["This", "is", "my","text","and"] myText2 = ["there","is","nothing","you","can","do","about","it","!"] myText1 + myText2
from nltk.book import text3, text4, text5 from common.utils import lexical_diversity, percentage ## counting tokens (words and punctuations) - 44764 # print(len(text3)) ## counting unique tokens - 2789 dict3 = sorted(set(text3)) print(dict3[1:50]) print(len(dict3)) ## lexical diversity print("tokens: {0}, types: {1}, lexical diversity: {2}".format( len(text3), len(dict3), lexical_diversity(text3))) ## word frequency print("text3 has {0} word '{1}'".format(text3.count('smote'), 'smote')) print("percentage of 'a' in text4 is {0}".format( percentage(text4.count('a'), len(text4)))) print("text5 (Chat Corpus) has {0} word '{1}'".format(text5.count('lol'), 'lol')) print("percentage of 'lol' in text5 is {0}".format( percentage(text5.count('lol'), len(text5))))
# Actually, any Python list object can be sorted: sorted(myText) # Python is nice because return arguments can be directly subsetted: sorted(mobyTokens)[900:910] # Lexical richness (#words/#tokens) [Don’t forget from __future__ import division] len(text1)/len(set(text1)) # How many times does a word appear in the text? text1.count("do") # Percentage of the text occupied by a word, see E28 below for a better function. from nltk.book import text5 # Chat conversations 100*text5.count("mildly")/len(text5) # Define a function that computes lexical diversity def lexical_diversity(text): return len(text)/len(set(text)) #Note that our new function can be used on any text, even your own: lexical_diversity(myText) # You can combine two lists with text (the addition operator concatenates strings and lists): myText1 = ["This", "is", "my","text","and"] myText2 = ["there","is","nothing","you","can","do","about","it","!"] myText1 + myText2
from nltk.book import text3, text4, text5 from common.utils import lexical_diversity, percentage ## counting tokens (words and punctuations) - 44764 # print(len(text3)) ## counting unique tokens - 2789 dict3 = sorted(set(text3)) print(dict3[1:50]) print(len(dict3)) ## lexical diversity print("tokens: {0}, types: {1}, lexical diversity: {2}".format(len(text3), len(dict3), lexical_diversity(text3))) ## word frequency print("text3 has {0} word '{1}'".format(text3.count('smote'), 'smote')) print("percentage of 'a' in text4 is {0}".format(percentage(text4.count('a'), len(text4)))) print("text5 (Chat Corpus) has {0} word '{1}'".format(text5.count('lol'), 'lol')) print("percentage of 'lol' in text5 is {0}".format(percentage(text5.count('lol'), len(text5))))