Example #1
0
# Actually, any Python list object can be sorted:
sorted(myText)

# Python is nice because return arguments can be directly subsetted:
sorted(mobyTokens)[900:910]

# Lexical richness (#words/#tokens) [Don’t forget from __future__ import division]
len(text1) / len(set(text1))

# How many times does a word appear in the text?
text1.count("do")

# Percentage of the text occupied by a word, see E28 below for a better function.
from nltk.book import text5  # Chat conversations
100 * text5.count("mildly") / len(text5)


# Define a function that computes lexical diversity
def lexical_diversity(text):
    return len(text) / len(set(text))


#Note that our new function can be used on any text, even your own:
lexical_diversity(myText)

# You can combine two lists with text (the addition operator concatenates strings and lists):
myText1 = ["This", "is", "my", "text", "and"]

myText2 = ["there", "is", "nothing", "you", "can", "do", "about", "it", "!"]
Example #2
0
# How many tokes among the first 100 words?
len(set(text1[0:99]))

# Python is nice because return arguments can be directly indexed:
sorted(set(text1))[900:910]

# Lexical richness (#words/#distinct tokens) [Don’t forget from __future__ import division, otherwise 1/2=0 !]
from __future__ import division
len(text1)/len(set(text1))

# How many times does a word appear in the text?
text1.count("do")

# Percentage of the text occupied by a word, see E28 below for a better function.
from nltk.book import text5 # Chat conversations
100*text5.count("call")/len(text5)
100*text5.count("whatever")/len(text5)

# Frequency distribution
from nltk import FreqDist

fdist1 = FreqDist(text1)

vocabulary = fdist1.keys()

frequencies = fdist1.values()

fdist1['whale'] 

# Define a function that computes lexical diversity
def lexical_diversity(text):
# How many tokes among the first 100 words?
len(set(text1[0:99]))

# Python is nice because return arguments can be directly subsetted:
sorted(set(text1))[900:910]

# Lexical richness (#words/#distinct tokens) [Don’t forget from __future__ import division, otherwise 1/2=0 !]
from __future__ import division
len(text1)/len(set(text1))

# How many times does a word appear in the text?
text1.count("do")

# Percentage of the text occupied by a word, see E28 below for a better function.
from nltk.book import text5 # Chat conversations
100*text5.count("call")/len(text5)
100*text5.count("whatever")/len(text5)

# Define a function that computes lexical diversity
def lexical_diversity(text):
        return len(text)/len(set(text))

#Note that our new function can be used on any text, even your own:
lexical_diversity(myText)

# You can combine two lists with text (the addition operator concatenates strings and lists):
myText1 = ["This", "is", "my","text","and"]

myText2 = ["there","is","nothing","you","can","do","about","it","!"]

myText1 + myText2
Example #4
0
from nltk.book import text3, text4, text5

from common.utils import lexical_diversity, percentage

## counting tokens (words and punctuations) - 44764
# print(len(text3))

## counting unique tokens - 2789

dict3 = sorted(set(text3))
print(dict3[1:50])
print(len(dict3))

## lexical diversity
print("tokens: {0}, types: {1}, lexical diversity: {2}".format(
    len(text3), len(dict3), lexical_diversity(text3)))

## word frequency
print("text3 has {0} word '{1}'".format(text3.count('smote'), 'smote'))
print("percentage of 'a' in text4 is {0}".format(
    percentage(text4.count('a'), len(text4))))

print("text5 (Chat Corpus) has {0} word '{1}'".format(text5.count('lol'),
                                                      'lol'))
print("percentage of 'lol' in text5 is {0}".format(
    percentage(text5.count('lol'), len(text5))))
Example #5
0
# Actually, any Python list object can be sorted:
sorted(myText)

# Python is nice because return arguments can be directly subsetted:
sorted(mobyTokens)[900:910]

# Lexical richness (#words/#tokens) [Don’t forget from __future__ import division]
len(text1)/len(set(text1))

# How many times does a word appear in the text?
text1.count("do")

# Percentage of the text occupied by a word, see E28 below for a better function.
from nltk.book import text5 # Chat conversations
100*text5.count("mildly")/len(text5)

# Define a function that computes lexical diversity
def lexical_diversity(text):
        return len(text)/len(set(text))

#Note that our new function can be used on any text, even your own:
lexical_diversity(myText)

# You can combine two lists with text (the addition operator concatenates strings and lists):
myText1 = ["This", "is", "my","text","and"]

myText2 = ["there","is","nothing","you","can","do","about","it","!"]

myText1 + myText2
Example #6
0
from nltk.book import text3, text4, text5

from common.utils import lexical_diversity, percentage


## counting tokens (words and punctuations) - 44764
# print(len(text3))

## counting unique tokens - 2789

dict3 = sorted(set(text3))
print(dict3[1:50])
print(len(dict3))

## lexical diversity
print("tokens: {0}, types: {1}, lexical diversity: {2}".format(len(text3), len(dict3), lexical_diversity(text3)))

## word frequency
print("text3 has {0} word '{1}'".format(text3.count('smote'), 'smote'))
print("percentage of 'a' in text4 is {0}".format(percentage(text4.count('a'), len(text4))))


print("text5 (Chat Corpus) has {0} word '{1}'".format(text5.count('lol'), 'lol'))
print("percentage of 'lol' in text5 is {0}".format(percentage(text5.count('lol'), len(text5))))