def get_training_test_sentences(self):
        brown_cats = ",".join(brown.categories())
        self.news_text = brown.words(categories= brown.categories())
        self.news_tagged_sentences = brown.tagged_sents(categories= brown.categories())

        size = int(len(self.news_tagged_sentences) * .9)
        brown_train = self.news_tagged_sentences[:size]
        brown_test = self.news_tagged_sentences[size:]

        self.train_sents = brown_train
        self.test_sents  = brown_test
Exemple #2
0
def print_brown():
    from nltk.corpus import brown
    print brown.categories()
    print brown.words(categories='news')
    print brown.words(fileids=['cg22'])
    print brown.sents(categories=['news','reviews'])
    news_text=brown.words(categories='news')
    fdist=nltk.FreqDist([w.lower() for w in news_text])
    modals=['can','could','may','might','must','will']
    for m in modals:
        print m+':',fdist[m]
def build_all_brown(subset_size=None):
    documents = []
    categories = []

    all_categories = set()

    try:
        fileids = brown.fileids()

        for fileid in fileids:
            if subset_size:
                if len(all_categories) > subset_size:
                    break
            category = brown.categories(fileid)[0]
            words = [x.lower() for x in brown.words(fileid)]

            documents.append(words)
            categories.append(category)

            all_categories.add(category)

        if subset_size != len(brown.categories()):
            # exclude the final item, since it's the sole member of the next group
            documents = documents[:-1]
            categories = categories[:-1]

        documents = [" ".join(d) for d in documents]

    except LookupError:
        """ we don't have the Brown corpus via nltk on this machine """
        try:
            with open("brown_docs_cats.pickle") as f:
                documents, categories = pickle.load(f)
        except IOError:
            raise Exception("can't load Brown Corpus via NLTK or file")

    # documents = [' '.join(d) for d in documents]

    """
    # let's NOT get tempted to hide away the encoding
    # we'll probably need to access, e.g., the vectorizer, to do reverse
    # transformations once we want to interpret/evaluate the model

    doc_vectorizer = CountVectorizer()
    doc_vec = doc_vectorizer.fit_transform(documents)
    """

    return documents, categories
Exemple #4
0
def import_brown_pos(ds, simplify_tags=False, silent=False, log=sys.stdout):
    """
    Import the brown corpus into `ds`. E.g.
    
    >>> from nathan.core import Dataspace
    >>> ds = Dataspace()
    >>> %time brown.import_brown(ds, silent=True)
    CPU times: user 12min 28s, sys: 536 ms, total: 12min 29s
    Wall time: 12min 29s
    """
    if not silent:
        total = len(brown.sents())
        counter = 0
    for category in brown.categories():
        cat_handle = ds.insert("#%s" % category)
        for sent in brown.tagged_sents(categories=category):
            if simplify_tags:
                norm = (simplify_tag(t) for t in sent)
            norm = [nltk.tuple2str(t) for t in norm]
            sen_handle = ds.insert(norm)
            ds.link(cat_handle, sen_handle)
            if not silent:
                counter += 1
                if (counter % 100 == 0):
                    print("importing %s of %s sentences..." % (counter, total), 
                        file=log)
Exemple #5
0
def fun08():
    """fun08"""
    cfd = nltk.ConditionalFreqDist((genre, word) \
        for genre in brown.categories() \
        for word in brown.words(categories=genre))
    genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
    modals = ['can', 'could', 'may', 'might', 'must', 'will']
    cfd.tabulate(conditions=genres, samples=modals)
def makeWordSet(args=None):
    '''Use the Brown corpus to see how many words used'''
    word_set = set()
    for cat in brown.categories():
        word_set = word_set.union(set(brown.words(categories=cat)))
    for cat in reuters.categories():
        word_set = word_set.union(set(reuters.words(categories=cat)))
    return word_set
Exemple #7
0
def brown_diversity():
	"""calculate and display lexical diversity score (token/token_type) for each brown corpus category"""
	cfd = nltk.ConditionalFreqDist((category, word)
		for category in brown.categories()
		for word in brown.words(categories=category))
	print "{0:15s} {1:10s}".format("CATEGORY", "DIVERSITY")
	for category in cfd.conditions():
		print "{0:15s} {1:10f}".format(category, (cfd[category].N() * 1.0 / cfd[category].B()))
Exemple #8
0
def ex11():
  from nltk.corpus import brown
  modals = set(["can", "could", "may", "might", "shall", "should", "will", "would", "must", "ought"])
  cfd = nltk.ConditionalFreqDist(
    (genre, modal)
    for genre in brown.categories()
    for modal in [w.lower() for w in brown.words(categories=genre) if w.lower() in modals])
  cfd.plot()
Exemple #9
0
def exercise_brown2():
    """带条件的频率分布函数"""
    cfd = nltk.ConditionalFreqDist(
        (genre, word) for genre in brown.categories() for word in brown.words(categories=genre)
    )

    genres = ["news", "religion", "hobbies", "science_fiction", "romance", "humor"]
    modals = ["can", "could", "may", "might", "must", "will"]
    cfd.tabulate(conditions=genres, samples=modals)
Exemple #10
0
def ex16():
  from nltk.corpus import brown
  lex_div = {}
  for category in brown.categories():
    words = brown.words(categories=category)
    ld = len(words) / len(set(words))
    print category, ld
    lex_div[category] = ld
  print sorted(lex_div.iteritems(), key=operator.itemgetter(1))
Exemple #11
0
def print_modals():
    from nltk.corpus import brown
    cfd=nltk.ConditionalFreqDist(
        (genre,word)
        for genre in brown.categories()
        for word in brown.words(categories=genre)
    )
    genres=['news','religion','hobbies','science_fiction','romance','humor']
    modals=['can','could','may','might','must','will']
    cfd.tabulate(conditions=genres,samples=modals)
Exemple #12
0
def exercise_brown():
    # 打印布朗语料库中的分类
    print brown.categories()
    # 打印分类为新闻的文本词汇
    print brown.words(categories="news")
    # 打印文本'cg22'
    print brown.words(fileids=["cg22"])
    # 打印句子
    print brown.sents(categories=["news", "reviews"])

    """比较不同文体中的情态动词的用法"""
    # 获取文本
    news_text = brown.words(categories="news")
    # 单词定义频率
    fdist = nltk.FreqDist([w.lower() for w in news_text])
    # 定义情态动词表
    modals = ["can", "could", "may", "might", "must", "will"]
    for m in modals:
        print m + ":", fdist[m]
Exemple #13
0
def training_sentences(use=1.0, categories=[]):
	"""returns a training sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
	if len(categories) == 0:
		categories = brown.categories() # use all of the brown categories
	sents = []
	for category in categories:
		total = len(brown.tagged_sents(categories=category))
		max = int((1-TEST_PROPORTION) * use * total) - 1 # use the first n sentences for training
		sents += brown.tagged_sents(categories=category, simplify_tags=True)[0:max]
	return sents
Exemple #14
0
def init_corpus():
    print 'init corpus.. ', 
    global categories, category_sentences
    categories = brown.categories()
    half_cat = int(len(categories) * 0.5)
    categories = categories[:half_cat]
    for category in categories:
        sents = brown.tagged_sents(categories = category)
        category_sentences[category] = sents
    print 'done'
Exemple #15
0
def test_sentences(categories=[]):
	"""returns a test sentence set: [[(word, tag), ..], [(word, tag), ..], ..]"""
	if len(categories) == 0:
		categories = brown.categories() # use all of the brown categories
	sents = []
	for category in categories:
		total = len(brown.tagged_sents(categories=category))
		start = int(TEST_PROPORTION * total) # use the last k sentences for test
		sents += brown.tagged_sents(categories=category, simplify_tags=True)[-start:-1]
	return sents
Exemple #16
0
def ch03_29_reading_difficulty():
  sent_tokenizer = nltk.data.load("tokenizers/punkt/english.pickle")
  from nltk.corpus import brown
  for category in brown.categories():
    raw = brown.raw(categories=category)
    words = len(brown.words(categories=category))
    sentences = len(sent_tokenizer.tokenize(raw))
    letters_per_word = (len(raw) - words) / words # raw chars - words space chars
    words_per_sentence = words / sentences
    reading_level = (4.71 * letters_per_word) + (0.5 * words_per_sentence) + 21.43
    print category, reading_level
Exemple #17
0
def brown():

    brown.categories()
    brown.words(categories='news')
    brown.words(fileids=['cg22'])
    brown.sents(categories=['news', 'editorial', 'reviews'])

    news_text = brown.words(categories='news')
    fdist = nltk.FreqDist([w.lower() for w in news_text])
    modals = ['can', 'could', 'may', 'might', 'must', 'will']
    for m in modals:
        print m + ':', fdist[m],


    cfd = nltk.ConditionalFreqDist(
            (genre, word)
            for genre in brown.categories()
            for word in brown.words(categories=genre))
    genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
    modals = ['can', 'could', 'may', 'might', 'must', 'will']
    cfd.tabulate(conditions=genres, samples=modals)
Exemple #18
0
def main():
	#Annahme: Wort "heart" statt "fun"
	words = [u'money', u'duty', u'love', u'heart']
	categories = [u'science_fiction', u'romance', u'government', u'humor', u'religion']
	#Dein Code
	cfd = nltk.ConditionalFreqDist((genre, word)
			for genre in brown.categories()
			for word in brown.words(categories=genre))

	cfd.tabulate(conditions=categories, samples=words)

	print_min_max_for_all(cfd, words, categories)
Exemple #19
0
def ex10():
  from nltk.corpus import brown
  from nltk.corpus import stopwords
  stopwords = stopwords.words("english")
  for genre in brown.categories():
    print genre
    words = map(lambda x : x.lower(), brown.words(categories=genre))
    fd = nltk.FreqDist([w for w in words if w.isalpha() and not(w in stopwords)])
    vocab_size = len(set(words))
    sum = 0
    for word in fd.keys():
      freq = fd[word]
      print "... %s (%f)" % (word, (freq * 100 / vocab_size))
      sum = sum + freq
      if (sum > (vocab_size / 3)):
        break
def exercise11():
    print
    print "Exercise 11"
    cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))
    genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
    modal_verb = ['shall', 'should', 'may', 'had', 'have']
    print "Tabulation data for closed class of words in english"
    print "For modal verbs:"
    cfd.tabulate(conditions = genres, samples = modal_verb)
    print
    print "For Prepositions:"
    prepositions = ['for', 'from', 'onto', 'to', 'with']
    cfd.tabulate(conditions = genres, samples = prepositions)
    print
    print "For Pronoun:"
    pronoun = ['me', 'she', 'her', 'I', 'we']
    cfd.tabulate(conditions = genres, samples = pronoun)
    print
Exemple #21
0
def tabulate(cfdist, words, categories):

    print '%-16s' % 'Category',
    for word in words:                                  # column headings
        print '%6s' % word,
    print
    for category in categories:
        print '%-16s' % category,                       # row heading
        for word in words:                              # for each word
            print '%6d' % cfdist[category][word],       # print table cell
        print                                           # end the row

    cfd = nltk.ConditionalFreqDist(
            (genre, word)
            for genre in brown.categories()
            for word in brown.words(categories=genre))
    genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
    modals = ['can', 'could', 'may', 'might', 'must', 'will']
    tabulate(cfd, modals, genres)
Exemple #22
0
	def get_brown_data(self):
		count = {}
		self.len_list = []
		for cats in brown.categories():
			for sent in brown.sents(categories=cats):
				l = len(sent)
				#if l < 3:
					#continue
				self.len_list.append(l)
				if l in count:
					count[l] += 1
				else:
					count[l] = 1
		total = len(self.len_list)
		for i in range(100):
			if i in count.keys():
				self.probs.append(count[i]/(total+0.0))
			else:
				self.probs.append(0)
Exemple #23
0
def countWords():

    """counting words by genre"""

    cfd = nltk.ConditionalFreqDist(
            (genre, word)
            for genre in brown.categories()
            for word in brown.words(categories=genre))

    genre_word = [(genre, word)
            for genre in ['news', 'romance']
            for word in brown.words(categories=genre)] 

    print len(genre_word)
    print genre_word[:4]
    print genre_word[-4:]

    cfd = nltk.ConditionalFreqDist(genre_word)
    print cfd.conditions()

    cfd['news']
    cfd['romance']
    list(cfd['romance'])
    cfd['romance']['could']
Exemple #24
0
'''
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
cfd = nltk.ConditionalFreqDist(
          ('news', word)
          for word in brown.words(categories='news'))
cfd.tabulate(samples=days)
'''

#Checking basic FreqDist
fdist1 = FreqDist(text1)
vocabulary1 = fdist1.keys()
vocabulary1[:50]
fdist1['whale']

#Checking basic ConditionalFreqDist
cfd = nltk.ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'] 
modals = ['can', 'could', 'may', 'might', 'must', 'will'] 
cfd.tabulate(conditions=genres, samples=modals)

#The Lookup Tagger
fd = nltk.FreqDist(brown.words(categories='news'))
#One word can have multiple tags. brown.tagged_words() returns object of class ConcatenatedCorpusView.
#We are creating the baseline tagger by tagging each word from the brown news corpus 
brown_sents = brown.sents(categories='news')
#brown_tagged_sents is of class ConcatenatedCorpusView
brown_tagged_sents = brown.tagged_sents(categories='news')
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
#cfd['of'] is a FreqDist with 3 samples and 2849 outcomes. The entries are {('IN', 2716), ('IN-TL', 128), ('IN-HL', 5)}.
most_freq_words = fd.keys()[:100]
#word comes from most_freq_words, which comes from fd, which comes from freq distn of brown.words in news corpus.
Exemple #25
0
import nltk
from nltk.corpus import brown

whwords = ['what', 'which', 'why', 'when', 'where', 'who']

genre_news = brown.words(categories='news')
fdist = nltk.FreqDist(genre_news)
count = 0

for wh in whwords:
    count = count + fdist[wh]

print("\n", count)

#q2
categories = list(brown.categories())
modals = ['can', 'could', 'may', 'might', 'must', 'will', 'would']

for c in categories:
    print(c)
    words = brown.words(categories=c)
    fdist = nltk.FreqDist([w.lower() for w in words])
    for m in modals:
        print(m + ':', fdist[m])
    print()

#q3
import nltk
from nltk.corpus import inaugural
print(inaugural.fileids())
year = [fileid[:4] for fileid in inaugural.fileids()]
Exemple #26
0
    for category in categories:
        if not 'stopwords' in kwargs:
            tokens = [w for w in brown.words(categories=category)]
        else:
            if kwargs['stopwords'] == 'english':
                tokens = [w for w in brown.words(categories=category) if w not in FileOps.get_stopwords('stopwords.txt')]
        token_count = len(tokens)
        type_count = len(set(tokens))
        diversity = "%.3f" % (type_count/token_count)
        tmp = category.split('_')
        category = ' '.join(tmp)
        category_info = (category, token_count, type_count, diversity)
        print(category_info)
        corpus_info.append(category_info)
        
    return corpus_info
    
if __name__ == "__main__":
    
<<<<<<< HEAD
=======
    
    plurals_info = get_plurals_info(brown.categories())
    FileOps.write_plurals_table(plurals_info)
    
    
>>>>>>> a4d8d09f16fdcb0d863e6b8683049a38be7be961
    corpus_info = get_corpus_info(brown.categories())
    stopwords_info = get_corpus_info(brown.categories(), stopwords='english')
    FileOps.write_table(corpus_info, stopwords_info)
    print_corpus_info(brown.categories(), FileOps.get_stopwords('stopwords.txt'))  
Exemple #27
0
print(tmp_Con.conditions())  # ['实惠', '快', '也好']
print(tmp_Con['快'].most_common())  # [('1', 1), ('-1', 1)]
print(tmp_Con['快'].keys())  # dict_keys(['1', '-1'])
print(len(tmp_Con['快'].keys()))  # 2; 可以看到每个词语的词性有多少个...
print(len(tmp_Con['也好'].keys()))  # 1; 重复的已经 set() 化了
print([
    condition for condition in tmp_Con.conditions()
    if len(tmp_Con[condition].keys()) > 1
])  # ['快']
tmp_Con.plot()
tmp_Con_1 = ConditionalFreqDist(zip(anls, word))
print(tmp_Con_1.conditions())  # ['实惠', '快', '也好']
##################################################################
## Brown 语料库 word 归类分析
print(
    brown.categories()
)  # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
cfd = nltk.ConditionalFreqDist(
    (genre, word) for genre in brown.categories()
    for word in brown.words(categories=genre))  # 这里的 categories=genre 不能去掉
genres = [
    'news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor'
]  # 从 brown.categories() 中找的
modals = ['can', 'could', 'may', 'might', 'must', 'will']  # 随机找的几个单词
print(
    cfd.tabulate(conditions=genres, samples=modals)
)  # Observe that the most frequent modal in the news genre is will, while the most frequent modal in the romance genre is could
#                  can could  may might must will  # 每个类别种各个单词的数量
#            news   93   86   66   38   50  389
#        religion   82   59   78   12   54   71
#         hobbies  268   58  131   22   83  264
# -*- coding: utf-8 -*-
"""

"""

# BROWN CORPUS DEMO
from nltk.corpus import brown
import nltk

print 'Total Categories:', len(brown.categories())

print brown.categories()

# tokenized sentences
brown.sents(categories='mystery')

# POS tagged sentences
brown.tagged_sents(categories='mystery')

# get sentences in natural form
sentences = brown.sents(categories='mystery')

# get tagged words
tagged_words = brown.tagged_words(categories='mystery')

# get nouns from tagged words
nouns = [(word, tag) for word, tag in tagged_words
         if any(noun_tag in tag for noun_tag in ['NP', 'NN'])]

print nouns[0:10]  # prints the first 10 nouns
Exemple #29
0
    soundex = fc + soundex[1:]
    soundex = soundex.replace('0', '')
    return (soundex + size * '0')[:size]


print("1")
print(SOUNDEX('abcdefghijklmnopqrstuvwxyz'))
print(SOUNDEX('Tim'))
print(SOUNDEX('Trump'))
print(SOUNDEX('Einstein'))

# 2.1
# nltk.download('brown')
# 使用以下categories
id = 3018216005
categories = brown.categories()[id % len(brown.categories())]
brown_tagged_sents = brown.tagged_sents(categories=categories)
brown_sents = brown.sents(categories=categories)
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
print("2.1")
print('categories:', categories)
print('size:', size)
print('train size:', len(train_sents))
print('test size:', len(test_sents))
print(unigram_tagger.evaluate(test_sents))
Exemple #30
0
# -*- coding: utf-8 -*-
import matplotlib
matplotlib.use('TkAgg')
import nltk 
'''
☼ Use the Brown corpus reader nltk.corpus.brown.words() or the Web text corpus
reader nltk.corpus.webtext.words() to access some sample text in two different genres.
'''

from nltk.corpus import brown,webtext
romance_text = brown.words(categories='romance')
print brown.categories()
print webtext.fileids()
print webtext.words('firefox.txt')
Exemple #31
0
import nltk
from nltk.corpus import brown

for category in brown.categories():
    words = brown.words(categories = category)
    fdist = nltk.FreqDist([w.lower() for w in words])
    modals = ['can', 'could', 'may', 'might', 'must', 'will']
    print category + ' '
    for m in modals:
        print m + ":", fdist[m],
    print "\n"
Exemple #32
0
 def word_category(self, word):
     # print "Finding category"
     if (word.name in stopwords.words('english')):
         return 'stopwords'
     categories = brown.categories()
     return max(categories, key=lambda cat: self.cond_freq[cat][word.name])
Exemple #33
0
Us nombre moyen de mots mar phrase



from nltk.corpus import brown as brown

dict = {}
#-----------------------------
for cat in brown.categories:
	Uw = len("".join(list(brown.words(categories=cat))))/len(brown.words(categories=cat))
	Us = len(brown.words(categories=cat))/len(brown.sents(categories=cat))
	dict[cat] = 4.71*Uw + 0.5*Us - 21.43

#-----------------------------

dict = {cat : (4.71*(len("".join(list(brown.words(categories=cat))))/len(brown.words(categories=cat))) + 0.5*(len(brown.words(categories=cat))/len(brown.sents(categories=cat))) - 21.43) for cat in brown.categories() }

Uw = lambda brown,cat : len("".join(list(brown.words(categories=cat))))/len(brown.words(categories=cat))
Us = lambda brown,cat : len(brown.words(categories=cat))/len(brown.sents(categories=cat))

d = {cat : (4.71*Uw(brown,cat) + 0.5*Us(brown,cat) - 21.43) for cat in brown.categories() }

#4 -----------------------------------------------------

["".join([char for char in word if char in "aeiou"]) for word in words]





Exemple #34
0
from nltk.corpus import brown, movie_reviews, reuters
print(brown.categories())  # doctest: +NORMALIZE_WHITESPACE
print(movie_reviews.categories())
print(reuters.categories())  # doctest: +NORMALIZE_WHITESPACE +ELLIPSIS
print(brown.categories('ca01'))
print(brown.categories(['ca01', 'cb01']))
print(reuters.categories('training/9865'))
print(reuters.categories(['training/9865', 'training/9880']))
print(reuters.fileids('barley'))  # doctest: +ELLIPSIS
print(brown.tagged_words(categories='news'))
print(brown.sents(
    categories=['editorial', 'reviews']))  # doctest: +NORMALIZE_WHITESPACE
Exemple #35
0
def brown():
    print brown.categories()
    print len(brown.sents())
    print len(brown.words())
    assert i not in guten_dev
    assert i not in guten_test
    assert j not in guten_train
    assert j not in guten_test
    assert k not in guten_dev
    assert k not in guten_train

#############################
#Generating for Brown Corpus#
#############################

random.seed(22)
brown_train = []
brown_dev = []
brown_test = []
for cat in brown.categories():
    brown_train.append(
        random.sample(list(brown.fileids(categories=cat)),
                      int((0.6) * len(brown.fileids(categories=cat)))))
brown_train = [x for y in brown_train for x in y]

#Generating Dev set for Brown Corpus
for cat in brown.categories():
    brown_dev_1 = [
        i for i in brown.fileids(categories=cat) if i not in brown_train
    ]
    brown_dev.append(random.sample(brown_dev_1, int(0.5 * len(brown_dev_1))))
brown_dev = [x for y in brown_dev for x in y]

#Generating Test set for Brown Corpus
for cat in brown.categories():
Exemple #37
0
def nltk():
    """
    nltk,文本处理简介
    :return:
    """
    # 1.语料库
    # 查看语料库包含的类别
    print(brown.categories())
    # 查看brown语料库
    print('共有{}个句子'.format(len(brown.sents())))
    print('共有{}个单词'.format(len(brown.words())))

    # 2.分词
    sentence = "Python is a widely used high-level programming language for general-purpose programming."
    tokens = nltk.word_tokenize(sentence)  # 需要下载punkt分词模型
    print(tokens)

    # 3.结巴分词
    seg_list = jieba.cut("欢迎来到小象学院", cut_all=True)
    print("全模式: " + "/ ".join(seg_list))  # 全模式

    seg_list = jieba.cut("欢迎来到小象学院", cut_all=False)
    print("精确模式: " + "/ ".join(seg_list))  # 精确模式

    # 4.词形处理
    # 词干提取(stemming)
    # PorterStemmer
    from nltk.stem.porter import PorterStemmer

    porter_stemmer = PorterStemmer()
    print(porter_stemmer.stem('looked'))
    print(porter_stemmer.stem('looking'))

    # SnowballStemmer
    from nltk.stem import SnowballStemmer

    snowball_stemmer = SnowballStemmer('english')
    print(snowball_stemmer.stem('looked'))
    print(snowball_stemmer.stem('looking'))

    # LancasterStemmer
    from nltk.stem.lancaster import LancasterStemmer

    lancaster_stemmer = LancasterStemmer()
    print(lancaster_stemmer.stem('looked'))
    print(lancaster_stemmer.stem('looking'))

    # 词形归并(lemmatization)
    from nltk.stem import WordNetLemmatizer  # 需要下载wordnet语料库

    wordnet_lematizer = WordNetLemmatizer()
    print(wordnet_lematizer.lemmatize('cats'))
    print(wordnet_lematizer.lemmatize('boxes'))
    print(wordnet_lematizer.lemmatize('are'))
    print(wordnet_lematizer.lemmatize('went'))

    # 指明词性可以更准确地进行lemma
    # lemmatize 默认为名词
    print(wordnet_lematizer.lemmatize('are', pos='v'))
    print(wordnet_lematizer.lemmatize('went', pos='v'))

    #词性标注 (Part-Of-Speech)
    words = nltk.word_tokenize('Python is a widely used programming language.')
    print(nltk.pos_tag(words))  # 需要下载 averaged_perceptron_tagger

    # 去除停用词
    from nltk.corpus import stopwords  # 需要下载stopwords

    filtered_words = [
        word for word in words if word not in stopwords.words('english')
    ]
    print('原始词:', words)
    print('去除停用词后:', filtered_words)

    # 5.典型的文本预处理流程
    # 原始文本
    raw_text = 'Life is like a box of chocolates. You never know what you\'re gonna get.'

    # 分词
    raw_words = nltk.word_tokenize(raw_text)

    # 词形归一化
    wordnet_lematizer = WordNetLemmatizer()
    words = [wordnet_lematizer.lemmatize(raw_word) for raw_word in raw_words]

    # 去除停用词
    filtered_words = [
        word for word in words if word not in stopwords.words('english')
    ]

    print('原始文本:', raw_text)
    print('预处理结果:', filtered_words)
Exemple #38
0
>>> from nltk.corpus import brown
>>> from nltk.book import *
*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
>>> brown.categories()
['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
>>> cfd = nltk.ConditionalFreqDist()
>>> 
============= RESTART: C:/Users/Aurangzeb khan/Desktop/nltk1.py =============
*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
Exemple #39
0
import nltk

nltk.download('brown')
from nltk.corpus import brown
from nltk.probability import FreqDist

res = []
for i in ["what", "when", "where", "who", "why"]:
    aux = []
    for j in brown.categories():
        aux.append(j)
        aux.append(FreqDist(brown.words(categories=j)).get(j, 0))
    res.append(i)
    res.append(aux)
print(res)
Exemple #40
0
 def conditional_frequencies(self):  # dlugo... moze dla mniejszego zbioru?
     # print "Making conditional frequencies..."
     cfd = ConditionalFreqDist((genre, word)
                               for genre in brown.categories()
                               for word in brown.words(categories=genre))
     return cfd
Exemple #41
0
    print(fileid, webtext.raw(fileid)[:100], '...')

#%%
from nltk.corpus import brown

news_text = brown.words(categories='news')
fdist = FreqDist(w.lower() for w in news_text)
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    #print(m + ':', fdist[m], )
    print m, ':', fdist[m], '  ',

#%%
#conditional frequencies

cdf = ConditionalFreqDist((genre, word) for genre in brown.categories()
                          for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cdf.tabulate(conditions=genres, samples=modals)
#%%
from nltk.corpus import inaugural

cdf = ConditionalFreqDist((target, fileid[:4])
                          for fileid in inaugural.fileids()
                          for w in inaugural.words(fileid)
                          for target in ['america', 'citizen']
                          if w.lower().startswith(target))

cdf.plot()
#%%
Exemple #42
0
sentences[1037]
#最长的句子
long = max([len(s) for s in sentences])
[s for s in sentences if len(s) == long]

#%%
#网络和聊天文本
from nltk.corpus import webtext
webtext.fileids()
for fileid in webtext.fileids():
    print(fileid,webtext.raw(fileid)[:60])

#%%
#布朗语料库
from nltk.corpus import brown
brown.categories()
brown.words(categories='news')

news_words = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_words])
modals = ['can','could','may','might','must','will']

for m in modals:
    print(m,fdist[m]) 
         
#%%
#条件频率分布
from nltk.corpus import brown
cfd = nltk.ConditionalFreqDist(
        (genre,word)
        for genre in brown.categories()
Exemple #43
0
def main():
	# levenshtein distance for lists of tokens
	#-----------------------------------------
    #PCL Solutions
    #-------------
    print "~~~~~~~~~Beispiel)~~~~~~~~~~~~~~"
    l1 = "Vladimir Levenshtein uebernahm dies im Jahre 1960 .".split()
    l2 = "Vladimir Iosifovich Levenshtein entwickelte dies im Jahre 1965 .".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "ld=3+1.3+4=8.3, ->%s" % (ld)
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"

    print "~~~~~~~~~2a und 2b)~~~~~~~~~~~~~"
    l1 = "Computerlinguistik 2 ist spannend .".split()
    l2 = "Computerlinguistik macht Spass und ist spannend !".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "ld= nicht 22.1, ->%s" % (ld)
    print "\nWeshalb ist 22.1 nicht die Optimale Lösung?\n" \
		  "Unsere Version geht wie folgt vor:\n" \
		  "delete: 2 (Kosten 1 * 3)\n" \
		  "insert: macht Spass und (Kosten 3 * 3.0)\n" \
		  "substitute: . und ! (Kosten 1 * 0.1)\n" \
		  "Totale Kosten von 12.1 nicht 22.1\n" \
		  "Die substitution (go diagonal) mit Kosten 16\n" \
		  "wird immer umgangen\n" \
		  "und durch ein delete (go down) und \n" \
		  "ein insert (go right) ersetzt."
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"

    print "~~~~~~~~~2c)~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    l1 = "spricht Yoda in immer Raetseln .".split()
    l2 = "Yoda spricht immer in Raetseln !".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "ld=0.9, ->%s" % (ld)
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
    l1 = "das ist !".split()
    l2 = "ist das !".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "ld=0.4, ->%s" % (ld)
    print "~~~~~~~~~2d)~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    #load brown corpus sents
    print "Brown categories:\n %s\n" %brown.categories()
    brown_sents=brown.sents(categories = "romance")

    #find minimal distance between l1 and l2
    l1 = "I wish you loved me .".split()
    ld_min=1000 #Startwert

    #iterate through sent in brown corpus
    print"\nUm die beste Lösung zu finden:\n" \
		 "Setze den print_matrix flag auf 0 " \
		 "und iteriere über die gesamte Kategorie " \
		 "romance (entferne auf nachfolgender Zeile [0:2])\n"
    for sent in brown_sents[0:2]:

        #calculate distance
        l2 = sent
        ld = levenshtein_on_tokens(l1, l2)

        #print out best match so far
        print l1, "\n", l2
        print "ld= , ->%s" % (ld)
        if ld<ld_min:
            ld_min=ld
            l2_min=l2
        print "ld_min= ", ld_min
        print "l2_min= ", l2_min
        print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"


    # Additional:
    # Test cases
    print"\nZusätzliche Testfälle:\n"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"
    # ----------------------------------------
    l1 = "a b c d . a a".split()
    l2 = "a b c d . a c".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "1) ld= 1.3, ->%s" % (ld)
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"

    l1 = "a b c d . a 2".split()
    l2 = "a b c d . a 4".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "2) ld= 4.0, ->%s" % (ld)
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"

    l1 = "a b .".split()
    l2 = "a b !".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "3) ld= 0.1, ->%s" % (ld)
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"

    l1 = "a b c 4".split()
    l2 = "a b c .".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "4) ld= nicht 16.0, 3.1, ->%s" % (ld)
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"

    l1 = "a b c d".split()
    l2 = "a e".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "5) ld= 7.3, ->%s" % (ld)
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"

    l1 = "a b c d .".split()
    l2 = "a e i o".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "6) ld= 4.0, ->%s" % (ld)
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"

    l1 = "a b c d !".split()
    l2 = "a b c d .".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "7) ld= 0.1, ->%s" % (ld)
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"

    l1 = "!".split()
    l2 = "b".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "8) ld= nicht 16, 3.1 , ->%s" % (ld)
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"

    l1 = "! 3 4".split()
    l2 = "b ? a".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "9) ld=12.1 , ->%s" % (ld)
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"

    l1 = "l e v e n s h t e i n".split()
    l2 = "m e i l e n s t e i n".split()
    ld = levenshtein_on_tokens(l1, l2)
    print l1, "\n", l2
    print "10) ld=7.8 , ->%s" % (ld)
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n"


    # levenshtein distance for strings of characters
    # -----------------------------------------------
    print"\nZusätzlich:"
    print "levenshtein distance for strings of chars\n" \
		  "Dieser Algorithmus war das Grundgerüst der\n" \
		  "gesamten Aufgabe"
    print "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
    s1 = "kitten"
    s2 = "sitting"
    ld = levenshtein_on_characters(s1, s2)
    print "\nld= ", ld
    print s1, "\n", s2
Exemple #44
0
Sl. No.- 14542


@@@-----------S1: Train: D1-Train, Test: D1-Test----------------@@@

"""
"""
######  IMPORT BROWN CORPUS  #######
"""

import nltk
import re
nltk.download('brown')
from nltk.corpus import brown

categories=brown.categories()


brown_corpus_sents={}

for cat in categories:
    a=[]
    for sents in brown.sents(categories=cat):
        sents.insert(0,'<s>')
        sents.append('<e>')
        a.append(sents)
    brown_corpus_sents[cat]=a
           

BROWN={} 
for cat in categories:
Exemple #45
0
weight1 = np.random.uniform(low=0.0, high=0.0, size=(34, 17))
weight2 = np.random.uniform(low=0.0, high=0.0, size=(17, 17))
weight3 = np.random.uniform(low=0.0, high=0.0, size=(17, 15))

# print np.matrix(weight1)
for i in range(34):
    for j in range(17):
        weight1[i, j] = float(1 / (1.0 * 34))
for i in range(17):
    for j in range(17):
        weight2[i, j] = float(1 / (1.0 * 17))
for i in range(17):
    for j in range(15):
        weight3[i, j] = float(1 / (1.0 * 17))
classes = ["" for x in range(len(brown.categories()))]
count = 0
for i in brown.categories():
    classes[count] = i
    count += 1

print classes
output1 = [0.0] * 17
output2 = [0.0] * 17
output3 = [0.0] * 15
error1 = [0.0] * 17
error2 = [0.0] * 17
error3 = [0.0] * 15
index = -1
for f in brown.fileids():
    print "Fileids ", f
Exemple #46
0
from nltk.corpus import brown


def average_num_words_per_sentence(text, category):
    """returns the average number of words per sentence in a text by dividing number of words in a corpus by number of sentences in the corpus."""

    sent_num = len(text.sents(categories=category))
    word_num = len(text.words(categories=category))
    return word_num / sent_num


def average_num_letters(text, category):
    """finds the average number of letters per word in a corpus."""

    word_num = len(text.words(categories=category))
    smash_text = ''.join(text.words(categories=category))
    letters_len = len(smash_text)

    return letters_len / word_num


def ari(text, category):
    """Calculates the average readability of a text."""
    uw = average_num_letters(text, category)
    us = average_num_words_per_sentence(text, category)
    ari = (4.71 * uw) + (0.5 * us) - 21.43
    return ari


for category in brown.categories():
    print(category + ': ' + str(ari(brown, category)))

from nltk.corpus import webtext ## ana ezMPLW  more casusal language sets
for fileid in webtext.fileids():
	print(fileid, webtext.raw(fileid)[:65], '...')

from nltk.corpus import nps_chat # instant messaging ... bit awk
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]

from nltk.corpus import brown
print (brown.categories)


from nltk.corpus import brown
print (brown.categories())  #displays categories

print(brown.words(categories='news'))

print (brown.words(fileids=['cg23'])) #accessing specific doccuments

print (brown.sents(categories =['news' , 'editorial', 'reviews']) # different categories


import nltk
from nltk.corpus import brown
fiction_text = brown.words(categories='fiction') #to easily access fiction
>>> fdist = nltk.FreqDist(w.lower() for w in fiction_text)
>>> qwords = ['what', 'when' , 'where', 'who' , 'why']
>>> for i in qwords:
	print(i + ':', fdist[i], end= ' ' ) # making a list of how many occur in this cat, plus end = ' ' , makes it primt all on one line 
Exemple #48
0
for fileid in gutenberg.fileids():
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set(w.lower() for w in gutenberg.words(fileid)))
    avg_word_len = round(num_chars / num_words)
    avg_sent_len = round(num_words / num_sents)
    lexical_diversity = round(num_words / num_vocab)
    print(fileid, "  |  ", num_chars, "  |  ", num_words, "  |  ", num_sents,
          "  |  ", num_vocab, "  |  ", avg_word_len, "  |  ", avg_sent_len,
          "  |  ", lexical_diversity)

for fileid in webtext.fileids():
    print(fileid)

brown.categories()
brown.raw("cr09")

#stylistics - systematic differences between genres
# by use of modal verbs - [can could may might must will]
news_text = brown.words(categories='news')
hobbies_text = brown.words(categories='hobbies')
news_text_fdist = nltk.FreqDist(w.lower() for w in news_text)
hobbies_text_fdist = nltk.FreqDist(w.lower() for w in hobbies_text)
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print(m, ":", news_text_fdist[m], "  |  ", hobbies_text_fdist[m])

event_words = ["who", "what", "when", "where", "why"]
for m in event_words:
    print(m, ":", news_text_fdist[m], "  |  ", hobbies_text_fdist[m])
Exemple #49
0
def fun06():
    """fun06"""
    print brown.categories()
    print brown.words(categories='news')[:60]
    print brown.words(fileids=['cg22'])[:60]
    print brown.sents(categories=['news', 'editorial', 'reviews'])
Exemple #50
0
    print(field, webtext.raw(field)[:65], '...')

# 聊天文本
from nltk.corpus import nps_chat

for field in nps_chat.fileids():
    print(field, nps_chat.posts(field)[:12])

chatroom = nps_chat.posts('10-19-20s_706posts.xml')
print("chatroom[123]= ", chatroom[123])

# 1.3. Brown(布朗)语料库:用于研究文体之间的系统性差异(又叫文体学研究)
from nltk.corpus import brown

show_subtitle("使用 categories 区分文本")
print("brown.categories() =", brown.categories())
print("brown.words(categories='news')= ", brown.words(categories='news'))
print("brown.words(categories=['news', 'editorial', 'reviews'])= ",
      brown.words(categories=['news', 'editorial', 'reviews']))
print("brown.sents(categories=['news', 'editorial', 'reviews'])= ",
      brown.sents(categories=['news', 'editorial', 'reviews']))

show_subtitle("使用 fileids 区分文本")
print("brown.words(fileids='cg22')= ", brown.words(fileids='cg22'))

news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print(m + ':', fdist[m], end=', ')
Exemple #51
0
def exercise29():
    # Readability measures are used to score the reading difficulty of a text, for the purposes of selecting texts of appropriate difficulty for language learners. Let us define μw to be the average number of letters per word, and μs to be the average number of words per sentence, in a given text. The Automated Readability Index (ARI) of the text is defined to be: 4.71 μw + 0.5 μs - 21.43. Compute the ARI score for various sections of the Brown Corpus, including section f (lore) and j (learned). Make use of the fact that nltk.corpus.brown.words() produces a sequence of words, while nltk.corpus.brown.sents() produces a sequence of sentences
    for category in brown.categories():
        print(category+ ':' + str(ari(brown,category)))
Exemple #52
0
import nltk

nltk.download()

# 1. brown corpus
# 2. Inaugural speech
# 3. book corpus - frequency distribution can be done(most common words in textbook)

# In[2]:

from nltk.corpus import brown

# In[4]:

brown.categories()

# In[5]:

print(type(brown))

# In[10]:

brown.words(categories="adventure")[:100]

# In[11]:

len(brown.words(categories="adventure"))

# In[13]:
#!/usr/bin/python3
# coding: utf-8
# Brown Corpus (布朗语料库): Brown Corpus of Standard American English 被认为是第一个可以在计算语言学处理中使用的通用英语语料库
#     它包含了一百万字 1961 年出版的美语文本; 它代表了通用英语的样本, 采样自小说, 新闻和宗教文本; 随后, 在大量的人工标注后, 诞生了词性标注过的版本
from nltk.corpus import brown
print(len(brown.fileids()))  # 500; 个 文档
print(brown.fileids()[:5])  # ['ca01', 'ca02', 'ca03', 'ca04', 'ca05']
print(len(brown.words()))  # 1161192; 总共 1161192 个单词
print(brown.words()[:5])  # ['The', 'Fulton', 'County', 'Grand', 'Jury']; 打印前 5 个单词
print(len(brown.words('ca01')))  # 2242; 一片文档还是比较少的
##################################################################
## 标记数据
print(brown.tagged_words()[:3])  # [('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL')]; 打印前 3 个单词的标注
##################################################################
## categories
print(len(brown.categories()))  # 15; 个分类
print(brown.categories())  # ['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']
print(len(brown.words(categories='news')))  # 100554; 统一类数据的单词
print(len(brown.sents(categories=['news', 'editorial', 'reviews'])))  # 9371
# brown 包括标记数据 和 非标记数据
print(len(brown.words()))  # 1161192
print(len(brown.words(categories=brown.categories())))  # 1161192; 所有数据都在 categories 里面
##################################################################
## 路径
print(brown.abspath('ca01'))  # /home/coder352/nltk_data/corpora/brown/ca01
print(brown.abspaths())  # 所有文档路径
##################################################################
## 类型
print(type(brown))  # <class 'nltk.corpus.reader.tagged.CategorizedTaggedCorpusReader'>
print(type(brown.words()))  # <class 'nltk.corpus.reader.util.ConcatenatedCorpusView'>
print(type(brown.words('ca01')))  # <class 'nltk.corpus.reader.tagged.TaggedCorpusView'>
Exemple #54
0
from nltk.corpus import brown

#Introduction to Brown Corpus
print(brown.categories())

#Accessing words to Brown Corpus

print(brown.words(categories='lore'))

#Introduction to Conditional Frequency Distribution
from nltk import ConditionalFreqDist  #imports statement

# pair_list [ (condition, word) ]
pair_list = [(category, word) for category in brown.categories()
             for word in brown.words(categories=category)]

print(pair_list[:10])

freqdist = ConditionalFreqDist(pair_list)
print(freqdist['lore']['the'])

#Conditional Method

#tabulate functions

category = ['adventure', 'lore', 'news']
samples = ['the', 'and', 'man']
freqdist.tabulate(conditions=category, samples=samples)
Exemple #55
0
    num_chars = len(gutenberg.raw(fileid))
    num_words = len(gutenberg.words(fileid))
    num_sents = len(gutenberg.sents(fileid))
    num_vocab = len(set([w.lower() for w in gutenberg.words(fileid)]))
    print int(num_chars/num_words), int(num_words/num_sents), int(num_words/num_vocab), fileid

#句子划分
macbeth_sentences = gutenberg.sents('shakespeare-macbeth.txt')
longest_len = max([len(s) for s in macbeth_sentences])
#网络聊天语料库
from nltk.corpus import webtext
from nltk.corpus import nps_chat
chatroom = nps_chat.posts('10-19-20s_706posts.xml')
chatroom[123]
from nltk.corpus import brown
brown.categories()
brown.sents(categories=['news', 'editorial', 'reviews'])
news_text = brown.words(categories='news')
fdist = nltk.FreqDist([w.lower() for w in news_text])
modals = ['can', 'could', 'may', 'might', 'must', 'will']
for m in modals:
    print m + ':', fdist[m]


cfd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre))
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']
modals = ['can', 'could', 'may', 'might', 'must', 'will']
cfd.tabulate(conditions=genres, samples=modals)
Exemple #56
0
def main():
    tfidf = None
    word2vec = None
    similarityMatrix = None
    browndict = {}
    corporadict = None

    word2vec = None

    choice = ""
    while choice != "exit":
        choice = ""
        while choice not in ["tfidf", "word2vec", "exit"]:
            choice = input(
                "TF-IDF or Word2Vec? [TFIDF, Word2Vec, Exit]\n>").lower()

        if choice == "exit":
            break

        catType = ""
        while catType not in ["within", "between", "return"]:
            catType = input(
                "Within or between clusters? [Within, Between, Return]\n>"
            ).lower()

        if catType == "return":
            break

        # get all of the words for each document per category
        texts = []
        if catType == "within":
            for c in brown.categories():
                words = NormalizeWords(brown.words(categories=c))
                texts.append(words)
                # build a dictionary for me to use later
                browndict[c] = words
        elif catType == "between":
            for c in brown.categories():
                words = NormalizeWords(brown.words(categories=c))
                texts.append(words[:len(words) // 2])
                texts.append(words[len(words) // 2:])
                # build a dictionary for me to use later
                browndict[c + "1/2"] = words[:len(words) // 2]
                browndict[c + "2/2"] = words[len(words) // 2:]

        # create the corpora dictionary built from gensim
        corporadict = corpora.Dictionary(texts)
        # create a corpus for the training
        corpus = []
        for line in texts:
            corpus.append(corporadict.doc2bow(line))

        if choice == "tfidf":
            # create the tfidf model from our built corpus
            tfidf = TfidfModel(corpus=corpus)

            # build the similarity matrix
            similarityMatrix = MatrixSimilarity(corpus,
                                                num_features=len(corporadict))
        elif choice == "word2vec":
            word2vec = Word2Vec(brown.sents())

            # build term similiarity matrix from our models word-vector
            termSimilarityIndex = WordEmbeddingSimilarityIndex(word2vec.wv)

            # build sparse similarity matrix
            sparseSimiliarityMatrix = SparseTermSimilarityMatrix(
                termSimilarityIndex, corporadict)

            # build similarity word-vector
            WV_SimilarityMatrix = SoftCosineSimilarity(
                corpus, sparseSimiliarityMatrix)

        maxes = {}
        if choice == "tfidf":
            # Print out the code
            keys = list(browndict.keys())
            for i in range(len(keys) - 1):
                # Convert to a bag of words and to a tfidf vector, then query it.
                query_bow = corporadict.doc2bow(browndict[keys[i]])
                query_tfidf = tfidf[query_bow]

                # Get the similarity of every cluster
                query_similarity = similarityMatrix[query_tfidf]
                for j in range(i + 1, len(query_similarity)):
                    sim = query_similarity[j]
                    print(keys[i], "and", keys[j], "have a similarity of:",
                          sim)
                print("")
        elif choice == "word2vec":
            keys = list(browndict.keys())
            for i in range(len(keys) - 1):
                # Convert to a bag of words and query it
                query_bow = corporadict.doc2bow(browndict[keys[i]])

                # Get the similarity of every cluster
                query_similarity = WV_SimilarityMatrix[query_bow]
                for j in range(i + 1, len(query_similarity)):
                    sim = query_similarity[j]
                    print(keys[i], "and", keys[j], "have a similarity of:",
                          sim)
                print("")
Exemple #57
0
# -*- coding: utf-8 -*-
"""
Created on Tue Oct 23 15:19:53 2018

@author: kkonakan
"""

import nltk.corpus as nc
from nltk.corpus import brown

fname = 'milton-paradise.txt'
guten = nc.gutenberg
print(guten.fileids())
print(len(guten.raw(fname)))
print(guten.sents(fname)[0:5])
print(guten.words(fname)[0:10])
print(brown.categories())
import nltk
import random
from nltk.corpus import brown

#All of the different categories that are tagged in the corpus
print(brown.categories())

#A random sentence from the "humor" category
total = []
sentences = brown.sents(categories='humor')
for sentence in sentences:
    sentence = ' '.join(sentence)
    total.append(sentence)
print('\n', random.choice(total))

#The category with the most words
categ_count = {}
count = []

for category in brown.categories():
    words = brown.words(categories=category)
    count.append(len(words))
    categ_count[category] = len(words)
count = sorted(count)

for category in categ_count:
    if categ_count[category] == count[-1]:
        print('\n', category)
Exemple #59
0
from nltk.corpus import stopwords
from cPickle import dump
from pprint import pprint

train_dict = nltk.defaultdict(list)
test_dict = nltk.defaultdict(list)

def FDtoDIC(fd):
    out_dict = nltk.defaultdict(float)
    for key in fd.keys():
        out_dict[key] = fd[key]
    out_dict['N'] = fd.N()
    return out_dict


for category in set(brown.categories()).\
    difference(set(['humor', 'science_fiction'])):
    cat_files = brown.fileids(categories=category)
    random.shuffle(cat_files)
    size = int(len(cat_files) * 0.85)
    train, test = cat_files[:size], cat_files[size:]
    key_list = []
    for f in train:
        temp = brown.open(f).read().split()
        temp = [entry.split('/')[0] for entry in temp]
        temp = [entry for entry in temp if entry \
                not in stopwords.words('english')]
        train_dict[category].append(FDtoDIC(nltk.FreqDist(temp)))
        key_list.extend(train_dict[category][-1].keys())
    # compute the averge sample for the given category
    key_list = set(key_list)
Exemple #60
0
# Module 3: Corpus
# Corpus structure challenge

from nltk.corpus import brown

# print(brown.fileids())

fileid = 'cl08'

# text = brown.words(fileid)
# print(text)

print(" Num of chars :", len(brown.raw(fileid)))
print(" Num of words :", len(brown.words(fileid)))
print(" Num of sentences :", len(brown.sents(fileid)))

print(" Categories:", brown.categories(fileid))