Python fileids Examples, nltk.corpus.state_union.fileids Python Examples

Example #1

0

Show file

def chapter2_exercise4():
    # Read in the texts of the State of the Union addresses, using the state_union corpus reader.Count occurrences of
    # men, women, and people in each document.What has happened to the usage of these words over time?
    files = state_union.fileids()
    men = dict()
    women = dict()
    people = dict()
    for index, file in enumerate(files):
        words = sorted(state_union.words(fileids=[file]))
        men[file] = words.count("men")
        women[file] = words.count("women")
        people[file] = words.count("people")
        print(file[:4], men[file], women[file], people[file], end="      ")
        if index % 6 == 5:
            print()
    print("\nMEN")
    for file, men_c in men.items():
        print(file[:4], men_c)
    print("\nWOMEN")
    for file, women_c in women.items():
        print(file[:4], women_c)
    print("\nPERSON")
    for file, person_c in people.items():
        print(file[:4], person_c)
    print("men:", sum(men.values()))
    print("women:", sum(women.values()))
    print("people:", sum(people.values()))

Example #2

0

Show file

File: NLPBook.py Project: lyozhou/lyo-lib

def state_union_men_stat():
    cfd = nltk.ConditionalFreqDist((target,year[:4])
        for year in state_union.fileids()
        for w in state_union.words(year)
        for target in ['men','women','people']
        if w.lower().startswith(target)
    )
    cfd.plot()

Example #3

0

Show file

File: which_president.py Project: EricChristensen/Python_Randomness

def all_documents():
    documents = []
    for document in state_union.fileids():
        text = ""
        for word in state_union.words(document):
            text = text + " " + word
        documents.append((text, extract_president(document)))
    return documents

Example #4

0

Show file

File: common.py Project: BennyMcBenBen/nlp-state-of-the-union

def init():
    train = []
    test = []
    filenames = state_union.fileids()
    for i in range(0,len(filenames)):
        if (i % 2 == 0):
            train.append(filenames[i])
        else:
            test.append(filenames[i])    
    return (train, test)

Example #5

0

Show file

File: n_grams_main.py Project: EricFillion/NLP

def main():
    nlp = spacy.load('en')
    text = ''
    for file in state_union.fileids():
        text += state_union.raw(file)
    result_dictionary = bigram_text(text, nlp)
    i = 0
    for occurrences, bigram in result_dictionary.items():
        print(bigram, occurrences)
        i = i +1
        if i > 100:
            break

Example #6

0

Show file

def ex4():
    from nltk.corpus import state_union
    tags = ["men", "women", "people"]
    #  for fileid in state_union.fileids():
    #    words = state_union.words(fileid)
    #    fdist = nltk.FreqDist([w.lower() for w in words])
    #    print fileid + ": ",
    #    for tag in tags:
    #      print tag + "=" + str(fdist[tag]) + " ",
    #    print
    cfd = nltk.ConditionalFreqDist((target, fileid[0:4])
                                   for fileid in state_union.fileids()
                                   for w in state_union.words(fileid)
                                   for target in tags if w.lower() == target)
    cfd.plot()

Example #7

0

Show file

File: ch02_ex.py Project: 447327642/nltk-examples

def ex4():
  from nltk.corpus import state_union
  tags = ["men", "women", "people"]
#  for fileid in state_union.fileids():
#    words = state_union.words(fileid)
#    fdist = nltk.FreqDist([w.lower() for w in words])
#    print fileid + ": ",
#    for tag in tags:
#      print tag + "=" + str(fdist[tag]) + " ",
#    print
  cfd = nltk.ConditionalFreqDist(
    (target, fileid[0:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
      for target in tags if w.lower() == target)
  cfd.plot()

Example #8

0

Show file

File: baseline_comparison.py Project: JoeAcanfora/cs4984_unit1

path = dir_path + "/*.txt"
list_txt = glob.glob(path)
all_toks_china = list()

for txt in list_txt:
	file_y = open(txt).read()
	tokens = word_tokenize(file_y)
	all_toks_china = all_toks_china + tokens

brown_cats = brown.categories()
all_toks_brown = list()

reuters_cats = reuters.categories()
all_toks_reuters = list()

state_union_cats = state_union.fileids()
all_toks_state_union = list()

complete_toks = list()

linux_words = open("../ref/words").read().split('\n')
linux_set = set(linux_words)

for cat in brown_cats:
	words = brown.words(categories=cat)
	tokens = [w.lower() for w in words]
	all_toks_brown = all_toks_brown + tokens
	complete_toks = complete_toks + tokens

for cat in reuters_cats:
	words = reuters.words(categories=cat)

Example #9

0

Show file

File: Module2-DataAnalysis-Solution.py Project: ab6/QConSF-2016

    return entity_names

def extract_entities(taggedText):
    '''
    Create map with entity and their counts
    :param taggedText: Parsed text (output of ne chunker) in tree form
    :return: dict of entities and their freq counts
    '''
    entity_names = []
    for tree in taggedText:
        entity_names.extend(extract_entity_names(tree))
    return entity_names


#get year and words for each file
extracted= [(state_union.raw(fileid), int(fileid[:4])) for fileid in state_union.fileids()]
docs, years = zip(*extracted)

#break text down into sentences, tokens
tokens = [nltk.word_tokenize(text) for text in docs]
sents = [nltk.sent_tokenize(text.replace("\n", " ")) for text in docs]
senttokens = [[nltk.word_tokenize(sent) for sent in entry] for entry in sents]

#get counts of unique words and plot over time
unique = [len(set(words)) for words in tokens]
plt.scatter(years, unique)
plt.show()

#get unique/total ratio
ratios = [(float(len(set(words)))/float(len(words))) for words in tokens]
plt.scatter(years, ratios)

Example #10

0

Show file

text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908
>>> 
>>> 
>>> 
>>> 
>>> # problem 1
>>> from nltk.corpus import state_union
>>> cfd = nltk.ConditionalFreqDist((text, word)
			       for text in state_union.fileids()
			       for word in state_union.words( fileids = text ))

>>> text = state_union.fileids()
>>> contexts = ['men', 'women', 'people']
>>> cfd.tabulate(condition = text, samples = contexts)
                       men  women people 
    1945-Truman.txt      2      2     10 
    1946-Truman.txt     12      7     49 
    1947-Truman.txt      7      2     12 
    1948-Truman.txt      4      1     22 
    1949-Truman.txt      2      1     15 
    1950-Truman.txt      6      2     15 
    1951-Truman.txt      8      2      9 
1953-Eisenhower.txt      3      0     17 
1954-Eisenhower.txt      2      0     15

Example #11

0

Show file

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed Jun 13 11:30:48 2018

@author: vpapg
"""

# Read in the texts of the State of the Union addresses, using the state_union
# corpus reader. Count occurrences of men, women, and people in each document.
# What has happened to the usage of these words over time?

from nltk.corpus import state_union
from nltk import ConditionalFreqDist

text = state_union.words()
print("Men:", text.count("men"))
print("Women:", text.count("women"))

Text(text).dispersion_plot(["men", "women"])

cfd = ConditionalFreqDist((target, fileid) for fileid in state_union.fileids()
                          for w in state_union.words(fileid)
                          for target in ['men', 'women']
                          if w.lower().startswith(target))
cfd.plot()

# The word 'women' appears more in recent documents, so it appears more
# over time

Example #12

0

Show file

File: stateunion_highfreq_collocations1.4.py Project: charx7/rug-ids-2018

#Importing NLTK and download : tokenizer, tagger, stopwords, corpus
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords, state_union
from nltk.tokenize import word_tokenize
from nltk.collocations import TrigramCollocationFinder
from nltk import pos_tag
nltk.download('state_union')
nltk.download('stopwords')
nltk.download('tagsets')


#make corpusList ready
corpusList = []
for i in range(len(state_union.fileids())):
    corpusList.append(state_union.raw(state_union.fileids()[i]))

#concatanete all raw texts within corpusList
allTexts = " ".join(corpusList)

#get english stop words
stop_words = set(stopwords.words('english'))

#tokenize
tokens = word_tokenize(allTexts)

#tag tokens
tagged = pos_tag(tokens)

#convert tagged tuple into dataframe for the ease of manipulation

Example #13

0

Show file

File: 2_3.py Project: bogdanp05/IDS_2017

print 'Number of arguments:', len(sys.argv), 'arguments.'
print 'Argument List:', str(sys.argv)

phrase = sys.argv[1]
corpora = sys.argv[2]
corpus = []

#Check corpus
if corpora == "gutenberg":
    titles = gutenberg.fileids()
    for title in titles:
        corpus.append(gutenberg.raw(title))

elif corpora == "state_union":
    titles = state_union.fileids()
    for title in titles:
        corpus.append(state_union.raw(title))
else:
    print "Choose from gutenberg or state_union"
    exit(0)

vectorizer = TfidfVectorizer(min_df=1, stop_words="english")
X = vectorizer.fit_transform(corpus)

XA = X.toarray()
# print vectorizer.vocabulary_
print 'The dimensions of the TF.IDF matrix: '
print XA.shape

print 'TF.IDF computation for the ' + corpora + ' corpus is completed\n'

Example #14

0

Show file

File: main.py Project: EricFillion/nlp-presentation

def get_text():
    text = ''
    for file in state_union.fileids():
        text += state_union.raw(file)
    return text

Example #15

0

Show file

File: sou.py Project: davidar/polya

import nltk
from nltk.corpus import state_union

test  = [fid for fid in state_union.fileids() if 'Johnson' in fid]
train = [fid for fid in state_union.fileids() if fid not in test]

print 'TEST:', ', '.join(test)

f = open('sou.test.txt','w')
for w in state_union.words(test): print>>f, w
f.close()

f = open('sou.norm.test.txt','w')
for s in state_union.sents(test):
	s = ' '.join(s).lower()
	s = s.replace("' s ","'s ").replace(' .','.')
	s = ' '.join(nltk.word_tokenize(s))
	print>>f, s
f.close()

print 'TRAIN:', ', '.join(train)

f = open('sou.train.txt','w')
for w in state_union.words(train): print>>f, w
f.close()

f = open('sou.norm.train.txt','w')
for s in state_union.sents(train):
	s = ' '.join(s).lower()
	s = s.replace("' s ","'s ").replace(' .','.')
	s = ' '.join(nltk.word_tokenize(s))

Example #16

0

Show file

File: sentdex_video4.py Project: adit0503/natural-language-processing-with-python

23.	RP	Particle
24.	SYM	Symbol
25.	TO	to
26.	UH	Interjection
27.	VB	Verb, base form
28.	VBD	Verb, past tense
29.	VBG	Verb, gerund or present participle
30.	VBN	Verb, past participle
31.	VBP	Verb, non-3rd person singular present
32.	VBZ	Verb, 3rd person singular present
33.	WDT	Wh-determiner
34.	WP	Wh-pronoun
35.	WP$	Possessive wh-pronoun
36.	WRB	Wh-adverb
'''
state_union.fileids()

text = state_union.raw('2006-GWBush.txt')

train_text = state_union.raw('2005-GWBush.txt')
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized1 = custom_sent_tokenizer.tokenize(text)
tagged1 = []

tokenized2 = sent_tokenize(text)
tagged2 = []

for sent in tokenized1:
    words = word_tokenize(sent)
    tagged = nltk.pos_tag(words)
    tagged1.append(tagged)

Example #17

0

Show file

# -*- coding: utf-8 -*-
import matplotlib

matplotlib.use('TkAgg')
import nltk
'''
☼ Read in the texts of the State of the Union addresses, using the
state_union corpus reader.  Count occurrences of men, women,
and people in each document.  What has happened to the usage of these
words over time?
'''

from nltk.corpus import state_union
#print state_union.fileids()
targets = ['men', 'women', 'people']
pair = [(target, fileid[:4]) for fileid in state_union.fileids()
        for word in state_union.words(fileid) for target in targets
        if word.lower() == target]
print pair
cfd = nltk.ConditionalFreqDist(pair)
cfd.plot()

Example #18

0

Show file

#Assignment:    03
#Due Date:      January 31st, 2018

import nltk

#Number 1 (2.4) in HW3

print('################ Number 1 ################')

#Generating list for each of the words through time
from nltk.corpus import state_union as su
total = []
men = []
women = []
people = []
for s in su.fileids():
    length_women = 0
    length_men = 0
    length_people = 0
    length = 0
    for w in su.words(s):
        if w.lower() == 'women':
            length_women += 1
            length += 1
        elif w.lower() == 'men':
            length_men += 1
            length += 1
        elif w.lower() == 'people':
            length_people += 1
            length += 1
    total.append(length)

Example #19

0

Show file

president_vocabulary = {}

for president in inaugural.fileids():
    vocab = Vocabulary(inaugural.words(president), unk_cutoff=2)
    president_vocabulary[president] = len(vocab)

inverse_vocabulary = [(value, key)
                      for key, value in president_vocabulary.items()]
print(max(inverse_vocabulary)[1],
      max(inverse_vocabulary)[0])  #richest vocabulary for Harrison in 1841
print(min(inverse_vocabulary)[1],
      min(inverse_vocabulary)[0])  #poorest vocabulary for Washington in 1793

president_vocabulary_state_union = {}

for president in state_union.fileids():
    vocab = Vocabulary(state_union.words(president), unk_cutoff=2)
    president_vocabulary_state_union[president] = len(vocab)

inverse_vocabulary_state_union = [
    (value, key) for key, value in president_vocabulary_state_union.items()
]
print(
    max(inverse_vocabulary_state_union)[1],
    max(inverse_vocabulary_state_union)
    [0])  #richest vocabulary for Truman in 1946
print(
    min(inverse_vocabulary_state_union)[1],
    min(inverse_vocabulary_state_union)
    [0])  #poorest vocabulary for Johnson in 1963

Example #20

0

Show file

File: ch2.py Project: jyzhang/py-nlp

def state_union_ts(word_list):
	cfd = nltk.ConditionalFreqDist((word.lower(), fileid[:4]) 
		for fileid in state_union.fileids()
		for word in state_union.words(fileid) if word.lower() in word_list)
	return cfd

Example #21

0

Show file

File: 2.py Project: MaciejWasilewski/NLP-with-Python

from nltk.corpus import gutenberg
gutenberg.fileids()
gutenberg.words('austen-emma.txt')
# word tokens
len([w.lower() for w in gutenberg.words('austen-emma.txt') if w.isalpha()])
#words
len(list(set([w.lower() for w in gutenberg.words('austen-emma.txt') if w.isalpha()])))

#3
from nltk.corpus import brown
brown.categories()
brown.words(categories='science_fiction')

#4
from nltk.corpus import state_union
state_union.fileids()
words=['men', 'women', 'people']
from nltk import ConditionalFreqDist
cfd=ConditionalFreqDist([(word, fileid) for fileid in state_union.fileids() for word in [w for w in state_union.words(fileid)]])
cfd.plot(conditions=words)

#5
word='life'
from nltk.corpus import wordnet as wn
for syn in wn.synsets(word): 
    for mer in syn.part_meronyms():
        print("Synset '{2}':\n\t{0}\n\npart meronym '{1}':\n\t{3} ".format(syn.definition(),
              mer.lemma_names()[0],syn.lemma_names()[0],mer.definition()))
        
    for mer in syn.member_meronyms():
        print("Synset '{2}':\n\t{0}\n\nmember meronym '{1}':\n\t{3} ".format(syn.definition(),

Example #22

0

Show file

File: RastogiNeelesh_lab2.py Project: neelrast/nltk-lab

def question1():
    a = nltk.ConditionalFreqDist((x, id[:4]) for id in state_union.fileids()
                                 for w in state_union.words(id)
                                 for x in ['men', 'women', 'people']
                                 if w.lower().startswith(x))
    a.plot()

Example #23

0

Show file

File: state_union_style.py Project: hbdhj/python

#pres_avg_length = {}

def getPresFromSpeech(speech_id):
    # 2001-GWBush-1.txt
    words = speech_id.split('.')

    if len(words) > 0:
        single_words = words[0].split('-')
        if len(single_words) > 0:
            for word in single_words:
                if word.isalpha():
                    return word
    return ""

all_words = {}
for speech_id in state_union.fileids():
    text = state_union.raw(speech_id)
    words = word_tokenize(text)
    for word in words:
        if word not in all_words.keys():
            all_words[word] = 1
        else:
            all_words[word] += 1

sent_len = []
word_len = []

pres_list = []
pres_sent_total = {}
pres_word_total = {}
pres_char_total = {}

Example #24

0

Show file

#4

import nltk
from nltk.corpus import state_union

for speech in state_union.fileids():
    words = state_union.words(fileids=[speech])
    fdist = nltk.FreqDist(w.lower() for w in words)
    print(speech)
    print("she: ", fdist["she"], end='\n')
    print("he: ", fdist["he"], end='\n')
    print("people: ", fdist["people"], end='\n')

Example #25

0

Show file

File: Chapter2.py Project: dsRajput90/nlp-exercise-steven

def tabulate(cfdist, words, categories):
    print('%-16s' % 'Category', end=' ')
    for word in words:
        print('%6s' % word, end=' ')
    print()
    for category in categories:
        print('%-16s' % category, end=' ')
        for word in words:
            print('%6d' % cfdist[category][word], end=' ')
        print()
        
        
cfd = nltk.ConditionalFreqDist(
    (fileid, word)
    for fileid in state_union.fileids()
    for word in state_union.words(fileid))


# In[47]:

tabulate(cfd, ['men', 'women', 'people'], state_union.fileids())


# In[55]:

#5. Investigate the holonym-meronym relations for some nouns. Remember that there are three kinds of holonym-meronym relation, so you need to use: member_meronyms(), part_meronyms(),  substance_meronyms(), member_holonyms(), part_holonyms(), and substance_holonyms().

wordnet.synset('book.n.01').part_holonyms()
wordnet.synset('book.n.01').substance_holonyms()
wordnet.synset('book.n.01').member_holonyms()

Example #26

0

Show file

File: ch2_exercises.py Project: haidang92/csc577

print(tempPhrase[-4:])
print(sorted(w.lower()
             for w in set(tempPhrase)))  #only sort puts capital letters first

#2 Use the corpus module to explore austen-persuasion.txt. How many word tokens does this book have? How many word types?
austen_persuasion = gutenberg.words('austen-persuasion.txt')
print("Number of word tokens = ", len(austen_persuasion))
print("Number of word types = ", len(set(austen_persuasion)))

#3 Use the Brown corpus reader nltk.corpus.brown.words() or the Web text corpus reader nltk.corpus.webtext.words() to access some sample text in two different genres.
print(brown.categories())
news_data = brown.words(categories='news')
religion_data = brown.words(categories='religion')

#4 Read in the texts of the State of the Union addresses, using the state_union corpus reader. Count occurrences of men, women, and people in each document. What has happened to the usage of these words over time?
print(state_union.fileids())
#cfd for inaugral address speeches for each president showing count of words american and citizen each speech
cfd = nltk.ConditionalFreqDist((target, fileid[:4])
                               for fileid in state_union.fileids()
                               for w in state_union.words(fileid)
                               for target in ['men', 'women']
                               if w.lower().startswith(target))
#cfd.plot()

#5 Investigate the holonym-meronym relations for some nouns. Remember that there are three kinds of holonym-meronym relation, so you need to use: member_meronyms(), part_meronyms(), substance_meronyms(), member_holonyms(), part_holonyms(), and substance_holonyms().
house = wn.synsets('house')
print(house)
house = wn.synset('house.n.01')
print(house.lemma_names())
print(house.definition())
print(house.examples())

Example #27

0

Show file

File: Lab1_1-5.py Project: serverlat/TDT4310

for word in words:
    if word[:2] == "sh":
        print(word, end=" ")
print("\n")

# b
print("Words longer than 4 characters:")
for word in words:
    if len(word) > 4:
        print(word, end=" ")
print("\n")

# Exercise 2

# a
files = list(state_union.fileids())
terms = ["men", "women", "people"]
statistics = nltk.ConditionalFreqDist((file, word)
                                      for file in state_union.fileids()
                                      for word in state_union.words(file)
                                      for term in terms
                                      if word.lower() == term)
statistics.tabulate(conditions=files, samples=terms)

# b
years_raw = sorted(list(set([int(year[:4])
                             for year in state_union.fileids()])))
years = [str(year) for year in years_raw]
year_statistics = nltk.ConditionalFreqDist(
    (word.lower(), fileid[:4]) for fileid in state_union.fileids()
    for word in state_union.words(fileid) for term in terms

Example #28

0

Show file

File: 2-04_cond-fdist.py Project: hmly/nlp-solutions

import nltk
from nltk.corpus import state_union

# Plot usage of words over time
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
    for target in ['men', 'women', 'people']
    if w.lower().startswith(target))
cfd.plot()

Example #29

0

Show file

# In[1]:

# NLTK imports
import nltk
from nltk.corpus import webtext
from nltk.corpus import state_union
import numpy as np

nltk.download('state_union')
nltk.download('stopwords')
nltk.download('punkt')

print("\n\n")
print('The fields are: ')
print(state_union.fileids())

# # TF.IDV Representation
# Computing the TF.IDV value of each word of each text in the corpus

# In[ ]:


# Compute the TF value of each word from a bag of words (bow)
def computeTF(wordDic, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bowCount)
    return tfDict

Example #30

0

Show file

File: state_union_scrape.py Project: jakeane/acronym_generator

# Author: Jack Keane
# Date: 3/25/20
# Description: Convert state of the union speeches into csv

# Libraries
from nltk.corpus import state_union
from nltk.tokenize import sent_tokenize
import string

# Code
speeches = state_union.fileids()
f = open("../acronym_data/state_union_data.csv", "w")

for s in speeches:
    speech = state_union.raw(s)
    sentences = sent_tokenize(speech.lower().replace("\n", " "))
    for sen in sentences:
        f.write(
            sen.translate(str.maketrans('', '', string.punctuation)) + "\n")

f.close()

Example #31

0

Show file

# read texts from the State of the Union addresses using the state_union module
# determine the frequency of use of the words "men", "women", "people" in each document
import nltk
from nltk.corpus import state_union

state_files = state_union.fileids()
words = ['men', 'women', 'people']

cfd = nltk.ConditionalFreqDist(
    (text, word) for text in state_files for word in state_union.words(text))
cfd.tabulate(conditions=state_files, samples=words)

cfd = nltk.ConditionalFreqDist((target, fileid[:4]) for fileid in state_files
                               for word in state_union.words(fileid)
                               for target in words
                               if word.lower().startswith(target))
cfd.plot()

# analyze the frequency chart of modal verbs for different genres
# find other word use classes that also differ in different genres
import nltk
import nltk.corpus

corpus_name = nltk.corpus.brown
files = corpus_name.fileids()
modals = ['can', 'could', 'may', 'might', 'must', 'will']
commons = ['the', 'be', 'to', 'of', 'and', 'in', 'that']
adjectives = ['good', 'new', 'first', 'last', 'long']
genres = ['news', 'religion', 'hobbies', 'science_fiction', 'romance', 'humor']

cfd = nltk.ConditionalFreqDist((genre, word)

Example #32

0

Show file

File: Assignment 4.py Project: deepakorantak/Python

import nltk
from nltk.corpus import state_union
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from gensim import corpora, models

_files_all_speechs = state_union.fileids()
all_raw_speeches = []
for _file_ in _files_all_speechs:
    all_raw_speeches.append(state_union.raw(_file_))

#print('Number of Speeches:', len(all_raw_speeches))
all_categories = [x.split('-')[1].split('.')[0] for x in _files_all_speechs]
#print(all_categories)

stopwords = nltk.corpus.stopwords
eng_stopwords = stopwords.words('english')
wordnet_lemmatizer = WordNetLemmatizer()


def basic_preprocessing(text):
    text = text.lower()  #lowering
    text = re.sub(
        r'\[.*?\]', '', text
    )  #removing all instances of citation brackets found in wiki articles
    text = word_tokenize(text)
    text = [word for word in text
            if word not in eng_stopwords]  #removing stop words
    text = [word for word in text
            if len(word) > 1]  #removing single character tokens

Example #33

0

Show file

File: ch2.py Project: juri-220/Python-NLP

len(persuasion)
len(set(persuasion))

#3.
from nltk.corpus import brown
brown.fileids()
brown.categories()
brown.words(categories='adventure')

#4.
from nltk.corpus import state_union

text = state_union.words()
cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
    for target in ['men','women']
    if w.lower().startswith(target))
cfd.plot()

#5.
wn.synset('fish.n.01').part_meronyms()
wn.synset('fish.n.01').member_meronyms()
wn.synset('leaf.n.01').substance_meronyms()
wn.synset('fish.n.01').member_holonyms()
wn.synset('leaf.n.01').substance_holonyms()

#6.cannot translate among 3 languages at a time, loop to solve
from nltk.corpus import swadesh

Example #34

0

Show file

 def __init__(self):
     self.number_id = 40
     self.source_id = "state_union"
     self.titles = [name for name in state_union.fileids()]
     self.data = [state_union.raw(name) for name in self.titles]

Example #35

0

Show file

File: Module2-DataAnalysis-Solution.py Project: ab6/Codemash2016-NLPprecompiler

    return entity_names

def extract_entities(taggedText):
    '''
    Create map with entity and their counts
    :param taggedText: Parsed text (output of ne chunker) in tree form
    :return: dict of entities and their freq counts
    '''
    entity_names = []
    for tree in taggedText:
        entity_names.extend(extract_entity_names(tree))
    return entity_names


#get year and words for each file
extracted= [(state_union.raw(fileid), int(fileid[:4])) for fileid in state_union.fileids()]
docs, years = zip(*extracted)

#break text down into sentences, tokens
tokens = [nltk.word_tokenize(text) for text in docs]
sents = [nltk.sent_tokenize(text.replace("\n", " ")) for text in docs]
senttokens = [[nltk.word_tokenize(sent) for sent in entry] for entry in sents]

#get counts of unique words and plot over time
unique = [len(set(words)) for words in tokens]
plt.scatter(years, unique)
plt.show()

#get unique/total ratio
ratios = [(float(len(set(words)))/float(len(words))) for words in tokens]
plt.scatter(years, ratios)

Example #36

0

Show file

File: final.py Project: kabirahuja2431/My-projects

    #Gives an accuracy of 88% on test data
    return [clf_bern, clf_tree, vectorizer]


[clf_bern, clf_tree, vectorizer] = train_questions()


#this method classifies to which category a new question belongs
def classify_question(question, vectorizer=vectorizer, clf=clf_bern):
    b = vectorizer.transform([question])
    b = b.toarray()
    return clf.predict(b)[0]


#loading the data set of different minutes of the meets
lisa = state_union.fileids()
dataset = []
for ele in lisa:
    dataset.append(state_union.raw(ele))
for i in range(len(dataset)):
    dataset[i] = dataset[i].encode('utf-8')


#this funtion finds the most important words in the nth meet.
def important_words(n, dataset=dataset):
    data = dataset
    #removing punctuations and \n from the data
    for i in range(len(data)):
        data[i] = data[i].translate(None, string.punctuation)
        data[i] = data[i].translate(None, "\n")