Example #1
1
from nltk.corpus.reader import WordListCorpusReader
import nltk

# print(nltk.data.find('corpora/cookbook'))
# print(nltk.data.find('corpora/cookbook/wordlist.txt'))

d = nltk.data.find('corpora/cookbook')
reader = WordListCorpusReader(d, ['wordlist.txt'])
print(reader.words())
print(reader.fileids())
Example #2
0
    def read_emails(self, path):
        # Get all files
        files = [f for f in listdir(path) if isfile(join(path, f))]

        try:
            del (files[files.index('DS_Store')])
        except:
            pass

        reader = WordListCorpusReader(path, files)

        cleaner = Cleaner()

        emails = list()

        # Creates the Email Object out of each email file and appends to list
        for file_id in reader.fileids():
            with open(path + file_id, 'r') as current_file:
                cleaned_contents = cleaner.clean_file(current_file.read())
                split_email_header, split_email_body, split_email_file_id = self.divide(
                    cleaned_contents, file_id)
                emails.append(
                    Email(split_email_header, split_email_body,
                          split_email_file_id))

        # Return list of Email objects
        return emails
Example #3
0
    def find_info_type(self):
        type_list = []  #list of all types('abstract','speaker')
        content_list = []  #list with content
        reader = WordListCorpusReader(self.folder_name, [self.file_name])
        all_words = reader.words()

        #is the mail a proper one?
        if (all_words == []):
            return ([], [])

        #append the first tag of the mail ex:<0.1....>
        type_list.append("")
        content_list.append(all_words[0])

        for w in all_words[1:]:
            #search for pattern like "Abstract: ..."
            type = re.search('^(\w+)(:)', w)

            #using group functionality to split the topic and content
            if (type != None):
                type_list.append(type.group(1))
                content = re.search('^(\w+:)(.*)', w)
                content_list.append(content.group(2))

            #not the best way to add the \n splitted content but...
            elif (len(content_list) > 0):
                last_element = content_list[-1]
                extra_content = w
                last_element = last_element + "\n" + extra_content
                content_list[-1] = last_element

        #if the type_list[0] will be 'abstract' then content_list[0] will be the abstract content
        return (type_list, content_list)
 def __init__(self, punctuation_marks: str, corpus_dir: str,
              corpus_files: list):
     reader = WordListCorpusReader(corpus_dir, corpus_files)
     self.vi_dict = set(reader.words())
     # Thêm các dấu vào từ điển, xem như nó đúng chỉnh tả
     self.vi_dict.update(list(punctuation_marks))
     # Thêm một số từ đặc biệt
     self.vi_dict.update(
         ['m', 'g', 'gt', 'kg', 'km', 'mm', 'cm', 'c', 'f', 't'])
     self.re_d = re.compile(r'\d')
Example #5
0
def read_emails(path):
    files = [f for f in listdir(path) if isfile(join(path, f))]

    try:
        del (files[files.index('.DS_Store')])
    except:
        pass

    reader = WordListCorpusReader(path, files)

    text = clean(reader.raw())
    emails = split_emails(text, reader.fileids())

    return emails
Example #6
0
def addingCorpus():
    path = os.path.expanduser('~/nltk_data')
    if not os.path.exists(path):
        os.mkdir(path)
    print(os.path.exists(path))
    print(nltk.data.path)
    print(path in nltk.data.path)

    nltk.data.load('corpora/cookbook/cookbook.txt', format='raw')

    reader = WordListCorpusReader('/Users/Dell/nltk_data/corpora/cookbook/',
                                  ['wordlist.txt'])

    print(reader.words())
Example #7
0
def tokenize_file(file, corpus_root, english_stops):            #tokenize input file, count words, characters, remove stopwords
    tokenizer = RegexpTokenizer(r'\w+')
    item_count = 0
    total_chars = 0
    word_count = 0
    wordlist = []

    reader = WordListCorpusReader(corpus_root, file)
    chunks = reader.words()

    for item in chunks:
        total_chars += len(chunks[item_count])
        word_tokens = tokenizer.tokenize(chunks[item_count])
        word_count += len(word_tokens)
        item_count += 1
        for word in word_tokens:
            wordlist.append(word)
    stopsout = [word for word in wordlist if word.lower() not in english_stops]
    return wordlist, stopsout, word_count, total_chars
Example #8
0
def main():
    reader = WordListCorpusReader(path, ['banbagsfb.txt'])
    pages = line_tokenize(reader.raw())
    thispage = pages[4]
    thispage = thispage.raw()

    """
    The easiest way to deal with strings in Python that contain escape characters and quotes is to triple double-quote the string (""") and prefix it with r. For example:
    my_str = r"""This string would "really "suck"" to write if I didn't
    know how to tell Python to parse it as "raw" text with the 'r' character and
    triple " quotes. Especially since I want \n to show up as a backlash followed
    by n. I don't want \0 to be the null byte either!"""

    The r means "take escape characters as literal". The triple double-quotes (""") prevent single-quotes, double-quotes, and double double-quotes from prematurely ending the string.

    """

    m = re.search("(\d)", thispage)
    thisitem = m.group(0)
    m = re.search("(\d\d\D\d\d)", thispage)
    thisdate = m.group(0)
    starturl = thispage.find('http')
    endurl = thispage.find(' ', starturl)-2
    thisurl = thispage[starturl:endurl] 
    soup = BeautifulSoup(thispage)
    newpage = soup.findAll(text=True)
    html = replace_all(newpage, reps)
    html = html[11:len(html)]
    postdate = html[0:5]
    posttext = html[5:len(html)]
    print "post date = " + postdate
    print "post text = " + posttext

def replace_all(txt, reps):
    for i, j in reps.iteritems():
        txt = txt.replace(i, j)
    return text

if __name__ == "__main__":
    main()
Example #9
0
    def __init__(self, config_file):

        try:

            self.config = ConfigParser.RawConfigParser()
            self.config.optionxform = str
            self.config.read(config_file)

            tokenizers = self.config.get('post_training_corpus', 'regex_file')
            self.config_tokenizer = json.load(open(tokenizers, "r"))

            self.isWordList = self.config.getboolean('postaggers',
                                                     'isWordList')
            self.wordlist = self.config.items('postaggers.wordlist')
            self.training_portion = self.config.getfloat(
                'post_training_corpus', 'training_portion')
            self.taggers_path = self.config.get('postaggers', 'save_to')
            self.max_ngrams = self.config.getint('postaggers', 'max_ngrams')
            self.tagger_extension_file = self.config.get(
                'postaggers', 'ext_file')
            corpus = []

            for key, corpus_file in self.config.items(
                    'post_training_corpus.corpus'):
                print "Generate model from file:", corpus_file
                corpus.append(corpus_file)

            self.corpusReader = ConllChunkCorpusReader(
                self.config.get('post_training_corpus', 'corpora'), corpus,
                ('NP', 'PP', 'VP', 'AP'))
            self.corpusSents = self.corpusReader.tagged_sents()

            self.wordListReader = WordListCorpusReader(
                self.config.get('post_training_corpus', 'wordlist_path'),
                r'.*\.txt')

            self.regex_list = []

            for key in self.config_tokenizer.keys():

                if self.config_tokenizer[key]['isolate'] == "True":
                    regex = self.config_tokenizer[key]['regex'].encode(
                        'utf-8').decode('utf-8')
                    post = self.config_tokenizer[key]['post']
                    self.regex_list.append((regex, post))

            #logging.info(self.regex_list)

        except Exception, e:

            print "Error :", str(e)
            pdb.set_trace()
Example #10
0
import nltk
from nltk.corpus.reader import WordListCorpusReader
reader = WordListCorpusReader('', ['computerscience.txt'])
words = [nltk.word_tokenize(i) for i in reader.words()]
from nltk.stem.porter import *
stemmer = PorterStemmer()
from nltk.stem import WordNetLemmatizer
lemmer = WordNetLemmatizer()

stemmed = [[stemmer.stem(y) for y in i] for i in words]
lemmed = [[lemmer.lemmatize(y) for y in i] for i in words]

print(stemmed)
Example #11
0
import nltk

from nltk.corpus import brown
from nltk.corpus.reader import WordListCorpusReader
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer

x = nltk.data.load('files/big.txt', format='text')

reader = WordListCorpusReader('files/', ['computerscience.txt'])
cs_text = reader.raw()
cs_words = []

cs_words = (nltk.word_tokenize(cs_text))

print(cs_words)

stemmer = PorterStemmer()
wnl = WordNetLemmatizer()

for word in cs_words:
	print(stemmer.stem(word))
	print(wnl.lemmatize(word))



import os.path
path = os.path.expanduser('~/nltk_data')
if not os.path.exists(path): os.mkdir(path)
os.path.exists(path)
import nltk.data
#path in nltk.data.path
print path
''' note that this should be a path in the Git_Workspace on D:\ '''

''' load a sample wordlist '''
#import nltk.data
nltk.data.load('corpora/cookbook/GL_Sequent.txt', format='raw')
'nltk\n'

from nltk.corpus.reader import WordListCorpusReader
reader = WordListCorpusReader(path + '/corpora/cookbook/', ['GL_Sequent.txt'])
reader.words()

''' reading a tagged corpus '''
from nltk.corpus.reader import TaggedCorpusReader
reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos')
reader.words()
reader.tagged_words()
reader.sents()
reader.tagged_sents()
reader.paras()
reader.tagged_paras()

''' different Tokenizer - works? '''
from nltk.tokenize import SpaceTokenizer
reader = TaggedCorpusReader(path + '/corpora/cookbook/', r'.*\.pos',word_tokenizer=SpaceTokenizer())
Example #13
0
from nltk.corpus import brown
from nltk.corpus.reader import WordListCorpusReader
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer
import re
from os import listdir
from os.path import isfile, join

wnl = WordNetLemmatizer()
stemmer = PorterStemmer()

tagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/training"
untagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/seminar_testdata/test_untagged"
general_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/Data"

l_names = WordListCorpusReader(general_data_filepath, ["names.family"]).words()

file_names = [
    f for f in listdir(untagged_data_filepath)
    if isfile(join(untagged_data_filepath, f))
]
file_names = file_names[1:]

reader = WordListCorpusReader(untagged_data_filepath, [file_names[0]])

corpus = reader.raw()
words = reader.words()


def get_tags_by_name(corpus, name):
    return re.findall(r"<" + name + r">.+</" + name + r">", corpus)
Example #14
0
 def __init__(self):
     '''
     Constructor for the BE06 word list corpa. 
     
     @note: Initaly the contructor tries to load the corpora from a .plk file. If this has not
     created, then a new instance is created by iterating though all files for BE06.
     '''
     
     try:
         #Attempt to open .plk file and load. 
         input = open("./Corpus/BE06/BE06.pkl", 'rb')
         reader = load(input)
         input.close()
     except IOError as e:
         filelist = []
         words = []
         
         #Find all .txt files in /BE06 dirctory
         for files in os.listdir("./Corpus/BE06"):
             if files.endswith(".txt"):
                 filelist.append(files)
         
         if(len(filelist) == 500):
             #Iterate through whole list of file
             for name in filelist:
                 f = open("./Corpus/BE06/" + name)
             
                 lines = f.readlines()
                 
                 #Read line in file, tokonize to words, and remove all 
                 #Punctuation
                 for line in lines:
                     tmp1 = nltk.sent_tokenize(line)
                     for lin in tmp1:
                         tmp = nltk.word_tokenize(lin)
                         for word in tmp:
                             for c in string.punctuation:
                                 word = word.replace(c, "")
                             words.append(word)
                         
                 f.close()
             
             #Write wordlist to output file.
             a = open("./Corpus/BE06/finalcorpa.txt", "wb") 
             for word in words:
                 if word not in ".,;!?\"":
                     a.write(word + '\n')   
                     
             a.close()    
             
             #Creat NLTK corpus, and save a copy in folder for later use
             reader = WordListCorpusReader('./Corpus/BE06', ['finalcorpa.txt'])
             output = open("./Corpus/BE06/BE06.pkl", 'wb')
             dump(reader, output, -1)
             output.close()
         else:
             reader = WordListCorpusReader('./Corpus/BE06', ['finalcorpa.txt'])
             output = open("./Corpus/BE06/BE06.pkl", 'wb')
             dump(reader, output, -1)
             output.close()
     
     #Return corpus
     self.corpa = reader
Example #15
0
import nltk.data
from nltk.corpus.reader import WordListCorpusReader
from nltk.corpus import names
from nltk.corpus.reader import TaggedCorpusReader
from nltk.tokenize import SpaceTokenizer
from nltk.corpus import treebank

wordlist = WordListCorpusReader("C:/nltk_data/corpora/cookbook", ['wordlist'])
print(wordlist.words())
print(wordlist.fileids())

print(names.fileids())
print(len(names.words('male.txt')))

reader = TaggedCorpusReader("C:/nltk_data/corpora/treebank/tagged",
                            r'.*\.pos',
                            word_tokenizer=SpaceTokenizer(),
                            tagset='en-brown')
print(reader.words('wsj_0001.pos'))
print(reader.tagged_words('wsj_0001.pos'))
print(reader.tagged_sents('wsj_0001.pos'))
print(reader.tagged_paras('wsj_0001.pos'))
print(reader.fileids())

print("\n")
print(reader.tagged_words('wsj_0001.pos', tagset='universal'))

print(treebank.tagged_words())
Example #16
0
# Create a corpus reader with all the files
reader = PlaintextCorpusReader('.', files)

# Set up a translation table for punctuation to the empty string
table = str.maketrans('', '', string.punctuation)

# Get a list of English stopwords without punctuation
english_stops = set(stopwords.words('english'))
english_stops_nopunct = {
    stopword.translate(table)
    for stopword in english_stops
}

# Load the insect wordlist of stems
insect_words = WordListCorpusReader('.', ['wordlists/insect-wordstems.txt'])

# A list to hold the frequency data
freq_data = []

count = 1
# Read each file in turn
for file in files:
    text = reader.raw(file)

    print(f'{count}: TOKENISING {file}')

    # Tokenise and normalise to lowercase
    tokens = word_tokenize(text.lower())

    # Remove all punctuation marks
Example #17
0
    n_score = ret.prob("negative")

    if max(p_score, n_score) <= cutoff:
        return "neutral"

    if p_score > n_score:
        return "positive"

    elif n_score > p_score:
        return "negative"

    else:
        return "neutral"


reader = WordListCorpusReader('/path/to/sentiment/files',
                              ['positive.txt', 'negative.txt'])

pos_feats = [(dict([(word, True)]), 'positive')
             for word in reader.words('positive.txt')]
neg_feats = [(dict([(word, True)]), 'negative')
             for word in reader.words('negative.txt')]
train_feats = pos_feats + neg_feats
classifier = NaiveBayesClassifier.train(train_feats)

t = Twitter(auth=OAuth("TOKEN", "TOKEN_KEY", "CON_SECRET", "CON_SECRET_KEY"))

connection = pymongo.Connection()
db = connection.twitter
mentions = db.mentions

screen_names = ["YOUR_ACCOUNT", "YOUR_OTHER_ACCOUNT"]
		#print(fileName)
		counter =0
		docWords = corpus.words(fileName)
		for word in docWords:
			#print(word)
			w = word.lower()
			if w in wordSet: 
				# i could also use in the fd.inc approach here and it's proabably better-just showing another option. 
				print(w + " is in " + fileName)  
				counter+= 1
	 	billCounts.append(counter)
	return billCounts

from nltk.corpus.reader import WordListCorpusReader
path = "/Volumes/Optibay-1TB/Dropbox/Content_Wilker/Gonzalez_Project/Gonzalez_Keywords"
reader = WordListCorpusReader(path, ['crime.txt']) #make an nltk word list

crime = reader.words()
crime = [word.lower().strip() for word in crime]	

crimeSet = set([w.lower() for w in crime])
crimeCount = make_count(billsCorpora, crimeSet)

fd = count_stems(billsCorpora)

counter = 0
#lets look at 200 of the most popular items and there counts

#you could use the csv writer methods or this which is kind of hacky
mywordlist = numpy.asarray([billsCorpora.fileids(), crimeCount])
mywordlist[0][1] #name
Example #19
0
x = nltk.data.load('big.txt', format='auto')
'''
reader = WordListCorpusReader('',['wordlist.txt','wordlist2.txt'])
print(reader.words())
print(reader.fileids())

stemmer = PorterStemmer()
print(stemmer.stem('running'))

wnl = WordNetLemmatizer()
print(wnl.lemmatize('dogs'))

'''

csReader = WordListCorpusReader('','computerscience.txt')
wnl = WordNetLemmatizer()
stemmer = PorterStemmer()

### Concats list of words from reader
csWords = nltk.word_tokenize(' '.join(csReader.words()))
print(type(csWords))
for word in csWords:
    print("%s,%s" % (wnl.lemmatize(word), stemmer.stem(word)))
    
inputList = ['16/12/2016']
    
for inputString in inputList:
    print(re.findall(r'(.*?)[\s\-\\](.*?)[\s\-\\](.*?)', inputString))
    
Example #20
0
#!/usr/bin/env python
# encoding: utf-8
"""
wordNet.py
Created by Aaron Erlich on 2013-02-13.
"""

import sys
import os
import nltk
from nltk.corpus.reader import WordListCorpusReader

path = #insert your path
#path = "/Volumes/Optibay-1TB/Dropbox/Content_Wilker/Gonzalez_Project/Gonzalez_Keywords"
reader = WordListCorpusReader(path, ['crime.txt']) #make an nltk word list

crime = reader.words()
crime = [word.lower().strip() for word in crime]

from nltk.corpus import wordnet

#lemmas are the distinct meaning of the a word and all of each meaning's possible morphologies
#we see that lots of the student's words have both noun and verb meanings. Which does he care about?
#these words are polysemous -- they have similar but different meanings
for word in crime:
	print word
	print wordnet.synsets(word)
	print "\n"
	raw_input("Hit Enter")
	
[synset.lemma_names for synset in wordnet.synsets("stealing")]
Example #21
0
    f for f in listdir(corpora + '/golden_test_subset_a')
    if isfile(join(corpora + '/golden_test_subset_a', f))
]
onlyfilessbsa2 = [
    f for f in listdir(corpora + '/golden_tagged_subset_a')
    if isfile(join(corpora + '/golden_tagged_subset_a', f))
]
testc = nltk.corpus.reader.plaintext.PlaintextCorpusReader(
    corpora + '/golden_test_subset_a', onlyfilessbsa1)
tagdc = nltk.corpus.reader.plaintext.PlaintextCorpusReader(
    corpora + '/golden_tagged_subset_a', onlyfilessbsa2)

# getting named entity corpora

names = WordListCorpusReader(
    nepath,
    ['male.txt', 'female.txt', 'family.txt'])  # list of names, from canvas
titles = WordListCorpusReader(nepath, ['titles.txt'])  # list of common titles
orgsuffs = WordListCorpusReader(
    nepath, ['orgsuff.txt'])  # list of organisation suffixes
daymonths = WordListCorpusReader(nepath,
                                 ['daymonths.txt'])  # list of days and months

# extracting named entities from tagged data
# regex patterns to match each tag
pattern1 = '<ENAMEX TYPE="PERSON">(.*?)<\/ENAMEX>'
pattern2 = '<ENAMEX TYPE="LOCATION">(.*?)<\/ENAMEX>'
pattern3 = '<ENAMEX TYPE="ORGANIZATION">(.*?)<\/ENAMEX>'

# finding every example in the data, storing in sets
people = set(re.findall(pattern1, trainingcorpus.raw()))
import os
import re
import sys
import json
import nltk.test
import abbreviations
import portuguese_tagger_processor
from sentilex import sentiLexPairRdd
from nltk.corpus.reader import WordListCorpusReader

__output_path = "result.json"

stopwords = nltk.corpus.stopwords.words('portuguese')
reader = WordListCorpusReader('.', ['symbols.txt'])
symbols = reader.words()
reader = WordListCorpusReader('.', ['positive_emoticons.txt'])
positive_emoticons = reader.words()
reader = WordListCorpusReader('.', ['negative_emoticons.txt'])
negative_emoticons = reader.words()

tweet_tokenizer = portuguese_tagger_processor.get_tweet_tokenizer()
tagger = portuguese_tagger_processor.get_tagger()
json_result = []
tweet_dict = {}


def count_positive_emoticons(tokens):
    counter = 0
    for emoticon in positive_emoticons:
        if emoticon in tokens:
            counter += 1
def word_normalizer(word):
    return filter(lambda word: word not in 'the,"inisarefromwithonfor1234567890asbywasretrieved.andof:;''()).{}[]-to\'&#^%160/20102011201220132009ateditwikipedia].isbn\x80\x93),.&#\xe0\xe1has\xd0\xd1\xb0worldthisthat|&amp201420072006200520042003200220012000+=-', word)

# Feature extractor
# Frequency count
def geo_features (word):
    return {'any_word':word}

# Initialize constants
NLTK_HOME = '/home/administrator/nltk_data'

l_list = []
# cleaning, tokenizing, normalizing

# Read the Corpus
state_reader = WordListCorpusReader(NLTK_HOME, ['state_files.txt'])
city_reader = WordListCorpusReader(NLTK_HOME, ['city_files.txt'])
train_file = '/app/ai/train_file.txt'
test_results_file = '/app/ai/test_city_results_file.txt'


# Store the URLs in  a list
urls = ([(url,'city') for url in city_reader.words()]+
        [(url,'state') for url in state_reader.words()]
        )

for url in list(urls):
    # Remove HTMLtabs after reading the URL
    raw = nltk.clean_html(urlopen(url[0]).read())
    print 'Finished cleaning html for ', url[0]
    # Compute the frequency distribution of the words
Example #24
0
def open_places_wordlist():
    path = '/Users/tim/mycode/time/wordlists/'
    wordlist = 'ga_gazetteer_wordlist.txt'
    reader = WordListCorpusReader(path, [wordlist])
    return reader
Example #25
0
    WP = 'WP'

    # 35. Possessive wh-pronoun
    WP_ = 'WP$'

    # 36. Wh-adverb
    WRB = 'WRB'

    @staticmethod
    def nounish(word, pos):
        # nltk apparently defaults to 'NN' for smileys :) so special-case those
        return pos in (POS.NN, POS.NNS, POS.NNP, POS.NNPS) and \
            any(c.isalpha() for c in word)


mass_noun_corpora = WordListCorpusReader('wordlist/massnoun', r'[a-z]+')
mass_nouns = mass_noun_corpora.words()

QUANTITY_POS_TAGS = frozenset((
    POS.JJ,
    POS.VBN,
    POS.VBP,
    POS.NN,
    POS.NNP,
    POS.RB,
    POS.RBR,
    POS.RBS,
))

bad_words_corpora = WordListCorpusReader('wordlist/shutterstock-bad-words', r'[a-z]{2,3}')
bad_words_en = bad_words_corpora.words('en')
                     user='******',
                     passwd='Webrowse@123',
                     db='article')
cur = db.cursor()
'''

dataset = load_files(
    '/home/soumen/projects/scikit-learn/doc/tutorial/text_analytics/data/languages/paragraphs'
)  # Read an article

file_id_argv = open(sys.argv[1])
file_id = file_id_argv.read()
file_list = file_id.split('\n')
file_list.pop(-1)

italian_stopwords = WordListCorpusReader('.', ['stop-words-it-en.txt'])


def language_detection(text):
    """Description here"""
    count_vect = CountVectorizer()
    X_train_counts = count_vect.fit_transform(dataset.data)
    tfidf_transformer = TfidfTransformer()
    X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

    X_test_counts = count_vect.transform(text)
    X_test_tfidf = tfidf_transformer.transform(X_test_counts)

    clf = MultinomialNB().fit(X_train_tfidf, dataset.target)

    predicted = clf.predict(X_test_tfidf)
# from nltk.stem import WordNetLemmatizer
# wnl = WordNetLemmatizer()
# print(wnl.lemmatize('monsters'))
'''
In each of the above cases we have handled one word. 
Now print the stemmed and lemmatized versions of all the words in the document computerscience.txt
Preview the document. Here is an overview of what you need to do: 
    1. Load the file into a reader [ Hint: reader = WordListCorpusReader( ... ) ]
    2. use word_tokenize from nltk.tokenize to convert the text into words
    3. Loop through the text [Hint: Use the for statement]
    4. Lemmatize and Stem each word.
    5. Look at the difference between the two, notice how the lemmatizer makes mistakes in some cases - can you identify why and propose a solution?
'''

from nltk.corpus.reader import WordListCorpusReader
tokens = []
reader = WordListCorpusReader('./', ['computerscience.txt'])
for count, ele in enumerate(reader.words()):
    print(count, "\b:", ele, "\n")
    tokens += nltk.word_tokenize(ele)

print(tokens)
from nltk.stem.porter import *
stemmer = PorterStemmer()
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
for token in tokens:
    print(token)
    print(wnl.lemmatize(token))
    print(stemmer.stem(token))
Example #28
0
########## WORDLIST CORPUS READER ###############

#Basic Corpus Reader
from nltk.corpus.reader import WordListCorpusReader
#List of a few thousand names organized by gender
from nltk.corpus import names
#List of english words
from nltk.corpus import words

nltkDir="C:\\Users\\Matrix\\AppData\\Roaming\\nltk_data\\corpora\\cookbook\\"
#nltkFile="mywords.txt"
#source=nltkDir+nltkFile

### One File WordListCorpusReader
reader=WordListCorpusReader(nltkDir,['wordlist.txt'])
print reader.words()
print reader.fileids()

### MultiFile WordListCorpusReader
#To get the names of the files in the corpus use the "fileids" command
names.fileids()
print len(names.words('female.txt'))
print len(names.words('female.txt'))

words.fileids()
print len(words.words('en-basic'))
print len(words.words('en'))

###Chunked Corpus Reader
Example #29
0
File: parse.py Project: MMJ744/NLP
import nltk
from nltk import load_parser

from nltk.corpus.reader import WordListCorpusReader

reader = WordListCorpusReader('', ['words.txt'])
words = [nltk.word_tokenize(i) for i in reader.words()]
cp = load_parser('grammar.fcfg', trace=1)

# from nltk.corpus import treebank
# from nltk.tag import DefaultTagger
# train_set = treebank.tagged_sents()[:4000]
# test_set = treebank.tagged_sents()[2000:]
# from nltk.tag import UnigramTagger
# unigramTagger = UnigramTagger(train_set)
# from nltk.tag import BigramTagger, TrigramTagger
# bigramTagger = BigramTagger(train_set, cutoff=2)
# trigramTagger = TrigramTagger(train_set, cutoff=3)
# def backoff_tagger(train_sents, tagger_classes, backoff=None):
#    for cls in tagger_classes :
#        backoff = cls(train_sents, backoff=backoff)
#    return backoff
# tagger = backoff_tagger(train_set, [UnigramTagger, BigramTagger, TrigramTagger], backoff=DefaultTagger('NN'))
# for sentence in words:
#    print(tagger.tag(sentence))


for sentence in words:
    print(sentence)
    for tree in cp.parse(sentence):
        print(tree)