Python WordListCorpusReader.raw Examples

Programming Language: Python

Namespace/Package Name: nltk.corpus.reader

Method/Function: raw

Examples at hotexamples.com: 4

Python WordListCorpusReader.raw - 4 examples found. These are the top rated real world Python examples of nltk.corpus.reader.WordListCorpusReader.raw extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

WordListCorpusReader(19)

words(14)

fileids(4)

raw(3)

categories(1)

chunked_paras(1)

chunked_sents(1)

chunked_words(1)

paras(1)

sents(1)

tagged_paras(1)

tagged_sents(1)

tagged_words(1)

Example #1

Show file

def read_emails(path):
    files = [f for f in listdir(path) if isfile(join(path, f))]

    try:
        del (files[files.index('.DS_Store')])
    except:
        pass

    reader = WordListCorpusReader(path, files)

    text = clean(reader.raw())
    emails = split_emails(text, reader.fileids())

    return emails

Example #2

Show file

File: custom_corpus.py Project: cljohnson/coapted

def main():
    reader = WordListCorpusReader(path, ['banbagsfb.txt'])
    pages = line_tokenize(reader.raw())
    thispage = pages[4]
    thispage = thispage.raw()

    """
    The easiest way to deal with strings in Python that contain escape characters and quotes is to triple double-quote the string (""") and prefix it with r. For example:
    my_str = r"""This string would "really "suck"" to write if I didn't
    know how to tell Python to parse it as "raw" text with the 'r' character and
    triple " quotes. Especially since I want \n to show up as a backlash followed
    by n. I don't want \0 to be the null byte either!"""

    The r means "take escape characters as literal". The triple double-quotes (""") prevent single-quotes, double-quotes, and double double-quotes from prematurely ending the string.

    """

    m = re.search("(\d)", thispage)
    thisitem = m.group(0)
    m = re.search("(\d\d\D\d\d)", thispage)
    thisdate = m.group(0)
    starturl = thispage.find('http')
    endurl = thispage.find(' ', starturl)-2
    thisurl = thispage[starturl:endurl] 
    soup = BeautifulSoup(thispage)
    newpage = soup.findAll(text=True)
    html = replace_all(newpage, reps)
    html = html[11:len(html)]
    postdate = html[0:5]
    posttext = html[5:len(html)]
    print "post date = " + postdate
    print "post text = " + posttext

def replace_all(txt, reps):
    for i, j in reps.iteritems():
        txt = txt.replace(i, j)
    return text

if __name__ == "__main__":
    main()

Example #3

Show file

File: lemmandstem.py Project: Jrhenderson11/NLP

import nltk

from nltk.corpus import brown
from nltk.corpus.reader import WordListCorpusReader
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer

x = nltk.data.load('files/big.txt', format='text')

reader = WordListCorpusReader('files/', ['computerscience.txt'])
cs_text = reader.raw()
cs_words = []

cs_words = (nltk.word_tokenize(cs_text))

print(cs_words)

stemmer = PorterStemmer()
wnl = WordNetLemmatizer()

for word in cs_words:
	print(stemmer.stem(word))
	print(wnl.lemmatize(word))

Example #4

Show file

tagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/training"
untagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/seminar_testdata/test_untagged"
general_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/Data"

l_names = WordListCorpusReader(general_data_filepath, ["names.family"]).words()

file_names = [
    f for f in listdir(untagged_data_filepath)
    if isfile(join(untagged_data_filepath, f))
]
file_names = file_names[1:]

reader = WordListCorpusReader(untagged_data_filepath, [file_names[0]])

corpus = reader.raw()
words = reader.words()


def get_tags_by_name(corpus, name):
    return re.findall(r"<" + name + r">.+</" + name + r">", corpus)


def tokenise(corpus):
    return re.findall("([^\s<>]+)[\s\n<>]", corpus)


def get_name_of_poster(corpus):
    return re.findall()