Example #1
0
def read_emails(path):
    files = [f for f in listdir(path) if isfile(join(path, f))]

    try:
        del (files[files.index('.DS_Store')])
    except:
        pass

    reader = WordListCorpusReader(path, files)

    text = clean(reader.raw())
    emails = split_emails(text, reader.fileids())

    return emails
Example #2
0
def main():
    reader = WordListCorpusReader(path, ['banbagsfb.txt'])
    pages = line_tokenize(reader.raw())
    thispage = pages[4]
    thispage = thispage.raw()

    """
    The easiest way to deal with strings in Python that contain escape characters and quotes is to triple double-quote the string (""") and prefix it with r. For example:
    my_str = r"""This string would "really "suck"" to write if I didn't
    know how to tell Python to parse it as "raw" text with the 'r' character and
    triple " quotes. Especially since I want \n to show up as a backlash followed
    by n. I don't want \0 to be the null byte either!"""

    The r means "take escape characters as literal". The triple double-quotes (""") prevent single-quotes, double-quotes, and double double-quotes from prematurely ending the string.

    """

    m = re.search("(\d)", thispage)
    thisitem = m.group(0)
    m = re.search("(\d\d\D\d\d)", thispage)
    thisdate = m.group(0)
    starturl = thispage.find('http')
    endurl = thispage.find(' ', starturl)-2
    thisurl = thispage[starturl:endurl] 
    soup = BeautifulSoup(thispage)
    newpage = soup.findAll(text=True)
    html = replace_all(newpage, reps)
    html = html[11:len(html)]
    postdate = html[0:5]
    posttext = html[5:len(html)]
    print "post date = " + postdate
    print "post text = " + posttext

def replace_all(txt, reps):
    for i, j in reps.iteritems():
        txt = txt.replace(i, j)
    return text

if __name__ == "__main__":
    main()
Example #3
0
import nltk

from nltk.corpus import brown
from nltk.corpus.reader import WordListCorpusReader
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer

x = nltk.data.load('files/big.txt', format='text')

reader = WordListCorpusReader('files/', ['computerscience.txt'])
cs_text = reader.raw()
cs_words = []

cs_words = (nltk.word_tokenize(cs_text))

print(cs_words)

stemmer = PorterStemmer()
wnl = WordNetLemmatizer()

for word in cs_words:
	print(stemmer.stem(word))
	print(wnl.lemmatize(word))



Example #4
0
tagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/training"
untagged_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/seminar_testdata/test_untagged"
general_data_filepath = "/Users/aledjackson/Documents/University/Second Year/Modules/Natural Language Processing/Assignments/assignment1/Data"

l_names = WordListCorpusReader(general_data_filepath, ["names.family"]).words()

file_names = [
    f for f in listdir(untagged_data_filepath)
    if isfile(join(untagged_data_filepath, f))
]
file_names = file_names[1:]

reader = WordListCorpusReader(untagged_data_filepath, [file_names[0]])

corpus = reader.raw()
words = reader.words()


def get_tags_by_name(corpus, name):
    return re.findall(r"<" + name + r">.+</" + name + r">", corpus)


def tokenise(corpus):
    return re.findall("([^\s<>]+)[\s\n<>]", corpus)


def get_name_of_poster(corpus):
    return re.findall()