from nltk.corpus import CategorizedPlaintextCorpusReader

corpus_root = "./files/"
cat_root = "../categories/"

# Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'...
corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')

# get all categories
cats = corpus.categories()
print(cats)

# access corpus
raw = corpus.raw()

# access words, normal and for a category
words = corpus.words()
words_pop = corpus.words(categories="POP")
words_rock = corpus.words(categories="ROCK")

# access sents, normal and for a category
sents = corpus.sents()
sents_pop = corpus.sents(categories="POP")
sents_rock = corpus.sents(categories="ROCK")

# make lists
word_list = list(words)
sents_list = list(sents)

pop_word_list = list(words_pop)
pop_sents_list = list(sents_pop)
doc_end = {}
#doc_end[0] = re.compile('END OF  MEETING')
doc_end[0] = "END OF  MEETING"
doc_end[2] = "END  OF  MEETING"
doc_end[1] = "END OF MEETING"
doc_end[3] = "END  OF MEETING"
doc_end[4] = "to  a malfunction  of the  recording equipment"
doc_end[5] = "END  OF  SESSION"

#doc_end[1] = re.compile('(?i)The Committee voted to authorize')
#doc_end[2] = re.compile('(?i)The vote encompassed approval of')

for f in data_fileids:
    year, fname = f.split('/')
    cropped_text = crop_text(data_m.raw(f), doc_start, doc_end)
    saveFile(fname, year, cropped_text)

corpus_root_cropped = '/Users/LENOVO USER/Desktop/FedTranscript1/cropped/'
data_c = CategorizedPlaintextCorpusReader(corpus_root_cropped,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*',
                                          encoding='latin1')

#corpus_Stats(data_c)

#%%
import nltk
nltk.download('averaged_perceptron_tagger')
sent_example = data_c.paras('2007/20070131.txt')[473]
Ejemplo n.º 3
0
import ProcessText

d1 = "judge people by what they say"

d1_processed = ProcessText.ProcessText.process(d1)

documents = [d1]

#Read documents
reader = CategorizedPlaintextCorpusReader(
    r'\Users\JoeDi\Desktop\MSC\Idioms Corpera',
    r'.*\.txt',
    cat_pattern=r'(\w+)/*')

for w in reader.fileids():
    wd = reader.raw(w)
    documents.append(w + " " + wd)

print("Documents in the collection are: ")
print(documents)
print("\n")

from sklearn.feature_extraction.text import TfidfVectorizer

#build a TF/IDF matrix for each description
tfidf = TfidfVectorizer().fit_transform(documents)

print("Tf-idf weightings are:  ")
print(tfidf)
print("\n")
def loadCorpus(category = None) :

    corpus_root = "../corpus/lyric_corpus/files/"
    cat_root = "../categories/"

    if not os.name == 'posix':
        corpus_root = "..\\corpus\\lyric_corpus\\files\\"
    # load the corpus

    # corpus = PlaintextCorpusReader(corpus_root, '.*\.txt')
    corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')
    # print files in corpus
    # for file in corpus.fileids():
    # print(file)
    # access corpus

    raw = corpus.raw()
    words = corpus.words()
    # print (category)
    if(category == None):
        sents = corpus.sents()
    else:
        sents = corpus.sents(categories = category)
    # sents_pop = corpus.sents(categories="POP")
    # sents_rock = corpus.sents(categories="ROCK")

    shuffledSents = shuffleSent(sents)


    numberSents = len(shuffledSents)
    trainSize = math.floor(numberSents*0.8)
    testSize = len(shuffledSents) - trainSize
    # testSize = math.floor(numberSents*0.1)
    # devSize = len(shuffledSents)-trainSize - testSize

    trainCorpus = []
    testCorpus = []
    # devCorpus = []
    wholeCorpus = []
    testSents = []

    for i in range(numberSents):
        if(i < trainSize):
            for word in shuffledSents[i]:
                trainCorpus.append(word)
                wholeCorpus.append(word)
        # elif(i < (trainSize + testSize)):
        #     for word in shuffledSents[i]:
        #         testCorpus.append(word)
        #         wholeCorpus.append(word)
        else:
            testSents.append(shuffledSents[i])
            for word in shuffledSents[i]:
                testCorpus.append(word)
                wholeCorpus.append(word)



    # testCorpus = []
    # trainCorpus = list(words)
    # for i in range(testSize):
    #     seed = random.randrange(0,numberSents - i)
    #     testCorpus.append(trainCorpus.pop(seed))

    return wholeCorpus, trainCorpus, testCorpus, testSents
Ejemplo n.º 5
0
import re

corpus_root = 'C:\\MyData\\PythonPractice\\IMDB\\test'  #Path of IMDB Test Data
reader = CategorizedPlaintextCorpusReader(corpus_root,
                                          r'.*\.txt',
                                          cat_pattern=r'(\w+)/*')

r_neg = reader.fileids(categories=['neg'])
r_pos = reader.fileids(categories=['pos'])

global_shortlisted = []
TEST_GS_POS = []

for i in range(0, 12500):

    doc = reader.raw(r_pos[i:i + 1])  #doc contains the movie review
    sentences = nltk.sent_tokenize(doc)
    senlen = len(sentences)

    def decontracted(phrase):
        # specific
        phrase = re.sub(r"won't", "will not", phrase)
        phrase = re.sub(r"can\'t", "can not", phrase)

        # general
        phrase = re.sub(r"n\'t", " not", phrase)
        phrase = re.sub(r"\'re", " are", phrase)
        phrase = re.sub(r"\'s", " is", phrase)
        phrase = re.sub(r"\'d", " would", phrase)
        phrase = re.sub(r"\'ll", " will", phrase)
        phrase = re.sub(r"\'t", " not", phrase)
Ejemplo n.º 6
0
# GET RAW TEXT COMMENT given fileid
# corpus.raw([fileid])  #  my_corpus.raw(my_corpus.fileids()[2])) # prints raw text of file index 2 of whole corpus#

# GET list of TOKENIZED SENTS for a COMMENT via index or fileid:
# sents = corpus.sents(corpus.fileids()[index])
# sents = corpus.sents([fileid])
"""
GET TOKENIZED PARAGRAPHS
para = corpus.paras([fileid])
comment
"""
"""
GET TOKENIZED COMMENT
para = corpus.paras([fileid])
comment
"""

# ITERATE OVER FILEIDS
for fileid in corpus.fileids()[22:23]:
    print(fileid)
    print(type(fileid))
    print(len(corpus.raw(fileid)))
    print(corpus.raw(fileid))

    #sents = get_raw_sentences(fileid)
    sents = get_raw_paragraph(fileid)
    # print("SENT:  " + "\nSENT:  ".join(sents))
    words = corpus.words(fileid)
    print(words)