from nltk.corpus import CategorizedPlaintextCorpusReader corpus_root = "./files/" cat_root = "../categories/" # Hacky way to specify path for cat.txt. A better way would be to rewrite regex '.*\.txt'... corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') # get all categories cats = corpus.categories() print(cats) # access corpus raw = corpus.raw() # access words, normal and for a category words = corpus.words() words_pop = corpus.words(categories="POP") words_rock = corpus.words(categories="ROCK") # access sents, normal and for a category sents = corpus.sents() sents_pop = corpus.sents(categories="POP") sents_rock = corpus.sents(categories="ROCK") # make lists word_list = list(words) sents_list = list(sents) pop_word_list = list(words_pop) pop_sents_list = list(sents_pop)
doc_end = {} #doc_end[0] = re.compile('END OF MEETING') doc_end[0] = "END OF MEETING" doc_end[2] = "END OF MEETING" doc_end[1] = "END OF MEETING" doc_end[3] = "END OF MEETING" doc_end[4] = "to a malfunction of the recording equipment" doc_end[5] = "END OF SESSION" #doc_end[1] = re.compile('(?i)The Committee voted to authorize') #doc_end[2] = re.compile('(?i)The vote encompassed approval of') for f in data_fileids: year, fname = f.split('/') cropped_text = crop_text(data_m.raw(f), doc_start, doc_end) saveFile(fname, year, cropped_text) corpus_root_cropped = '/Users/LENOVO USER/Desktop/FedTranscript1/cropped/' data_c = CategorizedPlaintextCorpusReader(corpus_root_cropped, r'.*\.txt', cat_pattern=r'(\w+)/*', encoding='latin1') #corpus_Stats(data_c) #%% import nltk nltk.download('averaged_perceptron_tagger') sent_example = data_c.paras('2007/20070131.txt')[473]
import ProcessText d1 = "judge people by what they say" d1_processed = ProcessText.ProcessText.process(d1) documents = [d1] #Read documents reader = CategorizedPlaintextCorpusReader( r'\Users\JoeDi\Desktop\MSC\Idioms Corpera', r'.*\.txt', cat_pattern=r'(\w+)/*') for w in reader.fileids(): wd = reader.raw(w) documents.append(w + " " + wd) print("Documents in the collection are: ") print(documents) print("\n") from sklearn.feature_extraction.text import TfidfVectorizer #build a TF/IDF matrix for each description tfidf = TfidfVectorizer().fit_transform(documents) print("Tf-idf weightings are: ") print(tfidf) print("\n")
def loadCorpus(category = None) : corpus_root = "../corpus/lyric_corpus/files/" cat_root = "../categories/" if not os.name == 'posix': corpus_root = "..\\corpus\\lyric_corpus\\files\\" # load the corpus # corpus = PlaintextCorpusReader(corpus_root, '.*\.txt') corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') # print files in corpus # for file in corpus.fileids(): # print(file) # access corpus raw = corpus.raw() words = corpus.words() # print (category) if(category == None): sents = corpus.sents() else: sents = corpus.sents(categories = category) # sents_pop = corpus.sents(categories="POP") # sents_rock = corpus.sents(categories="ROCK") shuffledSents = shuffleSent(sents) numberSents = len(shuffledSents) trainSize = math.floor(numberSents*0.8) testSize = len(shuffledSents) - trainSize # testSize = math.floor(numberSents*0.1) # devSize = len(shuffledSents)-trainSize - testSize trainCorpus = [] testCorpus = [] # devCorpus = [] wholeCorpus = [] testSents = [] for i in range(numberSents): if(i < trainSize): for word in shuffledSents[i]: trainCorpus.append(word) wholeCorpus.append(word) # elif(i < (trainSize + testSize)): # for word in shuffledSents[i]: # testCorpus.append(word) # wholeCorpus.append(word) else: testSents.append(shuffledSents[i]) for word in shuffledSents[i]: testCorpus.append(word) wholeCorpus.append(word) # testCorpus = [] # trainCorpus = list(words) # for i in range(testSize): # seed = random.randrange(0,numberSents - i) # testCorpus.append(trainCorpus.pop(seed)) return wholeCorpus, trainCorpus, testCorpus, testSents
import re corpus_root = 'C:\\MyData\\PythonPractice\\IMDB\\test' #Path of IMDB Test Data reader = CategorizedPlaintextCorpusReader(corpus_root, r'.*\.txt', cat_pattern=r'(\w+)/*') r_neg = reader.fileids(categories=['neg']) r_pos = reader.fileids(categories=['pos']) global_shortlisted = [] TEST_GS_POS = [] for i in range(0, 12500): doc = reader.raw(r_pos[i:i + 1]) #doc contains the movie review sentences = nltk.sent_tokenize(doc) senlen = len(sentences) def decontracted(phrase): # specific phrase = re.sub(r"won't", "will not", phrase) phrase = re.sub(r"can\'t", "can not", phrase) # general phrase = re.sub(r"n\'t", " not", phrase) phrase = re.sub(r"\'re", " are", phrase) phrase = re.sub(r"\'s", " is", phrase) phrase = re.sub(r"\'d", " would", phrase) phrase = re.sub(r"\'ll", " will", phrase) phrase = re.sub(r"\'t", " not", phrase)
# GET RAW TEXT COMMENT given fileid # corpus.raw([fileid]) # my_corpus.raw(my_corpus.fileids()[2])) # prints raw text of file index 2 of whole corpus# # GET list of TOKENIZED SENTS for a COMMENT via index or fileid: # sents = corpus.sents(corpus.fileids()[index]) # sents = corpus.sents([fileid]) """ GET TOKENIZED PARAGRAPHS para = corpus.paras([fileid]) comment """ """ GET TOKENIZED COMMENT para = corpus.paras([fileid]) comment """ # ITERATE OVER FILEIDS for fileid in corpus.fileids()[22:23]: print(fileid) print(type(fileid)) print(len(corpus.raw(fileid))) print(corpus.raw(fileid)) #sents = get_raw_sentences(fileid) sents = get_raw_paragraph(fileid) # print("SENT: " + "\nSENT: ".join(sents)) words = corpus.words(fileid) print(words)