def loadCorpus(category = None) : corpus_root = "../corpus/lyric_corpus/files/" cat_root = "../categories/" if not os.name == 'posix': corpus_root = "..\\corpus\\lyric_corpus\\files\\" # load the corpus # corpus = PlaintextCorpusReader(corpus_root, '.*\.txt') corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') # print files in corpus # for file in corpus.fileids(): # print(file) # access corpus raw = corpus.raw() words = corpus.words() # print (category) if(category == None): sents = corpus.sents() else: sents = corpus.sents(categories = category) # sents_pop = corpus.sents(categories="POP") # sents_rock = corpus.sents(categories="ROCK") shuffledSents = shuffleSent(sents) numberSents = len(shuffledSents) trainSize = math.floor(numberSents*0.8) testSize = len(shuffledSents) - trainSize # testSize = math.floor(numberSents*0.1) # devSize = len(shuffledSents)-trainSize - testSize trainCorpus = [] testCorpus = [] # devCorpus = [] wholeCorpus = [] testSents = [] for i in range(numberSents): if(i < trainSize): for word in shuffledSents[i]: trainCorpus.append(word) wholeCorpus.append(word) # elif(i < (trainSize + testSize)): # for word in shuffledSents[i]: # testCorpus.append(word) # wholeCorpus.append(word) else: testSents.append(shuffledSents[i]) for word in shuffledSents[i]: testCorpus.append(word) wholeCorpus.append(word) # testCorpus = [] # trainCorpus = list(words) # for i in range(testSize): # seed = random.randrange(0,numberSents - i) # testCorpus.append(trainCorpus.pop(seed)) return wholeCorpus, trainCorpus, testCorpus, testSents
corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+') # get all categories cats = corpus.categories() print(cats) # access corpus raw = corpus.raw() # access words, normal and for a category words = corpus.words() words_pop = corpus.words(categories="POP") words_rock = corpus.words(categories="ROCK") # access sents, normal and for a category sents = corpus.sents() sents_pop = corpus.sents(categories="POP") sents_rock = corpus.sents(categories="ROCK") # make lists word_list = list(words) sents_list = list(sents) pop_word_list = list(words_pop) pop_sents_list = list(sents_pop) rock_word_list = list(words_rock) rock_sents_list = list(sents_rock) '''
word_list_brown = brown.words() sents_list_brown = brown.sents() vocabulary_brown = set(word_list_brown) brown_len_words = len(word_list_brown) brown_len_sents = len(sents_list_brown) brown_len_vocab = len(vocabulary_brown) brown_richness = lexical_diversity(word_list_brown) # Lyric corpus cats = corpus.categories() print(len(cats)) print(cats) num_files = len(corpus.fileids()) word_list = list(corpus.words()) sents_list = list(corpus.sents()) vocabulary = set(word_list) total_len_words = len(word_list) total_len_sents = len(sents_list) total_len_vocab = len(vocabulary) total_richness = lexical_diversity(word_list) # POP word_list_pop = list(corpus.words(categories="POP")) sents_list_pop = list(corpus.sents(categories="POP")) vocabulary_pop = set(word_list_pop) pop_len_words = len(word_list_pop) pop_len_sents = len(sents_list_pop) pop_len_vocab = len(vocabulary_pop) pop_richness = lexical_diversity(word_list_pop)