def loadCorpus(category = None) :

    corpus_root = "../corpus/lyric_corpus/files/"
    cat_root = "../categories/"

    if not os.name == 'posix':
        corpus_root = "..\\corpus\\lyric_corpus\\files\\"
    # load the corpus

    # corpus = PlaintextCorpusReader(corpus_root, '.*\.txt')
    corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')
    # print files in corpus
    # for file in corpus.fileids():
    # print(file)
    # access corpus

    raw = corpus.raw()
    words = corpus.words()
    # print (category)
    if(category == None):
        sents = corpus.sents()
    else:
        sents = corpus.sents(categories = category)
    # sents_pop = corpus.sents(categories="POP")
    # sents_rock = corpus.sents(categories="ROCK")

    shuffledSents = shuffleSent(sents)


    numberSents = len(shuffledSents)
    trainSize = math.floor(numberSents*0.8)
    testSize = len(shuffledSents) - trainSize
    # testSize = math.floor(numberSents*0.1)
    # devSize = len(shuffledSents)-trainSize - testSize

    trainCorpus = []
    testCorpus = []
    # devCorpus = []
    wholeCorpus = []
    testSents = []

    for i in range(numberSents):
        if(i < trainSize):
            for word in shuffledSents[i]:
                trainCorpus.append(word)
                wholeCorpus.append(word)
        # elif(i < (trainSize + testSize)):
        #     for word in shuffledSents[i]:
        #         testCorpus.append(word)
        #         wholeCorpus.append(word)
        else:
            testSents.append(shuffledSents[i])
            for word in shuffledSents[i]:
                testCorpus.append(word)
                wholeCorpus.append(word)



    # testCorpus = []
    # trainCorpus = list(words)
    # for i in range(testSize):
    #     seed = random.randrange(0,numberSents - i)
    #     testCorpus.append(trainCorpus.pop(seed))

    return wholeCorpus, trainCorpus, testCorpus, testSents
corpus = CategorizedPlaintextCorpusReader(corpus_root, '.*\.txt', cat_file=cat_root+'cat.txt', cat_delimiter='+')

# get all categories
cats = corpus.categories()
print(cats)

# access corpus
raw = corpus.raw()

# access words, normal and for a category
words = corpus.words()
words_pop = corpus.words(categories="POP")
words_rock = corpus.words(categories="ROCK")

# access sents, normal and for a category
sents = corpus.sents()
sents_pop = corpus.sents(categories="POP")
sents_rock = corpus.sents(categories="ROCK")

# make lists
word_list = list(words)
sents_list = list(sents)

pop_word_list = list(words_pop)
pop_sents_list = list(sents_pop)

rock_word_list = list(words_rock)
rock_sents_list = list(sents_rock)


'''
word_list_brown = brown.words()
sents_list_brown = brown.sents()
vocabulary_brown = set(word_list_brown)
brown_len_words = len(word_list_brown)
brown_len_sents = len(sents_list_brown)
brown_len_vocab = len(vocabulary_brown)
brown_richness = lexical_diversity(word_list_brown)

# Lyric corpus
cats = corpus.categories()
print(len(cats))
print(cats)

num_files = len(corpus.fileids())
word_list = list(corpus.words())
sents_list = list(corpus.sents())
vocabulary = set(word_list)
total_len_words = len(word_list)
total_len_sents = len(sents_list)
total_len_vocab = len(vocabulary)
total_richness = lexical_diversity(word_list)

# POP
word_list_pop = list(corpus.words(categories="POP"))
sents_list_pop = list(corpus.sents(categories="POP"))
vocabulary_pop = set(word_list_pop)
pop_len_words = len(word_list_pop)
pop_len_sents = len(sents_list_pop)
pop_len_vocab = len(vocabulary_pop)
pop_richness = lexical_diversity(word_list_pop)