def train_lda(docs: List, outputFolder: str):
    docs = list(docs)
    id2word = Dictionary(docs)
    id2word.filter_extremes(no_below=20, no_above=0.1, keep_n=1000000)
    corpus = [id2word.doc2bow(doc) for doc in docs]
    print("Starting training...")
    lda = LdaMulticore(corpus, num_topics=300, id2word=id2word)
    path = outputFolder + "/lda.model"
    matrix = np.transpose(lda.get_topics())
    with open(path, "wt", encoding='utf-8') as f:
        f.write("{} {}\n".format(np.size(matrix, 0), np.size(matrix, 1)))
        for idx in range(np.size(matrix, 0)):
            f.write(id2word[idx] + " " + " ".join([str(x) for x in matrix[idx]]) + "\n")
    print("Model saved to ", path)
def main():
    print('* Loading data from ',
          'data/counts.ICGC-BRCA-EU_BRCA_22.SBS-96.tsv')
    data_df = pd.read_csv('data/counts.ICGC-BRCA-EU_BRCA_22.SBS-96.tsv',
                          header=0,
                          index_col=0,
                          sep='\t')
    samples_bow = counts_to_bow(data_df.values)
    samples_train, samples_test = train_test_split(samples_bow, test_size=0.2)
    lda = LdaMulticore(samples_train, num_topics=12)
    topics = lda.get_topics()
    print(topics.shape)
    df = pd.DataFrame(data=topics, columns=data_df.columns.values)
    df.to_csv('output/gensim_betas.tsv', header=True, index=False, sep='\t')