def train_lda(docs: List, outputFolder: str): docs = list(docs) id2word = Dictionary(docs) id2word.filter_extremes(no_below=20, no_above=0.1, keep_n=1000000) corpus = [id2word.doc2bow(doc) for doc in docs] print("Starting training...") lda = LdaMulticore(corpus, num_topics=300, id2word=id2word) path = outputFolder + "/lda.model" matrix = np.transpose(lda.get_topics()) with open(path, "wt", encoding='utf-8') as f: f.write("{} {}\n".format(np.size(matrix, 0), np.size(matrix, 1))) for idx in range(np.size(matrix, 0)): f.write(id2word[idx] + " " + " ".join([str(x) for x in matrix[idx]]) + "\n") print("Model saved to ", path)
def main(): print('* Loading data from ', 'data/counts.ICGC-BRCA-EU_BRCA_22.SBS-96.tsv') data_df = pd.read_csv('data/counts.ICGC-BRCA-EU_BRCA_22.SBS-96.tsv', header=0, index_col=0, sep='\t') samples_bow = counts_to_bow(data_df.values) samples_train, samples_test = train_test_split(samples_bow, test_size=0.2) lda = LdaMulticore(samples_train, num_topics=12) topics = lda.get_topics() print(topics.shape) df = pd.DataFrame(data=topics, columns=data_df.columns.values) df.to_csv('output/gensim_betas.tsv', header=True, index=False, sep='\t')