from params.core import Core as Params from dataset.core import Core as Dataset from corpus.meta import Meta logging.basicConfig(level=logging.INFO) logging.info("# This script generates meta information about the corpus") logging.info("# 1. Loading script params ") logging.info("# ================================") scriptParams = Params() params = scriptParams.get() scriptParams.save(params.data_directory) logging.info("# 2. Preprocessing data") logging.info("# ================================") dataset = Dataset(params.dataset_name, params.data_directory) datasetToProcess = dataset.get(float(params.dataset_percentage), int(params.total_items)) if not datasetToProcess: logging.error('No dataset found') sys.exit() data = datasetToProcess.getTrainingSet() metaManager = Meta(datasetToProcess) metaManager.remove() metaManager.process() print('Finished')
import sys packagesPath = "/content/drive/My Drive/Colab Notebooks/packages/TextMining" sys.path.append(packagesPath) from dataset.core import Core as Dataset from topic.lda import LDA from topic.evaluate import Evaluate from params.core import Core as Params import os scriptParams = Params() params = scriptParams.get() scriptParams.save(params.data_directory) dataset = Dataset(params.dataset_name, params.data_directory) dataProcessor = dataset.get() if params.type == 'lda': print(":::::::::::::: Evaluating ::::::::::::::") evaluationProcessor = Evaluate(dataProcessor, params) evaluationProcessor.process() else: print(":::::::::::::: Train ::::::::::::::") lda = LDA(dataProcessor, os.path.join(params.data_directory, params.dataset_name)) lda.remove() lda.setPerplexity(5) lda.setNumberOfTopics(6) lda.setNumberOfTotalTopFrequencyWord(10000) lda.setNumberOfIterations(1000) lda.train()