Ejemplo n.º 1
0
 dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY]))
 ngram_size = len(dictionary[0])
 transformer = NgramTransformer(ngram_size)
 ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL]))
 logger.info('Clusterer model loaded!')
 
 kwargs = {'dictionary': dictionary,
               'ngramtransformer': transformer,
               'ldamodel': ldamodel,
               'method': 'LDA'}
 
 
 logger.info('Fitting clusterer')
 clusterer = Clusterer(settings)
 texts, labels = clusterer.get_training_data()
 clusterer.fit(texts, labels, **kwargs)
 logger.info('Fitting completed!')
 
 # TODO: implement get_params and set_params for clusterer tool to allow cross-validation for better score estimation
 logger.info('Evaluating score on training data')
 score = clusterer.score(texts, labels, **kwargs)
 logger.info('Score is {0}'.format(score))
 
 exporter = CsvExporter(segstorage, docstorage, args.clustermodel, sys.stdout)
 
 logger.info(u'Classifying segments with name {0}'.format(settings[SEGMENT_NAME]))
 iter = segstorage.load_iterator(name=settings[SEGMENT_NAME])
 segments = load_next_n(iter)
 while len(texts) > 0:
     texts = [s.value for s in segments]
     labels = clusterer.predict(texts, **kwargs)