Ejemplo n.º 1
0
    
    kwargs = {'dictionary': dictionary,
                  'ngramtransformer': transformer,
                  'ldamodel': ldamodel,
                  'method': 'LDA'}
    
    
    logger.info('Fitting clusterer')
    clusterer = Clusterer(settings)
    texts, labels = clusterer.get_training_data()
    clusterer.fit(texts, labels, **kwargs)
    logger.info('Fitting completed!')
    
    # TODO: implement get_params and set_params for clusterer tool to allow cross-validation for better score estimation
    logger.info('Evaluating score on training data')
    score = clusterer.score(texts, labels, **kwargs)
    logger.info('Score is {0}'.format(score))
    
    exporter = CsvExporter(segstorage, docstorage, args.clustermodel, sys.stdout)
    
    logger.info(u'Classifying segments with name {0}'.format(settings[SEGMENT_NAME]))
    iter = segstorage.load_iterator(name=settings[SEGMENT_NAME])
    segments = load_next_n(iter)
    while len(texts) > 0:
        texts = [s.value for s in segments]
        labels = clusterer.predict(texts, **kwargs)
        exporter.export(segments, labels, [score] * len(segments))
        segments = load_next_n(iter)
    exporter.close()
    logger.info('Completed successfully!\n')