def view_examples(self, name, n=500): settings = self._setstorage.load(encode_name(name)) clusterer = Clusterer(settings) # get the input segments = self._segstorage.load(name=settings[SEGMENT_NAME], limit=int(n)) documents = [s.value for s in segments] labels = clusterer.assign_labels(documents) return ClusterHtml.html(documents, labels)
def save_labels(self, **kwargs): try: pprint(kwargs) name = unicode(kwargs['name']) labels = dict((unicode(key), unicode(value)) for key, value in json.loads(kwargs['labels']).iteritems()) settings = self._setstorage.load(encode_name(name)) clusterer = Clusterer(settings) clusterer.update_labels(labels) settings = dict(clusterer) self._setstorage.save(encode_name(settings[CLUSTERER_NAME]), settings) return json.dumps({'result': 'OK'}) except Exception, e: error = traceback.format_exc() logger.error(error) return json.dumps({'result': 'FAIL', 'error': error})
def update(self, name, n=500, method='FastICA'): settings = self._setstorage.load(encode_name(name)) clusterer = Clusterer(settings) # load the models dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY])) ngram_size = len(dictionary[0]) transformer = NgramTransformer(ngram_size) ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL])) # get the input segments = self._segstorage.load(name=settings[SEGMENT_NAME], limit=int(n)) documents = [s.value for s in segments] # prepare args kwargs = {'dictionary': dictionary, 'ngramtransformer': transformer, 'ldamodel': ldamodel, 'method': method} Xt = clusterer.fit_transform(documents, **kwargs) labels = clusterer.assign_labels(documents) data = self._make_data(Xt, labels, documents) return json.dumps({'result': 'OK', 'data': data})
logger.info('Loading clusterer model') settings = setstorage.load(encode_name(args.clustermodel)) dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY])) ngram_size = len(dictionary[0]) transformer = NgramTransformer(ngram_size) ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL])) logger.info('Clusterer model loaded!') kwargs = {'dictionary': dictionary, 'ngramtransformer': transformer, 'ldamodel': ldamodel, 'method': 'LDA'} logger.info('Fitting clusterer') clusterer = Clusterer(settings) texts, labels = clusterer.get_training_data() clusterer.fit(texts, labels, **kwargs) logger.info('Fitting completed!') # TODO: implement get_params and set_params for clusterer tool to allow cross-validation for better score estimation logger.info('Evaluating score on training data') score = clusterer.score(texts, labels, **kwargs) logger.info('Score is {0}'.format(score)) exporter = CsvExporter(segstorage, docstorage, args.clustermodel, sys.stdout) logger.info(u'Classifying segments with name {0}'.format(settings[SEGMENT_NAME])) iter = segstorage.load_iterator(name=settings[SEGMENT_NAME]) segments = load_next_n(iter) while len(texts) > 0:
def clear_labels(self, name): settings = self._setstorage.load(encode_name(name)) clusterer = Clusterer(settings) clusterer.clear_labels() settings = dict(clusterer) self._setstorage.save(encode_name(settings[CLUSTERER_NAME]), settings)