Ejemplo n.º 1
0
 def view_examples(self, name, n=500):
     settings = self._setstorage.load(encode_name(name))
     clusterer = Clusterer(settings)
     
     # get the input
     segments = self._segstorage.load(name=settings[SEGMENT_NAME], limit=int(n))
     documents = [s.value for s in segments]
     
     labels = clusterer.assign_labels(documents)
     return ClusterHtml.html(documents, labels)
Ejemplo n.º 2
0
 def save_labels(self, **kwargs):
     try:
         pprint(kwargs)
         name = unicode(kwargs['name'])
         labels = dict((unicode(key), unicode(value)) for key, value in json.loads(kwargs['labels']).iteritems())
         
         settings = self._setstorage.load(encode_name(name))
         clusterer = Clusterer(settings)
         clusterer.update_labels(labels)
         
         settings = dict(clusterer)
         self._setstorage.save(encode_name(settings[CLUSTERER_NAME]), settings)
         return json.dumps({'result': 'OK'})
     except Exception, e:
         error = traceback.format_exc()
         logger.error(error)
         return json.dumps({'result': 'FAIL', 'error': error})
Ejemplo n.º 3
0
 def update(self, name, n=500, method='FastICA'):
     settings = self._setstorage.load(encode_name(name))
     clusterer = Clusterer(settings)
     
     # load the models
     dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY]))
     ngram_size = len(dictionary[0])
     transformer = NgramTransformer(ngram_size)
     ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL]))
     
     # get the input
     segments = self._segstorage.load(name=settings[SEGMENT_NAME], limit=int(n))
     documents = [s.value for s in segments]
     
     # prepare args
     kwargs = {'dictionary': dictionary,
               'ngramtransformer': transformer,
               'ldamodel': ldamodel,
               'method': method}
     Xt = clusterer.fit_transform(documents, **kwargs)
     labels = clusterer.assign_labels(documents)
     data = self._make_data(Xt, labels, documents)
     return json.dumps({'result': 'OK',
                        'data': data})
Ejemplo n.º 4
0
 logger.info('Loading clusterer model')
 settings = setstorage.load(encode_name(args.clustermodel))
 dictionary = Dictionary.load(os.path.join(DICTIONARY_PATH, settings[DICTIONARY]))
 ngram_size = len(dictionary[0])
 transformer = NgramTransformer(ngram_size)
 ldamodel = LdaModel.load(os.path.join(LDA_PATH, settings[LDA_MODEL]))
 logger.info('Clusterer model loaded!')
 
 kwargs = {'dictionary': dictionary,
               'ngramtransformer': transformer,
               'ldamodel': ldamodel,
               'method': 'LDA'}
 
 
 logger.info('Fitting clusterer')
 clusterer = Clusterer(settings)
 texts, labels = clusterer.get_training_data()
 clusterer.fit(texts, labels, **kwargs)
 logger.info('Fitting completed!')
 
 # TODO: implement get_params and set_params for clusterer tool to allow cross-validation for better score estimation
 logger.info('Evaluating score on training data')
 score = clusterer.score(texts, labels, **kwargs)
 logger.info('Score is {0}'.format(score))
 
 exporter = CsvExporter(segstorage, docstorage, args.clustermodel, sys.stdout)
 
 logger.info(u'Classifying segments with name {0}'.format(settings[SEGMENT_NAME]))
 iter = segstorage.load_iterator(name=settings[SEGMENT_NAME])
 segments = load_next_n(iter)
 while len(texts) > 0:
Ejemplo n.º 5
0
 def clear_labels(self, name):
     settings = self._setstorage.load(encode_name(name))
     clusterer = Clusterer(settings)
     clusterer.clear_labels()
     settings = dict(clusterer)
     self._setstorage.save(encode_name(settings[CLUSTERER_NAME]), settings)