def handle(self, *args, **options): def _to_hit_group_class(results): for result in results: doc = result['document'] prob = result['probabilities'] yield HitGroupClass(group_id=doc['group_id'], classes=NaiveBayesClassifier.most_likely(result), probabilities=json.dumps(prob)) if options['clear_all']: logger.info('Removing all existing classification') # HitGroupClass.objects.all().delete() execute_sql('DELETE FROM main_hitgroupclass;', commit=True) return if options['begin'] and options['end']: # XXX it can be slow. query = ''' SELECT group_id, title, description, keywords FROM main_hitgroupcontent as content JOIN hits_mv ON content.group_id = hits_mv.group_id WHERE NOT EXISTS( SELECT * FROM main_hitgroupclass as class WHERE content.group_id = class.group_id ) AND hits_mv.start_time >= {} AND hits_mv.start_time < {} GROUP BY group_id LIMIT {}; '''.format(options['begin'], options['end'], self.BATCH_SIZE) else: query = ''' SELECT group_id, title, description, keywords FROM main_hitgroupcontent as content WHERE NOT EXISTS( SELECT * FROM main_hitgroupclass as class WHERE content.group_id = class.group_id ) LIMIT {}; '''.format(self.BATCH_SIZE) if not options['classifier_path']: try: options['classifier_path'] = settings.CLASSIFIER_PATH except AttributeError: raise ImproperlyConfigured('Classifier path is not specified ' 'neither in the settings file' 'nor in the command line') with open(options['classifier_path'], 'r') as file: probabilities = json.load(file) classifier = NaiveBayesClassifier(probabilities=probabilities) logger.info('Classification of hit groups started. Processing in '\ 'batches size of {}'.format(self.BATCH_SIZE)) while True: models = query_to_dicts(query) logger.info('Batch classification started') try: results = _to_hit_group_class(classifier.classify_batch(models)) HitGroupClass.objects.bulk_create(results) except EmptyBatchException: logger.info('Batch is empty no hit groups to classify') break logger.info('Batch classified successfully')
def _to_hit_group_class(results): for result in results: doc = result['document'] prob = result['probabilities'] yield HitGroupClass(group_id=doc['group_id'], classes=NaiveBayesClassifier.most_likely(result), probabilities=json.dumps(prob))
def hit_group_details(request, hit_group_id): try: hit_group = HitGroupContent.objects.get(group_id=hit_group_id) if RequesterProfile.objects.filter(requester_id=hit_group.requester_id, is_public=False): raise HitGroupContent.DoesNotExist() except HitGroupContent.DoesNotExist: messages.info(request, 'Hitgroup with id "{0}" was not found!'.format( hit_group_id)) return redirect('haystack_search') try: hit_group_class = HitGroupClass.objects.get(group_id=hit_group_id) except ObjectDoesNotExist: # TODO classification should be done on all models. hit_group_class = None try: with open(settings.CLASSIFIER_PATH, "r") as file: classifier = NaiveBayesClassifier( probabilities=json.load(file) ) classified = classifier.classify(hit_group) most_likely = classifier.most_likely(classified) document = classified["document"] hit_group_class = HitGroupClass( group_id=document.group_id, classes=most_likely, probabilities=classified["probabilities"]) hit_group_class.save() except IOError: # We do not want make hit group details page unavailable when # classifier file does not exist. pass if hit_group_class is not None: hit_group_class_label = NaiveBayesClassifier.label( hit_group_class.classes ) else: hit_group_class_label = NaiveBayesClassifier.label() params = { 'multichart': False, 'columns': HIT_DETAILS_COLUMNS, 'title': '#Hits', 'class': hit_group_class_label, } def hit_group_details_data_formater(input): for cc in input: yield { 'date': cc['start_time'], 'row': (str(cc['hits_available']),), } dicts = query_to_dicts( """ select start_time, hits_available from hits_mv where group_id = '{}' order by start_time asc """ .format(hit_group_id)) data = hit_group_details_data_formater(dicts) params['date_from'] = hit_group.occurrence_date params['date_to'] = datetime.datetime.utcnow() params['data'] = data params['hit_group'] = hit_group return direct_to_template(request, 'main/hit_group_details.html', params)