Esempio n. 1
0
 def handle(self, *args, **options):
     def _to_hit_group_class(results):
         for result in results:
             doc = result['document']
             prob = result['probabilities']
             yield HitGroupClass(group_id=doc['group_id'],
                                 classes=NaiveBayesClassifier.most_likely(result),
                                 probabilities=json.dumps(prob))
     if options['clear_all']:
         logger.info('Removing all existing classification')
         # HitGroupClass.objects.all().delete()
         execute_sql('DELETE FROM main_hitgroupclass;', commit=True)
         return
     if options['begin'] and options['end']:
         # XXX it can be slow.
         query = ''' SELECT group_id, title, description, keywords
                     FROM main_hitgroupcontent as content
                     JOIN hits_mv 
                     ON content.group_id = hits_mv.group_id
                     WHERE 
                         NOT EXISTS(
                             SELECT * FROM main_hitgroupclass as class
                             WHERE content.group_id = class.group_id
                         ) AND 
                         hits_mv.start_time >= {} AND 
                         hits_mv.start_time < {} 
                     GROUP BY group_id
                     LIMIT {};
                 '''.format(options['begin'], options['end'], self.BATCH_SIZE)
     else:
         query = ''' SELECT group_id, title, description, keywords
                     FROM main_hitgroupcontent as content
                     WHERE NOT EXISTS(
                         SELECT * FROM main_hitgroupclass as class
                         WHERE content.group_id = class.group_id
                     ) LIMIT {};
                 '''.format(self.BATCH_SIZE)
     if not options['classifier_path']:
         try:
             options['classifier_path'] = settings.CLASSIFIER_PATH
         except AttributeError:
             raise ImproperlyConfigured('Classifier path is not specified '
                                        'neither in the settings file'
                                        'nor in the command line')
     with open(options['classifier_path'], 'r') as file:
         probabilities = json.load(file)
         classifier = NaiveBayesClassifier(probabilities=probabilities)
         logger.info('Classification of hit groups started. Processing in '\
                     'batches size of {}'.format(self.BATCH_SIZE))
         while True:
             models = query_to_dicts(query)
             logger.info('Batch classification started')
             try:
                 results = _to_hit_group_class(classifier.classify_batch(models))
                 HitGroupClass.objects.bulk_create(results)
             except EmptyBatchException:
                 logger.info('Batch is empty no hit groups to classify')
                 break
             logger.info('Batch classified successfully')
Esempio n. 2
0
 def _to_hit_group_class(results):
     for result in results:
         doc = result['document']
         prob = result['probabilities']
         yield HitGroupClass(group_id=doc['group_id'],
                             classes=NaiveBayesClassifier.most_likely(result),
                             probabilities=json.dumps(prob))
Esempio n. 3
0
def hit_group_details(request, hit_group_id):

    try:
        hit_group = HitGroupContent.objects.get(group_id=hit_group_id)
        if RequesterProfile.objects.filter(requester_id=hit_group.requester_id,
            is_public=False):
            raise HitGroupContent.DoesNotExist()
    except HitGroupContent.DoesNotExist:
        messages.info(request, 'Hitgroup with id "{0}" was not found!'.format(
            hit_group_id))
        return redirect('haystack_search')

    try:
        hit_group_class = HitGroupClass.objects.get(group_id=hit_group_id)
    except ObjectDoesNotExist:
        # TODO classification should be done on all models.
        hit_group_class = None
        try:
            with open(settings.CLASSIFIER_PATH, "r") as file:
                classifier = NaiveBayesClassifier(
                        probabilities=json.load(file)
                    )
                classified = classifier.classify(hit_group)
                most_likely = classifier.most_likely(classified)
                document = classified["document"]
                hit_group_class = HitGroupClass(
                        group_id=document.group_id,
                        classes=most_likely,
                        probabilities=classified["probabilities"])
                hit_group_class.save()
        except IOError:
            # We do not want make hit group details page unavailable when
            # classifier file does not exist.
            pass

    if hit_group_class is not None:
        hit_group_class_label = NaiveBayesClassifier.label(
                hit_group_class.classes
            )
    else:
        hit_group_class_label = NaiveBayesClassifier.label()

    params = {
        'multichart': False,
        'columns': HIT_DETAILS_COLUMNS,
        'title': '#Hits',
        'class': hit_group_class_label,
    }

    def hit_group_details_data_formater(input):
        for cc in input:
            yield {
                'date': cc['start_time'],
                'row': (str(cc['hits_available']),),
            }

    dicts = query_to_dicts(
                """ select start_time, hits_available from hits_mv
                    where group_id = '{}' order by start_time asc """
                .format(hit_group_id))
    data = hit_group_details_data_formater(dicts)
    params['date_from'] = hit_group.occurrence_date
    params['date_to'] = datetime.datetime.utcnow()
    params['data'] = data
    params['hit_group'] = hit_group
    return direct_to_template(request, 'main/hit_group_details.html', params)