Exemple #1
0
def compute_dataset_num_contributions_per_day(store_key, dataset_id):
    logger.info('Start computing data for {0}'.format(store_key))
    try:
        dataset = Dataset.objects.get(id=dataset_id)

        contributions = Vote.objects\
            .filter(candidate_annotation__sound_dataset__dataset=dataset)\
            .annotate(day=TruncDay('created_at'))\
            .values('day')\
            .annotate(count=Count('id'))\
            .values('day', 'count')

        start_date = Vote.objects\
            .filter(candidate_annotation__sound_dataset__dataset=dataset)\
            .order_by('created_at')[0].created_at.replace(tzinfo=None)
        end_date = datetime.datetime.now()
        dates = [str(start_date + datetime.timedelta(days=x))[:10] for x in range(0, (end_date - start_date).days)]

        contributions_per_day = {d: 0 for d in dates}
        contributions_per_day.update({str(o['day'])[:10]: o['count'] for o in contributions})

        store.set(store_key, {'contribution_per_day': json.dumps([[day, count]
                                                                  for day, count in contributions_per_day.items()])})

        logger.info('Finished computing data for {0}'.format(store_key))

    except Dataset.DoesNotExist:
        pass
Exemple #2
0
def compute_dataset_basic_stats(store_key, dataset_id):
    logger.info('Start computing data for {0}'.format(store_key))
    try:
        dataset = Dataset.objects.get(id=dataset_id)
        store.set(
            store_key, {
                'num_taxonomy_nodes': dataset.taxonomy.get_num_nodes(),
                'num_sounds': dataset.num_sounds_with_candidate,
                'num_annotations': dataset.num_annotations,
                'avg_annotations_per_sound': dataset.avg_annotations_per_sound,
                'percentage_validated_annotations':
                dataset.percentage_validated_annotations,
                'num_ground_truth_annotations':
                dataset.num_ground_truth_annotations,
                'num_verified_annotations': dataset.num_verified_annotations,
                'num_user_contributions': dataset.num_user_contributions,
                'percentage_verified_annotations':
                dataset.percentage_verified_annotations,
                'num_categories_reached_goal':
                dataset.num_categories_reached_goal,
                'num_non_omitted_nodes': dataset.num_non_omitted_nodes
            })
        logger.info('Finished computing data for {0}'.format(store_key))
    except Dataset.DoesNotExist:
        pass
Exemple #3
0
def compute_dataset_top_contributed_categories(store_key, dataset_id, N=15):
    logger.info('Start computing data for {0}'.format(store_key))
    try:
        dataset = Dataset.objects.get(id=dataset_id)
        nodes = dataset.taxonomy.taxonomynode_set.all()
        reference_date = datetime.datetime.today() - datetime.timedelta(days=7)
        top_categories = list()
        top_categories_last_week = list()

        for node in nodes:
            num_votes = Vote.objects.filter(candidate_annotation__taxonomy_node=node).count()
            top_categories.append((node.url_id, node.name, num_votes, node.omitted))
            num_votes_last_week = Vote.objects.filter(candidate_annotation__taxonomy_node=node, created_at__gt=reference_date).count()
            top_categories_last_week.append((node.url_id, node.name, num_votes_last_week, node.omitted))

        top_categories = sorted(top_categories, key=lambda x: x[2], reverse=True)  # Sort by number of votes
        top_categories_last_week = sorted(top_categories_last_week, key=lambda x: x[2], reverse=True)

        store.set(store_key, {'top_categories': top_categories[:N],
                              'top_categories_last_week': top_categories_last_week[:N]})

        logger.info('Finished computing data for {0}'.format(store_key))

    except Dataset.DoesNotExist:
        pass
Exemple #4
0
def compute_remaining_annotations_with_duration(store_key, dataset_id):
    logger.info('Start computing data for {0}'.format(store_key))
    try:
        dataset = Dataset.objects.get(id=dataset_id)
        nodes = dataset.taxonomy.taxonomynode_set.all()
        remaining_categories = list()

        for node in nodes:
            non_gt_annotations = dataset.non_ground_truth_annotations_per_taxonomy_node(node.node_id)
            num_candidate_annotations_non_gt = non_gt_annotations.count()
            num_candidate_annotations_non_gt_max_10_sec = non_gt_annotations.\
                filter(sound_dataset__sound__extra_data__duration__lte=10).count()
            num_candidate_annotations_non_gt_max_20_sec = non_gt_annotations.\
                filter(sound_dataset__sound__extra_data__duration__lte=20,
                       sound_dataset__sound__extra_data__duration__gt=10).count()
            remaining_categories.append((node.url_id, node.name, num_candidate_annotations_non_gt,
                                         num_candidate_annotations_non_gt_max_10_sec,
                                         num_candidate_annotations_non_gt_max_20_sec,
                                         node.omitted))

        remaining_categories = sorted(remaining_categories, key=lambda x: x[3])

        store.set(store_key, {'remaining_categories': remaining_categories})

        logger.info('Finished computing data for {0}'.format(store_key))

    except Dataset.DoesNotExist:
        pass
Exemple #5
0
def compute_taxonomy_tree(store_key):
    logger.info('Start computing data for {0}'.format(store_key))
    try:
        dataset = Dataset.objects.get(short_name='fsd')
        taxonomy_tree = dataset.taxonomy.get_taxonomy_as_tree()
        store.set(store_key, taxonomy_tree)
        logger.info('Finished computing data for {0}'.format(store_key))
    except Dataset.DoesNotExist:
        pass
def task_prerun(signal=None,
                sender=None,
                task_id=None,
                task=None,
                args=None,
                kwargs=None):
    # Set computing key
    computing_store_key = 'computing-{0}'.format(task.name)
    store.set(computing_store_key, {'running': True})
Exemple #7
0
def compute_dataset_num_ground_truth_per_day(store_key, dataset_id):
    logger.info('Start computing data for {0}'.format(store_key))
    try:
        dataset = Dataset.objects.get(id=dataset_id)

        num_ground_truth_not_from_propagation = GroundTruthAnnotation.objects\
            .filter(sound_dataset__dataset=dataset)\
            .filter(from_propagation=False)\
            .annotate(day=TruncDay('created_at'))\
            .values('day')\
            .annotate(count=Count('id'))\
            .values('day', 'count')

        num_ground_truth_from_propagation = GroundTruthAnnotation.objects\
            .filter(sound_dataset__dataset=dataset)\
            .filter(from_propagation=True)\
            .annotate(day=TruncDay('created_at'))\
            .values('day')\
            .annotate(count=Count('id'))\
            .values('day', 'count')

        start_date = GroundTruthAnnotation.objects\
            .filter(sound_dataset__dataset=dataset)\
            .order_by('created_at')[0].created_at.replace(tzinfo=None)
        end_date = datetime.datetime.now()
        dates = [str(start_date + datetime.timedelta(days=x))[:10] for x in range(0, (end_date - start_date).days)]

        num_ground_truth_not_from_propagation_per_day = {d: 0 for d in dates}
        num_ground_truth_not_from_propagation_per_day.update({str(o['day'])[:10]: o['count']
                                                              for o in num_ground_truth_not_from_propagation})
        num_ground_truth_from_propagation_per_day = {d: 0 for d in dates}
        num_ground_truth_from_propagation_per_day.update({str(o['day'])[:10]: o['count']
                                                          for o in num_ground_truth_from_propagation})

        store.set(store_key, {
            'num_ground_truth_not_from_propagation_per_day':
                json.dumps(sorted([[day, count]
                                   for day, count in num_ground_truth_not_from_propagation_per_day.items()],
                                  key=lambda x: x[0])),
            'num_ground_truth_from_propagation_per_day':
                json.dumps(sorted([[day, count]
                                   for day, count in num_ground_truth_from_propagation_per_day.items()],
                                  key=lambda x: x[0]))
        })

        logger.info('Finished computing data for {0}'.format(store_key))

    except Dataset.DoesNotExist:
        pass
Exemple #8
0
def compute_dataset_bad_mapping(store_key, dataset_id):
    logger.info('Start computing data for {0}'.format(store_key))
    try:
        dataset = Dataset.objects.get(id=dataset_id)
        nodes = dataset.taxonomy.taxonomynode_set.all()
        reference_date = datetime.datetime.today() - datetime.timedelta(days=31)
        bad_mapping_categories = list()
        bad_mapping_categories_last_month = list()

        for node in nodes:
            num_PP = dataset.num_votes_with_value(node.node_id, 1.0)
            num_PNP = dataset.num_votes_with_value(node.node_id, 0.5)
            num_NP = dataset.num_votes_with_value(node.node_id, -1.0)
            num_U = dataset.num_votes_with_value(node.node_id, 0.0)
            try:
                bad_mapping_score = (num_NP + num_U) / (num_PP + num_PNP + num_NP + num_U)
            except ZeroDivisionError:
                bad_mapping_score = 0

            num_PP_last_month = dataset.num_votes_with_value_after_date(node.node_id, 1.0, reference_date)
            num_PNP_last_month = dataset.num_votes_with_value_after_date(node.node_id, 0.5, reference_date)
            num_NP_last_month = dataset.num_votes_with_value_after_date(node.node_id, -1.0, reference_date)
            num_U_last_month = dataset.num_votes_with_value_after_date(node.node_id, 0.0, reference_date)
            try:
                bad_mapping_score_last_month = (num_NP_last_month + num_U_last_month) / \
                                               (num_PP_last_month + num_PNP_last_month + num_NP_last_month + num_U_last_month)
            except ZeroDivisionError:
                bad_mapping_score_last_month = 0

            bad_mapping_categories.append((node.url_id, node.name, bad_mapping_score, node.omitted))
            bad_mapping_categories_last_month.append((node.url_id, node.name, bad_mapping_score_last_month, node.omitted))

            bad_mapping_categories = sorted(bad_mapping_categories, key=lambda x: x[2], reverse=True)  # Sort by mapping score
            bad_mapping_categories = [category_name_score for category_name_score in bad_mapping_categories]

            bad_mapping_categories_last_month = sorted(bad_mapping_categories_last_month,
                                                       key=lambda x: x[2], reverse=True)
            bad_mapping_categories_last_month = [category_name_score for category_name_score
                                                 in bad_mapping_categories_last_month]

        store.set(store_key, {'bad_mapping_categories': bad_mapping_categories,
                              'bad_mapping_categories_last_month': bad_mapping_categories_last_month})

        logger.info('Finished computing data for {0}'.format(store_key))

    except Dataset.DoesNotExist:
        pass
Exemple #9
0
def compute_dataset_difficult_agreement(store_key, dataset_id):
    logger.info('Start computing data for {0}'.format(store_key))
    try:
        dataset = Dataset.objects.get(id=dataset_id)
        nodes = dataset.taxonomy.taxonomynode_set.all()
        reference_date = datetime.datetime.today() - datetime.timedelta(days=31)
        difficult_agreement_categories = list()
        difficult_agreement_categories_last_month = list()

        for node in nodes:
            ground_truth_annotations = node.ground_truth_annotations.filter(from_propagation=False)
            ground_truth_annotations_last_month = node.ground_truth_annotations.filter(from_propagation=False,
                                                                                       created_at__gt=reference_date)
            try:
                mean_votes_agreement = mean([annotation.from_candidate_annotation.votes.exclude(test='FA').count()
                                             for annotation in ground_truth_annotations])
            except StatisticsError:
                mean_votes_agreement = 0
            try:
                mean_votes_agreement_last_month = mean([annotation.from_candidate_annotation.votes.exclude(test='FA').count()
                                                        for annotation in ground_truth_annotations_last_month])
            except StatisticsError:
                mean_votes_agreement_last_month = 0

            difficult_agreement_categories.append((node.url_id, node.name, mean_votes_agreement, node.omitted))
            difficult_agreement_categories_last_month.append((node.url_id, node.name, mean_votes_agreement_last_month, node.omitted))

        difficult_agreement_categories = [category_name_votes for category_name_votes in difficult_agreement_categories
                                          if category_name_votes[2] > 2]
        difficult_agreement_categories = sorted(difficult_agreement_categories, key=lambda x: x[2], reverse=True)
        difficult_agreement_categories_last_month = [category_name_votes for category_name_votes
                                                     in difficult_agreement_categories_last_month
                                                     if category_name_votes[2] > 2]
        difficult_agreement_categories_last_month = sorted(difficult_agreement_categories_last_month, key=lambda x: x[2]
                                                           , reverse=True)

        store.set(store_key, {'difficult_agreement_categories': difficult_agreement_categories,
                              'difficult_agreement_categories_last_month': difficult_agreement_categories_last_month})

        logger.info('Finished computing data for {0}'.format(store_key))

    except Dataset.DoesNotExist:
        pass
Exemple #10
0
def compute_annotators_ranking(store_key, dataset_id, N=10):
    logger.info('Start computing data for {0}'.format(store_key))
    try:
        dataset = Dataset.objects.get(id=dataset_id)
        reference_date = timezone.now() - datetime.timedelta(days=7)
        current_day_date = timezone.now().replace(hour=0,
                                                  minute=0,
                                                  second=0,
                                                  microsecond=0)
        ranking = list()
        ranking_last_week = list()
        ranking_today = list()
        ranking_agreement_today = list()
        for user in User.objects.all():
            # all time
            n_annotations = CandidateAnnotation.objects.filter(
                created_by=user, sound_dataset__dataset=dataset,
                type='MA').count()
            n_votes = Vote.objects.filter(
                created_by=user,
                candidate_annotation__sound_dataset__dataset=dataset).count()
            ranking.append((user.username, n_annotations + n_votes))

            # last week
            n_annotations_last_week = CandidateAnnotation.objects.filter(
                created_at__gt=reference_date,
                created_by=user,
                sound_dataset__dataset=dataset,
                type='MA').count()
            n_votes_last_week = Vote.objects.filter(
                created_at__gt=reference_date,
                created_by=user,
                candidate_annotation__sound_dataset__dataset=dataset).count()
            ranking_last_week.append(
                (user.username, n_annotations_last_week + n_votes_last_week))

            # today
            agreement_score = 0
            n_annotations_today = CandidateAnnotation.objects.filter(
                created_at__gt=current_day_date,
                created_by=user,
                sound_dataset__dataset=dataset,
                type='MA').count()
            n_votes_today = Vote.objects.filter(
                created_at__gt=current_day_date,
                created_by=user,
                candidate_annotation__sound_dataset__dataset=dataset).count()

            ranking_today.append(
                (user.username, n_annotations_today + n_votes_today))

            # agreement score today
            votes = Vote.objects.filter(
                created_by=user,
                candidate_annotation__sound_dataset__dataset=dataset,
                created_at__gt=current_day_date)
            for vote in votes:
                all_vote_values = [
                    v.vote for v in vote.candidate_annotation.votes.all()
                ]
                if all_vote_values.count(vote.vote) > 1:
                    agreement_score += 1
                elif len(all_vote_values) > 1:
                    pass
                else:
                    agreement_score += 0.5
            try:
                ranking_agreement_today.append(
                    (user.username, agreement_score / float(n_votes_today)))
            except ZeroDivisionError:
                ranking_agreement_today.append((user.username, 0))

        ranking = sorted(ranking, key=lambda x: x[1],
                         reverse=True)  # Sort by number of annotations
        ranking_last_week = sorted(ranking_last_week,
                                   key=lambda x: x[1],
                                   reverse=True)
        ranking_today = sorted(ranking_today, key=lambda x: x[1], reverse=True)
        ranking_agreement_today = sorted(ranking_agreement_today,
                                         key=lambda x: x[1],
                                         reverse=True)

        store.set(
            store_key, {
                'ranking': ranking[:N],
                'ranking_last_week': ranking_last_week[:N],
                'ranking_today': ranking_today,
                'ranking_agreement_today': ranking_agreement_today
            })
        logger.info('Finished computing data for {0}'.format(store_key))
    except Dataset.DoesNotExist:
        pass
    except User.DoesNotExist:
        pass
Exemple #11
0
def compute_dataset_taxonomy_stats(store_key, dataset_id):
    logger.info('Start computing data for {0}'.format(store_key))
    try:
        dataset = Dataset.objects.get(id=dataset_id)
        node_ids = dataset.taxonomy.get_all_node_ids()
        from django.db import connection
        with connection.cursor() as cursor:
            cursor.execute(
                """
                    SELECT taxonomynode.node_id
                           , COUNT(candidateannotation.id)
                           , COUNT(DISTINCT(sound.id))
                        FROM datasets_candidateannotation candidateannotation
                  INNER JOIN datasets_sounddataset sounddataset
                          ON candidateannotation.sound_dataset_id = sounddataset.id
                  INNER JOIN datasets_sound sound
                          ON sound.id = sounddataset.sound_id
                  INNER JOIN datasets_taxonomynode taxonomynode
                          ON taxonomynode.id = candidateannotation.taxonomy_node_id
                       WHERE taxonomynode.node_id IN %s
                         AND sounddataset.dataset_id = %s
                    GROUP BY taxonomynode.node_id
                           """, (tuple(node_ids), dataset.id))
            node_n_annotations_n_sounds = cursor.fetchall()

        annotation_numbers = {}
        for node_id, num_ann, num_sounds in node_n_annotations_n_sounds:
            # In commit https://github.com/MTG/freesound-datasets/commit/0a748ec3e8481cc1ca4625bced24e0aee9d059d0 we
            # introduced a single SQL query that go num_ann, num_sounds and num_missing_votes in one go.
            # However when tested in production we saw the query took hours to complete with full a sized dataset.
            # To make it work in a reasonable amount of time we now do a query to get nun validated annotations
            # for each node in the taxonomy. This should be refactored and use a single query to get all non
            # validated annotation counts for all nodes.
            num_missing_votes = dataset.num_non_validated_annotations_per_taxonomy_node(
                node_id)
            votes_stats = {
                'num_present_and_predominant':
                dataset.num_votes_with_value(node_id, 1.0),
                'num_present_not_predominant':
                dataset.num_votes_with_value(node_id, 0.5),
                'num_not_present':
                dataset.num_votes_with_value(node_id, -1.0),
                'num_unsure':
                dataset.num_votes_with_value(node_id, 0.0)
            }

            annotation_numbers[node_id] = {
                'num_annotations': num_ann,
                'num_sounds': num_sounds,
                'num_missing_votes': num_missing_votes,
                'votes_stats': votes_stats
            }

        nodes_data = []
        for node in dataset.taxonomy.get_all_nodes():
            try:
                counts = annotation_numbers[node.node_id]
            except KeyError:
                # Can happen if there are no annotations/sounds per a category
                counts = {
                    'num_sounds': 0,
                    'num_annotations': 0,
                    'num_missing_votes': 0,
                    'votes_stats': None,
                }
            node_stats = calculate_taxonomy_node_stats(
                dataset, node.as_dict(), counts['num_sounds'],
                counts['num_annotations'], counts['num_missing_votes'],
                counts['votes_stats'])
            node_stats.update({
                'id': node.node_id,
                'name': node.name,
            })
            nodes_data.append(node_stats)

        store.set(store_key, {
            'nodes_data': nodes_data,
        })
        logger.info('Finished computing data for {0}'.format(store_key))
    except Dataset.DoesNotExist:
        pass