def compute_dataset_num_contributions_per_day(store_key, dataset_id): logger.info('Start computing data for {0}'.format(store_key)) try: dataset = Dataset.objects.get(id=dataset_id) contributions = Vote.objects\ .filter(candidate_annotation__sound_dataset__dataset=dataset)\ .annotate(day=TruncDay('created_at'))\ .values('day')\ .annotate(count=Count('id'))\ .values('day', 'count') start_date = Vote.objects\ .filter(candidate_annotation__sound_dataset__dataset=dataset)\ .order_by('created_at')[0].created_at.replace(tzinfo=None) end_date = datetime.datetime.now() dates = [str(start_date + datetime.timedelta(days=x))[:10] for x in range(0, (end_date - start_date).days)] contributions_per_day = {d: 0 for d in dates} contributions_per_day.update({str(o['day'])[:10]: o['count'] for o in contributions}) store.set(store_key, {'contribution_per_day': json.dumps([[day, count] for day, count in contributions_per_day.items()])}) logger.info('Finished computing data for {0}'.format(store_key)) except Dataset.DoesNotExist: pass
def compute_dataset_basic_stats(store_key, dataset_id): logger.info('Start computing data for {0}'.format(store_key)) try: dataset = Dataset.objects.get(id=dataset_id) store.set( store_key, { 'num_taxonomy_nodes': dataset.taxonomy.get_num_nodes(), 'num_sounds': dataset.num_sounds_with_candidate, 'num_annotations': dataset.num_annotations, 'avg_annotations_per_sound': dataset.avg_annotations_per_sound, 'percentage_validated_annotations': dataset.percentage_validated_annotations, 'num_ground_truth_annotations': dataset.num_ground_truth_annotations, 'num_verified_annotations': dataset.num_verified_annotations, 'num_user_contributions': dataset.num_user_contributions, 'percentage_verified_annotations': dataset.percentage_verified_annotations, 'num_categories_reached_goal': dataset.num_categories_reached_goal, 'num_non_omitted_nodes': dataset.num_non_omitted_nodes }) logger.info('Finished computing data for {0}'.format(store_key)) except Dataset.DoesNotExist: pass
def compute_dataset_top_contributed_categories(store_key, dataset_id, N=15): logger.info('Start computing data for {0}'.format(store_key)) try: dataset = Dataset.objects.get(id=dataset_id) nodes = dataset.taxonomy.taxonomynode_set.all() reference_date = datetime.datetime.today() - datetime.timedelta(days=7) top_categories = list() top_categories_last_week = list() for node in nodes: num_votes = Vote.objects.filter(candidate_annotation__taxonomy_node=node).count() top_categories.append((node.url_id, node.name, num_votes, node.omitted)) num_votes_last_week = Vote.objects.filter(candidate_annotation__taxonomy_node=node, created_at__gt=reference_date).count() top_categories_last_week.append((node.url_id, node.name, num_votes_last_week, node.omitted)) top_categories = sorted(top_categories, key=lambda x: x[2], reverse=True) # Sort by number of votes top_categories_last_week = sorted(top_categories_last_week, key=lambda x: x[2], reverse=True) store.set(store_key, {'top_categories': top_categories[:N], 'top_categories_last_week': top_categories_last_week[:N]}) logger.info('Finished computing data for {0}'.format(store_key)) except Dataset.DoesNotExist: pass
def compute_remaining_annotations_with_duration(store_key, dataset_id): logger.info('Start computing data for {0}'.format(store_key)) try: dataset = Dataset.objects.get(id=dataset_id) nodes = dataset.taxonomy.taxonomynode_set.all() remaining_categories = list() for node in nodes: non_gt_annotations = dataset.non_ground_truth_annotations_per_taxonomy_node(node.node_id) num_candidate_annotations_non_gt = non_gt_annotations.count() num_candidate_annotations_non_gt_max_10_sec = non_gt_annotations.\ filter(sound_dataset__sound__extra_data__duration__lte=10).count() num_candidate_annotations_non_gt_max_20_sec = non_gt_annotations.\ filter(sound_dataset__sound__extra_data__duration__lte=20, sound_dataset__sound__extra_data__duration__gt=10).count() remaining_categories.append((node.url_id, node.name, num_candidate_annotations_non_gt, num_candidate_annotations_non_gt_max_10_sec, num_candidate_annotations_non_gt_max_20_sec, node.omitted)) remaining_categories = sorted(remaining_categories, key=lambda x: x[3]) store.set(store_key, {'remaining_categories': remaining_categories}) logger.info('Finished computing data for {0}'.format(store_key)) except Dataset.DoesNotExist: pass
def compute_taxonomy_tree(store_key): logger.info('Start computing data for {0}'.format(store_key)) try: dataset = Dataset.objects.get(short_name='fsd') taxonomy_tree = dataset.taxonomy.get_taxonomy_as_tree() store.set(store_key, taxonomy_tree) logger.info('Finished computing data for {0}'.format(store_key)) except Dataset.DoesNotExist: pass
def task_prerun(signal=None, sender=None, task_id=None, task=None, args=None, kwargs=None): # Set computing key computing_store_key = 'computing-{0}'.format(task.name) store.set(computing_store_key, {'running': True})
def compute_dataset_num_ground_truth_per_day(store_key, dataset_id): logger.info('Start computing data for {0}'.format(store_key)) try: dataset = Dataset.objects.get(id=dataset_id) num_ground_truth_not_from_propagation = GroundTruthAnnotation.objects\ .filter(sound_dataset__dataset=dataset)\ .filter(from_propagation=False)\ .annotate(day=TruncDay('created_at'))\ .values('day')\ .annotate(count=Count('id'))\ .values('day', 'count') num_ground_truth_from_propagation = GroundTruthAnnotation.objects\ .filter(sound_dataset__dataset=dataset)\ .filter(from_propagation=True)\ .annotate(day=TruncDay('created_at'))\ .values('day')\ .annotate(count=Count('id'))\ .values('day', 'count') start_date = GroundTruthAnnotation.objects\ .filter(sound_dataset__dataset=dataset)\ .order_by('created_at')[0].created_at.replace(tzinfo=None) end_date = datetime.datetime.now() dates = [str(start_date + datetime.timedelta(days=x))[:10] for x in range(0, (end_date - start_date).days)] num_ground_truth_not_from_propagation_per_day = {d: 0 for d in dates} num_ground_truth_not_from_propagation_per_day.update({str(o['day'])[:10]: o['count'] for o in num_ground_truth_not_from_propagation}) num_ground_truth_from_propagation_per_day = {d: 0 for d in dates} num_ground_truth_from_propagation_per_day.update({str(o['day'])[:10]: o['count'] for o in num_ground_truth_from_propagation}) store.set(store_key, { 'num_ground_truth_not_from_propagation_per_day': json.dumps(sorted([[day, count] for day, count in num_ground_truth_not_from_propagation_per_day.items()], key=lambda x: x[0])), 'num_ground_truth_from_propagation_per_day': json.dumps(sorted([[day, count] for day, count in num_ground_truth_from_propagation_per_day.items()], key=lambda x: x[0])) }) logger.info('Finished computing data for {0}'.format(store_key)) except Dataset.DoesNotExist: pass
def compute_dataset_bad_mapping(store_key, dataset_id): logger.info('Start computing data for {0}'.format(store_key)) try: dataset = Dataset.objects.get(id=dataset_id) nodes = dataset.taxonomy.taxonomynode_set.all() reference_date = datetime.datetime.today() - datetime.timedelta(days=31) bad_mapping_categories = list() bad_mapping_categories_last_month = list() for node in nodes: num_PP = dataset.num_votes_with_value(node.node_id, 1.0) num_PNP = dataset.num_votes_with_value(node.node_id, 0.5) num_NP = dataset.num_votes_with_value(node.node_id, -1.0) num_U = dataset.num_votes_with_value(node.node_id, 0.0) try: bad_mapping_score = (num_NP + num_U) / (num_PP + num_PNP + num_NP + num_U) except ZeroDivisionError: bad_mapping_score = 0 num_PP_last_month = dataset.num_votes_with_value_after_date(node.node_id, 1.0, reference_date) num_PNP_last_month = dataset.num_votes_with_value_after_date(node.node_id, 0.5, reference_date) num_NP_last_month = dataset.num_votes_with_value_after_date(node.node_id, -1.0, reference_date) num_U_last_month = dataset.num_votes_with_value_after_date(node.node_id, 0.0, reference_date) try: bad_mapping_score_last_month = (num_NP_last_month + num_U_last_month) / \ (num_PP_last_month + num_PNP_last_month + num_NP_last_month + num_U_last_month) except ZeroDivisionError: bad_mapping_score_last_month = 0 bad_mapping_categories.append((node.url_id, node.name, bad_mapping_score, node.omitted)) bad_mapping_categories_last_month.append((node.url_id, node.name, bad_mapping_score_last_month, node.omitted)) bad_mapping_categories = sorted(bad_mapping_categories, key=lambda x: x[2], reverse=True) # Sort by mapping score bad_mapping_categories = [category_name_score for category_name_score in bad_mapping_categories] bad_mapping_categories_last_month = sorted(bad_mapping_categories_last_month, key=lambda x: x[2], reverse=True) bad_mapping_categories_last_month = [category_name_score for category_name_score in bad_mapping_categories_last_month] store.set(store_key, {'bad_mapping_categories': bad_mapping_categories, 'bad_mapping_categories_last_month': bad_mapping_categories_last_month}) logger.info('Finished computing data for {0}'.format(store_key)) except Dataset.DoesNotExist: pass
def compute_dataset_difficult_agreement(store_key, dataset_id): logger.info('Start computing data for {0}'.format(store_key)) try: dataset = Dataset.objects.get(id=dataset_id) nodes = dataset.taxonomy.taxonomynode_set.all() reference_date = datetime.datetime.today() - datetime.timedelta(days=31) difficult_agreement_categories = list() difficult_agreement_categories_last_month = list() for node in nodes: ground_truth_annotations = node.ground_truth_annotations.filter(from_propagation=False) ground_truth_annotations_last_month = node.ground_truth_annotations.filter(from_propagation=False, created_at__gt=reference_date) try: mean_votes_agreement = mean([annotation.from_candidate_annotation.votes.exclude(test='FA').count() for annotation in ground_truth_annotations]) except StatisticsError: mean_votes_agreement = 0 try: mean_votes_agreement_last_month = mean([annotation.from_candidate_annotation.votes.exclude(test='FA').count() for annotation in ground_truth_annotations_last_month]) except StatisticsError: mean_votes_agreement_last_month = 0 difficult_agreement_categories.append((node.url_id, node.name, mean_votes_agreement, node.omitted)) difficult_agreement_categories_last_month.append((node.url_id, node.name, mean_votes_agreement_last_month, node.omitted)) difficult_agreement_categories = [category_name_votes for category_name_votes in difficult_agreement_categories if category_name_votes[2] > 2] difficult_agreement_categories = sorted(difficult_agreement_categories, key=lambda x: x[2], reverse=True) difficult_agreement_categories_last_month = [category_name_votes for category_name_votes in difficult_agreement_categories_last_month if category_name_votes[2] > 2] difficult_agreement_categories_last_month = sorted(difficult_agreement_categories_last_month, key=lambda x: x[2] , reverse=True) store.set(store_key, {'difficult_agreement_categories': difficult_agreement_categories, 'difficult_agreement_categories_last_month': difficult_agreement_categories_last_month}) logger.info('Finished computing data for {0}'.format(store_key)) except Dataset.DoesNotExist: pass
def compute_annotators_ranking(store_key, dataset_id, N=10): logger.info('Start computing data for {0}'.format(store_key)) try: dataset = Dataset.objects.get(id=dataset_id) reference_date = timezone.now() - datetime.timedelta(days=7) current_day_date = timezone.now().replace(hour=0, minute=0, second=0, microsecond=0) ranking = list() ranking_last_week = list() ranking_today = list() ranking_agreement_today = list() for user in User.objects.all(): # all time n_annotations = CandidateAnnotation.objects.filter( created_by=user, sound_dataset__dataset=dataset, type='MA').count() n_votes = Vote.objects.filter( created_by=user, candidate_annotation__sound_dataset__dataset=dataset).count() ranking.append((user.username, n_annotations + n_votes)) # last week n_annotations_last_week = CandidateAnnotation.objects.filter( created_at__gt=reference_date, created_by=user, sound_dataset__dataset=dataset, type='MA').count() n_votes_last_week = Vote.objects.filter( created_at__gt=reference_date, created_by=user, candidate_annotation__sound_dataset__dataset=dataset).count() ranking_last_week.append( (user.username, n_annotations_last_week + n_votes_last_week)) # today agreement_score = 0 n_annotations_today = CandidateAnnotation.objects.filter( created_at__gt=current_day_date, created_by=user, sound_dataset__dataset=dataset, type='MA').count() n_votes_today = Vote.objects.filter( created_at__gt=current_day_date, created_by=user, candidate_annotation__sound_dataset__dataset=dataset).count() ranking_today.append( (user.username, n_annotations_today + n_votes_today)) # agreement score today votes = Vote.objects.filter( created_by=user, candidate_annotation__sound_dataset__dataset=dataset, created_at__gt=current_day_date) for vote in votes: all_vote_values = [ v.vote for v in vote.candidate_annotation.votes.all() ] if all_vote_values.count(vote.vote) > 1: agreement_score += 1 elif len(all_vote_values) > 1: pass else: agreement_score += 0.5 try: ranking_agreement_today.append( (user.username, agreement_score / float(n_votes_today))) except ZeroDivisionError: ranking_agreement_today.append((user.username, 0)) ranking = sorted(ranking, key=lambda x: x[1], reverse=True) # Sort by number of annotations ranking_last_week = sorted(ranking_last_week, key=lambda x: x[1], reverse=True) ranking_today = sorted(ranking_today, key=lambda x: x[1], reverse=True) ranking_agreement_today = sorted(ranking_agreement_today, key=lambda x: x[1], reverse=True) store.set( store_key, { 'ranking': ranking[:N], 'ranking_last_week': ranking_last_week[:N], 'ranking_today': ranking_today, 'ranking_agreement_today': ranking_agreement_today }) logger.info('Finished computing data for {0}'.format(store_key)) except Dataset.DoesNotExist: pass except User.DoesNotExist: pass
def compute_dataset_taxonomy_stats(store_key, dataset_id): logger.info('Start computing data for {0}'.format(store_key)) try: dataset = Dataset.objects.get(id=dataset_id) node_ids = dataset.taxonomy.get_all_node_ids() from django.db import connection with connection.cursor() as cursor: cursor.execute( """ SELECT taxonomynode.node_id , COUNT(candidateannotation.id) , COUNT(DISTINCT(sound.id)) FROM datasets_candidateannotation candidateannotation INNER JOIN datasets_sounddataset sounddataset ON candidateannotation.sound_dataset_id = sounddataset.id INNER JOIN datasets_sound sound ON sound.id = sounddataset.sound_id INNER JOIN datasets_taxonomynode taxonomynode ON taxonomynode.id = candidateannotation.taxonomy_node_id WHERE taxonomynode.node_id IN %s AND sounddataset.dataset_id = %s GROUP BY taxonomynode.node_id """, (tuple(node_ids), dataset.id)) node_n_annotations_n_sounds = cursor.fetchall() annotation_numbers = {} for node_id, num_ann, num_sounds in node_n_annotations_n_sounds: # In commit https://github.com/MTG/freesound-datasets/commit/0a748ec3e8481cc1ca4625bced24e0aee9d059d0 we # introduced a single SQL query that go num_ann, num_sounds and num_missing_votes in one go. # However when tested in production we saw the query took hours to complete with full a sized dataset. # To make it work in a reasonable amount of time we now do a query to get nun validated annotations # for each node in the taxonomy. This should be refactored and use a single query to get all non # validated annotation counts for all nodes. num_missing_votes = dataset.num_non_validated_annotations_per_taxonomy_node( node_id) votes_stats = { 'num_present_and_predominant': dataset.num_votes_with_value(node_id, 1.0), 'num_present_not_predominant': dataset.num_votes_with_value(node_id, 0.5), 'num_not_present': dataset.num_votes_with_value(node_id, -1.0), 'num_unsure': dataset.num_votes_with_value(node_id, 0.0) } annotation_numbers[node_id] = { 'num_annotations': num_ann, 'num_sounds': num_sounds, 'num_missing_votes': num_missing_votes, 'votes_stats': votes_stats } nodes_data = [] for node in dataset.taxonomy.get_all_nodes(): try: counts = annotation_numbers[node.node_id] except KeyError: # Can happen if there are no annotations/sounds per a category counts = { 'num_sounds': 0, 'num_annotations': 0, 'num_missing_votes': 0, 'votes_stats': None, } node_stats = calculate_taxonomy_node_stats( dataset, node.as_dict(), counts['num_sounds'], counts['num_annotations'], counts['num_missing_votes'], counts['votes_stats']) node_stats.update({ 'id': node.node_id, 'name': node.name, }) nodes_data.append(node_stats) store.set(store_key, { 'nodes_data': nodes_data, }) logger.info('Finished computing data for {0}'.format(store_key)) except Dataset.DoesNotExist: pass