def stem_dataset_sound_tags(): logger.info('Start computing stem tags for FSD sounds') dataset = Dataset.objects.get(short_name='fsd') with transaction.atomic(): for sound in dataset.sounds.all(): tags = sound.extra_data['tags'] stemmed_tags = [stem(tag) for tag in tags] sound.extra_data['stemmed_tags'] = stemmed_tags sound.save() logger.info('Finished computing stem tags for FSD sounds')
def handle(self, *args, **options): file_location = options['filepath'] dataset_short_name = options['dataset_short_name'] algorithm_name = options['algorithm_name'] dataset = Dataset.objects.get(short_name=dataset_short_name) print('Loading data...') data = json.load(open(file_location)) count = 0 # Iterate all the sounds in chunks so we can do all transactions of a chunk atomically for chunk in chunks(list(data.keys()), 5000): with transaction.atomic(): for sound_id in chunk: sound_data = data[sound_id] count += 1 sys.stdout.write('\rCreating sound %i of %i (%.2f%%)' % (count + 1, len(data), 100.0 * (count + 1) / len(data))) sys.stdout.flush() sound = Sound.objects.create( name=sound_data['name'][:200], freesound_id=sound_id, extra_data={ 'tags': sound_data['tags'], 'stemmed_tags': [stem(tag) for tag in sound_data['tags']], 'duration': sound_data['duration'], 'username': sound_data['username'], 'license': sound_data['license'], 'description': sound_data['description'], 'previews': sound_data['previews'], 'analysis': sound_data['analysis'] if 'analysis' in sound_data.keys() else {}, }) sound_dataset = SoundDataset.objects.create( dataset=dataset, sound=sound) for node_id in sound_data['aso_ids']: c = CandidateAnnotation.objects.create( sound_dataset=sound_dataset, type='AU', algorithm=algorithm_name, taxonomy_node=TaxonomyNode.objects.get( node_id=node_id)) c.update_priority_score()
def mapping_category(request, short_name, node_id): dataset = get_object_or_404(Dataset, short_name=short_name) if not dataset.user_is_maintainer(request.user): return HttpResponseRedirect(reverse('dataset', args=[dataset.short_name])) node_id = unquote(node_id) node = dataset.taxonomy.get_element_at_id(node_id) if request.method == 'POST': run_or_submit = dict(request.POST).get('run-or-submit', ['run'])[0] positive_tags_raw = dict(request.POST).get('positive-tags', '') # e.g. ['dog, cat', 'dog'] negative_tags_raw = dict(request.POST).get('negative-tags', '') preproc_positive = True if dict(request.POST).get('preproc-positive', ['true']) == ['true'] else False preproc_negative = True if dict(request.POST).get('preproc-negative', ['false']) == ['true'] else False positive_tags = [[stem(tag.replace(' ', '').lower()) if preproc_positive else tag.replace(' ', '').lower() for tag in tags.split(',')] for tags in positive_tags_raw if tags != ''] # e.g. [['dog', 'cat'], ['dog']] negative_tags = [stem(tag.replace(' ', '')).lower() if preproc_negative else tag.replace(' ', '').lower() for tags in negative_tags_raw for tag in tags.split(',') if tags != ''] results = dataset.retrieve_sound_by_tags(positive_tags, negative_tags, preproc_positive, preproc_negative) candidates = list(node.candidate_annotations.values_list('sound_dataset__sound__freesound_id', flat=True)) # Run the mapping strategy and return the retrieved sounds and some statistics if run_or_submit == 'run': quality_estimate = dataset.quality_estimate_mapping(results, node_id) freesound_ids = list(results.values_list('freesound_id', flat=True)) shuffle(freesound_ids) quality_estimate['freesound_ids'] = freesound_ids quality_estimate['num_sounds'] = len(freesound_ids) num_common_sounds = len(list(set(candidates).intersection(set(freesound_ids)))) stats = { 'retrieved': quality_estimate, 'mapping': node.quality_estimate, 'num_common_sounds': num_common_sounds } return JsonResponse(stats) # Submit the retrieved sounds elif run_or_submit == 'submit': freesound_ids_str = dict(request.POST).get('freesound-ids', [None])[0] # Retrieved by Freesound IDs if freesound_ids_str: freesound_ids = freesound_ids_str.split(',') results = dataset.sounds.filter(freesound_id__in=freesound_ids) new_sounds = results.exclude(freesound_id__in=candidates) num_new_sounds = new_sounds.count() try: with transaction.atomic(): for sound in new_sounds: CandidateAnnotation.objects.create( sound_dataset=sound.sounddataset_set.filter(dataset=dataset).first(), type='MA', algorithm='platform_manual: By Freesound ID', taxonomy_node=node, created_by=request.user ) except: return JsonResponse({'error': True}) return JsonResponse({'error': False, 'num_candidates_added': num_new_sounds, 'num_candidates_deleted': 0}) # Retrieved by the tag based query else: add_or_replace = dict(request.POST).get('add-or-replace', ['add'])[0] voted_negative = dict(request.POST).get('voted-negative', []) results = results.exclude(freesound_id__in=voted_negative) name_algorithm = str(positive_tags) + ' AND NOT ' + str(negative_tags) num_new_sounds = 0 num_deleted = 0 # Add the new candidates to the existing ones if add_or_replace == 'add': new_sounds = results.exclude(freesound_id__in=candidates) num_new_sounds = new_sounds.count() try: with transaction.atomic(): for sound in new_sounds: CandidateAnnotation.objects.create( sound_dataset=sound.sounddataset_set.filter(dataset=dataset).first(), type='AU', algorithm='platform_mapping: {}'.format(name_algorithm), taxonomy_node=node, created_by=request.user ) except: return JsonResponse({'error': True}) # Replace the actual candidates with the retrieved ones (deletes only candidates never voted) elif add_or_replace == 'replace': try: with transaction.atomic(): new_sounds = results.exclude(freesound_id__in=candidates) num_deleted = node.candidate_annotations.exclude(sound_dataset__sound__in=results)\ .annotate(num_votes=Count('votes'))\ .filter(num_votes=0)\ .delete()[0] num_new_sounds = new_sounds.count() for sound in new_sounds: CandidateAnnotation.objects.create( sound_dataset=sound.sounddataset_set.filter(dataset=dataset).first(), type='AU', algorithm='platform_mapping: {}'.format(name_algorithm), taxonomy_node=node, created_by=request.user ) except: return JsonResponse({'error': True}) return JsonResponse({'error': False, 'num_candidates_added': num_new_sounds, 'num_candidates_deleted': num_deleted}) elif request.method == 'GET': mapping_rule = [dataset.taxonomy.data[node_id].get('fs_tags', ''), dataset.taxonomy.data[node_id].get('omit_fs_tags', '')] platform_mapping_rules = list(set(node.candidate_annotations.exclude(type='MA') .values_list('algorithm', flat=True))) platform_mapping_rules.remove('tag_matching_mtg_1') platform_mapping_rules_formated = [(m.split(' AND NOT ')[0].split('platform_mapping: ')[1], m.split(' AND NOT ')[1]) for m in platform_mapping_rules] return render(request, 'monitor/mapping_category.html', { 'dataset': dataset, 'node': node, 'mapping_rule': mapping_rule, 'platform_mapping_rules': platform_mapping_rules_formated })