def validate_input_csv_file(csv_header,
                            csv_lines,
                            sounds_base_dir,
                            username=None):
    """
    Reads through the lines of a CSV file containing metadata to describe (and create) new Sound objects and returns
    the list of lines after the validation process and a list of global errors (if any).

    Each element in the returned list of lines after the validation process is a dictionary which inclues the original
    line content, the cleaned line content (i.e. fields with cleaned data), and a dictionary of errors for the line
    (if any). Lines that validated ok form lines that did not validate ok can be separated by checking whether there
    are any errors for them.

    :param csv_header: header of the CSV (as returned by 'get_csv_lines' funtion above).
    :param csv_lines: lines of the CSV (as returned by 'get_csv_lines' funtion above).
    :param sounds_base_dir: directory where audio files referenced in CSV file lines should be found.
    :param username: username of the User to which sounds should be assigned to.
    :return: tuple - (lines_validated, global_errors)
    """
    lines_validated = []
    global_errors = []
    filenames_to_describe = []

    # Import required sound models using apps.get_model (to avoid circular dependencies)
    License = apps.get_model('sounds', 'License')

    # Import sound form here to avoid circular dependecy problems between sounds.models, sounds.forms and
    # utils.sound_upload.
    from sounds.forms import SoundCSVDescriptionForm

    # Check headers
    if username is not None and csv_header != EXPECTED_HEADER_NO_USERNAME:
        global_errors.append('Invalid header. Header should be: <i>%s</i>' %
                             ','.join(EXPECTED_HEADER_NO_USERNAME))
    elif username is None and csv_header != EXPECTED_HEADER:
        global_errors.append('Invalid header. Header should be: <i>%s</i>' %
                             ','.join(EXPECTED_HEADER))

    # Check that there are lines for sounds
    if len(csv_lines) == 0:
        global_errors.append(
            'The file contains no lines with sound descriptions')

    # Check individual rows
    if not global_errors:
        for n, line in enumerate(csv_lines):
            line_errors = defaultdict(str)
            line_cleaned = None
            n_columns_is_ok = True

            # Check that number of columns is ok
            if len(line) != len(EXPECTED_HEADER) and username is None:
                line_errors[
                    'columns'] = 'Row should have %i columns but it has %i.' % (
                        len(EXPECTED_HEADER), len(line))
                n_columns_is_ok = False

            if len(line) != len(
                    EXPECTED_HEADER_NO_USERNAME) and username is not None:
                line_errors['columns'] = 'Row should have %i columns but it has %i.' \
                                        % (len(EXPECTED_HEADER_NO_USERNAME), len(line))
                n_columns_is_ok = False

            if n_columns_is_ok:
                # If the number of columns of the current row is ok, we can proceed to validate each individual column

                # 1) Check that user exists
                sound_username = username or line['username']
                try:
                    User.objects.get(username=sound_username)
                except User.DoesNotExist:
                    line_errors['username'] = "******"

                # 2) Check that audio file is valid, that exists in disk and that it has not been described yet in
                # CSV another line
                audio_filename = line['audio_filename']
                if not audio_filename.strip():
                    line_errors['audio_filename'] = "Invalid audio filename."
                else:
                    from accounts.forms import filename_has_valid_extension
                    if not filename_has_valid_extension(audio_filename):
                        line_errors[
                            'audio_filename'] = "Invalid file extension."
                    else:
                        src_path = os.path.join(sounds_base_dir,
                                                audio_filename)
                        if not os.path.exists(src_path):
                            line_errors['audio_filename'] = "Audio file does not exist. This should be the name of " \
                                                            "one of the audio files you <a href='%s'>previously " \
                                                            "uploaded</a>." % reverse('accounts-describe')
                        else:
                            if src_path in filenames_to_describe:
                                line_errors[
                                    'audio_filename'] = "Audio file can only be described once."
                            else:
                                filenames_to_describe.append(src_path)

                # 3) Check that all the other sound fields are ok
                try:
                    license = License.objects.get(name=line['license'])
                    license_id = license.id
                    license_name = license.name
                except License.DoesNotExist:
                    license_id = 0
                    license_name = ''

                try:
                    # Make sure is_explicit value is an integer (the library we use to parse xls/xlsx files treats
                    # numbers as float and we need integer here.
                    line['is_explicit'] = int(float(line['is_explicit']))

                    # Check that is_explicit is either 0 or 1
                    if int(line['is_explicit']) not in [0, 1]:
                        line_errors['is_explicit'] = 'Invalid value. Should be "1" if sound is explicit or ' \
                                                     '"0" otherwise.'
                except ValueError:
                    line_errors[
                        'is_explicit'] = 'Invalid value. Should be "1" if sound is explicit or "0" otherwise.'

                sound_fields = {
                    'name': line['name'] or audio_filename,
                    'description': line['description'],
                    'license': license_id,
                    'tags': line['tags'],
                    'pack_name': line['pack_name'] or None,
                    'is_explicit': str(line['is_explicit']) == '1'
                }

                if line['geotag'].strip():
                    geoparts = str(line['geotag']).split(',')
                    if len(geoparts) == 3:
                        lat, lon, zoom = geoparts
                        sound_fields['lat'] = lat
                        sound_fields['lon'] = lon
                        sound_fields['zoom'] = zoom
                    else:
                        line_errors['geotag'] = "Invalid geotag format. Must be latitude, longitude and zoom " \
                                                "separated by commas (e.g. 41.40348, 2.189420, 18)."

                form = SoundCSVDescriptionForm(sound_fields)
                if not form.is_valid():
                    # If there are errors, add them to line_errors
                    for field, errors in json.loads(
                            form.errors.as_json()).items():
                        if field in ['lat', 'lon', 'zoom']:
                            line_errors['geotag'] += ' '.join(
                                [e['message'] for e in errors])
                        else:
                            line_errors[field] = ' '.join(
                                [e['message'] for e in errors])
                    # Post-process some error so they are more user-friendly
                    if 'Enter a whole number' in line_errors[
                            'geotag'] or 'Enter a number' in line_errors[
                                'geotag']:
                        # Make geotag error messages more user-friendly when the problem is that at least one of the
                        # numbers is not formatted correctly
                        line_errors['geotag'] = "Invalid geotag format. Must be latitude, longitude and zoom " \
                                                "separated by commas (e.g. 41.40348, 2.189420, 18)."

                line_cleaned = form.cleaned_data
                line_cleaned.update(
                    {  # Update line_cleaned with the fields not returned by SoundCSVDescriptionForm
                        'username': sound_username,
                        'audio_filename': audio_filename,
                        'license':
                        license_name,  # Overwrite license with license name as License is not JSON serializable
                        'tags': list(
                            line_cleaned.get('tags', [])
                        ),  # Convert tags to List as Set is not JSON serializable
                    })

            lines_validated.append({
                'line_no': n +
                2,  # Show line number with l1 = header, l2 = first sound, and soon
                'line_original': line,
                'line_cleaned': line_cleaned,
                'line_errors': line_errors,
            })

    return lines_validated, global_errors
Beispiel #2
0
def validate_input_csv_file(csv_header, csv_lines, sounds_base_dir, username=None):
    """
    Reads through the lines of a CSV file containing metadata to describe (and create) new Sound objects and returns
    the list of lines after the validation process and a list of global errors (if any).

    Each element in the returned list of lines after the validation process is a dictionary which inclues the original
    line content, the cleaned line content (i.e. fields with cleaned data), and a dictionary of errors for the line
    (if any). Lines that validated ok form lines that did not validate ok can be separated by checking whether there
    are any errors for them.

    :param csv_header: header of the CSV (as returned by 'get_csv_lines' funtion above).
    :param csv_lines: lines of the CSV (as returned by 'get_csv_lines' funtion above).
    :param sounds_base_dir: directory where audio files referenced in CSV file lines should be found.
    :param username: username of the User to which sounds should be assigned to.
    :return: tuple - (lines_validated, global_errors)
    """
    lines_validated = []
    global_errors = []
    filenames_to_describe = []

    # Import required sound models using apps.get_model (to avoid circular dependencies)
    License = apps.get_model('sounds', 'License')

    # Import sound form here to avoid circular dependecy problems between sounds.models, sounds.forms and
    # utils.sound_upload.
    from sounds.forms import SoundCSVDescriptionForm

    # Check headers
    if username is not None and csv_header != EXPECTED_HEADER_NO_USERNAME:
        global_errors.append('Invalid header. Header should be: <i>%s</i>'
                             % ','.join(EXPECTED_HEADER_NO_USERNAME))
    elif username is None and csv_header != EXPECTED_HEADER:
        global_errors.append('Invalid header. Header should be: <i>%s</i>'
                             % ','.join(EXPECTED_HEADER))

    # Check that there are lines for sounds
    if len(csv_lines) == 0:
        global_errors.append('The file contains no lines with sound descriptions')

    # Check individual rows
    if not global_errors:
        for n, line in enumerate(csv_lines):
            line_errors = defaultdict(str)
            line_cleaned = None
            n_columns_is_ok = True

            # Check that number of columns is ok
            if len(line) != len(EXPECTED_HEADER) and username is None:
                line_errors['columns'] = 'Row should have %i columns but it has %i.' % (len(EXPECTED_HEADER), len(line))
                n_columns_is_ok = False

            if len(line) != len(EXPECTED_HEADER_NO_USERNAME) and username is not None:
                line_errors['columns'] = 'Row should have %i columns but it has %i.' \
                                        % (len(EXPECTED_HEADER_NO_USERNAME), len(line))
                n_columns_is_ok = False

            if n_columns_is_ok:
                # If the number of columns of the current row is ok, we can proceed to validate each individual column

                # 1) Check that user exists
                sound_username = username or line['username']
                try:
                    User.objects.get(username=sound_username)
                except User.DoesNotExist:
                    line_errors['username'] = "******"

                # 2) Check that audio file is valid, that exists in disk and that it has not been described yet in
                # CSV another line
                audio_filename = line['audio_filename']
                if not audio_filename.strip():
                    line_errors['audio_filename'] = "Invalid audio filename."
                else:
                    from accounts.forms import filename_has_valid_extension
                    if not filename_has_valid_extension(audio_filename):
                        line_errors['audio_filename'] = "Invalid file extension."
                    else:
                        src_path = os.path.join(sounds_base_dir, audio_filename)
                        if not os.path.exists(src_path):
                            line_errors['audio_filename'] = "Audio file does not exist. This should be the name of " \
                                                            "one of the audio files you <a href='%s'>previously " \
                                                            "uploaded</a>." % reverse('accounts-describe')
                        else:
                            if src_path in filenames_to_describe:
                                line_errors['audio_filename'] = "Audio file can only be described once."
                            else:
                                filenames_to_describe.append(src_path)

                # 3) Check that all the other sound fields are ok
                try:
                    license = License.objects.get(name=line['license'])
                    license_id = license.id
                    license_name = license.name
                except License.DoesNotExist:
                    license_id = 0
                    license_name = ''

                try:
                    # Make sure is_explicit value is an integer (the library we use to parse xls/xlsx files treats
                    # numbers as float and we need integer here.
                    line['is_explicit'] = int(float(line['is_explicit']))

                    # Check that is_explicit is either 0 or 1
                    if int(line['is_explicit']) not in [0, 1]:
                        line_errors['is_explicit'] = 'Invalid value. Should be "1" if sound is explicit or ' \
                                                     '"0" otherwise.'
                except ValueError:
                    line_errors['is_explicit'] = 'Invalid value. Should be "1" if sound is explicit or "0" otherwise.'

                sound_fields = {
                    'name': line['name'] or audio_filename,
                    'description': line['description'],
                    'license': license_id,
                    'tags': line['tags'],
                    'pack_name': line['pack_name'] or None,
                    'is_explicit': str(line['is_explicit']) == '1'
                }

                if line['geotag'].strip():
                    geoparts = str(line['geotag']).split(',')
                    if len(geoparts) == 3:
                        lat, lon, zoom = geoparts
                        sound_fields['lat'] = lat
                        sound_fields['lon'] = lon
                        sound_fields['zoom'] = zoom
                    else:
                        line_errors['geotag'] = "Invalid geotag format. Must be latitude, longitude and zoom " \
                                                "separated by commas (e.g. 41.40348, 2.189420, 18)."

                form = SoundCSVDescriptionForm(sound_fields)
                if not form.is_valid():
                    # If there are errors, add them to line_errors
                    for field, errors in json.loads(form.errors.as_json()).items():
                        if field in ['lat', 'lon', 'zoom']:
                            line_errors['geotag'] += ' '.join([e['message'] for e in errors])
                        else:
                            line_errors[field] = ' '.join([e['message'] for e in errors])
                    # Post-process some error so they are more user-friendly
                    if 'Enter a whole number' in line_errors['geotag'] or 'Enter a number' in line_errors['geotag']:
                        # Make geotag error messages more user-friendly when the problem is that at least one of the
                        # numbers is not formatted correctly
                        line_errors['geotag'] = "Invalid geotag format. Must be latitude, longitude and zoom " \
                                                "separated by commas (e.g. 41.40348, 2.189420, 18)."

                line_cleaned = form.cleaned_data
                line_cleaned.update({  # Update line_cleaned with the fields not returned by SoundCSVDescriptionForm
                    'username': sound_username,
                    'audio_filename': audio_filename,
                    'license': license_name,  # Overwrite license with license name as License is not JSON serializable
                    'tags': list(line_cleaned.get('tags', [])),  # Convert tags to List as Set is not JSON serializable
                })

            lines_validated.append({
                'line_no': n + 2,  # Show line number with l1 = header, l2 = first sound, and soon
                'line_original': line,
                'line_cleaned': line_cleaned,
                'line_errors': line_errors,
            })

    return lines_validated, global_errors