Beispiel #1
0
def pipeline_restore_version(slug, file_id):
    pipeline = get_object_or_404(Pipeline, slug=slug)
    form = forms.RestoreVersionForm()
    is_form_valid = form.validate_on_submit()
    if is_form_valid and form.proceed.data != YES:
        return redirect(
            url_for(
                'uploader_views.pipeline_data_upload',
                slug=pipeline.slug,
            )
        )

    data_file_latest = pipeline.latest_version
    file_contents_latest, _ = CSVParser.get_csv_sample(
        data_file_latest.data_file_url, data_file_latest.delimiter, data_file_latest.quote
    )

    data_file_to_restore = get_object_or_404(PipelineDataFile, pipeline=pipeline, id=file_id)
    file_contents_to_restore, _ = CSVParser.get_csv_sample(
        data_file_to_restore.data_file_url,
        data_file_to_restore.delimiter,
        data_file_to_restore.quote,
    )

    if is_form_valid:
        data_file_to_restore.state = DataUploaderFileState.VERIFIED.value
        data_file_to_restore.save()

        thread = process_pipeline_data_file(data_file_to_restore)
        thread.start()

        return redirect(
            url_for(
                'uploader_views.pipeline_data_uploaded',
                slug=pipeline.slug,
                file_id=data_file_to_restore.id,
            )
        )

    return render_uploader_template(
        'pipeline_restore_version.html',
        form=form,
        file_contents_latest=file_contents_latest,
        file_contents_to_restore=file_contents_to_restore,
        column_types_latest=dict(data_file_latest.column_types),
        column_types_to_restore=dict(data_file_to_restore.column_types),
        format_row_data=format_row_data,
    )
def test_get_s3_file_sample_invalid_lines(mock_tabulator_stream, app):
    csv_string = 'hello,goodbye\nbad\n1,2'
    _mock_stream_return_values(mock_tabulator_stream, [csv_string])
    result, err = CSVParser.get_csv_sample('', ',')
    assert not result
    assert err == (
        'Unable to process CSV file: row 2 has a different number of '
        'data points (1) than there are column headers (2)')
def test_get_s3_file_sample_empty_file(mock_tabulator_stream, app):
    csv_string = ' '
    _mock_stream_return_values(mock_tabulator_stream, [csv_string])
    result, err = CSVParser.get_csv_sample('', ',')
    assert not result
    assert err == (
        'Unable to process CSV file: no headers found. The first line of the csv should '
        'contain the column headers.')
def test_get_s3_file_sample_when_empty_headers(mock_tabulator_stream, app):
    csv_string = 'hello,,goodbye,\n1,2,3,4\n5,6,7,8'
    _mock_stream_return_values(mock_tabulator_stream, [csv_string])
    result, err = CSVParser.get_csv_sample('', ',')
    assert not err

    # column with empty header is ignored
    assert result == [('hello', 'integer', ['1', '5']),
                      ('goodbye', 'integer', ['3', '7'])]
def test_get_s3_file_sample_when_extra_data_column(mock_tabulator_stream, app):
    csv_string = 'hello,goodbye\n1,2,3\n4,5,6\n7,8,9'
    _mock_stream_return_values(mock_tabulator_stream, [csv_string])
    result, err = CSVParser.get_csv_sample('', ',')

    assert err == (
        'Unable to process CSV file: row 2 has a different number of '
        'data points (3) than there are column headers (2)')
    assert not result
def test_get_s3_file_sample(mock_tabulator_stream, csv_string, delimiter,
                            quotechar, app):
    _mock_stream_return_values(mock_tabulator_stream, [csv_string], delimiter,
                               quotechar)
    result, err = CSVParser.get_csv_sample('',
                                           delimiter,
                                           number_of_lines_sample=2)
    assert not err
    assert len(result) == 2
def test_get_s3_file_sample_with_invalid_header_names(mock_tabulator_stream,
                                                      app):
    csv_string = 'spaces in header,weird :@£$% characters,Uppercase\n1,2,3\n4,5,6\n7,8,9'
    _mock_stream_return_values(mock_tabulator_stream, [csv_string])
    result, err = CSVParser.get_csv_sample('', ',')
    assert err == (
        'Unable to process CSV file: column headers must start with a letter and may only '
        'contain lowercase letters, numbers, and underscores. Invalid headers: "spaces in '
        'header", "weird :@£$% characters", "Uppercase"')
    assert not result
def test_get_s3_file_sample_when_duplicate_header_names(
        mock_tabulator_stream, app):
    csv_string = 'hello,goodbye,goodbye\n1,2,3\n4,5,6\n7,8,9'
    _mock_stream_return_values(mock_tabulator_stream, [csv_string])
    result, err = CSVParser.get_csv_sample('', ',')
    assert not err

    # duplicate headers are made unique
    assert result == [
        ('hello', 'integer', ['1', '4', '7']),
        ('goodbye_1', 'integer', ['2', '5', '8']),
        ('goodbye_2', 'integer', ['3', '6', '9']),
    ]
def test_get_s3_file_sample_infer_data_types_big_sample(
        mock_tabulator_stream, app):
    csv_string = 'int,bool,text,datetime,date,numeric,mix\n'
    for i in range(1000):
        if i == 900:
            csv_string += 'text,text,text,text,text,text,text\n'
            continue
        csv_string += '2000,true,test,2006-11-26T16:30:00Z,2004-01-01,3.1,test\n'

    _mock_stream_return_values(mock_tabulator_stream, [csv_string])
    result, err = CSVParser.get_csv_sample('', ',')
    assert not err
    for column, type, _ in result:
        assert type == 'text'
Beispiel #10
0
def _move_file_to_s3(file_url, organisation, dataset, delimiter, quote):
    bucket = app.config['s3']['bucket_url']
    file_name = file_url.split('/')[-1]
    full_url = os.path.join(bucket, file_url)
    utf_8_byte_stream = CSVParser.get_csv_as_utf_8_byte_stream(
        full_url=full_url,
        delimiter=delimiter,
        quotechar=quote,
    )
    file_info = FileInfo(file_url, utf_8_byte_stream)
    storage = StorageFactory.create(bucket)
    datasets_folder = app.config['s3']['datasets_folder']
    target_file_url = f'{datasets_folder}/{organisation}/{dataset}/{file_name}'
    storage.write_file(target_file_url, file_info.data)
    file_info.data.seek(0)
    return file_info
def test_get_s3_file_sample_infer_data_types(mock_tabulator_stream, app):
    csv_string = ('int,bool,text,datetime,date,numeric,mix\n'
                  '2000,true,test,2006-11-26T16:30:00Z,2004-01-01,3.1,test\n'
                  '13,false,test,2018-12-18T12:10:00Z,1998-12-26,-1,-2')
    _mock_stream_return_values(mock_tabulator_stream, [csv_string])
    result, err = CSVParser.get_csv_sample('', ',')
    assert not err
    assert result == [
        ('int', 'integer', ['2000', '13']),
        ('bool', 'boolean', ['true', 'false']),
        ('text', 'text', ['test', 'test']),
        ('datetime', 'timestamp',
         ['2006-11-26T16:30:00Z', '2018-12-18T12:10:00Z']),
        ('date', 'date', ['2004-01-01', '1998-12-26']),
        ('numeric', 'numeric', ['3.1', '-1']),
        ('mix', 'text', ['test', '-2']),
    ]
def test_get_s3_file_sample_with_no_data(mock_tabulator_stream, app):
    csv_string = 'hello,goodbye'
    _mock_stream_return_values(mock_tabulator_stream, [csv_string])
    result, err = CSVParser.get_csv_sample('', ',')
    assert err == ('Unable to process CSV file: no data found')
    assert not result
def test_make_unique_headers(headers, unique_headers):
    assert CSVParser.make_unique_headers(headers) == unique_headers
Beispiel #14
0
def pipeline_data_verify(slug, file_id):
    pipeline = get_object_or_404(Pipeline, slug=slug)
    pipeline_data_file = get_object_or_404(PipelineDataFile, pipeline=pipeline, id=file_id)

    form = forms.VerifyDataFileForm()
    is_form_valid = form.validate_on_submit()
    if is_form_valid and form.proceed.data != YES:
        pipeline_data_file.delete()
        delete_file(pipeline_data_file)
        return redirect(url_for('uploader_views.pipeline_select'))

    new_file_contents, new_file_err = CSVParser.get_csv_sample(
        pipeline_data_file.data_file_url, pipeline_data_file.delimiter, pipeline_data_file.quote
    )

    current_file_contents, current_column_types, missing_headers = None, None, set()
    if pipeline.latest_version:
        data_file_latest = pipeline.latest_version
        current_file_contents, current_file_err = CSVParser.get_csv_sample(
            data_file_latest.data_file_url, data_file_latest.delimiter, data_file_latest.quote
        )
        missing_headers = get_missing_headers(
            current_version=current_file_contents, new_version=new_file_contents
        )
        current_column_types = dict(data_file_latest.column_types)

    if is_form_valid:
        selected_column_types = [
            (column, request.form[column]) for column, _, _ in new_file_contents
        ]
        pipeline_data_file.column_types = selected_column_types
        pipeline_data_file.state = DataUploaderFileState.VERIFIED.value
        pipeline_data_file.save()

        thread = process_pipeline_data_file(pipeline_data_file)
        thread.start()

        return redirect(
            url_for(
                'uploader_views.pipeline_data_uploaded',
                slug=pipeline.slug,
                file_id=pipeline_data_file.id,
            )
        )

    if new_file_err is None:
        uploaded_columns = set(x[0] for x in new_file_contents)
        error_message = check_for_reserved_column_names(pipeline_data_file, uploaded_columns)
        if error_message:
            new_file_contents = None
            new_file_err = error_message

    if new_file_err is not None:
        pipeline_data_file.state = DataUploaderFileState.FAILED.value
        pipeline_data_file.error_message = new_file_err
        pipeline_data_file.save()
        form.errors['non_field_errors'] = [new_file_err]

    return render_uploader_template(
        'pipeline_data_verify.html',
        pipeline=pipeline,
        new_file_contents=new_file_contents,
        current_file_contents=current_file_contents,
        current_column_types=current_column_types,
        data_types=DataUploaderDataTypes.values(),
        format_row_data=format_row_data,
        form=form,
        missing_headers=missing_headers,
    )