Python sniff_file_format Examples, ckanext.qa.sniff_format.sniff_file_format Python Examples

Example #1

0

Show file

File: tasks.py Project: thenets/ckanext-qa

def score_by_sniffing_data(archival, resource, score_reasons, log):
    '''
    Looks inside a data file\'s contents to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    '''
    if not archival or not archival.cache_filepath:
        score_reasons.append(
            _('This file had not been downloaded at the time of scoring it.'))
        return (None, None)
    # Analyse the cached file
    filepath = archival.cache_filepath
    if not os.path.exists(filepath):
        score_reasons.append(
            _('Cache filepath does not exist: "%s".') % filepath)
        return (None, None)
    else:
        if filepath:
            sniffed_format = sniff_file_format(filepath, log)
            score = lib.resource_format_scores().get(sniffed_format['format']) \
                if sniffed_format else None
            if sniffed_format:
                score_reasons.append(
                    _('Content of file appeared to be format "%s" which receives openness score: %s.'
                      ) % (sniffed_format['format'], score))
                return score, sniffed_format['format']
            else:
                score_reasons.append(
                    _('The format of the file was not recognized from its contents.'
                      ))
                return (None, None)
        else:
            # No cache_url
            if archival.status_id == Status.by_text('Chose not to download'):
                score_reasons.append(
                    _('File was not downloaded deliberately') + '. ' +
                    _('Reason') + ': %s. ' % archival.reason +
                    _('Using other methods to determine file openness.'))
                return (None, None)
            elif archival.is_broken is None and archival.status_id:
                # i.e. 'Download failure' or 'System error during archival'
                score_reasons.append(
                    _('A system error occurred during downloading this file') +
                    '. ' + _('Reason') + ': %s. ' % archival.reason +
                    _('Using other methods to determine file openness.'))
                return (None, None)
            else:
                score_reasons.append(
                    _('This file had not been downloaded at the time of scoring it.'
                      ))
                return (None, None)

Example #2

0

Show file

def sniff(args):
    from ckanext.qa.sniff_format import sniff_file_format
    if len(args) < 1:
        print('Not enough arguments', args)
        sys.exit(1)
    for filepath in args[0:]:
        format_ = sniff_file_format(filepath)
        if format_:
            print('Detected as: %s - %s' % (format_['format'], filepath))
        else:
            print('ERROR: Could not recognise format of: %s' % filepath)

Example #3

0

Show file

File: commands.py Project: tbalaz/test

    def sniff(self):
        from ckanext.qa.sniff_format import sniff_file_format

        if len(self.args) < 2:
            print 'Not enough arguments', self.args
            sys.exit(1)
        for filepath in self.args[1:]:
            format_ = sniff_file_format(filepath, logging.getLogger('ckanext.qa.sniffer'))
            if format_:
                print 'Detected as: %s - %s' % (format_['display_name'], filepath)
            else:
                print 'ERROR: Could not recognise format of: %s' % filepath

Example #4

0

Show file

File: test_sniff_format.py Project: CarlQLange/ckanext-qa

    def assert_file_has_format_sniffed_correctly(cls, format_extension, filepath):
        '''Given a filepath, checks the sniffed format matches the format_extension.'''
        expected_format = format_extension
        sniffed_format = sniff_file_format(filepath, log)
        assert sniffed_format, expected_format
        expected_format_without_zip = expected_format.replace('.zip', '')
        assert_equal(sniffed_format['format'].lower(), expected_format_without_zip)

        expected_container = None
        if expected_format.endswith('.zip'):
            expected_container = 'ZIP'
        elif expected_format.endswith('.gzip'):
            expected_container = 'ZIP'  # lumped together with zip for simplicity now
        assert_equal(sniffed_format.get('container'), expected_container)

Example #5

0

Show file

    def sniff(self):
        from ckanext.qa.sniff_format import sniff_file_format

        if len(self.args) < 2:
            print 'Not enough arguments', self.args
            sys.exit(1)
        for filepath in self.args[1:]:
            format_ = sniff_file_format(
                filepath, logging.getLogger('ckanext.qa.sniffer'))
            if format_:
                print 'Detected as: %s - %s' % (format_['display_name'],
                                                filepath)
            else:
                print 'ERROR: Could not recognise format of: %s' % filepath

Example #6

0

Show file

File: test_sniff_format.py Project: uk-gov-mirror/datagovuk.ckanext-qa

    def assert_file_has_format_sniffed_correctly(cls, format_extension, filepath):
        '''Given a filepath, checks the sniffed format matches the format_extension.'''
        expected_format = format_extension
        sniffed_format = sniff_file_format(filepath, log)
        assert sniffed_format, expected_format
        expected_format_without_zip = expected_format.replace('.zip', '')
        assert_equal(sniffed_format['format'].lower(), expected_format_without_zip)

        expected_container = None
        if expected_format.endswith('.zip'):
            expected_container = 'ZIP'
        elif expected_format.endswith('.gzip'):
            expected_container = 'ZIP'  # lumped together with zip for simplicity now
        assert_equal(sniffed_format.get('container'), expected_container)

Example #7

0

Show file

File: test_sniff_format.py Project: derilinx/ckanext-qa

    def assert_file_has_format_sniffed_correctly(cls, format_extension, filepath):
        """Given a filepath, checks the sniffed format matches the format_extension."""
        expected_format = format_extension
        sniffed_format = sniff_file_format(filepath, log)
        assert sniffed_format, expected_format
        expected_format_without_zip = expected_format.replace(".zip", "")
        assert_equal(sniffed_format["format"].lower(), expected_format_without_zip)

        expected_container = None
        if expected_format.endswith(".zip"):
            expected_container = "ZIP"
        elif expected_format.endswith(".gzip"):
            expected_container = "ZIP"  # lumped together with zip for simplicity now
        assert_equal(sniffed_format.get("container"), expected_container)

Example #8

0

Show file

File: tasks.py Project: CarlQLange/ckanext-qa

def score_by_sniffing_data(archival, resource, score_reasons, log):
    """
    Looks inside a data file\'s contents to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_string)
      * If it cannot work out the format then format_string is None
      * If it cannot score it, then score is None
    """
    if not archival or not archival.cache_filepath:
        score_reasons.append("This file had not been downloaded at the time of scoring it.")
        return (None, None)
    # Analyse the cached file
    filepath = archival.cache_filepath
    if not os.path.exists(filepath):
        score_reasons.append('Cache filepath does not exist: "%s".' % filepath)
        return (None, None)
    else:
        if filepath:
            sniffed_format = sniff_file_format(filepath, log)
            score = lib.resource_format_scores().get(sniffed_format["format"]) if sniffed_format else None
            if sniffed_format:
                score_reasons.append(
                    'Content of file appeared to be format "%s" which receives openness score: %s.'
                    % (sniffed_format["format"], score)
                )
                return score, sniffed_format["format"]
            else:
                score_reasons.append("The format of the file was not recognized from its contents.")
                return (None, None)
        else:
            # No cache_url
            if archival.status_id == Status.by_text("Chose not to download"):
                score_reasons.append(
                    "File was not downloaded deliberately. Reason: %s. Using other methods to determine file openness."
                    % archival.reason
                )
                return (None, None)
            elif archival.is_broken is None and archival.status_id:
                # i.e. 'Download failure' or 'System error during archival'
                score_reasons.append(
                    "A system error occurred during downloading this file. Reason: %s. Using other methods to determine file openness."
                    % archival.reason
                )
                return (None, None)
            else:
                score_reasons.append("This file had not been downloaded at the time of scoring it.")
                return (None, None)

Example #9

0

Show file

File: tasks.py Project: zfbpb/data.gov.hr

def score_by_sniffing_data(archival, resource, score_reasons, log):
    '''
    Looks inside a data file\'s contents to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_display_name)
      * If it cannot work out the format then format_display_name is None
      * If it cannot score it, then score is None
    '''
    if not archival or not archival.cache_filepath:
        score_reasons.append('Datoteka nije preuzeta u vrijeme ocijenjivanja.')
        return (None, None)
    # Analyse the cached file
    filepath = archival.cache_filepath
    if not os.path.exists(filepath):
        score_reasons.append('Putanja predmemorije ne postoji: "%s".' %
                             filepath)
        return (None, None)
    else:
        if filepath:
            sniffed_format = sniff_file_format(filepath, log)
            if sniffed_format:
                score_reasons.append(
                    'Podaci su u formatu "%s" s ocjenom otvorenosti: %s.' %
                    (sniffed_format['display_name'],
                     sniffed_format['openness']))
                return sniffed_format['openness'], sniffed_format[
                    'display_name']
            else:
                score_reasons.append('Format je nepoznat.')
                return (None, None)
        else:
            # No cache_url
            if archival.status_id == Status.by_text('Chose not to download'):
                score_reasons.append('Datoteka nije preuzeta namjerno. Razlog: %s.' % \
                                     archival.reason)
                return (None, None)
            elif archival.is_broken is None and archival.status_id:
                # i.e. 'Download failure' or 'System error during archival'
                score_reasons.append('Dogodio se problem prilikom preuzimanja datoteke. Razlog: %s.' % \
                                     archival.reason)
                return (None, None)
            else:
                score_reasons.append(
                    'Datoteka nije preuzeta u vrijeme ocijenjivanja.')
                return (None, None)

Example #10

0

Show file

File: tasks.py Project: tbalaz/test

def score_by_sniffing_data(archival, resource, score_reasons, log):
    '''
    Looks inside a data file\'s contents to determine its format and score.

    It adds strings to score_reasons list about how it came to the conclusion.

    Return values:
      * It returns a tuple: (score, format_display_name)
      * If it cannot work out the format then format_display_name is None
      * If it cannot score it, then score is None
    '''
    if not archival or not archival.cache_filepath:
        score_reasons.append('Datoteka nije preuzeta u vrijeme ocijenjivanja.')
        return (None, None)
    # Analyse the cached file
    filepath = archival.cache_filepath
    if not os.path.exists(filepath):
        score_reasons.append('Putanja predmemorije ne postoji: "%s".' % filepath)
        return (None, None)
    else:
        if filepath:
            sniffed_format = sniff_file_format(filepath, log)
            if sniffed_format:
                score_reasons.append('Podaci su u formatu "%s" s ocjenom otvorenosti: %s.' % (sniffed_format['display_name'], sniffed_format['openness']))
                return sniffed_format['openness'], sniffed_format['display_name']
            else:
                score_reasons.append('Format je nepoznat.')
                return (None, None)
        else:
            # No cache_url
            if archival.status_id == Status.by_text('Chose not to download'):
                score_reasons.append('Datoteka nije preuzeta namjerno. Razlog: %s.' % \
                                     archival.reason)
                return (None, None)
            elif archival.is_broken is None and archival.status_id:
                # i.e. 'Download failure' or 'System error during archival'
                score_reasons.append('Dogodio se problem prilikom preuzimanja datoteke. Razlog: %s.' % \
                                     archival.reason)
                return (None, None)
            else:
                score_reasons.append('Datoteka nije preuzeta u vrijeme ocijenjivanja.')
                return (None, None)

Example #11

0

Show file

    def validate_resource(self, id):
        if toolkit.request.method == 'POST':
            data = dict_fns.unflatten(
                tuplize_dict(parse_params(toolkit.request.POST)))
            check_schema = toolkit.request.params.get('check_schema')
            upload_data = toolkit.request.params.get('upload_data')
            file_path = data.get('file_path')

            # Logic for validating a resource against a specified schema
            if check_schema:
                schema = {'fields': []}

                fields = data.get('field_name')
                field_type = data.get('field_type')

                # Schema is populated from data entered by the user
                for i, field in enumerate(fields):
                    schema['fields'].append({
                        'name': field,
                        'type': field_type[i]
                    })

                # File is validated with Goodtables
                report = validate(file_path, schema=schema)

                log = logging.getLogger('ckanext.tayside')

                # Score is calculated based on Sir Tim Berners-Lee's five star
                # of openness
                sniffed_format = sniff_file_format(file_path, log)
                score = resource_format_scores().get(sniffed_format['format'])

                vars = {
                    'report': report,
                    'pkg_name': id,
                    'stars': score,
                    'file_path': file_path
                }

                return toolkit.render('tayside/package/validate_resource.html',
                                      extra_vars=vars)
            elif upload_data:
                # Handles creating a resource in CKAN

                # Metadata for the resource is stored in Redis
                r = redis.StrictRedis()
                data = json.loads(r.get(file_path))
                data['package_id'] = id

                # Dataset's state is changed from 'draft' to 'active'
                toolkit.get_action('package_patch')({}, {
                    'id': id,
                    'state': 'active'
                })

                # FieldStorage instance is created which is needed to upload
                # the file to Filestore and Datastore
                fs = cgi.FieldStorage()
                fs.file = fs.make_file()
                fs.filename = data.get('url')

                f = open(file_path, 'r')
                fs.file.write(f.read())
                fs.file.seek(0)
                f.close()

                data['upload'] = fs

                try:
                    toolkit.get_action('resource_create')({}, data)
                except Exception as e:
                    vars = {
                        'upload_error': 'An error occured while creating the '
                        'resource.',
                        'pkg_name': id
                    }

                    return toolkit.render(
                        'tayside/package/validate_resource.html',
                        extra_vars=vars)

                # File is uploaded on Filestore, and now it is safe to be
                # removed from the temporary location
                os.remove(file_path)

                toolkit.redirect_to(controller='package', action='read', id=id)
            else:
                is_upload = isinstance(data.get('upload'), cgi.FieldStorage)
                supported_formats = ['csv', 'tsv', 'xls', 'xlsx', 'ods']
                current_format = data.get('url').split('.')[-1]

                if is_upload:
                    if current_format in supported_formats:
                        # Logic for storing the file locally and extracting
                        # it's headers (fields)
                        storage_path = config.get('ckan.storage_path')
                        file_path = storage_path + '/' + data.get('url')

                        # Read the file
                        buffer = data.get('upload').file
                        buffer.seek(0)

                        # Write the file locally
                        f = open(file_path, 'w')
                        f.write(buffer.read())
                        f.close()

                        # Inspect the headers (fields) of the file
                        with Stream(file_path, headers=1) as stream:
                            fields = stream.headers

                        vars = {
                            'fields': fields,
                            'pkg_name': id,
                            'file_path': file_path
                        }

                        if is_redis_available():
                            # Store the metadata of the resource in Redis for
                            # later usage
                            r = redis.StrictRedis()
                            resource_data = {
                                'name': data.get('name'),
                                'description': data.get('description'),
                                'format': data.get('format'),
                                'url': data.get('url'),
                            }

                            r.set(file_path, json.dumps(resource_data))

                            # Store it for 1 day
                            r.expire(file_path, 86400)
                        else:
                            return toolkit.render(
                                'tayside/package/validate_resource.html',
                                {'redis_error': 'Redis not available'})

                        return toolkit.render(
                            'tayside/package/validate_resource.html',
                            extra_vars=vars)
                    else:
                        vars = {
                            'format_error': 'Format not supported.',
                            'pkg_name': id
                        }

                        return toolkit.render(
                            'tayside/package/validate_resource.html',
                            extra_vars=vars)

                vars = {
                    'format_error': 'No file provided for validation.',
                    'pkg_name': id
                }

                return toolkit.render('tayside/package/validate_resource.html',
                                      extra_vars=vars)
        else:
            return toolkit.render('tayside/package/validate_resource.html',
                                  {'pkg_name': id})