Ejemplo n.º 1
0
    def handle(self, *args, **options):
        if not settings.NMTK_SERVER:
            raise CommandError('The NMTK Server is not currently enabled')

        for m in models.DataFile.objects.filter(srid__gte=0):
            data_qs = data_output.getQuerySet(m)
            extent = data_qs.extent()
            geometry = Polygon.from_bbox(extent)
            m.extent = str(geometry)
            m.save()
Ejemplo n.º 2
0
def importDataFile(datafile, job_id=None):
    from NMTK_server import models
    datafile.status_message = None
    try:
        loader = NMTKDataLoader(datafile.file.path,
                                srid=datafile.srid)
        destination = None
        for import_file in loader.extract_files():
            # Figure out where these files need to go.
            if not destination:
                destination = os.path.dirname(datafile.file.path)
                # the first file we get (when destination is null,it's our first
                # loop) is the one that needs to be in the model, handle that
                # here...
                if datafile.file.path != import_file:
                    f = open(import_file)
                    datafile.file.save(os.path.basename(import_file), File(f))
            else:
                shutil.copyfile(import_file,
                                os.path.join(destination,
                                             os.path.basename(import_file)))
            logger.debug('Created a new file for %s', import_file)

        if loader.is_spatial:
            datafile.srid = loader.info.srid
            datafile.srs = loader.info.srs
            datafile.geom_type = loader.info.type
            logger.debug('Loader extent is %s', loader.info.extent)
            extent = geos.Polygon.from_bbox(loader.info.extent)
            logger.debug("Extent is 'srid=%s;%s'::geometry", loader.info.srid,
                         extent,)
            if datafile.srid:
                extent.srid = int(loader.info.srid)
                extent.transform(4326)
            logger.debug("Extent is 'srid=%s;%s'::geometry", 4326,
                         extent,)
            datafile.extent = extent
        datafile.feature_count = loader.info.feature_count
        if not datafile.description:
            datafile.description = loader.info.format
        if loader.is_spatial and not datafile.srid:
            datafile.status = datafile.IMPORT_FAILED
            datafile.status_message = 'Please specify SRID for this file (unable to auto-identify SRID)'
        elif not job_id:
            datafile.status = datafile.IMPORTED
        else:
            datafile.status = datafile.IMPORT_RESULTS_COMPLETE
        datafile.fields = loader.info.fields
        # Create an empty file using ContentFile, then we can overwrite it
        # with the desired GeoJSON data.
        if loader.is_spatial:
            suffix = 'geojson'
        else:
            suffix = 'json'
        if datafile.status in (
                datafile.IMPORTED,
                datafile.IMPORT_RESULTS_COMPLETE):
            if datafile.geom_type == 99:
                field_attributes = {}
                # This is a raster...
                for pos, band in enumerate(loader.dl_instance.bands()):
                    field_attributes[pos + 1] = {
                        'type': band.type,
                        'field_name': 'pixel',
                        'min': band.min,
                        'max': band.max}
                datafile.field_attributes = field_attributes
            elif datafile.feature_count:
                logger.error('Working on saving the model!')
                datafile.processed_file.save('{0}.{1}'.format(datafile.pk, suffix),
                                             ContentFile(''))
                loader.export_json(datafile.processed_file.path)
                generate_datamodel(datafile, loader)
                # Here we load the spatialite data using the model that was created
                # by generate_datamodel.  We need to use this to get the range
                # and type information for each field...
                try:
                    field_attributes = {}
                    qs = getQuerySet(datafile)
                    field_mappings = [(django_model_fields.IntegerField, 'integer', int),
                                      # Required because nmtk_id is an
                                      # autofield..
                                      (django_model_fields.AutoField,
                                       'integer', int,),
                                      (django_model_fields.BooleanField,
                                       'boolean', bool),
                                      # Special case holding FIPS
                                      (django_model_fields.DecimalField,
                                       'float', float),
                                      (django_model_fields.TextField,
                                       'text', None),
                                      (django_model_fields.FloatField,
                                       'float', float),
                                      (django_model_fields.DateField,
                                       'date', None,),
                                      (django_model_fields.TimeField,
                                       'time', None,),
                                      (django_model_fields.DateTimeField,
                                       'datetime', None)]
                    if qs.count() > 0:
                        # Get a single row so that we can try to work with the
                        # fields.
                        sample_row = qs[0]
                        for field in sample_row._meta.fields:
                            field_name = field.name
                            db_column = field.db_column or field.name
                            # convert the django field type to a text string.
                            for ftype, field_type, caster in field_mappings:
                                if isinstance(field, (ftype,)):
                                    break
                            else:
                                logger.info(
                                    'Unable to map field of type %s (this is expected for GIS fields)', type(
                                        field, ))
                                continue
                            values_aggregates = qs.aggregate(
                                Count(field_name, distinct=True))
                            field_attributes[db_column] = {
                                'type': field_type,
                                'field_name': field_name,
                                'distinct': values_aggregates[
                                    '{0}__count'.format(field_name)]}
                            if field_attributes[db_column]['distinct'] < 10:
                                distinct_values = list(
                                    qs.order_by().values_list(
                                        field_name, flat=True).distinct())
                                if not caster:
                                    field_attributes[db_column][
                                        'values'] = distinct_values
                                else:
                                    field_attributes[db_column][
                                        'values'] = map(caster, distinct_values)
                            else:
                                logger.debug(
                                    'There are more than 10 values for %s (%s), enumerating..',
                                    db_column,
                                    field_attributes[db_column]['distinct'])
                                # formerly the aggregates happened above - with the count. However, Django doesn't
                                # allow those aggregates with boolean fields - so here we split it up to only do the
                                # aggregates in the cases where we have to (i.e.,
                                # the distinct values is above the threshold.)
                                values_aggregates = qs.aggregate(
                                    Max(field_name), Min(field_name), )
                                field_attributes[db_column]['min'] = values_aggregates[
                                    '{0}__min'.format(field_name)]
                                field_attributes[db_column]['max'] = values_aggregates[
                                    '{0}__max'.format(field_name)]
                                if caster:
                                    field_attributes[db_column]['min'] = caster(
                                        field_attributes[db_column]['min'])
                                    field_attributes[db_column]['max'] = caster(
                                        field_attributes[db_column]['max'])
                        datafile.field_attributes = field_attributes
                except Exception as e:
                    logger.exception('Failed to get range for model %s',
                                     datafile.pk)
        if job_id:
            try:
                job = models.Job.objects.get(pk=job_id)
                # There might be multiple results files from this job, so we will only
                # mark the job as complete if all the results files are
                # processed.
                if job.status != job.COMPLETE:
                    results_left = job.job_files.filter(
                        status=models.DataFile.PROCESSING_RESULTS).count()
                    if results_left == 0:
                        job.status = job.COMPLETE
                        models.JobStatus(message='Job Completed',
                                         timestamp=timezone.now(),
                                         job=job,
                                         category=models.JobStatus.CATEGORY_SYSTEM).save()
                    elif results_left == 1:
                        # Handle the potential race condition here - do we really need this?
                        # sort of.  Since it's possible that two files finish post-processing
                        # at the same time.  In such cases, a second should be more than enough
                        # time to get both committed as complete.
                        time.sleep(1)
                        job = models.Job.objects.get(pk=job_id)
                        if job.status != job.COMPLETE:
                            results_left = job.job_files.filter(
                                status=models.DataFile.PROCESSING_RESULTS).count()
                            if results_left == 0:
                                job.status = job.COMPLETE
                                models.JobStatus(message='Job Completed',
                                                 timestamp=timezone.now(),
                                                 job=job,
                                                 category=models.JobStatus.CATEGORY_SYSTEM).save()

            except:
                logger.exception('Failed to update job status to complete?!!')
    except Exception as e:
        logger.exception('Failed import process!')
        datafile.processed_file = None
        if not job_id:
            datafile.status = datafile.IMPORT_FAILED
        else:
            datafile.status = datafile.IMPORT_RESULTS_FAILED
        datafile.status_message = "%s" % (e,)
        if job_id:
            try:
                job = models.Job.objects.get(pk=job_id)
                job.status = job.POST_PROCESSING_FAILED
            except:
                logger.exception('Failed to update job status to failed?!!')

    if job_id:
        job.save()
    # Now we need to create the spatialite version of this thing.
    datafile.save()
Ejemplo n.º 3
0
def importDataFile(datafile, job_id=None):
    from NMTK_server import models
    logger = importDataFile.get_logger()
    datafile.status_message = None
    job = None
    try:
        loader = NMTKDataLoader(datafile.file.path,
                                srid=datafile.srid,
                                logger=logger)
        destination = None
        for import_file in loader.extract_files():
            # Figure out where these files need to go.
            if not destination:
                destination = os.path.dirname(datafile.file.path)
                # the first file we get (when destination is null,it's our first
                # loop) is the one that needs to be in the model, handle that
                # here...
                if datafile.file.path != import_file:
                    f = open(import_file)
                    datafile.file.save(os.path.basename(import_file), File(f))
            else:
                shutil.copyfile(import_file,
                                os.path.join(destination,
                                             os.path.basename(import_file)))
            logger.debug('Created a new file for %s', import_file)
        logger.info('The file is spatial? %s', loader.is_spatial)
        if loader.is_spatial:
            datafile.srid = loader.info.srid
            datafile.srs = loader.info.srs
            datafile.geom_type = loader.info.type
            logger.debug('Loader extent is %s', loader.info.extent)
            extent = geos.Polygon.from_bbox(loader.info.extent)
            logger.debug("Extent is 'srid=%s;%s'::geometry", loader.info.srid,
                         extent,)
            if datafile.srid:
                extent.srid = int(loader.info.srid)
                extent.transform(4326)
            logger.debug("Extent is 'srid=%s;%s'::geometry", 4326,
                         extent,)
            datafile.extent = extent
        datafile.feature_count = loader.info.feature_count
        if not datafile.description:
            datafile.description = loader.info.format
        future_status = datafile.status
        if loader.is_spatial and not datafile.srid:
            future_status = datafile.IMPORT_FAILED
            datafile.status_message = 'Please specify SRID for this file (unable to auto-identify SRID)'
        elif not job_id:
            future_status = datafile.IMPORTED
        else:
            future_status = datafile.IMPORT_RESULTS_COMPLETE

        # We need to merge these things..
        desired_field_order = datafile.fields or []
        # Now that we have a desired field order from the model, we can
        # go the next step of getting job data.
        if job_id:
            try:
                job = models.Job.objects.select_related('tool').get(pk=job_id)
            except Exception as e:
                logger.error('Failed to get job with id of %s', job_id,
                             exc_info=True)

        # From the job data we can get the tool config:
        config_field_list = config_namespace = None
        # Get the list of field names, with the unique ones first...
        tool_config_field_units = {}
        job_config_field_units = datafile.units or {}
        if job:
            tool_config = job.tool.toolconfig.json_config
            # there might be multiple input files, but we'll use the first
            # one as the basis for format for the output, since we don't
            # really have a better option.  The tool developer ought to
            # specify a list of fields in the output if they don't like
            # this behaviour, since this is just a "default" for the order.
            for t in job.tool.toolconfig.json_config['input']:
                if t.get('type', '').lower() == 'file':
                    config_namespace = t.get('name', None)
                    if config_namespace:
                        config_field_list = [f['name']
                                             for f in t.get('elements', []) if
                                             isinstance(f.get('name', None),
                                                        (str, unicode))]
                        # If there are units, then we store the units
                        # here, so we can use that with the field data.
                        for f in t.get('elements', []):
                            if 'units' in f:
                                tool_config_field_units[
                                    f['name']] = f.get('units', None)
                            elif 'description' in f:
                                tool_config_field_units[f['name']] = f.get(
                                    'description', None)

                    break
            # Now that we have a list of fields from the tool configuration,
            # get the input fields from the file for each of the tool fields,
            # since we want that to be the default order of output.
            if config_field_list:
                job_config = job.config[config_namespace]
                for f in config_field_list:
                    if f in job_config:
                        if job_config[f].get('type', None) == 'property':
                            if isinstance(job_config[f].get('value', None),
                                          (str, unicode)):
                                desired_field_order.append(
                                    job_config[f]['value'])
                            # Map the tool config field (f) to the selected data file field
                            # (job_config[f]['value'] so we can grab the units from the
                            # tool config.
                            if (datafile.units and f in datafile.units and
                                    'value' in job_config[f]):
                                job_config_field_units[
                                    job_config[f]['value']] = datafile.units.get(f, '')
                            # If the tool didn't give us the units to use for fields
                            # we can fall back to the tool config to see what they
                            # ought to be.
                            elif (f in tool_config_field_units and
                                  'value' in job_config[f]):
                                job_config_field_units[
                                    job_config[f]['value']] = tool_config_field_units.get(f, '')

        # Get the list of actual fields in the input datafile...
        available_fields = loader.info.fields
        # eliminate fields that are not in the list of output fields.
        logger.debug('Desired field order is: %s', desired_field_order)
        logger.debug('Loader provided field order is: %s', available_fields)
        ordered_fields = [field for field in desired_field_order
                          if field in available_fields]
        # Add in any fields using the order first, then following with
        # any fields not in the ordered list, but in the output list
        # of fields.
        datafile.fields = list(unique_everseen(
            ordered_fields + available_fields))

        logger.debug('Final field order is %s', datafile.fields)
        # Create an empty file using ContentFile, then we can overwrite it
        # with the desired GeoJSON data.
        if loader.is_spatial:
            suffix = 'geojson'
        else:
            suffix = 'json'
        if future_status in (
                datafile.IMPORTED,
                datafile.IMPORT_RESULTS_COMPLETE):
            if datafile.geom_type == 99:
                field_attributes = {}
                # This is a raster...
                for pos, band in enumerate(loader.dl_instance.bands()):
                    field_attributes[pos + 1] = {
                        'type': band.type,
                        'field_name': 'pixel',
                        'min': band.min,
                        'max': band.max}
                datafile.field_attributes = field_attributes
            elif datafile.feature_count:
                logger.error('Working on saving the model!')
                datafile.processed_file.save('{0}.{1}'.format(datafile.pk, suffix),
                                             ContentFile(''))
                loader.export_json(datafile.processed_file.path)
                try:
                    generate_datamodel(datafile, loader, logger)
                except Exception as e:
                    logger.error('Error generating data model: %s', e,
                                 exc_info=logger.isEnabledFor(logging.DEBUG))
                    raise e
                # Here we load the spatialite data using the model that was created
                # by generate_datamodel.  We need to use this to get the range
                # and type information for each field...
                try:
                    field_attributes = {}
                    qs = getQuerySet(datafile)
                    field_mappings = [(django_model_fields.IntegerField, 'integer', int),
                                      # Required because nmtk_id is an
                                      # autofield..
                                      (django_model_fields.AutoField,
                                       'integer', int,),
                                      (django_model_fields.BooleanField,
                                       'boolean', bool),
                                      # Special case holding FIPS
                                      (django_model_fields.DecimalField,
                                       'float', float),
                                      (django_model_fields.TextField,
                                       'text', None),
                                      (django_model_fields.FloatField,
                                       'float', float),
                                      (django_model_fields.DateField,
                                       'date', datetime.date.isoformat,),
                                      (django_model_fields.TimeField,
                                       'time', datetime.time.isoformat,),
                                      (django_model_fields.DateTimeField,
                                       'datetime', datetime.datetime.isoformat)]
                    if qs.count() > 0:
                        # Get a single row so that we can try to work with the
                        # fields.
                        sample_row = qs[0]
                        for field in sample_row._meta.fields:
                            field_name = field.name
                            db_column = field.db_column or field.name
                            # convert the django field type to a text string.
                            for ftype, field_type, caster in field_mappings:
                                if isinstance(field, (ftype,)):
                                    break
                            else:
                                logger.info(
                                    'Unable to map field of type %s (this is expected for GIS fields)', type(
                                        field, ))
                                continue
                            values_aggregates = qs.aggregate(
                                Count(field_name, distinct=True))
                            field_attributes[db_column] = {
                                'type': field_type,
                                'field_name': field_name,
                                'distinct': values_aggregates[
                                    '{0}__count'.format(field_name)]}
                            # Add the units from the config to the data.
                            if db_column in job_config_field_units:
                                field_attributes[db_column][
                                    'units'] = job_config_field_units[db_column]
                            if field_attributes[db_column]['distinct'] < 10:
                                distinct_values = [v for v in  
                                    qs.order_by().values_list(
                                        field_name, flat=True).distinct() if v is not None]
                                if not caster:
                                    field_attributes[db_column][
                                        'values'] = distinct_values
                                else:
                                    logger.info('Attempting to cast values: %s', distinct_values)
                                    field_attributes[db_column][
                                        'values'] = map(caster, distinct_values)
                            else:
                                logger.debug(
                                    'There are more than 10 values for %s (%s), enumerating..',
                                    db_column,
                                    field_attributes[db_column]['distinct'])
                                # formerly the aggregates happened above - with the count. However, Django doesn't
                                # allow those aggregates with boolean fields - so here we split it up to only do the
                                # aggregates in the cases where we have to (i.e.,
                                # the distinct values is above the threshold.)
                                values_aggregates = qs.aggregate(
                                    Max(field_name), Min(field_name), )
                                field_attributes[db_column]['min'] = values_aggregates[
                                    '{0}__min'.format(field_name)]
                                field_attributes[db_column]['max'] = values_aggregates[
                                    '{0}__max'.format(field_name)]
                                if caster:
                                    field_attributes[db_column]['min'] = caster(
                                        field_attributes[db_column]['min'])
                                    field_attributes[db_column]['max'] = caster(
                                        field_attributes[db_column]['max'])
                        datafile.field_attributes = field_attributes
                        datafile.units = job_config_field_units
                except Exception as e:
                    logger.exception('Failed to get range for model %s',
                                     datafile.pk)
        if job:
            try:
                # There might be multiple results files from this job, so we will only
                # mark the job as complete if all the results files are
                # processed.
                if job.status != job.COMPLETE:
                    results_left = job.job_files.filter(
                        status=models.DataFile.PROCESSING_RESULTS).count()
                    if results_left == 0:
                        job.status = job.COMPLETE
                        models.JobStatus(message='Job Completed',
                                         timestamp=timezone.now(),
                                         job=job,
                                         category=models.JobStatus.CATEGORY_SYSTEM).save()
                    elif results_left == 1:
                        # Handle the potential race condition here - do we really need this?
                        # sort of.  Since it's possible that two files finish post-processing
                        # at the same time.  In such cases, a second should be more than enough
                        # time to get both committed as complete.
                        time.sleep(1)
                        job = models.Job.objects.get(pk=job_id)
                        if job.status != job.COMPLETE:
                            results_left = job.job_files.filter(
                                status=models.DataFile.PROCESSING_RESULTS).count()
                            if results_left == 0:
                                job.status = job.COMPLETE
                                models.JobStatus(message='Job Completed',
                                                 timestamp=timezone.now(),
                                                 job=job,
                                                 category=models.JobStatus.CATEGORY_SYSTEM).save()

            except:
                logger.exception('Failed to update job status to complete?!!')
        datafile.status = future_status
    except Exception as e:
        logger.error('Failed import process!', exc_info=True)
        datafile.processed_file = None
        if not job_id:
            datafile.status = datafile.IMPORT_FAILED
        else:
            datafile.status = datafile.IMPORT_RESULTS_FAILED
        datafile.status_message = "%s" % (e,)
        if job_id:
            try:
                if not job:
                    job = models.Job.objects.get(pk=job_id)
                job.status = job.POST_PROCESSING_FAILED
                logger.info('Set post processing to failed for job %s', job.pk)
            except:
                logger.error(
                    'Failed to update job status to failed?!!', exc_info=True)

    if job:
        job.save()
    datafile.save()
Ejemplo n.º 4
0
def importDataFile(datafile, job_id=None):
    from NMTK_server import models
    datafile.status_message=None
    try:
        loader=NMTKDataLoader(datafile.file.path, 
                              srid=datafile.srid)
        if loader.is_spatial:
            datafile.srid=loader.info.srid
            datafile.srs=loader.info.srs
            datafile.geom_type=loader.info.type
            logger.debug('Loader extent is %s', loader.info.extent)
            extent=geos.Polygon.from_bbox(loader.info.extent)
            logger.debug("Extent is 'srid=%s;%s'::geometry", loader.info.srid, 
                         extent,)
            if datafile.srid:
                extent.srid=int(loader.info.srid)
                extent.transform(4326)
            logger.debug("Extent is 'srid=%s;%s'::geometry", 4326, 
                         extent,)
            datafile.extent=extent
        datafile.feature_count=loader.info.feature_count
        if loader.is_spatial and not datafile.srid:
            datafile.status=datafile.IMPORT_FAILED
            datafile.status_message='Please specify SRID for this file (unable to auto-identify SRID)'
        elif not job_id:
            datafile.status=datafile.IMPORTED
        else:
            datafile.status=datafile.IMPORT_RESULTS_COMPLETE
        datafile.fields=loader.info.fields
        # Create an empty file using ContentFile, then we can overwrite it 
        # with the desired GeoJSON data.
        if loader.is_spatial: 
            suffix='geojson'
        else: 
            suffix='json'
        if datafile.status in (datafile.IMPORTED, datafile.IMPORT_RESULTS_COMPLETE):
            datafile.processed_file.save('{0}.{1}'.format(datafile.pk, suffix), 
                                         ContentFile(''))
            loader.export_json(datafile.processed_file.path)
            generate_datamodel(datafile, loader)
            # Here we load the spatialite data using the model that was created
            # by generate_datamodel.  We need to use this to get the range
            # and type information for each field...
            try:
                field_attributes={}
                qs=getQuerySet(datafile)
                field_mappings=[(django_model_fields.IntegerField, 'integer',),
                                (django_model_fields.AutoField, 'integer',), # Required because nmtk_id is an autofield..
                                (django_model_fields.BooleanField, 'boolean',),
                                (django_model_fields.DecimalField, 'float',), # Special case holding FIPS
                                (django_model_fields.TextField, 'text',),
                                (django_model_fields.FloatField,'float'),
                                (django_model_fields.DateField, 'date',),
                                (django_model_fields.TimeField, 'time'),
                                (django_model_fields.DateTimeField, 'datetime')]
                if qs.count() > 0:
                    # Get a single row so that we can try to work with the fields.
                    sample_row=qs[0]
                    for field in sample_row._meta.fields:
                        field_name=field.name
                        db_column=field.db_column or field.name
                        # convert the django field type to a text string.
                        for ftype, field_type in field_mappings:
                            if isinstance(field, (ftype,)):
                                break
                        else:
                            logger.info('Unable to map field of type %s (this is expected for GIS fields)', type(field,))
                            continue
                        values_aggregates=qs.aggregate(Count(field_name, distinct=True))
                        field_attributes[db_column]={'type': field_type, 
                                                     'field_name': field_name,
                                                     'distinct': values_aggregates['{0}__count'.format(field_name)]}
                        if field_attributes[db_column]['distinct'] < 10:
                            distinct_values=list(qs.order_by().values_list(field_name, flat=True).distinct())
                            field_attributes[db_column]['values']=distinct_values
                        else:
                            logger.debug('There are more than 10 values for %s (%s), enumerating..', db_column, 
                                         field_attributes[db_column]['distinct'])
                            # formerly the aggregates happened above - with the count. However, Django doesn't
                            # allow those aggregates with boolean fields - so here we split it up to only do the
                            # aggregates in the cases where we have to (i.e., the distinct values is above the threshold.)
                            values_aggregates=qs.aggregate(Max(field_name), Min(field_name), )
                            field_attributes[db_column]['min']= values_aggregates['{0}__min'.format(field_name)]
                            field_attributes[db_column]['max']= values_aggregates['{0}__max'.format(field_name)]
                    datafile.field_attributes=field_attributes
            except Exception, e:
                logger.exception('Failed to get range for model %s',
                                 datafile.pk)
        if job_id:
            try:
                job=models.Job.objects.get(pk=job_id)
                # There might be multiple results files from this job, so we will only
                # mark the job as complete if all the results files are processed.
                if job.status != job.COMPLETE:
                    results_left=job.job_files.filter(status=models.DataFile.PROCESSING_RESULTS).count()
                    if results_left == 0:
                        job.status=job.COMPLETE
                        models.JobStatus(message='Job Completed',
                                         timestamp=timezone.now(),
                                         job=job).save()
                    elif results_left == 1:
                        # Handle the potential race condition here - do we really need this?
                        # sort of.  Since it's possible that two files finish post-processing
                        # at the same time.  In such cases, a second should be more than enough
                        # time to get both committed as complete.
                        time.sleep(1)
                        job=models.Job.objects.get(pk=job_id)
                        if job.status != job.COMPLETE:
                            results_left=job.job_files.filter(status=models.DataFile.PROCESSING_RESULTS).count()
                            if results_left == 0:
                                job.status=job.COMPLETE
                                models.JobStatus(message='Job Completed',
                                                 timestamp=timezone.now(),
                                                 job=job).save()
                    
                    
            except: 
                logger.exception('Failed to update job status to complete?!!')
Ejemplo n.º 5
0
def importDataFile(datafile, job_id=None):
    from NMTK_server import models
    logger = importDataFile.get_logger()
    datafile.status_message = None
    job = None
    try:
        loader = NMTKDataLoader(datafile.file.path,
                                srid=datafile.srid,
                                logger=logger)
        destination = None
        for import_file in loader.extract_files():
            # Figure out where these files need to go.
            if not destination:
                destination = os.path.dirname(datafile.file.path)
                # the first file we get (when destination is null,it's our first
                # loop) is the one that needs to be in the model, handle that
                # here...
                if datafile.file.path != import_file:
                    f = open(import_file)
                    datafile.file.save(os.path.basename(import_file), File(f))
            else:
                shutil.copyfile(
                    import_file,
                    os.path.join(destination, os.path.basename(import_file)))
            logger.debug('Created a new file for %s', import_file)
        logger.info('The file is spatial? %s', loader.is_spatial)
        if loader.is_spatial:
            datafile.srid = loader.info.srid
            datafile.srs = loader.info.srs
            datafile.geom_type = loader.info.type
            logger.debug('Loader extent is %s', loader.info.extent)
            extent = geos.Polygon.from_bbox(loader.info.extent)
            logger.debug(
                "Extent is 'srid=%s;%s'::geometry",
                loader.info.srid,
                extent,
            )
            if datafile.srid:
                extent.srid = int(loader.info.srid)
                extent.transform(4326)
            logger.debug(
                "Extent is 'srid=%s;%s'::geometry",
                4326,
                extent,
            )
            datafile.extent = extent
        datafile.feature_count = loader.info.feature_count
        if not datafile.description:
            datafile.description = loader.info.format
        future_status = datafile.status
        if loader.is_spatial and not datafile.srid:
            future_status = datafile.IMPORT_FAILED
            datafile.status_message = 'Please specify SRID for this file (unable to auto-identify SRID)'
        elif not job_id:
            future_status = datafile.IMPORTED
        else:
            future_status = datafile.IMPORT_RESULTS_COMPLETE

        # We need to merge these things..
        desired_field_order = datafile.fields or []
        # Now that we have a desired field order from the model, we can
        # go the next step of getting job data.
        if job_id:
            try:
                job = models.Job.objects.select_related('tool').get(pk=job_id)
            except Exception as e:
                logger.error('Failed to get job with id of %s',
                             job_id,
                             exc_info=True)

        # From the job data we can get the tool config:
        config_field_list = config_namespace = None
        # Get the list of field names, with the unique ones first...
        tool_config_field_units = {}
        job_config_field_units = datafile.units or {}
        if job:
            tool_config = job.tool.toolconfig.json_config
            # there might be multiple input files, but we'll use the first
            # one as the basis for format for the output, since we don't
            # really have a better option.  The tool developer ought to
            # specify a list of fields in the output if they don't like
            # this behaviour, since this is just a "default" for the order.
            for t in job.tool.toolconfig.json_config['input']:
                if t.get('type', '').lower() == 'file':
                    config_namespace = t.get('name', None)
                    if config_namespace:
                        config_field_list = [
                            f['name'] for f in t.get('elements', [])
                            if isinstance(f.get('name', None), (str, unicode))
                        ]
                        # If there are units, then we store the units
                        # here, so we can use that with the field data.
                        for f in t.get('elements', []):
                            if 'units' in f:
                                tool_config_field_units[f['name']] = f.get(
                                    'units', None)
                            elif 'description' in f:
                                tool_config_field_units[f['name']] = f.get(
                                    'description', None)

                    break
            # Now that we have a list of fields from the tool configuration,
            # get the input fields from the file for each of the tool fields,
            # since we want that to be the default order of output.
            if config_field_list:
                job_config = job.config[config_namespace]
                for f in config_field_list:
                    if f in job_config:
                        if job_config[f].get('type', None) == 'property':
                            if isinstance(job_config[f].get('value', None),
                                          (str, unicode)):
                                desired_field_order.append(
                                    job_config[f]['value'])
                            # Map the tool config field (f) to the selected data file field
                            # (job_config[f]['value'] so we can grab the units from the
                            # tool config.
                            if (datafile.units and f in datafile.units
                                    and 'value' in job_config[f]):
                                job_config_field_units[job_config[f][
                                    'value']] = datafile.units.get(f, '')
                            # If the tool didn't give us the units to use for fields
                            # we can fall back to the tool config to see what they
                            # ought to be.
                            elif (f in tool_config_field_units
                                  and 'value' in job_config[f]):
                                job_config_field_units[job_config[f][
                                    'value']] = tool_config_field_units.get(
                                        f, '')

        # Get the list of actual fields in the input datafile...
        available_fields = loader.info.fields
        # eliminate fields that are not in the list of output fields.
        logger.debug('Desired field order is: %s', desired_field_order)
        logger.debug('Loader provided field order is: %s', available_fields)
        ordered_fields = [
            field for field in desired_field_order if field in available_fields
        ]
        # Add in any fields using the order first, then following with
        # any fields not in the ordered list, but in the output list
        # of fields.
        datafile.fields = list(
            unique_everseen(ordered_fields + available_fields))

        logger.debug('Final field order is %s', datafile.fields)
        # Create an empty file using ContentFile, then we can overwrite it
        # with the desired GeoJSON data.
        if loader.is_spatial:
            suffix = 'geojson'
        else:
            suffix = 'json'
        if future_status in (datafile.IMPORTED,
                             datafile.IMPORT_RESULTS_COMPLETE):
            if datafile.geom_type == 99:
                field_attributes = {}
                # This is a raster...
                for pos, band in enumerate(loader.dl_instance.bands()):
                    field_attributes[pos + 1] = {
                        'type': band.type,
                        'field_name': 'pixel',
                        'min': band.min,
                        'max': band.max
                    }
                datafile.field_attributes = field_attributes
            elif datafile.feature_count:
                logger.error('Working on saving the model!')
                datafile.processed_file.save(
                    '{0}.{1}'.format(datafile.pk, suffix), ContentFile(''))
                loader.export_json(datafile.processed_file.path)
                try:
                    generate_datamodel(datafile, loader, logger)
                except Exception as e:
                    logger.error('Error generating data model: %s',
                                 e,
                                 exc_info=logger.isEnabledFor(logging.DEBUG))
                    raise e
                # Here we load the spatialite data using the model that was created
                # by generate_datamodel.  We need to use this to get the range
                # and type information for each field...
                try:
                    field_attributes = {}
                    qs = getQuerySet(datafile)
                    field_mappings = [
                        (django_model_fields.IntegerField, 'integer', int),
                        # Required because nmtk_id is an
                        # autofield..
                        (
                            django_model_fields.AutoField,
                            'integer',
                            int,
                        ),
                        (django_model_fields.BooleanField, 'boolean', bool),
                        # Special case holding FIPS
                        (django_model_fields.DecimalField, 'float', float),
                        (django_model_fields.TextField, 'text', None),
                        (django_model_fields.FloatField, 'float', float),
                        (
                            django_model_fields.DateField,
                            'date',
                            datetime.date.isoformat,
                        ),
                        (
                            django_model_fields.TimeField,
                            'time',
                            datetime.time.isoformat,
                        ),
                        (django_model_fields.DateTimeField, 'datetime',
                         datetime.datetime.isoformat)
                    ]
                    if qs.count() > 0:
                        # Get a single row so that we can try to work with the
                        # fields.
                        sample_row = qs[0]
                        for field in sample_row._meta.fields:
                            field_name = field.name
                            db_column = field.db_column or field.name
                            # convert the django field type to a text string.
                            for ftype, field_type, caster in field_mappings:
                                if isinstance(field, (ftype, )):
                                    break
                            else:
                                logger.info(
                                    'Unable to map field of type %s (this is expected for GIS fields)',
                                    type(field, ))
                                continue
                            values_aggregates = qs.aggregate(
                                Count(field_name, distinct=True))
                            field_attributes[db_column] = {
                                'type':
                                field_type,
                                'field_name':
                                field_name,
                                'distinct':
                                values_aggregates['{0}__count'.format(
                                    field_name)]
                            }
                            # Add the units from the config to the data.
                            if db_column in job_config_field_units:
                                field_attributes[db_column][
                                    'units'] = job_config_field_units[
                                        db_column]
                            if field_attributes[db_column]['distinct'] < 10:
                                distinct_values = [
                                    v for v in qs.order_by().values_list(
                                        field_name, flat=True).distinct()
                                    if v is not None
                                ]
                                if not caster:
                                    field_attributes[db_column][
                                        'values'] = distinct_values
                                else:
                                    logger.info(
                                        'Attempting to cast values: %s',
                                        distinct_values)
                                    field_attributes[db_column][
                                        'values'] = map(
                                            caster, distinct_values)
                            else:
                                logger.debug(
                                    'There are more than 10 values for %s (%s), enumerating..',
                                    db_column,
                                    field_attributes[db_column]['distinct'])
                                # formerly the aggregates happened above - with the count. However, Django doesn't
                                # allow those aggregates with boolean fields - so here we split it up to only do the
                                # aggregates in the cases where we have to (i.e.,
                                # the distinct values is above the threshold.)
                                values_aggregates = qs.aggregate(
                                    Max(field_name),
                                    Min(field_name),
                                )
                                field_attributes[db_column][
                                    'min'] = values_aggregates[
                                        '{0}__min'.format(field_name)]
                                field_attributes[db_column][
                                    'max'] = values_aggregates[
                                        '{0}__max'.format(field_name)]
                                if caster:
                                    field_attributes[db_column][
                                        'min'] = caster(
                                            field_attributes[db_column]['min'])
                                    field_attributes[db_column][
                                        'max'] = caster(
                                            field_attributes[db_column]['max'])
                        datafile.field_attributes = field_attributes
                        datafile.units = job_config_field_units
                except Exception as e:
                    logger.exception('Failed to get range for model %s',
                                     datafile.pk)
        if job:
            try:
                # There might be multiple results files from this job, so we will only
                # mark the job as complete if all the results files are
                # processed.
                if job.status != job.COMPLETE:
                    results_left = job.job_files.filter(
                        status=models.DataFile.PROCESSING_RESULTS).count()
                    if results_left == 0:
                        job.status = job.COMPLETE
                        models.JobStatus(
                            message='Job Completed',
                            timestamp=timezone.now(),
                            job=job,
                            category=models.JobStatus.CATEGORY_SYSTEM).save()
                    elif results_left == 1:
                        # Handle the potential race condition here - do we really need this?
                        # sort of.  Since it's possible that two files finish post-processing
                        # at the same time.  In such cases, a second should be more than enough
                        # time to get both committed as complete.
                        time.sleep(1)
                        job = models.Job.objects.get(pk=job_id)
                        if job.status != job.COMPLETE:
                            results_left = job.job_files.filter(
                                status=models.DataFile.PROCESSING_RESULTS
                            ).count()
                            if results_left == 0:
                                job.status = job.COMPLETE
                                models.JobStatus(message='Job Completed',
                                                 timestamp=timezone.now(),
                                                 job=job,
                                                 category=models.JobStatus.
                                                 CATEGORY_SYSTEM).save()

            except:
                logger.exception('Failed to update job status to complete?!!')
        datafile.status = future_status
    except Exception as e:
        logger.error('Failed import process!', exc_info=True)
        datafile.processed_file = None
        if not job_id:
            datafile.status = datafile.IMPORT_FAILED
        else:
            datafile.status = datafile.IMPORT_RESULTS_FAILED
        datafile.status_message = "%s" % (e, )
        if job_id:
            try:
                if not job:
                    job = models.Job.objects.get(pk=job_id)
                job.status = job.POST_PROCESSING_FAILED
                logger.info('Set post processing to failed for job %s', job.pk)
            except:
                logger.error('Failed to update job status to failed?!!',
                             exc_info=True)

    if job:
        job.save()
    datafile.save()