Beispiel #1
0
    def handle_location(self, collection, location):
        tmp_dir = tempfile.mkdtemp()
        shutil.copytree(collection.data_dir, join(tmp_dir, "backup"))
        annotations_file = tempfile.TemporaryFile()

        ingested_dir = join(collection.data_dir, "ingested", location.slug)
        pending_dir = join(collection.data_dir, "pending", location.slug)

        safe_makedirs(ingested_dir)
        safe_makedirs(pending_dir)

        try:
            # move all files from ingested dir to pending dir
            for path in glob.iglob(join(ingested_dir, "*")):
                os.rename(path, join(pending_dir, basename(path)))

            # save all annotations to a CSV
            writer = csv.writer(annotations_file, delimiter="\t")
            writer.writerow(["filename", "annotation"])

            for record in location.records.filter():
                for annotation in record.annotations.all():
                    writer.writerow([record.filename, annotation.text])

            annotations_file.seek(0)

            # delete all index file records in database
            location.index_files.all().delete()

            # re-ingest all index files in pending
            for path in glob.iglob(join(pending_dir, "*")):
                ingest(
                    collection.mission, collection.file_type, location.url,
                    basename(path)
                )

            # restore annotations and remove temporary file
            reader = csv.reader(annotations_file, delimiter="\t")
            next(reader)  # skip header
            for row in reader:
                models.Annotation.objects.create(
                    record=location.records.get(filename=row[0]),
                    text=row[1]
                )

        except:
            # restore backups
            shutil.rmtree(collection.data_dir)
            shutil.move(join(tmp_dir, "backup"), collection.data_dir)
            raise
        finally:
            shutil.rmtree(tmp_dir)
Beispiel #2
0
def backup(logs=False,
           config=False,
           app=False,
           diff=None,
           incr=None,
           out_path=None):
    """ Task function to perform the backup of the given items: logs,
    configuration and application (currently not supported). Backup can be full
    (the default), differential (specify path to other backup or a truthy value
    (will select the last backup)) or incremental (specify datetime or
    timedelta). An output path can be specified or one will be generated.
    Returns the path to the backup ZIP.
    """
    if not logs and not config and not app:
        raise BackupError("One of logs, config or app must be specified.")

    if diff and incr:
        raise BackupError(
            "Differential and incremental backups are mutually exclusive.")

    timestamp = now().strftime("%Y%m%d-%H%M%S")

    # assure that the backup path exists
    if not out_path:
        safe_makedirs(BASE_PATH)

    if diff:
        backupper = DifferentialBackup(logs, config, app, diff)
        out_path = out_path or join(BASE_PATH,
                                    "backup.diff.%s.zip" % timestamp)
    elif incr:
        backupper = IncrementalBackup(logs, config, app, incr)
        out_path = out_path or join(BASE_PATH,
                                    "backup.incr.%s.zip" % timestamp)
    else:
        backupper = FullBackup(logs, config, app)
        out_path = out_path or join(BASE_PATH, "backup.%s.zip" % timestamp)

    try:
        backupper.perform(out_path, timestamp)
        os.chmod(out_path, 0660)
    except Exception:
        exc_info = sys.exc_info()
        try:
            os.unlink(out_path)
        except OSError:
            pass
        raise exc_info[0], exc_info[1], exc_info[2]

    return out_path
Beispiel #3
0
def export_collection(mission, file_type, filename=None,
                      configuration=True, data=True, reschedule=False):
    """ Export the configuration and/or the data of a collection to a ZIP file.
    """

    collection = models.Collection.objects.get(
        mission=mission, file_type=file_type
    )

    if not configuration and not data:
        raise RuntimeError("Neither collection nor data export specified")

    # create a default filename if none was specified
    if not filename:
        exports_dir = join(collection.data_dir, "exports")
        safe_makedirs(exports_dir)
        new_filename = join(
            exports_dir, "export_%s.zip" % now().strftime("%Y%m%d-%H%M%S")
        )

    with collection.get_lock():
        ret_val = _export_collection_locked(
            collection, filename or new_filename, configuration, data
        )

        logger.info(
            "Successfully exported %s collection to %s" % (collection, filename)
        )

        if reschedule:
            try:
                interval = collection.configuration.export_interval
                schedule("export", now() + interval, {
                    "mission": mission,
                    "file_type": file_type,
                    "filename": filename,
                    "configuration": configuration,
                    "data": data,
                    "reschedule": True
                })
            except Exception as exc:
                logger.error(
                    "Failed to reschedule export for %s. Error was '%s'." % (
                        collection, exc
                    )
                )

        return ret_val
Beispiel #4
0
    def handle_location(self, collection, location):
        tmp_dir = tempfile.mkdtemp()
        shutil.copytree(collection.data_dir, join(tmp_dir, "backup"))
        annotations_file = tempfile.TemporaryFile()

        ingested_dir = join(collection.data_dir, "ingested", location.slug)
        pending_dir = join(collection.data_dir, "pending", location.slug)

        safe_makedirs(ingested_dir)
        safe_makedirs(pending_dir)

        try:
            # move all files from ingested dir to pending dir
            for path in glob.iglob(join(ingested_dir, "*")):
                os.rename(path, join(pending_dir, basename(path)))

            # save all annotations to a CSV
            writer = csv.writer(annotations_file, delimiter="\t")
            writer.writerow(["filename", "annotation"])

            for record in location.records.filter():
                for annotation in record.annotations.all():
                    writer.writerow([record.filename, annotation.text])

            annotations_file.seek(0)

            # delete all index file records in database
            location.index_files.all().delete()

            # re-ingest all index files in pending
            for path in glob.iglob(join(pending_dir, "*")):
                ingest(collection.mission, collection.file_type, location.url,
                       basename(path))

            # restore annotations and remove temporary file
            reader = csv.reader(annotations_file, delimiter="\t")
            next(reader)  # skip header
            for row in reader:
                models.Annotation.objects.create(
                    record=location.records.get(filename=row[0]), text=row[1])

        except:
            # restore backups
            shutil.rmtree(collection.data_dir)
            shutil.move(join(tmp_dir, "backup"), collection.data_dir)
            raise
        finally:
            shutil.rmtree(tmp_dir)
Beispiel #5
0
def backup(logs=False, config=False, app=False, diff=None, incr=None,
           out_path=None):
    """ Task function to perform the backup of the given items: logs,
    configuration and application (currently not supported). Backup can be full
    (the default), differential (specify path to other backup or a truthy value
    (will select the last backup)) or incremental (specify datetime or
    timedelta). An output path can be specified or one will be generated.
    Returns the path to the backup ZIP.
    """
    if not logs and not config and not app:
        raise BackupError("One of logs, config or app must be specified.")

    if diff and incr:
        raise BackupError(
            "Differential and incremental backups are mutually exclusive."
        )

    timestamp = now().strftime("%Y%m%d-%H%M%S")

    # assure that the backup path exists
    if not out_path:
        safe_makedirs(BASE_PATH)

    if diff:
        backupper = DifferentialBackup(logs, config, app, diff)
        out_path = out_path or join(BASE_PATH, "backup.diff.%s.zip" % timestamp)
    elif incr:
        backupper = IncrementalBackup(logs, config, app, incr)
        out_path = out_path or join(BASE_PATH, "backup.incr.%s.zip" % timestamp)
    else:
        backupper = FullBackup(logs, config, app)
        out_path = out_path or join(BASE_PATH, "backup.%s.zip" % timestamp)

    try:
        backupper.perform(out_path, timestamp)
        os.chmod(out_path, 0660)
    except Exception:
        exc_info = sys.exc_info()
        try:
            os.unlink(out_path)
        except OSError:
            pass
        raise exc_info[0], exc_info[1], exc_info[2]

    return out_path
Beispiel #6
0
def on_collection_created(sender, instance, created, **kwargs):
    if sender is Collection and created:
        safe_makedirs(instance.config_dir)
        safe_makedirs(join(settings.MINV_LOCK_DIR, instance.mission))

        if not exists(join(instance.config_dir, "collection.conf")):
            with open(join(instance.config_dir, "collection.conf"), "w") as f:
                f.write(
                    render_to_string("inventory/collection/collection.conf"))

        safe_makedirs(instance.data_dir)
Beispiel #7
0
def _harvest_locked(collection, url, reschedule):
    location = collection.locations.get(url=url)

    if location.location_type == "oads":
        harvester = OADSHarvester(location)
    elif location.location_type == "nga":
        harvester = NGAHarvester(location)
    else:
        raise HarvestingError(
            "Invalid location type '%s'." % location.location_type
        )

    # scan the source
    logger.debug("Scanning location %s." % location)
    available_index_files = harvester.scan()
    logger.debug("Successfully scanned location %s." % location)

    # categorize files
    inserted, updated, deleted = select_index_files(
        (base for base, url in available_index_files),
        location.index_files.values_list("filename", flat=True)
    )

    # directories for index files
    pending_dir = join(collection.data_dir, "pending", location.slug)
    ingested_dir = join(collection.data_dir, "ingested", location.slug)

    safe_makedirs(pending_dir)
    safe_makedirs(ingested_dir)

    failed_retrieve = []

    updated_to_retrieve = [u[1] for u in updated]

    # perform actual harvesting
    for index_file_name in itertools.chain(inserted, updated_to_retrieve):
        try:
            harvester.retrieve(
                join(url, index_file_name), index_file_name, pending_dir
            )
            logger.debug("Retrieved %s." % index_file_name)
        except:
            failed_retrieve.append(index_file_name)
            logger.debug("Failed to retrieve %s." % index_file_name)

    updated_to_delete = [u[0] for u in updated]

    # delete index files that are deleted or updated
    for index_file_name in itertools.chain(updated_to_delete, deleted):
        # delete model
        location.index_files.get(filename=index_file_name).delete()
        # remove ingested index file
        os.remove(join(ingested_dir, index_file_name))

    failed_ingest = []

    # finally ingest the updated and newly inserted index files
    for index_file_name in itertools.chain(updated_to_retrieve, inserted):
        try:
            index_file_name = extract_zipped_index_file(
                join(pending_dir, index_file_name)
            )

            ingest(
                collection.mission, collection.file_type, url,
                basename(index_file_name)
            )
            logger.debug("Ingested %s." % basename(index_file_name))
        except:
            failed_ingest.append(basename(index_file_name))
            logger.debug("Failed to ingest %s." % basename(index_file_name))

    logger.info("Finished harvesting for %s: %s" % (collection, location))
    if failed_retrieve:
        logger.error("Failed to retrieve %s" % ", ".join(failed_retrieve))
    if failed_ingest:
        logger.error("Failed to ingest %s" % ", ".join(failed_ingest))

    # if this was a scheduled harvest, reschedule it again
    if reschedule:
        try:
            interval = collection.configuration.harvest_interval
            schedule("harvest", now() + interval, {
                "mission": collection.mission,
                "file_type": collection.file_type,
                "url": location.url,
                "reschedule": True
            })
        except Exception as exc:
            logger.error(
                "Failed to reschedule harvest for %s %s. Error was '%s'." % (
                    collection, location, exc
                )
            )

    return failed_retrieve, failed_ingest
Beispiel #8
0
def ingest(mission, file_type, url, index_file_name):
    """ Function to ingest an indexfile into the collection identified by
    ``mission`` and ``file_type``. The indexfile must be located in the
    ``pending`` folder of the collections data directory.
    When ingested correctly, the index
    """

    collection = models.Collection.objects.get(
        mission=mission, file_type=file_type
    )
    location = models.Location.objects.get(url=url, collection=collection)

    # directories for index files
    pending_dir = join(collection.data_dir, "pending", location.slug)
    ingested_dir = join(collection.data_dir, "ingested", location.slug)
    failed_dir = join(collection.data_dir, "failed", location.slug)

    if (basename(index_file_name) != index_file_name and
            dirname(index_file_name) != pending_dir):
        raise IngestError(
            "Only pass the filename within the 'pending' directory, not the "
            "full path"
        )

    for dir_path in (pending_dir, ingested_dir, failed_dir):
        safe_makedirs(dir_path)

    path = join(pending_dir, index_file_name)
    if not exists(path):
        raise IngestError(
            "No such index file in pending directory: %s" % index_file_name
        )

    try:
        # parse index file name info
        s, e, u = basename(index_file_name).partition(".")[0].split("_")
        index_file = models.IndexFile(
            filename=index_file_name, location=location,
            begin_time=parse_index_time(s), end_time=parse_index_time(e),
            update_time=parse_index_time(u)
        )
        index_file.full_clean()
        index_file.save()

        # prepare value "preparators"
        meta = models.Record._meta
        preparations = {
            "filename": lambda v: basename(urlparse(v).path)
        }

        mapping = collection.get_metadata_field_mapping(url).items()

        if not mapping:
            raise IngestError("No metadata mapping configured for %s/%s %s"
                % (mission, file_type, url)
            )

        for target, _ in mapping:
            field = meta.get_field(target)
            if isinstance(field, DateTimeField):
                preparations[target] = parse_datetime  # TODO: necessary?
            elif isinstance(field, CharField) and field.choices:
                preparations[target] = lambda value: value[0].upper() if len(value) else None
            elif isinstance(field, MultiPolygonField):
                preparations[target] = parse_footprint
            elif isinstance(field, PointField):
                preparations[target] = parse_point
            elif isinstance(field, IntegerField):
                preparations[target] = parse_integer
            elif isinstance(field, FloatField):
                preparations[target] = parse_float

        count = 0
        with open(path) as f:
            reader = csv.DictReader(f, delimiter="\t")

            while True:
                records = []

                # iterate the files rows in chunks
                row = None
                chunk = islice(reader, 5000)

                for row in chunk:
                    record = models.Record(
                        index_file=index_file, location=location
                    )
                    for target, source in mapping:
                        try:
                            value = row[source]
                        except KeyError:
                            raise IngestionError(
                                "Index file '%s' has no such field '%s'."
                                % (index_file_name, source)
                            )
                        preparator = preparations.get(target)
                        if preparator:
                            value = preparator(value)

                        setattr(record, target, value)

                    records.append(record)
                    count += 1

                # check if the slice was empty and exit when the last line of the
                # was read.
                if row is None:
                    break
                else:
                    # save the next chunk of models to the DB
                    models.Record.objects.bulk_create(records)
                    logger.debug(
                        "Ingested chunk of %d records. "
                        "Current total %d records" % (len(records), count)
                    )

    except Exception as exc:
        # move file to failed directory
        os.rename(
            join(pending_dir, index_file_name),
            join(failed_dir, index_file_name)
        )
        logger.error(
            "Failed to ingest index file %s for %s (%s). Error was: %s"
            % (index_file_name, collection, location.url, exc)
        )
        logger.debug(traceback.format_exc())
        raise IngestionError(
            "Failed to ingest index file %s for %s (%s). Error was: %s"
            % (index_file_name, collection, location.url, exc)
        )
    else:
        # move file to ingested directory
        os.rename(
            join(pending_dir, index_file_name),
            join(ingested_dir, index_file_name)
        )
        logger.info(
            "Successfully ingested index file %s for %s (%s) with %d records"
            % (index_file_name, collection, location.url, count)
        )

    return count
Beispiel #9
0
def import_collection(filename, mission=None, file_type=None):
    """ Import a previously exported archive.
    """
    collections_qs = models.Collection.objects.filter(
        mission=mission, file_type=file_type
    )
    if collections_qs.exists():
        raise ImportException("Collection %s/%s already exists." % (
            mission, file_type
        ))

    if not zipfile.is_zipfile(filename):
        raise ImportException("File %s is not a ZIP file." % filename)

    with closing(zipfile.ZipFile(filename, "r")) as archive:
        manifest = json.loads(archive.read("manifest.json"))
        # TODO: better version check

        mission = mission or manifest["mission"]
        file_type = file_type or manifest["file_type"]

        collection = models.Collection.objects.create(
            mission=mission, file_type=file_type
        )

        if minv.__version__ != manifest["version"]:
            raise ImportException(
                "Cannot import file %s due to version mismatch: %r != %r"
                % (filename, minv.__version__, manifest["version"])
            )

        locations = json.loads(archive.read("locations.json"))

        for url, values in locations.items():
            models.Location.objects.create(
                collection=collection, url=url, location_type=values["type"]
            )

        try:
            archive.extract("collection.conf", collection.config_dir)
        except KeyError:
            pass

        slug_to_location = dict(
            (location.slug, location)
            for location in collection.locations.all()
        )

        # create a temporary directory tree to extract files to
        tmp_dir = tempfile.mkdtemp()

        # extract index files and ingest them
        members = [
            member for member in archive.namelist()
            if member.startswith("locations/") and
            basename(member) != "annotations.csv"
        ]
        try:
            for member in members:
                slug, _, index_filename = member[10:].partition("/")
                url = slug_to_location[slug].url

                directory = join(collection.data_dir, "pending", slug)
                safe_makedirs(directory)

                path = archive.extract(member, tmp_dir)
                move(path, directory)
                ingest(mission, file_type, url, index_filename)
        finally:
            rmtree(tmp_dir)

        # read annotations
        members = [
            member for member in archive.namelist()
            if member.startswith("locations/") and
            member.endswith("annotations.csv")
        ]
        for member in members:
            slug, _, index_filename = member[10:].partition("/")
            location = slug_to_location[slug]
            with closing(archive.open(member)) as annotations:
                reader = csv.reader(annotations)
                next(reader)  # skip header
                for record_filename, text in reader:
                    models.Annotation.objects.create(
                        record=models.Record.objects.get(
                            location=location, filename=record_filename
                        ),
                        text=text
                    )

    return collection
Beispiel #10
0
def ingest(mission, file_type, url, index_file_name):
    """ Function to ingest an indexfile into the collection identified by
    ``mission`` and ``file_type``. The indexfile must be located in the
    ``pending`` folder of the collections data directory.
    When ingested correctly, the index
    """

    collection = models.Collection.objects.get(mission=mission,
                                               file_type=file_type)
    location = models.Location.objects.get(url=url, collection=collection)

    # directories for index files
    pending_dir = join(collection.data_dir, "pending", location.slug)
    ingested_dir = join(collection.data_dir, "ingested", location.slug)
    failed_dir = join(collection.data_dir, "failed", location.slug)

    if (basename(index_file_name) != index_file_name
            and dirname(index_file_name) != pending_dir):
        raise IngestError(
            "Only pass the filename within the 'pending' directory, not the "
            "full path")

    for dir_path in (pending_dir, ingested_dir, failed_dir):
        safe_makedirs(dir_path)

    path = join(pending_dir, index_file_name)
    if not exists(path):
        raise IngestError("No such index file in pending directory: %s" %
                          index_file_name)

    try:
        # parse index file name info
        s, e, u = basename(index_file_name).partition(".")[0].split("_")
        index_file = models.IndexFile(filename=index_file_name,
                                      location=location,
                                      begin_time=parse_index_time(s),
                                      end_time=parse_index_time(e),
                                      update_time=parse_index_time(u))
        index_file.full_clean()
        index_file.save()

        # prepare value "preparators"
        meta = models.Record._meta
        preparations = {"filename": lambda v: basename(urlparse(v).path)}

        mapping = collection.get_metadata_field_mapping(url).items()

        if not mapping:
            raise IngestError("No metadata mapping configured for %s/%s %s" %
                              (mission, file_type, url))

        for target, _ in mapping:
            field = meta.get_field(target)
            if isinstance(field, DateTimeField):
                preparations[target] = parse_datetime  # TODO: necessary?
            elif isinstance(field, CharField) and field.choices:
                preparations[target] = lambda value: value[0].upper() if len(
                    value) else None
            elif isinstance(field, MultiPolygonField):
                preparations[target] = parse_footprint
            elif isinstance(field, PointField):
                preparations[target] = parse_point
            elif isinstance(field, IntegerField):
                preparations[target] = parse_integer
            elif isinstance(field, FloatField):
                preparations[target] = parse_float

        count = 0
        with open(path) as f:
            reader = csv.DictReader(f, delimiter="\t")

            while True:
                records = []

                # iterate the files rows in chunks
                row = None
                chunk = islice(reader, 5000)

                for row in chunk:
                    record = models.Record(index_file=index_file,
                                           location=location)
                    for target, source in mapping:
                        try:
                            value = row[source]
                        except KeyError:
                            raise IngestionError(
                                "Index file '%s' has no such field '%s'." %
                                (index_file_name, source))
                        preparator = preparations.get(target)
                        if preparator:
                            value = preparator(value)

                        setattr(record, target, value)

                    records.append(record)
                    count += 1

                # check if the slice was empty and exit when the last line of the
                # was read.
                if row is None:
                    break
                else:
                    # save the next chunk of models to the DB
                    models.Record.objects.bulk_create(records)
                    logger.debug("Ingested chunk of %d records. "
                                 "Current total %d records" %
                                 (len(records), count))

    except Exception as exc:
        # move file to failed directory
        os.rename(join(pending_dir, index_file_name),
                  join(failed_dir, index_file_name))
        logger.error(
            "Failed to ingest index file %s for %s (%s). Error was: %s" %
            (index_file_name, collection, location.url, exc))
        logger.debug(traceback.format_exc())
        raise IngestionError(
            "Failed to ingest index file %s for %s (%s). Error was: %s" %
            (index_file_name, collection, location.url, exc))
    else:
        # move file to ingested directory
        os.rename(join(pending_dir, index_file_name),
                  join(ingested_dir, index_file_name))
        logger.info(
            "Successfully ingested index file %s for %s (%s) with %d records" %
            (index_file_name, collection, location.url, count))

    return count