def handle_location(self, collection, location): tmp_dir = tempfile.mkdtemp() shutil.copytree(collection.data_dir, join(tmp_dir, "backup")) annotations_file = tempfile.TemporaryFile() ingested_dir = join(collection.data_dir, "ingested", location.slug) pending_dir = join(collection.data_dir, "pending", location.slug) safe_makedirs(ingested_dir) safe_makedirs(pending_dir) try: # move all files from ingested dir to pending dir for path in glob.iglob(join(ingested_dir, "*")): os.rename(path, join(pending_dir, basename(path))) # save all annotations to a CSV writer = csv.writer(annotations_file, delimiter="\t") writer.writerow(["filename", "annotation"]) for record in location.records.filter(): for annotation in record.annotations.all(): writer.writerow([record.filename, annotation.text]) annotations_file.seek(0) # delete all index file records in database location.index_files.all().delete() # re-ingest all index files in pending for path in glob.iglob(join(pending_dir, "*")): ingest( collection.mission, collection.file_type, location.url, basename(path) ) # restore annotations and remove temporary file reader = csv.reader(annotations_file, delimiter="\t") next(reader) # skip header for row in reader: models.Annotation.objects.create( record=location.records.get(filename=row[0]), text=row[1] ) except: # restore backups shutil.rmtree(collection.data_dir) shutil.move(join(tmp_dir, "backup"), collection.data_dir) raise finally: shutil.rmtree(tmp_dir)
def backup(logs=False, config=False, app=False, diff=None, incr=None, out_path=None): """ Task function to perform the backup of the given items: logs, configuration and application (currently not supported). Backup can be full (the default), differential (specify path to other backup or a truthy value (will select the last backup)) or incremental (specify datetime or timedelta). An output path can be specified or one will be generated. Returns the path to the backup ZIP. """ if not logs and not config and not app: raise BackupError("One of logs, config or app must be specified.") if diff and incr: raise BackupError( "Differential and incremental backups are mutually exclusive.") timestamp = now().strftime("%Y%m%d-%H%M%S") # assure that the backup path exists if not out_path: safe_makedirs(BASE_PATH) if diff: backupper = DifferentialBackup(logs, config, app, diff) out_path = out_path or join(BASE_PATH, "backup.diff.%s.zip" % timestamp) elif incr: backupper = IncrementalBackup(logs, config, app, incr) out_path = out_path or join(BASE_PATH, "backup.incr.%s.zip" % timestamp) else: backupper = FullBackup(logs, config, app) out_path = out_path or join(BASE_PATH, "backup.%s.zip" % timestamp) try: backupper.perform(out_path, timestamp) os.chmod(out_path, 0660) except Exception: exc_info = sys.exc_info() try: os.unlink(out_path) except OSError: pass raise exc_info[0], exc_info[1], exc_info[2] return out_path
def export_collection(mission, file_type, filename=None, configuration=True, data=True, reschedule=False): """ Export the configuration and/or the data of a collection to a ZIP file. """ collection = models.Collection.objects.get( mission=mission, file_type=file_type ) if not configuration and not data: raise RuntimeError("Neither collection nor data export specified") # create a default filename if none was specified if not filename: exports_dir = join(collection.data_dir, "exports") safe_makedirs(exports_dir) new_filename = join( exports_dir, "export_%s.zip" % now().strftime("%Y%m%d-%H%M%S") ) with collection.get_lock(): ret_val = _export_collection_locked( collection, filename or new_filename, configuration, data ) logger.info( "Successfully exported %s collection to %s" % (collection, filename) ) if reschedule: try: interval = collection.configuration.export_interval schedule("export", now() + interval, { "mission": mission, "file_type": file_type, "filename": filename, "configuration": configuration, "data": data, "reschedule": True }) except Exception as exc: logger.error( "Failed to reschedule export for %s. Error was '%s'." % ( collection, exc ) ) return ret_val
def handle_location(self, collection, location): tmp_dir = tempfile.mkdtemp() shutil.copytree(collection.data_dir, join(tmp_dir, "backup")) annotations_file = tempfile.TemporaryFile() ingested_dir = join(collection.data_dir, "ingested", location.slug) pending_dir = join(collection.data_dir, "pending", location.slug) safe_makedirs(ingested_dir) safe_makedirs(pending_dir) try: # move all files from ingested dir to pending dir for path in glob.iglob(join(ingested_dir, "*")): os.rename(path, join(pending_dir, basename(path))) # save all annotations to a CSV writer = csv.writer(annotations_file, delimiter="\t") writer.writerow(["filename", "annotation"]) for record in location.records.filter(): for annotation in record.annotations.all(): writer.writerow([record.filename, annotation.text]) annotations_file.seek(0) # delete all index file records in database location.index_files.all().delete() # re-ingest all index files in pending for path in glob.iglob(join(pending_dir, "*")): ingest(collection.mission, collection.file_type, location.url, basename(path)) # restore annotations and remove temporary file reader = csv.reader(annotations_file, delimiter="\t") next(reader) # skip header for row in reader: models.Annotation.objects.create( record=location.records.get(filename=row[0]), text=row[1]) except: # restore backups shutil.rmtree(collection.data_dir) shutil.move(join(tmp_dir, "backup"), collection.data_dir) raise finally: shutil.rmtree(tmp_dir)
def backup(logs=False, config=False, app=False, diff=None, incr=None, out_path=None): """ Task function to perform the backup of the given items: logs, configuration and application (currently not supported). Backup can be full (the default), differential (specify path to other backup or a truthy value (will select the last backup)) or incremental (specify datetime or timedelta). An output path can be specified or one will be generated. Returns the path to the backup ZIP. """ if not logs and not config and not app: raise BackupError("One of logs, config or app must be specified.") if diff and incr: raise BackupError( "Differential and incremental backups are mutually exclusive." ) timestamp = now().strftime("%Y%m%d-%H%M%S") # assure that the backup path exists if not out_path: safe_makedirs(BASE_PATH) if diff: backupper = DifferentialBackup(logs, config, app, diff) out_path = out_path or join(BASE_PATH, "backup.diff.%s.zip" % timestamp) elif incr: backupper = IncrementalBackup(logs, config, app, incr) out_path = out_path or join(BASE_PATH, "backup.incr.%s.zip" % timestamp) else: backupper = FullBackup(logs, config, app) out_path = out_path or join(BASE_PATH, "backup.%s.zip" % timestamp) try: backupper.perform(out_path, timestamp) os.chmod(out_path, 0660) except Exception: exc_info = sys.exc_info() try: os.unlink(out_path) except OSError: pass raise exc_info[0], exc_info[1], exc_info[2] return out_path
def on_collection_created(sender, instance, created, **kwargs): if sender is Collection and created: safe_makedirs(instance.config_dir) safe_makedirs(join(settings.MINV_LOCK_DIR, instance.mission)) if not exists(join(instance.config_dir, "collection.conf")): with open(join(instance.config_dir, "collection.conf"), "w") as f: f.write( render_to_string("inventory/collection/collection.conf")) safe_makedirs(instance.data_dir)
def _harvest_locked(collection, url, reschedule): location = collection.locations.get(url=url) if location.location_type == "oads": harvester = OADSHarvester(location) elif location.location_type == "nga": harvester = NGAHarvester(location) else: raise HarvestingError( "Invalid location type '%s'." % location.location_type ) # scan the source logger.debug("Scanning location %s." % location) available_index_files = harvester.scan() logger.debug("Successfully scanned location %s." % location) # categorize files inserted, updated, deleted = select_index_files( (base for base, url in available_index_files), location.index_files.values_list("filename", flat=True) ) # directories for index files pending_dir = join(collection.data_dir, "pending", location.slug) ingested_dir = join(collection.data_dir, "ingested", location.slug) safe_makedirs(pending_dir) safe_makedirs(ingested_dir) failed_retrieve = [] updated_to_retrieve = [u[1] for u in updated] # perform actual harvesting for index_file_name in itertools.chain(inserted, updated_to_retrieve): try: harvester.retrieve( join(url, index_file_name), index_file_name, pending_dir ) logger.debug("Retrieved %s." % index_file_name) except: failed_retrieve.append(index_file_name) logger.debug("Failed to retrieve %s." % index_file_name) updated_to_delete = [u[0] for u in updated] # delete index files that are deleted or updated for index_file_name in itertools.chain(updated_to_delete, deleted): # delete model location.index_files.get(filename=index_file_name).delete() # remove ingested index file os.remove(join(ingested_dir, index_file_name)) failed_ingest = [] # finally ingest the updated and newly inserted index files for index_file_name in itertools.chain(updated_to_retrieve, inserted): try: index_file_name = extract_zipped_index_file( join(pending_dir, index_file_name) ) ingest( collection.mission, collection.file_type, url, basename(index_file_name) ) logger.debug("Ingested %s." % basename(index_file_name)) except: failed_ingest.append(basename(index_file_name)) logger.debug("Failed to ingest %s." % basename(index_file_name)) logger.info("Finished harvesting for %s: %s" % (collection, location)) if failed_retrieve: logger.error("Failed to retrieve %s" % ", ".join(failed_retrieve)) if failed_ingest: logger.error("Failed to ingest %s" % ", ".join(failed_ingest)) # if this was a scheduled harvest, reschedule it again if reschedule: try: interval = collection.configuration.harvest_interval schedule("harvest", now() + interval, { "mission": collection.mission, "file_type": collection.file_type, "url": location.url, "reschedule": True }) except Exception as exc: logger.error( "Failed to reschedule harvest for %s %s. Error was '%s'." % ( collection, location, exc ) ) return failed_retrieve, failed_ingest
def ingest(mission, file_type, url, index_file_name): """ Function to ingest an indexfile into the collection identified by ``mission`` and ``file_type``. The indexfile must be located in the ``pending`` folder of the collections data directory. When ingested correctly, the index """ collection = models.Collection.objects.get( mission=mission, file_type=file_type ) location = models.Location.objects.get(url=url, collection=collection) # directories for index files pending_dir = join(collection.data_dir, "pending", location.slug) ingested_dir = join(collection.data_dir, "ingested", location.slug) failed_dir = join(collection.data_dir, "failed", location.slug) if (basename(index_file_name) != index_file_name and dirname(index_file_name) != pending_dir): raise IngestError( "Only pass the filename within the 'pending' directory, not the " "full path" ) for dir_path in (pending_dir, ingested_dir, failed_dir): safe_makedirs(dir_path) path = join(pending_dir, index_file_name) if not exists(path): raise IngestError( "No such index file in pending directory: %s" % index_file_name ) try: # parse index file name info s, e, u = basename(index_file_name).partition(".")[0].split("_") index_file = models.IndexFile( filename=index_file_name, location=location, begin_time=parse_index_time(s), end_time=parse_index_time(e), update_time=parse_index_time(u) ) index_file.full_clean() index_file.save() # prepare value "preparators" meta = models.Record._meta preparations = { "filename": lambda v: basename(urlparse(v).path) } mapping = collection.get_metadata_field_mapping(url).items() if not mapping: raise IngestError("No metadata mapping configured for %s/%s %s" % (mission, file_type, url) ) for target, _ in mapping: field = meta.get_field(target) if isinstance(field, DateTimeField): preparations[target] = parse_datetime # TODO: necessary? elif isinstance(field, CharField) and field.choices: preparations[target] = lambda value: value[0].upper() if len(value) else None elif isinstance(field, MultiPolygonField): preparations[target] = parse_footprint elif isinstance(field, PointField): preparations[target] = parse_point elif isinstance(field, IntegerField): preparations[target] = parse_integer elif isinstance(field, FloatField): preparations[target] = parse_float count = 0 with open(path) as f: reader = csv.DictReader(f, delimiter="\t") while True: records = [] # iterate the files rows in chunks row = None chunk = islice(reader, 5000) for row in chunk: record = models.Record( index_file=index_file, location=location ) for target, source in mapping: try: value = row[source] except KeyError: raise IngestionError( "Index file '%s' has no such field '%s'." % (index_file_name, source) ) preparator = preparations.get(target) if preparator: value = preparator(value) setattr(record, target, value) records.append(record) count += 1 # check if the slice was empty and exit when the last line of the # was read. if row is None: break else: # save the next chunk of models to the DB models.Record.objects.bulk_create(records) logger.debug( "Ingested chunk of %d records. " "Current total %d records" % (len(records), count) ) except Exception as exc: # move file to failed directory os.rename( join(pending_dir, index_file_name), join(failed_dir, index_file_name) ) logger.error( "Failed to ingest index file %s for %s (%s). Error was: %s" % (index_file_name, collection, location.url, exc) ) logger.debug(traceback.format_exc()) raise IngestionError( "Failed to ingest index file %s for %s (%s). Error was: %s" % (index_file_name, collection, location.url, exc) ) else: # move file to ingested directory os.rename( join(pending_dir, index_file_name), join(ingested_dir, index_file_name) ) logger.info( "Successfully ingested index file %s for %s (%s) with %d records" % (index_file_name, collection, location.url, count) ) return count
def import_collection(filename, mission=None, file_type=None): """ Import a previously exported archive. """ collections_qs = models.Collection.objects.filter( mission=mission, file_type=file_type ) if collections_qs.exists(): raise ImportException("Collection %s/%s already exists." % ( mission, file_type )) if not zipfile.is_zipfile(filename): raise ImportException("File %s is not a ZIP file." % filename) with closing(zipfile.ZipFile(filename, "r")) as archive: manifest = json.loads(archive.read("manifest.json")) # TODO: better version check mission = mission or manifest["mission"] file_type = file_type or manifest["file_type"] collection = models.Collection.objects.create( mission=mission, file_type=file_type ) if minv.__version__ != manifest["version"]: raise ImportException( "Cannot import file %s due to version mismatch: %r != %r" % (filename, minv.__version__, manifest["version"]) ) locations = json.loads(archive.read("locations.json")) for url, values in locations.items(): models.Location.objects.create( collection=collection, url=url, location_type=values["type"] ) try: archive.extract("collection.conf", collection.config_dir) except KeyError: pass slug_to_location = dict( (location.slug, location) for location in collection.locations.all() ) # create a temporary directory tree to extract files to tmp_dir = tempfile.mkdtemp() # extract index files and ingest them members = [ member for member in archive.namelist() if member.startswith("locations/") and basename(member) != "annotations.csv" ] try: for member in members: slug, _, index_filename = member[10:].partition("/") url = slug_to_location[slug].url directory = join(collection.data_dir, "pending", slug) safe_makedirs(directory) path = archive.extract(member, tmp_dir) move(path, directory) ingest(mission, file_type, url, index_filename) finally: rmtree(tmp_dir) # read annotations members = [ member for member in archive.namelist() if member.startswith("locations/") and member.endswith("annotations.csv") ] for member in members: slug, _, index_filename = member[10:].partition("/") location = slug_to_location[slug] with closing(archive.open(member)) as annotations: reader = csv.reader(annotations) next(reader) # skip header for record_filename, text in reader: models.Annotation.objects.create( record=models.Record.objects.get( location=location, filename=record_filename ), text=text ) return collection
def ingest(mission, file_type, url, index_file_name): """ Function to ingest an indexfile into the collection identified by ``mission`` and ``file_type``. The indexfile must be located in the ``pending`` folder of the collections data directory. When ingested correctly, the index """ collection = models.Collection.objects.get(mission=mission, file_type=file_type) location = models.Location.objects.get(url=url, collection=collection) # directories for index files pending_dir = join(collection.data_dir, "pending", location.slug) ingested_dir = join(collection.data_dir, "ingested", location.slug) failed_dir = join(collection.data_dir, "failed", location.slug) if (basename(index_file_name) != index_file_name and dirname(index_file_name) != pending_dir): raise IngestError( "Only pass the filename within the 'pending' directory, not the " "full path") for dir_path in (pending_dir, ingested_dir, failed_dir): safe_makedirs(dir_path) path = join(pending_dir, index_file_name) if not exists(path): raise IngestError("No such index file in pending directory: %s" % index_file_name) try: # parse index file name info s, e, u = basename(index_file_name).partition(".")[0].split("_") index_file = models.IndexFile(filename=index_file_name, location=location, begin_time=parse_index_time(s), end_time=parse_index_time(e), update_time=parse_index_time(u)) index_file.full_clean() index_file.save() # prepare value "preparators" meta = models.Record._meta preparations = {"filename": lambda v: basename(urlparse(v).path)} mapping = collection.get_metadata_field_mapping(url).items() if not mapping: raise IngestError("No metadata mapping configured for %s/%s %s" % (mission, file_type, url)) for target, _ in mapping: field = meta.get_field(target) if isinstance(field, DateTimeField): preparations[target] = parse_datetime # TODO: necessary? elif isinstance(field, CharField) and field.choices: preparations[target] = lambda value: value[0].upper() if len( value) else None elif isinstance(field, MultiPolygonField): preparations[target] = parse_footprint elif isinstance(field, PointField): preparations[target] = parse_point elif isinstance(field, IntegerField): preparations[target] = parse_integer elif isinstance(field, FloatField): preparations[target] = parse_float count = 0 with open(path) as f: reader = csv.DictReader(f, delimiter="\t") while True: records = [] # iterate the files rows in chunks row = None chunk = islice(reader, 5000) for row in chunk: record = models.Record(index_file=index_file, location=location) for target, source in mapping: try: value = row[source] except KeyError: raise IngestionError( "Index file '%s' has no such field '%s'." % (index_file_name, source)) preparator = preparations.get(target) if preparator: value = preparator(value) setattr(record, target, value) records.append(record) count += 1 # check if the slice was empty and exit when the last line of the # was read. if row is None: break else: # save the next chunk of models to the DB models.Record.objects.bulk_create(records) logger.debug("Ingested chunk of %d records. " "Current total %d records" % (len(records), count)) except Exception as exc: # move file to failed directory os.rename(join(pending_dir, index_file_name), join(failed_dir, index_file_name)) logger.error( "Failed to ingest index file %s for %s (%s). Error was: %s" % (index_file_name, collection, location.url, exc)) logger.debug(traceback.format_exc()) raise IngestionError( "Failed to ingest index file %s for %s (%s). Error was: %s" % (index_file_name, collection, location.url, exc)) else: # move file to ingested directory os.rename(join(pending_dir, index_file_name), join(ingested_dir, index_file_name)) logger.info( "Successfully ingested index file %s for %s (%s) with %d records" % (index_file_name, collection, location.url, count)) return count