def handle_collection(self, collection, *args, **options): for index_file_name in args: try: # TODO: print number of records ingested ingest(collection.mission, collection.file_type, options["url"], index_file_name) except Exception as exc: raise CommandError( "Failed to ingest index file '%s'. Error was: %s" % (index_file_name, exc))
def handle_location(self, collection, location): tmp_dir = tempfile.mkdtemp() shutil.copytree(collection.data_dir, join(tmp_dir, "backup")) annotations_file = tempfile.TemporaryFile() ingested_dir = join(collection.data_dir, "ingested", location.slug) pending_dir = join(collection.data_dir, "pending", location.slug) safe_makedirs(ingested_dir) safe_makedirs(pending_dir) try: # move all files from ingested dir to pending dir for path in glob.iglob(join(ingested_dir, "*")): os.rename(path, join(pending_dir, basename(path))) # save all annotations to a CSV writer = csv.writer(annotations_file, delimiter="\t") writer.writerow(["filename", "annotation"]) for record in location.records.filter(): for annotation in record.annotations.all(): writer.writerow([record.filename, annotation.text]) annotations_file.seek(0) # delete all index file records in database location.index_files.all().delete() # re-ingest all index files in pending for path in glob.iglob(join(pending_dir, "*")): ingest( collection.mission, collection.file_type, location.url, basename(path) ) # restore annotations and remove temporary file reader = csv.reader(annotations_file, delimiter="\t") next(reader) # skip header for row in reader: models.Annotation.objects.create( record=location.records.get(filename=row[0]), text=row[1] ) except: # restore backups shutil.rmtree(collection.data_dir) shutil.move(join(tmp_dir, "backup"), collection.data_dir) raise finally: shutil.rmtree(tmp_dir)
def handle_collection(self, collection, *args, **options): for index_file_name in args: try: # TODO: print number of records ingested ingest( collection.mission, collection.file_type, options["url"], index_file_name ) except Exception as exc: raise CommandError( "Failed to ingest index file '%s'. Error was: %s" % (index_file_name, exc) )
def handle_location(self, collection, location): tmp_dir = tempfile.mkdtemp() shutil.copytree(collection.data_dir, join(tmp_dir, "backup")) annotations_file = tempfile.TemporaryFile() ingested_dir = join(collection.data_dir, "ingested", location.slug) pending_dir = join(collection.data_dir, "pending", location.slug) safe_makedirs(ingested_dir) safe_makedirs(pending_dir) try: # move all files from ingested dir to pending dir for path in glob.iglob(join(ingested_dir, "*")): os.rename(path, join(pending_dir, basename(path))) # save all annotations to a CSV writer = csv.writer(annotations_file, delimiter="\t") writer.writerow(["filename", "annotation"]) for record in location.records.filter(): for annotation in record.annotations.all(): writer.writerow([record.filename, annotation.text]) annotations_file.seek(0) # delete all index file records in database location.index_files.all().delete() # re-ingest all index files in pending for path in glob.iglob(join(pending_dir, "*")): ingest(collection.mission, collection.file_type, location.url, basename(path)) # restore annotations and remove temporary file reader = csv.reader(annotations_file, delimiter="\t") next(reader) # skip header for row in reader: models.Annotation.objects.create( record=location.records.get(filename=row[0]), text=row[1]) except: # restore backups shutil.rmtree(collection.data_dir) shutil.move(join(tmp_dir, "backup"), collection.data_dir) raise finally: shutil.rmtree(tmp_dir)
def _harvest_locked(collection, url, reschedule): location = collection.locations.get(url=url) if location.location_type == "oads": harvester = OADSHarvester(location) elif location.location_type == "nga": harvester = NGAHarvester(location) else: raise HarvestingError( "Invalid location type '%s'." % location.location_type ) # scan the source logger.debug("Scanning location %s." % location) available_index_files = harvester.scan() logger.debug("Successfully scanned location %s." % location) # categorize files inserted, updated, deleted = select_index_files( (base for base, url in available_index_files), location.index_files.values_list("filename", flat=True) ) # directories for index files pending_dir = join(collection.data_dir, "pending", location.slug) ingested_dir = join(collection.data_dir, "ingested", location.slug) safe_makedirs(pending_dir) safe_makedirs(ingested_dir) failed_retrieve = [] updated_to_retrieve = [u[1] for u in updated] # perform actual harvesting for index_file_name in itertools.chain(inserted, updated_to_retrieve): try: harvester.retrieve( join(url, index_file_name), index_file_name, pending_dir ) logger.debug("Retrieved %s." % index_file_name) except: failed_retrieve.append(index_file_name) logger.debug("Failed to retrieve %s." % index_file_name) updated_to_delete = [u[0] for u in updated] # delete index files that are deleted or updated for index_file_name in itertools.chain(updated_to_delete, deleted): # delete model location.index_files.get(filename=index_file_name).delete() # remove ingested index file os.remove(join(ingested_dir, index_file_name)) failed_ingest = [] # finally ingest the updated and newly inserted index files for index_file_name in itertools.chain(updated_to_retrieve, inserted): try: index_file_name = extract_zipped_index_file( join(pending_dir, index_file_name) ) ingest( collection.mission, collection.file_type, url, basename(index_file_name) ) logger.debug("Ingested %s." % basename(index_file_name)) except: failed_ingest.append(basename(index_file_name)) logger.debug("Failed to ingest %s." % basename(index_file_name)) logger.info("Finished harvesting for %s: %s" % (collection, location)) if failed_retrieve: logger.error("Failed to retrieve %s" % ", ".join(failed_retrieve)) if failed_ingest: logger.error("Failed to ingest %s" % ", ".join(failed_ingest)) # if this was a scheduled harvest, reschedule it again if reschedule: try: interval = collection.configuration.harvest_interval schedule("harvest", now() + interval, { "mission": collection.mission, "file_type": collection.file_type, "url": location.url, "reschedule": True }) except Exception as exc: logger.error( "Failed to reschedule harvest for %s %s. Error was '%s'." % ( collection, location, exc ) ) return failed_retrieve, failed_ingest
def import_collection(filename, mission=None, file_type=None): """ Import a previously exported archive. """ collections_qs = models.Collection.objects.filter( mission=mission, file_type=file_type ) if collections_qs.exists(): raise ImportException("Collection %s/%s already exists." % ( mission, file_type )) if not zipfile.is_zipfile(filename): raise ImportException("File %s is not a ZIP file." % filename) with closing(zipfile.ZipFile(filename, "r")) as archive: manifest = json.loads(archive.read("manifest.json")) # TODO: better version check mission = mission or manifest["mission"] file_type = file_type or manifest["file_type"] collection = models.Collection.objects.create( mission=mission, file_type=file_type ) if minv.__version__ != manifest["version"]: raise ImportException( "Cannot import file %s due to version mismatch: %r != %r" % (filename, minv.__version__, manifest["version"]) ) locations = json.loads(archive.read("locations.json")) for url, values in locations.items(): models.Location.objects.create( collection=collection, url=url, location_type=values["type"] ) try: archive.extract("collection.conf", collection.config_dir) except KeyError: pass slug_to_location = dict( (location.slug, location) for location in collection.locations.all() ) # create a temporary directory tree to extract files to tmp_dir = tempfile.mkdtemp() # extract index files and ingest them members = [ member for member in archive.namelist() if member.startswith("locations/") and basename(member) != "annotations.csv" ] try: for member in members: slug, _, index_filename = member[10:].partition("/") url = slug_to_location[slug].url directory = join(collection.data_dir, "pending", slug) safe_makedirs(directory) path = archive.extract(member, tmp_dir) move(path, directory) ingest(mission, file_type, url, index_filename) finally: rmtree(tmp_dir) # read annotations members = [ member for member in archive.namelist() if member.startswith("locations/") and member.endswith("annotations.csv") ] for member in members: slug, _, index_filename = member[10:].partition("/") location = slug_to_location[slug] with closing(archive.open(member)) as annotations: reader = csv.reader(annotations) next(reader) # skip header for record_filename, text in reader: models.Annotation.objects.create( record=models.Record.objects.get( location=location, filename=record_filename ), text=text ) return collection