Exemple #1
0
 def handle_collection(self, collection, *args, **options):
     for index_file_name in args:
         try:
             # TODO: print number of records ingested
             ingest(collection.mission, collection.file_type,
                    options["url"], index_file_name)
         except Exception as exc:
             raise CommandError(
                 "Failed to ingest index file '%s'. Error was: %s" %
                 (index_file_name, exc))
Exemple #2
0
    def handle_location(self, collection, location):
        tmp_dir = tempfile.mkdtemp()
        shutil.copytree(collection.data_dir, join(tmp_dir, "backup"))
        annotations_file = tempfile.TemporaryFile()

        ingested_dir = join(collection.data_dir, "ingested", location.slug)
        pending_dir = join(collection.data_dir, "pending", location.slug)

        safe_makedirs(ingested_dir)
        safe_makedirs(pending_dir)

        try:
            # move all files from ingested dir to pending dir
            for path in glob.iglob(join(ingested_dir, "*")):
                os.rename(path, join(pending_dir, basename(path)))

            # save all annotations to a CSV
            writer = csv.writer(annotations_file, delimiter="\t")
            writer.writerow(["filename", "annotation"])

            for record in location.records.filter():
                for annotation in record.annotations.all():
                    writer.writerow([record.filename, annotation.text])

            annotations_file.seek(0)

            # delete all index file records in database
            location.index_files.all().delete()

            # re-ingest all index files in pending
            for path in glob.iglob(join(pending_dir, "*")):
                ingest(
                    collection.mission, collection.file_type, location.url,
                    basename(path)
                )

            # restore annotations and remove temporary file
            reader = csv.reader(annotations_file, delimiter="\t")
            next(reader)  # skip header
            for row in reader:
                models.Annotation.objects.create(
                    record=location.records.get(filename=row[0]),
                    text=row[1]
                )

        except:
            # restore backups
            shutil.rmtree(collection.data_dir)
            shutil.move(join(tmp_dir, "backup"), collection.data_dir)
            raise
        finally:
            shutil.rmtree(tmp_dir)
Exemple #3
0
 def handle_collection(self, collection, *args, **options):
     for index_file_name in args:
         try:
             # TODO: print number of records ingested
             ingest(
                 collection.mission, collection.file_type, options["url"],
                 index_file_name
             )
         except Exception as exc:
             raise CommandError(
                 "Failed to ingest index file '%s'. Error was: %s"
                 % (index_file_name, exc)
             )
Exemple #4
0
    def handle_location(self, collection, location):
        tmp_dir = tempfile.mkdtemp()
        shutil.copytree(collection.data_dir, join(tmp_dir, "backup"))
        annotations_file = tempfile.TemporaryFile()

        ingested_dir = join(collection.data_dir, "ingested", location.slug)
        pending_dir = join(collection.data_dir, "pending", location.slug)

        safe_makedirs(ingested_dir)
        safe_makedirs(pending_dir)

        try:
            # move all files from ingested dir to pending dir
            for path in glob.iglob(join(ingested_dir, "*")):
                os.rename(path, join(pending_dir, basename(path)))

            # save all annotations to a CSV
            writer = csv.writer(annotations_file, delimiter="\t")
            writer.writerow(["filename", "annotation"])

            for record in location.records.filter():
                for annotation in record.annotations.all():
                    writer.writerow([record.filename, annotation.text])

            annotations_file.seek(0)

            # delete all index file records in database
            location.index_files.all().delete()

            # re-ingest all index files in pending
            for path in glob.iglob(join(pending_dir, "*")):
                ingest(collection.mission, collection.file_type, location.url,
                       basename(path))

            # restore annotations and remove temporary file
            reader = csv.reader(annotations_file, delimiter="\t")
            next(reader)  # skip header
            for row in reader:
                models.Annotation.objects.create(
                    record=location.records.get(filename=row[0]), text=row[1])

        except:
            # restore backups
            shutil.rmtree(collection.data_dir)
            shutil.move(join(tmp_dir, "backup"), collection.data_dir)
            raise
        finally:
            shutil.rmtree(tmp_dir)
Exemple #5
0
def _harvest_locked(collection, url, reschedule):
    location = collection.locations.get(url=url)

    if location.location_type == "oads":
        harvester = OADSHarvester(location)
    elif location.location_type == "nga":
        harvester = NGAHarvester(location)
    else:
        raise HarvestingError(
            "Invalid location type '%s'." % location.location_type
        )

    # scan the source
    logger.debug("Scanning location %s." % location)
    available_index_files = harvester.scan()
    logger.debug("Successfully scanned location %s." % location)

    # categorize files
    inserted, updated, deleted = select_index_files(
        (base for base, url in available_index_files),
        location.index_files.values_list("filename", flat=True)
    )

    # directories for index files
    pending_dir = join(collection.data_dir, "pending", location.slug)
    ingested_dir = join(collection.data_dir, "ingested", location.slug)

    safe_makedirs(pending_dir)
    safe_makedirs(ingested_dir)

    failed_retrieve = []

    updated_to_retrieve = [u[1] for u in updated]

    # perform actual harvesting
    for index_file_name in itertools.chain(inserted, updated_to_retrieve):
        try:
            harvester.retrieve(
                join(url, index_file_name), index_file_name, pending_dir
            )
            logger.debug("Retrieved %s." % index_file_name)
        except:
            failed_retrieve.append(index_file_name)
            logger.debug("Failed to retrieve %s." % index_file_name)

    updated_to_delete = [u[0] for u in updated]

    # delete index files that are deleted or updated
    for index_file_name in itertools.chain(updated_to_delete, deleted):
        # delete model
        location.index_files.get(filename=index_file_name).delete()
        # remove ingested index file
        os.remove(join(ingested_dir, index_file_name))

    failed_ingest = []

    # finally ingest the updated and newly inserted index files
    for index_file_name in itertools.chain(updated_to_retrieve, inserted):
        try:
            index_file_name = extract_zipped_index_file(
                join(pending_dir, index_file_name)
            )

            ingest(
                collection.mission, collection.file_type, url,
                basename(index_file_name)
            )
            logger.debug("Ingested %s." % basename(index_file_name))
        except:
            failed_ingest.append(basename(index_file_name))
            logger.debug("Failed to ingest %s." % basename(index_file_name))

    logger.info("Finished harvesting for %s: %s" % (collection, location))
    if failed_retrieve:
        logger.error("Failed to retrieve %s" % ", ".join(failed_retrieve))
    if failed_ingest:
        logger.error("Failed to ingest %s" % ", ".join(failed_ingest))

    # if this was a scheduled harvest, reschedule it again
    if reschedule:
        try:
            interval = collection.configuration.harvest_interval
            schedule("harvest", now() + interval, {
                "mission": collection.mission,
                "file_type": collection.file_type,
                "url": location.url,
                "reschedule": True
            })
        except Exception as exc:
            logger.error(
                "Failed to reschedule harvest for %s %s. Error was '%s'." % (
                    collection, location, exc
                )
            )

    return failed_retrieve, failed_ingest
Exemple #6
0
def import_collection(filename, mission=None, file_type=None):
    """ Import a previously exported archive.
    """
    collections_qs = models.Collection.objects.filter(
        mission=mission, file_type=file_type
    )
    if collections_qs.exists():
        raise ImportException("Collection %s/%s already exists." % (
            mission, file_type
        ))

    if not zipfile.is_zipfile(filename):
        raise ImportException("File %s is not a ZIP file." % filename)

    with closing(zipfile.ZipFile(filename, "r")) as archive:
        manifest = json.loads(archive.read("manifest.json"))
        # TODO: better version check

        mission = mission or manifest["mission"]
        file_type = file_type or manifest["file_type"]

        collection = models.Collection.objects.create(
            mission=mission, file_type=file_type
        )

        if minv.__version__ != manifest["version"]:
            raise ImportException(
                "Cannot import file %s due to version mismatch: %r != %r"
                % (filename, minv.__version__, manifest["version"])
            )

        locations = json.loads(archive.read("locations.json"))

        for url, values in locations.items():
            models.Location.objects.create(
                collection=collection, url=url, location_type=values["type"]
            )

        try:
            archive.extract("collection.conf", collection.config_dir)
        except KeyError:
            pass

        slug_to_location = dict(
            (location.slug, location)
            for location in collection.locations.all()
        )

        # create a temporary directory tree to extract files to
        tmp_dir = tempfile.mkdtemp()

        # extract index files and ingest them
        members = [
            member for member in archive.namelist()
            if member.startswith("locations/") and
            basename(member) != "annotations.csv"
        ]
        try:
            for member in members:
                slug, _, index_filename = member[10:].partition("/")
                url = slug_to_location[slug].url

                directory = join(collection.data_dir, "pending", slug)
                safe_makedirs(directory)

                path = archive.extract(member, tmp_dir)
                move(path, directory)
                ingest(mission, file_type, url, index_filename)
        finally:
            rmtree(tmp_dir)

        # read annotations
        members = [
            member for member in archive.namelist()
            if member.startswith("locations/") and
            member.endswith("annotations.csv")
        ]
        for member in members:
            slug, _, index_filename = member[10:].partition("/")
            location = slug_to_location[slug]
            with closing(archive.open(member)) as annotations:
                reader = csv.reader(annotations)
                next(reader)  # skip header
                for record_filename, text in reader:
                    models.Annotation.objects.create(
                        record=models.Record.objects.get(
                            location=location, filename=record_filename
                        ),
                        text=text
                    )

    return collection