def update_perma(link_guid): """ Update the vested/darchived status of a perma link, and download the assets if necessary """ # N.B. This function has two instances of downloading stuff from # the root server using a scheme that looks something like # settings.SERVER + reverse("url_pattern") # This is nice because it means we don't have to repeat our URL # patterns from urls.py, but it hardcodes the fact that the root # server is another Perma instance. It's unclear to me which is a # better fact to abstract, but this is easier for now. ## First, let's get the metadata for this link. The metadata ## contains information about where we should place the assets (if ## we decide that we need them). This is also a fast check to make ## sure the link GUID is actually real. metadata_server = settings.UPSTREAM_SERVER['address'] metadata_url = metadata_server + reverse("service_link_status", args=(link_guid,)) metadata = requests.get( metadata_url, headers=settings.UPSTREAM_SERVER.get('headers', {}) ).json() ## Next, let's see if we need to get the assets. If we have the ## Link object for this GUID, we're going to assume we already ## have what we need. It would make a little more sense to use the ## Asset object here instead, but we're definitely going to need ## to do stuff to the Link object so we might as well get that ## instead. In practice they should be ~one to one. try: link = Link.objects.get(guid=link_guid) except Link.DoesNotExist: ## We need to download the assets. We can download an archive ## from the assets server. assets_server = settings.UPSTREAM_SERVER['address'] assets_url = assets_server + reverse("mirroring:link_assets", args=(link_guid,)) # Temp paths can be relative because we're in run_in_tempdir() temp_zip_path = 'temp.zip' # Save remote zip file to disk, using streaming to avoid keeping large files in RAM. request = requests.get( assets_url, headers=settings.UPSTREAM_SERVER.get('headers', {}), stream=True) with open(temp_zip_path, 'wb') as f: for chunk in request.iter_content(1024): f.write(chunk) ## Extract the archive and change into the extracted folder. with zipfile.ZipFile(temp_zip_path, "r") as zipfh: #assets_path = os.path.dirname(os.path.join(settings.MEDIA_ROOT, metadata["path"])) zipfh.extractall() # creates folder named [guid] in current temp dir temp_extracted_path = os.path.basename(metadata['path']) # e.g. "1234-ABCD" # Save all extracted files to default_storage, using the path in metadata. for root, dirs, files in os.walk(temp_extracted_path): for file in files: source_file_path = os.path.join(root, file) # e.g. "1234-ABCD/cap.png" dest_file_path = os.path.join(os.path.dirname(metadata['path']), source_file_path) # e.g. 2014/6/10/18/37/1234-ABCD/cap.png with open(source_file_path, 'rb') as source_file: default_storage.store_file(source_file, dest_file_path) ## We can now get some additional metadata that we'll need to ## create the Link object. with open(os.path.join(temp_extracted_path, "metadata.json"), "r") as fh: link_metadata = json.load(fh) ## We now have everything we need to initialize the Link object. link = Link(guid=link_guid) link.submitted_url = link_metadata["submitted_url"] link.submitted_title = link_metadata["submitted_title"] link.created_by = None # XXX maybe we should do something with FakeUser here link.save(pregenerated_guid=True) # We need to save this so that we can create an Asset object # This is a stupid hack to overcome the fact that the Link has # auto_now_add=True, so it's always going to be saved to the # current time on first creation. link.creation_timestamp = unserialize_datetime(link_metadata["creation_timestamp"]) link.save() ## Lastly, let's create an Asset object for this Link. asset = Asset(link=link) asset.base_storage_path = metadata["path"] asset.image_capture = metadata["image_capture"] asset.warc_capture = metadata["source_capture"] asset.pdf_capture = metadata["pdf_capture"] asset.text_capture = metadata["text_capture"] asset.save() ## We can now add some of the data we got from the metadata to the Link object link.dark_archived = metadata["dark_archived"] link.vested = metadata["vested"] link.save() # If we have sub-mirrors, poke them to get a copy from us. if settings.DOWNSTREAM_SERVERS: run_task(poke_mirrors, link_guid=link_guid)