Ejemplo n.º 1
0
    def export_warc(self):
        # by using select_for_update and checking for existence of this file,
        # we make sure that we won't accidentally try to create the file multiple
        # times in parallel.
        asset = self.assets.select_for_update().first()
        if not asset:
            return  # this is not an old-style Link
        if default_storage.exists(self.warc_storage_file()):
            return

        guid = self.guid
        out = self.open_warc_for_writing()

        def write_resource_record(file_path, url, content_type):
            self.write_warc_resource_record(
                default_storage.open(file_path),
                url.encode('utf8'),
                content_type,
                default_storage.created_time(file_path),
                out)

        def write_metadata_record(metadata, target_headers):
            concurrent_to = (v for k, v in target_headers if k == warctools.WarcRecord.ID).next()
            warc_date = (v for k, v in target_headers if k == warctools.WarcRecord.DATE).next()
            url = (v for k, v in target_headers if k == warctools.WarcRecord.URL).next()
            self.write_warc_metadata_record(metadata, url, concurrent_to, warc_date, out)

        # write PDF capture
        if asset.pdf_capture and ('cap' in asset.pdf_capture or 'upload' in asset.pdf_capture):
            file_path = os.path.join(asset.base_storage_path, asset.pdf_capture)
            headers = write_resource_record(file_path, "file:///%s/%s" % (guid, asset.pdf_capture), 'application/pdf')
            #write_metadata_record({'role':'primary', 'user_upload':asset.user_upload}, headers)

        # write image capture (if it's not a PDF thumbnail)
        elif (asset.image_capture and ('cap' in asset.image_capture or 'upload' in asset.image_capture)):
            file_path = os.path.join(asset.base_storage_path, asset.image_capture)
            mime_type = get_mime_type(asset.image_capture)
            write_resource_record(file_path, "file:///%s/%s" % (guid, asset.image_capture), mime_type)

        if asset.warc_capture:
            # write WARC capture
            if asset.warc_capture == 'archive.warc.gz':
                file_path = os.path.join(asset.base_storage_path, asset.warc_capture)
                self.write_warc_raw_data(default_storage.open(file_path), out)

            # write wget capture
            elif asset.warc_capture == 'source/index.html':
                mime = MimeTypes()
                for root, dirs, files in default_storage.walk(os.path.join(asset.base_storage_path, 'source')):
                    rel_path = root.split(asset.base_storage_path, 1)[-1]
                    for file_name in files:
                        mime_type = mime.guess_type(file_name)[0]
                        write_resource_record(os.path.join(root, file_name),
                                              "file:///%s%s/%s" % (guid, rel_path, file_name), mime_type)

        self.close_warc_after_writing(out)

        # regenerate CDX index
        self.cdx_lines.all().delete()
Ejemplo n.º 2
0
def migrate_assets(apps, schema_editor):
    # Update CDXLines to point to Link instead of Asset
    CDXLine = apps.get_model("perma", "CDXLine")
    print "Migrating CDXLines."
    for line in CDXLine.objects.all().select_related('asset'):
        line.link_id = line.asset.link_id
        line.save()

    # Create Captures
    Asset = apps.get_model("perma", "Asset")
    Capture = apps.get_model("perma", "Capture")
    print "Migrating Assets."
    obj_cache = []
    for i, asset in enumerate(Asset.objects.select_related('link').all()):
        if not i%1000:
            print "."
        if asset.pdf_capture:
            status = 'success' if asset.pdf_capture.endswith(
                '.pdf') else 'pending' if asset.pdf_capture == 'pending' else 'failed'
            obj_cache.append(Capture(
                link_id=asset.link_id,
                role='primary',
                status=status,
                url="file:///%s/%s" % (asset.link_id, asset.pdf_capture) if status == 'success' else None,
                record_type="resource",
                content_type="application/pdf",
                user_upload="upload" in asset.pdf_capture,
            ))

        elif asset.image_capture:
            upload = "upload" in asset.image_capture
            status = 'success' if 'cap' in asset.image_capture or 'upload' in asset.image_capture else 'pending' if asset.image_capture == 'pending' else 'failed'
            obj_cache.append(Capture(
                link_id=asset.link_id,
                role='primary' if upload else 'screenshot',
                status=status,
                url="file:///%s/%s" % (asset.link_id, asset.image_capture) if status == 'success' else None,
                record_type="resource",
                content_type=get_mime_type(asset.image_capture) or '',
                user_upload=upload,
            ))

        if asset.warc_capture:
            is_warc = asset.warc_capture == 'archive.warc.gz'
            status = 'success' if asset.warc_capture == 'archive.warc.gz' or asset.warc_capture == 'source/index.html' else 'pending' if asset.warc_capture == 'pending' else 'failed'
            url = None
            if status == 'success':
                url = asset.link.submitted_url if is_warc else "file:///%s/source/index.html" % asset.link_id
            obj_cache.append(Capture(
                link_id=asset.link_id,
                role='primary',
                status=status,
                url=url,
                record_type="response" if is_warc else "resource",
                content_type="text/html",
            ))

        if len(obj_cache)>1000:
            Capture.objects.bulk_create(obj_cache)
            obj_cache = []

    Capture.objects.bulk_create(obj_cache)
Ejemplo n.º 3
0
def migrate_assets(apps, schema_editor):
    # Update CDXLines to point to Link instead of Asset
    CDXLine = apps.get_model("perma", "CDXLine")
    print "Migrating CDXLines."
    for line in CDXLine.objects.all().select_related('asset'):
        line.link_id = line.asset.link_id
        line.save()

    # Create Captures
    Asset = apps.get_model("perma", "Asset")
    Capture = apps.get_model("perma", "Capture")
    print "Migrating Assets."
    obj_cache = []
    for i, asset in enumerate(Asset.objects.select_related('link').all()):
        if not i % 1000:
            print "."
        if asset.pdf_capture:
            status = 'success' if asset.pdf_capture.endswith(
                '.pdf'
            ) else 'pending' if asset.pdf_capture == 'pending' else 'failed'
            obj_cache.append(
                Capture(
                    link_id=asset.link_id,
                    role='primary',
                    status=status,
                    url="file:///%s/%s" % (asset.link_id, asset.pdf_capture)
                    if status == 'success' else None,
                    record_type="resource",
                    content_type="application/pdf",
                    user_upload="upload" in asset.pdf_capture,
                ))

        elif asset.image_capture:
            upload = "upload" in asset.image_capture
            status = 'success' if 'cap' in asset.image_capture or 'upload' in asset.image_capture else 'pending' if asset.image_capture == 'pending' else 'failed'
            obj_cache.append(
                Capture(
                    link_id=asset.link_id,
                    role='primary' if upload else 'screenshot',
                    status=status,
                    url="file:///%s/%s" % (asset.link_id, asset.image_capture)
                    if status == 'success' else None,
                    record_type="resource",
                    content_type=get_mime_type(asset.image_capture) or '',
                    user_upload=upload,
                ))

        if asset.warc_capture:
            is_warc = asset.warc_capture == 'archive.warc.gz'
            status = 'success' if asset.warc_capture == 'archive.warc.gz' or asset.warc_capture == 'source/index.html' else 'pending' if asset.warc_capture == 'pending' else 'failed'
            url = None
            if status == 'success':
                url = asset.link.submitted_url if is_warc else "file:///%s/source/index.html" % asset.link_id
            obj_cache.append(
                Capture(
                    link_id=asset.link_id,
                    role='primary',
                    status=status,
                    url=url,
                    record_type="response" if is_warc else "resource",
                    content_type="text/html",
                ))

        if len(obj_cache) > 1000:
            Capture.objects.bulk_create(obj_cache)
            obj_cache = []

    Capture.objects.bulk_create(obj_cache)