Ejemplo n.º 1
0
    def put_custom_record(self, environ, coll):
        chunks = []
        while True:
            buff = environ["wsgi.input"].read()
            print("LEN", len(buff))
            if not buff:
                break

            chunks.append(buff)

        data = b"".join(chunks)

        params = dict(parse_qsl(environ.get("QUERY_STRING")))

        rec_type = "resource"

        headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")}

        target_uri = params.get("url")

        if not target_uri:
            return WbResponse.json_response({"error": "no url"})

        timestamp = params.get("timestamp")
        if timestamp:
            headers["WARC-Date"] = timestamp_to_iso_date(timestamp)

        put_url = self.custom_record_path.format(url=target_uri,
                                                 coll=coll,
                                                 rec_type=rec_type)
        res = requests.put(put_url, headers=headers, data=data)

        res = res.json()

        return WbResponse.json_response(res)
Ejemplo n.º 2
0
    def serialize_json_pages(self,
                             pages,
                             id,
                             title,
                             desc=None,
                             has_text=False):
        page_header = {"format": "json-pages-1.0", "id": id}

        if title:
            page_header["title"] = title

        if desc:
            page_header["description"] = desc

        if has_text:
            page_header["hasText"] = True

        yield json.dumps(page_header) + "\n"

        for line in pages:
            ts = timestamp_to_iso_date(line["timestamp"])
            page_title = line.get("title")

            uid = line.get("id") or line.get("page_id") or shortuuid.uuid()

            data = {"id": uid, "url": line["url"], "ts": ts}

            if page_title:
                data["title"] = page_title

            if "text" in line:
                data["text"] = line["text"]

            yield json.dumps(data) + "\n"
Ejemplo n.º 3
0
    def ingest(self, coll, text, params):
        parsed = json.loads(text)
        mdata = {}
        content = "\n".join(text for text in extract_text(parsed["root"], mdata))
        title = mdata.get('title')
        url = params.get('url')
        timestamp_ss = params.get('timestamp')
        timestamp_dts = timestamp_to_iso_date(timestamp_ss)
        has_screenshot_b = params.get('hasScreenshot') == '1'

        title = title or url

        digest = self.get_digest(content)

        if self.update_if_dupe(digest, coll, url, timestamp_ss, timestamp_dts):
            return

        data = {
            'coll_s': coll,
            'title_t': title,
            'content_t': content,
            'url_s': url,
            'digest_s': digest,
            'timestamp_ss': timestamp_ss,
            'timestamp_dts': timestamp_dts,
            'has_screenshot_b': has_screenshot_b,
        }

        result = requests.post(self.solr_api, json=data)
Ejemplo n.º 4
0
    def create_redirect_record(self,
                               url,
                               redirect_url,
                               timestamp,
                               status='301'):
        warc_headers = {}
        warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)

        #content = 'Redirect to ' + redirect_url
        content = ''
        payload = content.encode('utf-8')
        headers_list = [('Content-Length', str(len(payload))),
                        ('Location', redirect_url)]

        http_headers = StatusAndHeaders(status + ' Redirect',
                                        headers_list,
                                        protocol='HTTP/1.0')

        rec = self.writer.create_warc_record(url,
                                             'response',
                                             payload=BytesIO(payload),
                                             length=len(payload),
                                             http_headers=http_headers,
                                             warc_headers_dict=warc_headers)

        self.writer.write_record(rec)

        return rec
Ejemplo n.º 5
0
    def ingest(self, text, params):

        # text already parsed
        content = text.decode('utf-8')
        title = params.get('title') or params.get('url')

        url = params.get('url')

        timestamp_s = params.get('timestamp') or timestamp_now()
        timestamp_dt = timestamp_to_iso_date(timestamp_s)
        has_screenshot_b = params.get('hasScreenshot') == '1'

        title = title or url

        digest = self.get_digest(content)

        #if self.update_if_dupe(digest, coll, url, timestamp_ss, timestamp_dts):
        #    return

        data = {
            'user_s': params.get('user'),
            'coll_s': params.get('coll'),
            'rec_s': params.get('rec'),
            'id': params.get('pid'),
            'title_t': title,
            'content_t': content,
            'url_s': url,
            'digest_s': digest,
            'timestamp_s': timestamp_s,
            'timestamp_dt': timestamp_dt,
            'has_screenshot_b': has_screenshot_b,
        }

        result = requests.post(self.solr_api, json=data)
Ejemplo n.º 6
0
    def _set_fixed_dt(self, fixed_dt):
        if not fixed_dt:
            return None

        fixed_dt = DATE_TIMESPLIT.sub('', fixed_dt)
        fixed_dt = pad_timestamp(fixed_dt, PAD_14_DOWN)
        fixed_dt = timestamp_to_iso_date(fixed_dt)
        return fixed_dt
Ejemplo n.º 7
0
    def serialize_cdxj_pages(self, pages):
        yield '!meta 0 ' + json.dumps({'format': 'cdxj-pages-1.0', 'title': 'All Pages'})

        for line in pages.values():
            ts = timestamp_to_iso_date(line['timestamp'])
            title = quote(line.get('title') or line.get('url'), safe=':/?&=')

            data = {'url': line['url']}
            if 'text' in line:
                data['text'] = line['text']

            yield title + ' ' + ts + ' ' + json.dumps(data)
Ejemplo n.º 8
0
    def serialize_json_pages(self, pages):
        yield json.dumps({'format': 'json-pages-1.0', 'title': 'All Pages'}) + "\n"

        for line in pages.values():
            ts = timestamp_to_iso_date(line['timestamp'])
            title = line.get('title')

            data = {'url': line['url'], 'ts': ts}
            if title:
                data['title']= title

            if 'text' in line:
                data['text'] = line['text']

            yield json.dumps(data) + "\n"
Ejemplo n.º 9
0
    def create_revisit_record(self, original, url, redirect_url, timestamp):
        warc_headers = {}
        warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)

        headers_list = [('Content-Length', '0'),
                        ('Location', redirect_url)]

        http_headers = StatusAndHeaders('302 Temp Redirect', headers_list, protocol='HTTP/1.0')

        rec = self.writer.create_revisit_record(url,
                                                digest=original.rec_headers['WARC-Payload-Digest'],
                                                refers_to_uri=url,
                                                refers_to_date=original.rec_headers['WARC-Date'],
                                                warc_headers_dict=warc_headers,
                                                http_headers=http_headers)

        self.writer.write_record(rec)
Ejemplo n.º 10
0
    def make_transclusion_metadata(self, writer, url, record):
        content_type = record.rec_headers.get('Content-Type')
        for url, timestamp, metadata in self.transclusion_serializer.find_transclusions(
                url, content_type):
            #if url.startswith('http://'):
            #    url = url.replace('http://', 'metadata://')
            #elif url.startswith('https://'):
            #    url = url.replace('https://', 'metadata://')
            embeds_url = 'urn:embeds:' + url

            content = json.dumps(metadata, indent=2,
                                 sort_keys=True).encode('utf-8')

            warc_date = timestamp_to_iso_date(timestamp)

            warc_headers = {
                'WARC-Date': warc_date,
                'WARC-Creation-Date': writer._make_warc_date()
            }

            warc_content_type = 'application/vnd.youtube-dl_formats+json'

            record = writer.create_warc_record(
                embeds_url,
                'resource',
                payload=BytesIO(content),
                length=len(content),
                warc_content_type=warc_content_type,
                warc_headers_dict=warc_headers)

            logging.debug(
                'Writing transclusion metadata at {0}'.format(embeds_url))

            writer.write_record(record)
            self.count += 1

            self.logger.debug('Writing "{0}" ({1}) @ "{2}" from "{3}"'.format(
                embeds_url, warc_content_type, warc_date, '-'))

            self.write_logfile({
                'file': '-',
                'Record-Type': 'metadata',
                'URL': embeds_url,
                'timestamp': warc_date,
            })
Ejemplo n.º 11
0
    def create_response_record(self, url, timestamp, text):
        payload = text.encode('utf-8')

        warc_headers = {}
        warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)

        headers_list = [('Content-Length', str(len(payload)))]

        http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')

        rec = self.writer.create_warc_record(url, 'response',
                                             payload=BytesIO(payload),
                                             length=len(payload),
                                             http_headers=http_headers,
                                             warc_headers_dict=warc_headers)

        self.writer.write_record(rec)
        return rec
Ejemplo n.º 12
0
    def create_revisit_record(self, url, timestamp, redirect_url, original_dt):
        warc_headers = {}
        warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)

        headers_list = [('Content-Length', '0'), ('Location', redirect_url)]

        http_headers = StatusAndHeaders('302 Temp Redirect',
                                        headers_list,
                                        protocol='HTTP/1.0')

        rec = self.writer.create_revisit_record(
            url,
            digest='3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ',
            refers_to_uri=url,
            refers_to_date=original_dt,
            warc_headers_dict=warc_headers,
            http_headers=http_headers)

        self.writer.write_record(rec)
Ejemplo n.º 13
0
    def generate_metadata(self, res, wacz):
        package_dict = {}

        package_dict["profile"] = "data-package"
        package_dict["resources"] = []
        for i in range(0, len(wacz.infolist())):
            file = wacz.infolist()[i]
            package_dict["resources"].append({})
            package_dict["resources"][i]["path"] = file.filename
            with wacz.open(file, "r") as myfile:
                content = myfile.read()
                package_dict["resources"][i]["hash"] = support_hash_file(
                    self.hash_type, content)
                package_dict["resources"][i]["bytes"] = len(content)

        # set optional metadata
        desc = res.desc or self.desc
        title = res.title or self.title

        if title:
            package_dict["title"] = title

        if desc:
            package_dict["description"] = desc

        if self.main_url:
            package_dict["mainPageURL"] = self.main_url
            if self.main_ts:
                package_dict["mainPageDate"] = timestamp_to_iso_date(
                    self.main_ts)

        if res.date:
            package_dict["mainPageDate"] = res.date

        package_dict["created"] = datetime.datetime.utcnow().strftime(
            "%Y-%m-%dT%H:%M:%SZ")

        package_dict["wacz_version"] = WACZ_VERSION

        package_dict["software"] = "py-wacz " + get_py_wacz_version()

        return json.dumps(package_dict, indent=2)
Ejemplo n.º 14
0
    def create_record(cls, writer, url, text, timestamp):
        payload = text.encode('utf-8')

        headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'),
                        ('Content-Length', str(len(payload)))
                       ]

        http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')

        warc_headers = {}
        if timestamp:
            warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)

        rec = writer.create_warc_record(url, 'response',
                                         payload=BytesIO(payload),
                                         length=len(payload),
                                         http_headers=http_headers,
                                         warc_headers_dict=warc_headers)

        writer.write_record(rec)
Ejemplo n.º 15
0
    def put_custom_record(self, environ, coll="$root"):
        """ When recording, PUT a custom WARC record to the specified collection
        (Available only when recording)

        :param dict environ: The WSGI environment dictionary for the request
        :param str coll: The name of the collection the record is to be served from
        """
        chunks = []
        while True:
            buff = environ["wsgi.input"].read()
            if not buff:
                break

            chunks.append(buff)

        data = b"".join(chunks)

        params = dict(parse_qsl(environ.get("QUERY_STRING")))

        rec_type = "resource"

        headers = {"Content-Type": environ.get("CONTENT_TYPE", "text/plain")}

        target_uri = params.get("url")

        if not target_uri:
            return WbResponse.json_response({"error": "no url"},
                                            status="400 Bad Request")

        timestamp = params.get("timestamp")
        if timestamp:
            headers["WARC-Date"] = timestamp_to_iso_date(timestamp)

        put_url = self.put_custom_record_path.format(url=target_uri,
                                                     coll=coll,
                                                     rec_type=rec_type)
        res = requests.put(put_url, headers=headers, data=data)

        res = res.json()

        return WbResponse.json_response(res)
Ejemplo n.º 16
0
    def create_record(cls, writer, url, text, timestamp):
        payload = text.encode('utf-8')

        headers_list = [('Content-Type', 'text/plain; charset="UTF-8"'),
                        ('Content-Length', str(len(payload)))]

        http_headers = StatusAndHeaders('200 OK',
                                        headers_list,
                                        protocol='HTTP/1.0')

        warc_headers = {}
        if timestamp:
            warc_headers['WARC-Date'] = timestamp_to_iso_date(timestamp)

        rec = writer.create_warc_record(url,
                                        'response',
                                        payload=BytesIO(payload),
                                        length=len(payload),
                                        http_headers=http_headers,
                                        warc_headers_dict=warc_headers)

        writer.write_record(rec)
Ejemplo n.º 17
0
    def generate_metadata(self, res):
        desc = res.desc or self.desc
        title = res.title or self.title
        textIndex = PAGE_INDEX if res.text else ''

        data = {}
        if title:
            data['title'] = title

        if desc:
            data['desc'] = desc

        if textIndex:
            data['textIndex'] = textIndex

        data['pages'] = [{
            'title': page.get('title') or page.get('url'),
            'date': timestamp_to_iso_date(page['timestamp']),
            'url': page['url']
        } for page in self.pages.values()]

        return yaml.dump(data)
Ejemplo n.º 18
0
    def put_record(self, environ, coll, target_uri_format, rec_type, params, data):
        self.ensure_coll_exists(coll)

        headers = {'Content-Type': environ.get('CONTENT_TYPE', 'text/plain')}

        url = params.get('url')

        if not url:
            return WbResponse.json_response({'error': 'no url'})

        timestamp = params.get('timestamp')
        if timestamp:
            headers['WARC-Date'] = timestamp_to_iso_date(timestamp)

        target_uri = target_uri_format.format(url=url)
        put_url = self.custom_record_path.format(
            url=target_uri, coll=coll, rec_type=rec_type
        )
        res = requests.put(put_url, headers=headers, data=data)

        res = res.json()

        return WbResponse.json_response(res)
Ejemplo n.º 19
0
    def _get_protocol_and_headers(self, headerline, parts):
        headers = []

        if headerline.startswith('filedesc://'):
            rec_type = 'warcinfo'
        else:
            rec_type = 'response'
            parts[3] = 'application/http;msgtype=response'

        headers.append(('WARC-Type', rec_type))
        headers.append(('WARC-Record-ID', StatusAndHeadersParser.make_warc_id()))

        for name, value in zip(self.headernames, parts):
            if name == 'WARC-Date':
                value = timestamp_to_iso_date(value)

            if rec_type == 'warcinfo' and name == 'WARC-Target-URI':
                name = 'WARC-Filename'
                value = value[len('filedesc://'):]

            headers.append((name, value))

        return ('WARC/1.0', headers)