def write_playlist( page, timestamp, xpath, videos, filenames ): urls = videos.split( "," ) files = filenames.split( "," ) if len( urls ) != len( files ): print "ERROR: Incorrect number of videos/filenames passed." return headers = [ ( WarcRecord.TYPE, WarcRecord.METADATA ), ( WarcRecord.URL, page ), ( WarcRecord.DATE, warc_datetime_str( datetime.now() ) ), ( WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1() ), ( WarcRecord.CONTENT_TYPE, "text/plain" ), ] block = "" for index, url in enumerate( urls ): block += "embedded-playlist-item-%s: %s\n" % ( index, url ) block += "embedding-timestamp: %s\nembedded-playlist-xpath: %s" % ( timestamp, xpath ) warcwriter.write_record( headers, "text/plain", block ) for url, file in zip( urls, files ): data = None with open( file, "rb" ) as d: data = d.read() if len( data ) == 0 or data is None: print "ERROR: %s" % file return mime, encoding = mimetypes.guess_type( file ) mtime = os.stat( file ).st_mtime headers = [ ( WarcRecord.TYPE, WarcRecord.RESOURCE ), ( WarcRecord.URL, url ), ( WarcRecord.DATE, warc_datetime_str( datetime.fromtimestamp( mtime ) ) ), ( WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1() ), ( WarcRecord.CONTENT_TYPE, mime ), ] warcwriter.write_record( headers, mime, data )
def getvideo( page, timestamp=None ): if timestamp is None: r = requests.get( page ) timestamp = datetime.now().strftime( "%Y%m%d%H%M%S" ) else: r = requests.get( "%s/%s/%s" % ( WAYBACK, timestamp, page ) ) htmlparser = etree.HTMLParser() root = etree.fromstring( r.content, htmlparser ) tree = etree.ElementTree( root ) for index, externalid in enumerate(re.findall("\"externalId\":\"([^\"]+)\"", r.content)): video_uuid = "<urn:uuid:%s>" % uuid.uuid1() media = try_archived_version( "%s/%s" %( BBC_MEDIA, externalid ), timestamp ) if not media.ok: print "ERROR: Couldn't find media; %s" % ( "%s/%s" %( BBC_MEDIA, externalid ) ) return if WAYBACK not in media.url: headers = [ ( WarcRecord.TYPE, WarcRecord.RESPONSE ), ( WarcRecord.URL, media.url ), ( WarcRecord.DATE, warc_datetime_str( parser.parse( media.headers[ "date" ] ) ) ), ( WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1() ), ( WarcRecord.CONCURRENT_TO, video_uuid ), ( WarcRecord.CONTENT_TYPE, "application/http; msgtype=response" ), ] block = "".join( [ httpheaders( media.raw._original_response ), media.content ] ) warcwriter.write_record( headers, "application/http; msgtype=response", block ) url = getbestvideo( etree.fromstring( media.content ) ) print url if url.startswith( "http" ): video = requests.get( url ) if not video.ok: print "ERROR: Couldn't find video; %s" % url return video_url = video.url video_date = warc_datetime_str( parser.parse( video.headers[ "date" ] ) ) video_type = WarcRecord.RESPONSE content_type = "application/http; msgtype=response" videoblock = "".join( [ httpheaders( video.raw._original_response ), video.content ] ) writemetadata( video_url, video_uuid, base64.b64encode(r.content), index, page ) else: video_url = url video_date = warc_datetime_str( datetime.now() ) video_type = WarcRecord.RESOURCE content_type = "video/mp4" writemetadata( video_url, video_uuid, base64.b64encode( etree.tostring( object ).strip() ), index, page ) videoblock = streamvideo( video_url ) if len( videoblock ) == 0 or videoblock is None: print "ERROR: Couldn't stream video; %s" % video_url continue headers = [ ( WarcRecord.TYPE, video_type ), ( WarcRecord.URL, video_url ), ( WarcRecord.DATE, video_date ), ( WarcRecord.ID, video_uuid ), ( WarcRecord.CONTENT_TYPE, content_type ), ] warcwriter.write_record( headers, content_type, videoblock )
def callback(warcwriter, body): """Parses messages, writing results to disk. Arguments: warcwriter -- A python-warcwriterpool instance. body -- The incoming message body. """ try: logger.debug("Message received: %s." % body) if body.startswith("{"): (url, handler_id, selectors, url_handler) = handle_json_message(body) else: (url, handler_id, selectors, url_handler) = handle_pipe_message(body) ws = "%s/%s" % (settings.WEBSERVICE, url) logger.debug("Calling %s" % ws) r = requests.post(ws, data=json.dumps(selectors)) if r.status_code == 200: # Handle outlinks, passing original message... har = r.content url_handler(har, handler_id, body) headers = [ (WarcRecord.TYPE, WarcRecord.METADATA), (WarcRecord.URL, url), (WarcRecord.CONTENT_TYPE, "application/json"), (WarcRecord.DATE, warc_datetime_str(datetime.now())), (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()), ] warcwriter.write_record(headers, "application/json", har) else: logger.warning("None-200 response for %s; %s" % (body, r.content)) except Exception as e: logger.error("%s [%s]" % (str(e), body))
def callback(warcwriter, body): """Parses messages, writing results to disk. Arguments: warcwriter -- A python-warcwriterpool instance. body -- The incoming message body. """ try: logger.debug("Message received: %s." % body) try: (url, handler_id, selectors, url_handler) = handle_json_message(body) except Exception as e: logger.error("Ignoring invalid (unparseable) message! \"%s\"" % body, e ) return False # Allow settings to override if settings.routing_key and not handler_id: handler_id = settings.routing_key # Start the render: logger.info("Requesting render of %s" % url ) start_time = time.time() ws = "%s/%s" % (settings.webrender_url, urllib.quote(url)) logger.debug("Calling %s" % ws) r = requests.post(ws, data=json.dumps(selectors)) if r.status_code: # Get the HAR payload logger.debug("Got response. Reading.") har = r.content logger.debug("Got HAR.") # Write to the WARC wrid = uuid.uuid1() headers = [ (WarcRecord.TYPE, WarcRecord.METADATA), (WarcRecord.URL, url), (WarcRecord.CONTENT_TYPE, "application/json"), (WarcRecord.DATE, warc_datetime_str(datetime.now())), (WarcRecord.ID, "<urn:uuid:%s>" % wrid), ] warcwriter.write_record(headers, "application/json", har) # TODO Also pull out the rendings as separate records? # see http://wpull.readthedocs.org/en/master/warc.html logger.debug("Written WARC.") # Send on embeds and outlinks, passing original message too... outchannel = setup_outward_channel(handler_id) url_handler(outchannel, har, handler_id, body) logger.debug("Sent messages.") # Record total elapsed time: end_time = time.time() logger.info("Rendered and recorded output for %s in %d seconds." %(url, end_time-start_time)) # It appears everything worked, so return True and ack the original message return True else: logger.warning("Invalid response code for %s; %s" % (body, r.content)) return True except Exception as e: logger.exception("Exception %s %s when handling [%s]" % (type(e).__name__, e, body)) return False
def writemetadata( video_url, video_uuid, b64string, index, page ): headers = [ ( WarcRecord.TYPE, WarcRecord.METADATA ), ( WarcRecord.URL, video_url ), ( WarcRecord.DATE, warc_datetime_str( datetime.now() ) ), ( WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1() ), ( WarcRecord.CONCURRENT_TO, video_uuid ), ( WarcRecord.CONTENT_TYPE, "text/plain" ), ] block = "embedding-page: %s\nembedding-timestamp: %s\nelement-xpath: (//object[param[@name='externalIdentifier']])[%i]\nelement-base64-string: %s" % ( page, timestamp, index+1, b64string ) warcwriter.write_record( headers, "text/plain", block )
def writemetadata(video_url, video_uuid, b64string, index, page): headers = [ (WarcRecord.TYPE, WarcRecord.METADATA), (WarcRecord.URL, video_url), (WarcRecord.DATE, warc_datetime_str(datetime.now())), (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()), (WarcRecord.CONCURRENT_TO, video_uuid), (WarcRecord.CONTENT_TYPE, "text/plain"), ] block = "embedding-page: %s\nembedding-timestamp: %s\nelement-xpath: (//object[param[@name='externalIdentifier']])[%i]\nelement-base64-string: %s" % ( page, timestamp, index + 1, b64string) warcwriter.write_record(headers, "text/plain", block)
def write_playlist(page, timestamp, xpath, videos, filenames): urls = videos.split(",") files = filenames.split(",") if len(urls) != len(files): print "ERROR: Incorrect number of videos/filenames passed." return headers = [ (WarcRecord.TYPE, WarcRecord.METADATA), (WarcRecord.URL, page), (WarcRecord.DATE, warc_datetime_str(datetime.now())), (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()), (WarcRecord.CONTENT_TYPE, "text/plain"), ] block = "" for index, url in enumerate(urls): block += "embedded-playlist-item-%s: %s\n" % (index, url) block += "embedding-timestamp: %s\nembedded-playlist-xpath: %s" % ( timestamp, xpath) warcwriter.write_record(headers, "text/plain", block) for url, file in zip(urls, files): data = None with open(file, "rb") as d: data = d.read() if len(data) == 0 or data is None: print "ERROR: %s" % file return mime, encoding = mimetypes.guess_type(file) mtime = os.stat(file).st_mtime headers = [ (WarcRecord.TYPE, WarcRecord.RESOURCE), (WarcRecord.URL, url), (WarcRecord.DATE, warc_datetime_str(datetime.fromtimestamp(mtime))), (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()), (WarcRecord.CONTENT_TYPE, mime), ] warcwriter.write_record(headers, mime, data)
def callback( warcwriter, body ): """Parses messages, writing results to disk. Arguments: warcwriter -- A python-warcwriterpool instance. body -- The incoming message body. """ try: logger.debug( "Message received: %s." % body ) dir = None selectors = [ ":root" ] parts = body.split( "|" ) if len( parts ) == 1: url = parts[ 0 ] elif len( parts ) == 2: url, dir = parts else: url = parts[ 0 ] dir = parts[ 1 ] selectors += parts[ 2: ] # Build up our POST data. data = {} for s in selectors: data[ s ] = s ws = "%s/%s" % ( settings.WEBSERVICE, url ) logger.debug( "Calling %s" % ws ) r = requests.post( ws, data=data ) if r.status_code == 200: har = r.content headers = [ ( WarcRecord.TYPE, WarcRecord.METADATA ), ( WarcRecord.URL, url ), ( WarcRecord.CONTENT_TYPE, "application/json" ), ( WarcRecord.DATE, warc_datetime_str( datetime.now() ) ), ( WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1() ), ] warcwriter.write_record( headers, "application/json", har ) if dir is not None: logger.debug( "Writing outlinks to %s" % dir ) write_outlinks( har, dir ) else: logger.warning( "None-200 response for %s; %s" % ( body, r.content ) ) except Exception as e: logger.error( "%s [%s]" % ( str( e ), body ) )
def write_record( url, video_uuid, timestamp, xpath, page, concurrent_to=None ): r = requests.get( url ) if not r.ok: print "ERROR: %s" % r.content sys.exit( 1 ) warcdate = warc_datetime_str( dateparser.parse( r.headers[ "date" ] ) ) if args.multiple: for pair in args.multiple.split( "," ): t, p = pair.split( "/", 1 ) write_metadata( r.url, video_uuid, t, xpath, p, warcdate ) else: write_metadata( r.url, video_uuid, timestamp, xpath, page, warcdate ) headers = [ ( WarcRecord.TYPE, WarcRecord.RESPONSE ), ( WarcRecord.URL, r.url ), ( WarcRecord.DATE, warcdate ), ( WarcRecord.ID, video_uuid ), ( WarcRecord.CONTENT_TYPE, "application/http; msgtype=response" ), ] if concurrent_to is not None: headers.append( ( WarcRecord.CONCURRENT_TO, concurrent_to) ) block = "".join( [ httpheaders( r.raw._original_response ), r.content ] ) warcwriter.write_record( headers, "application/http; msgtype=response", block )
def write_record(url, video_uuid, timestamp, xpath, page, concurrent_to=None): r = requests.get(url) if not r.ok: print "ERROR: %s" % r.content sys.exit(1) warcdate = warc_datetime_str(dateparser.parse(r.headers["date"])) if args.multiple: for pair in args.multiple.split(","): t, p = pair.split("/", 1) write_metadata(r.url, video_uuid, t, xpath, p, warcdate) else: write_metadata(r.url, video_uuid, timestamp, xpath, page, warcdate) headers = [ (WarcRecord.TYPE, WarcRecord.RESPONSE), (WarcRecord.URL, r.url), (WarcRecord.DATE, warcdate), (WarcRecord.ID, video_uuid), (WarcRecord.CONTENT_TYPE, "application/http; msgtype=response"), ] if concurrent_to is not None: headers.append((WarcRecord.CONCURRENT_TO, concurrent_to)) block = "".join([httpheaders(r.raw._original_response), r.content]) warcwriter.write_record(headers, "application/http; msgtype=response", block)
def callback(warcwriter, body): """Parses messages, writing results to disk. Arguments: warcwriter -- A python-warcwriterpool instance. body -- The incoming message body. """ try: logger.debug("Message received: %s." % body) try: (url, handler_id, selectors, url_handler) = handle_json_message(body) except Exception as e: logger.error( "Ignoring invalid (unparseable) message! \"%s\"" % body, e) return False # Allow settings to override if settings.routing_key and not handler_id: handler_id = settings.routing_key # Start the render: logger.info("Requesting render of %s" % url) start_time = time.time() ws = "%s/%s" % (settings.webrender_url, urllib.quote(url)) logger.debug("Calling %s" % ws) r = requests.post(ws, data=json.dumps(selectors)) if r.status_code: # Get the HAR payload logger.debug("Got response. Reading.") har = r.content logger.debug("Got HAR.") # Write to the WARC wrid = uuid.uuid1() headers = [ (WarcRecord.TYPE, WarcRecord.METADATA), (WarcRecord.URL, url), (WarcRecord.CONTENT_TYPE, "application/json"), (WarcRecord.DATE, warc_datetime_str(datetime.now())), (WarcRecord.ID, "<urn:uuid:%s>" % wrid), ] warcwriter.write_record(headers, "application/json", har) # TODO Also pull out the rendings as separate records? # see http://wpull.readthedocs.org/en/master/warc.html logger.debug("Written WARC.") # Send on embeds and outlinks, passing original message too... outchannel = setup_outward_channel(handler_id) url_handler(outchannel, har, handler_id, body) logger.debug("Sent messages.") # Record total elapsed time: end_time = time.time() logger.info("Rendered and recorded output for %s in %d seconds." % (url, end_time - start_time)) # It appears everything worked, so return True and ack the original message return True else: logger.warning("Invalid response code for %s; %s" % (body, r.content)) return True except Exception as e: logger.exception("Exception %s %s when handling [%s]" % (type(e).__name__, e, body)) return False
elif not args.filename: if args.y: r = requests.get( args.page ) if not r.ok: print "ERROR: %s" % r.content sys.exit( 1 ) ydl = youtube_dl.YoutubeDL() ydl.add_default_info_extractors() htmlparser = etree.HTMLParser() root = etree.fromstring( r.content, htmlparser ) for iframe in root.xpath( "//iframe[contains(@src,'www.youtube.com/embed/')]" ): yurl = iframe.attrib["src"] results = ydl.extract_info(yurl, download=False ) headers = [ ( WarcRecord.TYPE, WarcRecord.WARCINFO ), ( WarcRecord.DATE, warc_datetime_str( datetime.now() ) ), ( WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1() ), ] youtube_gdata = requests.get( YOUTUBE_DATA_URL % results["id"]) warcwriter.write_record( headers, "application/json", json.dumps( results, indent=8, separators=( ",", ":" ) ) ) xpath = "//iframe[contains(@src,'%s')]" % results["id"] for format in results["formats"]: date = warc_datetime_str( datetime.now() ) video_uuid = "<urn:uuid:%s>" % uuid.uuid1() write_record(format["url"], video_uuid, date, xpath, args.page, concurrent_to=None ) else: write_record( args.url, "<urn:uuid:%s>" % uuid.uuid1(), args.timestamp, args.xpath, args.page, concurrent_to=None ) elif args.filename: data = None with open( args.filename, "rb" ) as d:
if args.y: r = requests.get(args.page) if not r.ok: print "ERROR: %s" % r.content sys.exit(1) ydl = youtube_dl.YoutubeDL() ydl.add_default_info_extractors() htmlparser = etree.HTMLParser() root = etree.fromstring(r.content, htmlparser) for iframe in root.xpath( "//iframe[contains(@src,'www.youtube.com/embed/')]"): yurl = iframe.attrib["src"] results = ydl.extract_info(yurl, download=False) headers = [ (WarcRecord.TYPE, WarcRecord.WARCINFO), (WarcRecord.DATE, warc_datetime_str(datetime.now())), (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()), ] youtube_gdata = requests.get(YOUTUBE_DATA_URL % results["id"]) warcwriter.write_record( headers, "application/json", json.dumps(results, indent=8, separators=(",", ":"))) xpath = "//iframe[contains(@src,'%s')]" % results["id"] for format in results["formats"]: date = warc_datetime_str(datetime.now()) video_uuid = "<urn:uuid:%s>" % uuid.uuid1() write_record(format["url"], video_uuid, date, xpath, args.page,
def getvideo(page, timestamp=None): if timestamp is None: r = requests.get(page) timestamp = datetime.now().strftime("%Y%m%d%H%M%S") else: r = requests.get("%s/%s/%s" % (WAYBACK, timestamp, page)) htmlparser = etree.HTMLParser() root = etree.fromstring(r.content, htmlparser) tree = etree.ElementTree(root) for index, externalid in enumerate( re.findall("\"externalId\":\"([^\"]+)\"", r.content)): video_uuid = "<urn:uuid:%s>" % uuid.uuid1() media = try_archived_version("%s/%s" % (BBC_MEDIA, externalid), timestamp) if not media.ok: print "ERROR: Couldn't find media; %s" % ("%s/%s" % (BBC_MEDIA, externalid)) return if WAYBACK not in media.url: headers = [ (WarcRecord.TYPE, WarcRecord.RESPONSE), (WarcRecord.URL, media.url), (WarcRecord.DATE, warc_datetime_str(parser.parse(media.headers["date"]))), (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()), (WarcRecord.CONCURRENT_TO, video_uuid), (WarcRecord.CONTENT_TYPE, "application/http; msgtype=response"), ] block = "".join( [httpheaders(media.raw._original_response), media.content]) warcwriter.write_record(headers, "application/http; msgtype=response", block) url = getbestvideo(etree.fromstring(media.content)) print url if url.startswith("http"): video = requests.get(url) if not video.ok: print "ERROR: Couldn't find video; %s" % url return video_url = video.url video_date = warc_datetime_str(parser.parse(video.headers["date"])) video_type = WarcRecord.RESPONSE content_type = "application/http; msgtype=response" videoblock = "".join( [httpheaders(video.raw._original_response), video.content]) writemetadata(video_url, video_uuid, base64.b64encode(r.content), index, page) else: video_url = url video_date = warc_datetime_str(datetime.now()) video_type = WarcRecord.RESOURCE content_type = "video/mp4" writemetadata(video_url, video_uuid, base64.b64encode(etree.tostring(object).strip()), index, page) videoblock = streamvideo(video_url) if len(videoblock) == 0 or videoblock is None: print "ERROR: Couldn't stream video; %s" % video_url continue headers = [ (WarcRecord.TYPE, video_type), (WarcRecord.URL, video_url), (WarcRecord.DATE, video_date), (WarcRecord.ID, video_uuid), (WarcRecord.CONTENT_TYPE, content_type), ] warcwriter.write_record(headers, content_type, videoblock)