Example #1
0
def write_playlist( page, timestamp, xpath, videos, filenames ):
    urls = videos.split( "," )
    files = filenames.split( "," )
    if len( urls ) != len( files ):
        print "ERROR: Incorrect number of videos/filenames passed."
        return
    headers = [
        ( WarcRecord.TYPE, WarcRecord.METADATA ),
        ( WarcRecord.URL, page ),
        ( WarcRecord.DATE, warc_datetime_str( datetime.now() ) ),
        ( WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1() ),
        ( WarcRecord.CONTENT_TYPE, "text/plain" ),
    ]
    block = ""
    for index, url in enumerate( urls ):
        block += "embedded-playlist-item-%s: %s\n" % ( index, url )
    block += "embedding-timestamp: %s\nembedded-playlist-xpath: %s" % ( timestamp, xpath )
    warcwriter.write_record( headers, "text/plain", block )
    for url, file in zip( urls, files ):
        data = None
        with open( file, "rb" ) as d:
            data = d.read()
        if len( data ) == 0 or data is None:
            print "ERROR: %s" % file
            return
        mime, encoding = mimetypes.guess_type( file )
        mtime = os.stat( file ).st_mtime
        headers = [
            ( WarcRecord.TYPE, WarcRecord.RESOURCE ),
            ( WarcRecord.URL, url ),
            ( WarcRecord.DATE, warc_datetime_str( datetime.fromtimestamp( mtime ) ) ),
            ( WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1() ),
            ( WarcRecord.CONTENT_TYPE, mime ),
        ]
        warcwriter.write_record( headers, mime, data )
Example #2
0
def getvideo( page, timestamp=None ):
    if timestamp is None:
        r = requests.get( page )
        timestamp = datetime.now().strftime( "%Y%m%d%H%M%S" )
    else:
        r = requests.get( "%s/%s/%s" % ( WAYBACK, timestamp, page ) )
    htmlparser = etree.HTMLParser()
    root = etree.fromstring( r.content, htmlparser )
    tree = etree.ElementTree( root )
    for index, externalid in enumerate(re.findall("\"externalId\":\"([^\"]+)\"", r.content)):
            video_uuid = "<urn:uuid:%s>" % uuid.uuid1()
            media = try_archived_version( "%s/%s" %( BBC_MEDIA, externalid ), timestamp )
            if not media.ok:
                print "ERROR: Couldn't find media; %s" % ( "%s/%s" %( BBC_MEDIA, externalid ) )
                return
            if WAYBACK not in media.url:
                headers = [
                    ( WarcRecord.TYPE, WarcRecord.RESPONSE ),
                    ( WarcRecord.URL, media.url ),
                    ( WarcRecord.DATE, warc_datetime_str( parser.parse( media.headers[ "date" ] ) ) ),
                    ( WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1() ),
                    ( WarcRecord.CONCURRENT_TO, video_uuid ),
                    ( WarcRecord.CONTENT_TYPE, "application/http; msgtype=response" ),
                ]
                block = "".join( [ httpheaders( media.raw._original_response ), media.content ] )
                warcwriter.write_record( headers, "application/http; msgtype=response", block )

            url = getbestvideo( etree.fromstring( media.content ) )
            print url
            if url.startswith( "http" ):
                video = requests.get( url )
                if not video.ok:
                    print "ERROR: Couldn't find video; %s" % url
                    return
                video_url = video.url
                video_date = warc_datetime_str( parser.parse( video.headers[ "date" ] ) )
                video_type = WarcRecord.RESPONSE
                content_type = "application/http; msgtype=response"
                videoblock = "".join( [ httpheaders( video.raw._original_response ), video.content ] )
                writemetadata( video_url, video_uuid, base64.b64encode(r.content), index, page )
            else:
                video_url = url
                video_date = warc_datetime_str( datetime.now() )
                video_type = WarcRecord.RESOURCE
                content_type = "video/mp4"
                writemetadata( video_url, video_uuid, base64.b64encode( etree.tostring( object ).strip() ), index, page )
                videoblock = streamvideo( video_url )
                if len( videoblock ) == 0 or videoblock is None:
                    print "ERROR: Couldn't stream video; %s" % video_url
                    continue
            headers = [
                ( WarcRecord.TYPE, video_type ),
                ( WarcRecord.URL, video_url ),
                ( WarcRecord.DATE, video_date ),
                ( WarcRecord.ID, video_uuid ),
                ( WarcRecord.CONTENT_TYPE, content_type ),
            ]
            warcwriter.write_record( headers, content_type, videoblock )
Example #3
0
def callback(warcwriter, body):
    """Parses messages, writing results to disk.

    Arguments:
    warcwriter -- A python-warcwriterpool instance.
    body -- The incoming message body.

    """
    try:
        logger.debug("Message received: %s." % body)
        if body.startswith("{"):
            (url, handler_id, selectors, url_handler) = handle_json_message(body)
        else:
            (url, handler_id, selectors, url_handler) = handle_pipe_message(body)

        ws = "%s/%s" % (settings.WEBSERVICE, url)
        logger.debug("Calling %s" % ws)
        r = requests.post(ws, data=json.dumps(selectors))
        if r.status_code == 200:
            # Handle outlinks, passing original message...
            har = r.content
            url_handler(har, handler_id, body)
            headers = [
                (WarcRecord.TYPE, WarcRecord.METADATA),
                (WarcRecord.URL, url),
                (WarcRecord.CONTENT_TYPE, "application/json"),
                (WarcRecord.DATE, warc_datetime_str(datetime.now())),
                (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()),
            ]
            warcwriter.write_record(headers, "application/json", har)
        else:
            logger.warning("None-200 response for %s; %s" % (body, r.content))
    except Exception as e:
        logger.error("%s [%s]" % (str(e), body))
Example #4
0
def callback(warcwriter, body):
    """Parses messages, writing results to disk.

    Arguments:
    warcwriter -- A python-warcwriterpool instance.
    body -- The incoming message body.

    """
    try:
        logger.debug("Message received: %s." % body)
        try:
            (url, handler_id, selectors, url_handler) = handle_json_message(body)
        except Exception as e:
            logger.error("Ignoring invalid (unparseable) message! \"%s\"" % body, e )
            return False
        # Allow settings to override
        if settings.routing_key and not handler_id:
            handler_id = settings.routing_key

        # Start the render:            
        logger.info("Requesting render of %s" % url )
        start_time = time.time()
        ws = "%s/%s" % (settings.webrender_url, urllib.quote(url))
        logger.debug("Calling %s" % ws)
        r = requests.post(ws, data=json.dumps(selectors))
        if r.status_code:
            # Get the HAR payload
            logger.debug("Got response. Reading.")
            har = r.content
            logger.debug("Got HAR.")
            # Write to the WARC
            wrid = uuid.uuid1()
            headers = [
                (WarcRecord.TYPE, WarcRecord.METADATA),
                (WarcRecord.URL, url),
                (WarcRecord.CONTENT_TYPE, "application/json"),
                (WarcRecord.DATE, warc_datetime_str(datetime.now())),
                (WarcRecord.ID, "<urn:uuid:%s>" % wrid),
            ]
            warcwriter.write_record(headers, "application/json", har)
            # TODO Also pull out the rendings as separate records?
            # see http://wpull.readthedocs.org/en/master/warc.html
            logger.debug("Written WARC.")
            # Send on embeds and outlinks, passing original message too...
            outchannel = setup_outward_channel(handler_id)
            url_handler(outchannel, har, handler_id, body)
            logger.debug("Sent messages.")
            # Record total elapsed time:
            end_time = time.time()
            logger.info("Rendered and recorded output for %s in %d seconds." %(url, end_time-start_time))
            # It appears everything worked, so return True and ack the original message
            return True
        else:
            logger.warning("Invalid response code for %s; %s" % (body, r.content))
            return True
    except Exception as e:
        logger.exception("Exception %s %s when handling [%s]" % (type(e).__name__, e, body))
        return False
Example #5
0
def writemetadata( video_url, video_uuid, b64string, index, page ):
    headers = [
        ( WarcRecord.TYPE, WarcRecord.METADATA ),
        ( WarcRecord.URL, video_url ),
        ( WarcRecord.DATE, warc_datetime_str( datetime.now() ) ),
        ( WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1() ),
        ( WarcRecord.CONCURRENT_TO, video_uuid ),
        ( WarcRecord.CONTENT_TYPE, "text/plain" ),
    ]
    block = "embedding-page: %s\nembedding-timestamp: %s\nelement-xpath: (//object[param[@name='externalIdentifier']])[%i]\nelement-base64-string: %s" % ( page, timestamp, index+1, b64string )
    warcwriter.write_record( headers, "text/plain", block )
def writemetadata(video_url, video_uuid, b64string, index, page):
    headers = [
        (WarcRecord.TYPE, WarcRecord.METADATA),
        (WarcRecord.URL, video_url),
        (WarcRecord.DATE, warc_datetime_str(datetime.now())),
        (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()),
        (WarcRecord.CONCURRENT_TO, video_uuid),
        (WarcRecord.CONTENT_TYPE, "text/plain"),
    ]
    block = "embedding-page: %s\nembedding-timestamp: %s\nelement-xpath: (//object[param[@name='externalIdentifier']])[%i]\nelement-base64-string: %s" % (
        page, timestamp, index + 1, b64string)
    warcwriter.write_record(headers, "text/plain", block)
Example #7
0
def write_playlist(page, timestamp, xpath, videos, filenames):
    urls = videos.split(",")
    files = filenames.split(",")
    if len(urls) != len(files):
        print "ERROR: Incorrect number of videos/filenames passed."
        return
    headers = [
        (WarcRecord.TYPE, WarcRecord.METADATA),
        (WarcRecord.URL, page),
        (WarcRecord.DATE, warc_datetime_str(datetime.now())),
        (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()),
        (WarcRecord.CONTENT_TYPE, "text/plain"),
    ]
    block = ""
    for index, url in enumerate(urls):
        block += "embedded-playlist-item-%s: %s\n" % (index, url)
    block += "embedding-timestamp: %s\nembedded-playlist-xpath: %s" % (
        timestamp, xpath)
    warcwriter.write_record(headers, "text/plain", block)
    for url, file in zip(urls, files):
        data = None
        with open(file, "rb") as d:
            data = d.read()
        if len(data) == 0 or data is None:
            print "ERROR: %s" % file
            return
        mime, encoding = mimetypes.guess_type(file)
        mtime = os.stat(file).st_mtime
        headers = [
            (WarcRecord.TYPE, WarcRecord.RESOURCE),
            (WarcRecord.URL, url),
            (WarcRecord.DATE,
             warc_datetime_str(datetime.fromtimestamp(mtime))),
            (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()),
            (WarcRecord.CONTENT_TYPE, mime),
        ]
        warcwriter.write_record(headers, mime, data)
def callback( warcwriter, body ):
	"""Parses messages, writing results to disk.

	Arguments:
	warcwriter -- A python-warcwriterpool instance.
	body -- The incoming message body.

	"""
	try:
		logger.debug( "Message received: %s." % body )
		dir = None
		selectors = [ ":root" ]
		parts = body.split( "|" )
		if len( parts ) == 1:
			url = parts[ 0 ]
		elif len( parts ) == 2:
			url, dir = parts
		else:
			url = parts[ 0 ]
			dir = parts[ 1 ]
			selectors += parts[ 2: ]

		# Build up our POST data.
		data = {}
		for s in selectors:
			data[ s ] = s

		ws = "%s/%s" % ( settings.WEBSERVICE, url )
		logger.debug( "Calling %s" % ws )
		r = requests.post( ws, data=data )
		if r.status_code == 200:
			har = r.content
			headers = [
				( WarcRecord.TYPE, WarcRecord.METADATA ),
				( WarcRecord.URL, url ),
				( WarcRecord.CONTENT_TYPE, "application/json" ),
				( WarcRecord.DATE, warc_datetime_str( datetime.now() ) ),
				( WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1() ),
			]
			warcwriter.write_record( headers, "application/json", har )
			if dir is not None:
				logger.debug( "Writing outlinks to %s" % dir )
				write_outlinks( har, dir )
		else:
			logger.warning( "None-200 response for %s; %s" % ( body, r.content ) )
	except Exception as e:
		logger.error( "%s [%s]" % ( str( e ), body ) )
Example #9
0
def write_record( url, video_uuid, timestamp, xpath, page, concurrent_to=None ):
    r = requests.get( url )
    if not r.ok:
        print "ERROR: %s" % r.content
        sys.exit( 1 )
    warcdate = warc_datetime_str( dateparser.parse( r.headers[ "date" ] ) )
    if args.multiple:
        for pair in args.multiple.split( "," ):
            t, p = pair.split( "/", 1 )
            write_metadata( r.url, video_uuid, t, xpath, p, warcdate )
    else:
        write_metadata( r.url, video_uuid, timestamp, xpath, page, warcdate )
    headers = [
        ( WarcRecord.TYPE, WarcRecord.RESPONSE ),
        ( WarcRecord.URL, r.url ),
        ( WarcRecord.DATE, warcdate ),
        ( WarcRecord.ID, video_uuid ),
        ( WarcRecord.CONTENT_TYPE, "application/http; msgtype=response" ),
    ]
    if concurrent_to is not None:
        headers.append( ( WarcRecord.CONCURRENT_TO, concurrent_to) )
    block = "".join( [ httpheaders( r.raw._original_response ), r.content ] )
    warcwriter.write_record( headers, "application/http; msgtype=response", block )
Example #10
0
def write_record(url, video_uuid, timestamp, xpath, page, concurrent_to=None):
    r = requests.get(url)
    if not r.ok:
        print "ERROR: %s" % r.content
        sys.exit(1)
    warcdate = warc_datetime_str(dateparser.parse(r.headers["date"]))
    if args.multiple:
        for pair in args.multiple.split(","):
            t, p = pair.split("/", 1)
            write_metadata(r.url, video_uuid, t, xpath, p, warcdate)
    else:
        write_metadata(r.url, video_uuid, timestamp, xpath, page, warcdate)
    headers = [
        (WarcRecord.TYPE, WarcRecord.RESPONSE),
        (WarcRecord.URL, r.url),
        (WarcRecord.DATE, warcdate),
        (WarcRecord.ID, video_uuid),
        (WarcRecord.CONTENT_TYPE, "application/http; msgtype=response"),
    ]
    if concurrent_to is not None:
        headers.append((WarcRecord.CONCURRENT_TO, concurrent_to))
    block = "".join([httpheaders(r.raw._original_response), r.content])
    warcwriter.write_record(headers, "application/http; msgtype=response",
                            block)
Example #11
0
def callback(warcwriter, body):
    """Parses messages, writing results to disk.

    Arguments:
    warcwriter -- A python-warcwriterpool instance.
    body -- The incoming message body.

    """
    try:
        logger.debug("Message received: %s." % body)
        try:
            (url, handler_id, selectors,
             url_handler) = handle_json_message(body)
        except Exception as e:
            logger.error(
                "Ignoring invalid (unparseable) message! \"%s\"" % body, e)
            return False
        # Allow settings to override
        if settings.routing_key and not handler_id:
            handler_id = settings.routing_key

        # Start the render:
        logger.info("Requesting render of %s" % url)
        start_time = time.time()
        ws = "%s/%s" % (settings.webrender_url, urllib.quote(url))
        logger.debug("Calling %s" % ws)
        r = requests.post(ws, data=json.dumps(selectors))
        if r.status_code:
            # Get the HAR payload
            logger.debug("Got response. Reading.")
            har = r.content
            logger.debug("Got HAR.")
            # Write to the WARC
            wrid = uuid.uuid1()
            headers = [
                (WarcRecord.TYPE, WarcRecord.METADATA),
                (WarcRecord.URL, url),
                (WarcRecord.CONTENT_TYPE, "application/json"),
                (WarcRecord.DATE, warc_datetime_str(datetime.now())),
                (WarcRecord.ID, "<urn:uuid:%s>" % wrid),
            ]
            warcwriter.write_record(headers, "application/json", har)
            # TODO Also pull out the rendings as separate records?
            # see http://wpull.readthedocs.org/en/master/warc.html
            logger.debug("Written WARC.")
            # Send on embeds and outlinks, passing original message too...
            outchannel = setup_outward_channel(handler_id)
            url_handler(outchannel, har, handler_id, body)
            logger.debug("Sent messages.")
            # Record total elapsed time:
            end_time = time.time()
            logger.info("Rendered and recorded output for %s in %d seconds." %
                        (url, end_time - start_time))
            # It appears everything worked, so return True and ack the original message
            return True
        else:
            logger.warning("Invalid response code for %s; %s" %
                           (body, r.content))
            return True
    except Exception as e:
        logger.exception("Exception %s %s when handling [%s]" %
                         (type(e).__name__, e, body))
        return False
Example #12
0
 elif not args.filename:
     if args.y:
         r = requests.get( args.page )
         if not r.ok:
             print "ERROR: %s" % r.content
             sys.exit( 1 )
         ydl = youtube_dl.YoutubeDL()
         ydl.add_default_info_extractors()
         htmlparser = etree.HTMLParser()
         root = etree.fromstring( r.content, htmlparser )
         for iframe in root.xpath( "//iframe[contains(@src,'www.youtube.com/embed/')]" ):
             yurl = iframe.attrib["src"]
             results = ydl.extract_info(yurl, download=False )
             headers = [
                 ( WarcRecord.TYPE, WarcRecord.WARCINFO ),
                 ( WarcRecord.DATE, warc_datetime_str( datetime.now() ) ),
                 ( WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1() ),
             ]
             youtube_gdata = requests.get( YOUTUBE_DATA_URL % results["id"])
             warcwriter.write_record( headers, "application/json", json.dumps( results, indent=8, separators=( ",", ":" ) ) )
             xpath = "//iframe[contains(@src,'%s')]" % results["id"]
             for format in results["formats"]:
                 date = warc_datetime_str( datetime.now() )
                 video_uuid = "<urn:uuid:%s>" % uuid.uuid1()
                 write_record(format["url"], video_uuid, date, xpath, args.page, concurrent_to=None )
     else:
         write_record( args.url, "<urn:uuid:%s>" % uuid.uuid1(), args.timestamp, args.xpath, args.page, concurrent_to=None )
     
 elif args.filename:
     data = None
     with open( args.filename, "rb" ) as d:
Example #13
0
 if args.y:
     r = requests.get(args.page)
     if not r.ok:
         print "ERROR: %s" % r.content
         sys.exit(1)
     ydl = youtube_dl.YoutubeDL()
     ydl.add_default_info_extractors()
     htmlparser = etree.HTMLParser()
     root = etree.fromstring(r.content, htmlparser)
     for iframe in root.xpath(
             "//iframe[contains(@src,'www.youtube.com/embed/')]"):
         yurl = iframe.attrib["src"]
         results = ydl.extract_info(yurl, download=False)
         headers = [
             (WarcRecord.TYPE, WarcRecord.WARCINFO),
             (WarcRecord.DATE, warc_datetime_str(datetime.now())),
             (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()),
         ]
         youtube_gdata = requests.get(YOUTUBE_DATA_URL % results["id"])
         warcwriter.write_record(
             headers, "application/json",
             json.dumps(results, indent=8, separators=(",", ":")))
         xpath = "//iframe[contains(@src,'%s')]" % results["id"]
         for format in results["formats"]:
             date = warc_datetime_str(datetime.now())
             video_uuid = "<urn:uuid:%s>" % uuid.uuid1()
             write_record(format["url"],
                          video_uuid,
                          date,
                          xpath,
                          args.page,
Example #14
0
def getvideo(page, timestamp=None):
    if timestamp is None:
        r = requests.get(page)
        timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
    else:
        r = requests.get("%s/%s/%s" % (WAYBACK, timestamp, page))
    htmlparser = etree.HTMLParser()
    root = etree.fromstring(r.content, htmlparser)
    tree = etree.ElementTree(root)
    for index, externalid in enumerate(
            re.findall("\"externalId\":\"([^\"]+)\"", r.content)):
        video_uuid = "<urn:uuid:%s>" % uuid.uuid1()
        media = try_archived_version("%s/%s" % (BBC_MEDIA, externalid),
                                     timestamp)
        if not media.ok:
            print "ERROR: Couldn't find media; %s" % ("%s/%s" %
                                                      (BBC_MEDIA, externalid))
            return
        if WAYBACK not in media.url:
            headers = [
                (WarcRecord.TYPE, WarcRecord.RESPONSE),
                (WarcRecord.URL, media.url),
                (WarcRecord.DATE,
                 warc_datetime_str(parser.parse(media.headers["date"]))),
                (WarcRecord.ID, "<urn:uuid:%s>" % uuid.uuid1()),
                (WarcRecord.CONCURRENT_TO, video_uuid),
                (WarcRecord.CONTENT_TYPE,
                 "application/http; msgtype=response"),
            ]
            block = "".join(
                [httpheaders(media.raw._original_response), media.content])
            warcwriter.write_record(headers,
                                    "application/http; msgtype=response",
                                    block)

        url = getbestvideo(etree.fromstring(media.content))
        print url
        if url.startswith("http"):
            video = requests.get(url)
            if not video.ok:
                print "ERROR: Couldn't find video; %s" % url
                return
            video_url = video.url
            video_date = warc_datetime_str(parser.parse(video.headers["date"]))
            video_type = WarcRecord.RESPONSE
            content_type = "application/http; msgtype=response"
            videoblock = "".join(
                [httpheaders(video.raw._original_response), video.content])
            writemetadata(video_url, video_uuid, base64.b64encode(r.content),
                          index, page)
        else:
            video_url = url
            video_date = warc_datetime_str(datetime.now())
            video_type = WarcRecord.RESOURCE
            content_type = "video/mp4"
            writemetadata(video_url, video_uuid,
                          base64.b64encode(etree.tostring(object).strip()),
                          index, page)
            videoblock = streamvideo(video_url)
            if len(videoblock) == 0 or videoblock is None:
                print "ERROR: Couldn't stream video; %s" % video_url
                continue
        headers = [
            (WarcRecord.TYPE, video_type),
            (WarcRecord.URL, video_url),
            (WarcRecord.DATE, video_date),
            (WarcRecord.ID, video_uuid),
            (WarcRecord.CONTENT_TYPE, content_type),
        ]
        warcwriter.write_record(headers, content_type, videoblock)