def run_harchiver(): """Maintains a connection to the queue.""" while True: channel = None try: logger.info("Setting up warc writer, in %s" % settings.output_directory) warcwriter = WarcWriterPool(gzip=True, output_dir=settings.output_directory) logger.info("Starting connection: %s" % (settings.amqp_url)) parameters = pika.URLParameters(settings.amqp_url) connection = pika.BlockingConnection(parameters) channel = connection.channel() channel.exchange_declare(exchange=settings.exchange, type="direct", durable=True, auto_delete=False) channel.queue_declare(queue=settings.in_queue, durable=True, exclusive=False, auto_delete=False) channel.queue_bind(queue=settings.in_queue, exchange=settings.exchange, routing_key=settings.binding_key) channel.basic_qos(prefetch_count=settings.qos_num) logger.info("Started connection: %s" % (settings.amqp_url)) for method_frame, properties, body in channel.consume(settings.in_queue): handled = callback(warcwriter, body) if handled is True: channel.basic_ack(method_frame.delivery_tag) else: channel.basic_reject(delivery_tag = method_frame.delivery_tag, requeue=True) except Exception as e: logger.error("Error: %s" % e) if channel and channel.is_open and not channel.is_closing: try: requeued_messages = channel.cancel() logger.info("Requeued %i messages" % requeued_messages) except Exception as e: logger.warning("Could not cancel/shutdown neatly.") if warcwriter: warcwriter.cleanup() logger.warning("Sleeping for 15 seconds before retrying...") time.sleep(15) except KeyboardInterrupt: # Tidy up: if warcwriter: warcwriter.cleanup() # quit sys.exit()
def run_harchiver(): """Maintains a connection to the queue.""" while True: channel = None try: logger.info("Setting up warc writer, in %s" % settings.output_directory) warcwriter = WarcWriterPool(gzip=True, output_dir=settings.output_directory) logger.info("Starting connection: %s" % (settings.amqp_url)) parameters = pika.URLParameters(settings.amqp_url) connection = pika.BlockingConnection(parameters) channel = connection.channel() channel.exchange_declare(exchange=settings.exchange, type="direct", durable=True, auto_delete=False) channel.queue_declare(queue=settings.in_queue, durable=True, exclusive=False, auto_delete=False) channel.queue_bind(queue=settings.in_queue, exchange=settings.exchange, routing_key=settings.binding_key) channel.basic_qos(prefetch_count=settings.qos_num) logger.info("Started connection: %s" % (settings.amqp_url)) for method_frame, properties, body in channel.consume( settings.in_queue): handled = callback(warcwriter, body) if handled is True: channel.basic_ack(method_frame.delivery_tag) else: channel.basic_reject( delivery_tag=method_frame.delivery_tag, requeue=True) except Exception as e: logger.error("Error: %s" % e) if channel and channel.is_open and not channel.is_closing: try: requeued_messages = channel.cancel() logger.info("Requeued %i messages" % requeued_messages) except Exception as e: logger.warning("Could not cancel/shutdown neatly.") if warcwriter: warcwriter.cleanup() logger.warning("Sleeping for 15 seconds before retrying...") time.sleep(15) except KeyboardInterrupt: # Tidy up: if warcwriter: warcwriter.cleanup() # quit sys.exit()
def run_harchiver(): """Maintains a connection to the queue.""" warcwriter = WarcWriterPool(gzip=True, output_dir=settings.OUTPUT_DIRECTORY) while True: channel = None try: logger.info("Starting connection: %s" % (settings.AMQP_URL)) parameters = pika.URLParameters(settings.AMQP_URL) connection = pika.BlockingConnection(parameters) channel = connection.channel() channel.exchange_declare(exchange=settings.AMQP_EXCHANGE, type="direct", durable=True, auto_delete=False) channel.queue_declare(queue=settings.AMQP_QUEUE, durable=True, exclusive=False, auto_delete=False) channel.queue_bind(queue=settings.AMQP_QUEUE, exchange=settings.AMQP_EXCHANGE, routing_key=settings.AMQP_KEY) logger.info("Started connection: %s" % (settings.AMQP_URL)) for method_frame, properties, body in channel.consume(settings.AMQP_QUEUE): callback(warcwriter, body) channel.basic_ack(method_frame.delivery_tag) except Exception as e: logger.error("Error: %s" % e) if channel and channel.is_open and not channel.is_closing: try: requeued_messages = channel.cancel() logger.info("Requeued %i messages" % requeued_messages) except Exception as e: logger.warning("Could not cancel/shutdown neatly.") logger.warning("Sleeping for 15 seconds before retrying...") time.sleep(15)
videoblock = "".join( [ httpheaders( video.raw._original_response ), video.content ] ) writemetadata( video_url, video_uuid, base64.b64encode(r.content), index, page ) else: video_url = url video_date = warc_datetime_str( datetime.now() ) video_type = WarcRecord.RESOURCE content_type = "video/mp4" writemetadata( video_url, video_uuid, base64.b64encode( etree.tostring( object ).strip() ), index, page ) videoblock = streamvideo( video_url ) if len( videoblock ) == 0 or videoblock is None: print "ERROR: Couldn't stream video; %s" % video_url continue headers = [ ( WarcRecord.TYPE, video_type ), ( WarcRecord.URL, video_url ), ( WarcRecord.DATE, video_date ), ( WarcRecord.ID, video_uuid ), ( WarcRecord.CONTENT_TYPE, content_type ), ] warcwriter.write_record( headers, content_type, videoblock ) if __name__ == "__main__": warcwriter = WarcWriterPool( gzip=True, write_warcinfo=False ) for arg in sys.argv[ 1: ]: if arg[ 0 ].isdigit(): timestamp, url = arg.split( "/", 1 ) getvideo( url, timestamp=timestamp ) else: getvideo(sys.argv[1]) warcwriter.cleanup()
] warcwriter.write_record( headers, mime, data ) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument( "-m", dest="multiple", help="Multiple, comma-separated timestamp/page values." ) parser.add_argument( "-p", dest="page", help="Embedding page." ) parser.add_argument( "-t", dest="timestamp", help="Embedding page timestamp." ) parser.add_argument( "-x", dest="xpath", help="XPath to element." ) parser.add_argument( "-u", dest="url", help="Video URL." ) parser.add_argument( "-f", dest="filename", help="Filename on disk." ) parser.add_argument( "-l", dest="playlist", help="Playlist of videos." ) parser.add_argument( "-y", action="store_true", help="YouTube videos [iframes only]." ) args = parser.parse_args() warcwriter = WarcWriterPool( gzip=True, write_warcinfo=False ) if args.playlist: write_playlist( args.page, args.timestamp, args.xpath, args.playlist, args.filename ) elif not args.filename: if args.y: r = requests.get( args.page ) if not r.ok: print "ERROR: %s" % r.content sys.exit( 1 ) ydl = youtube_dl.YoutubeDL() ydl.add_default_info_extractors() htmlparser = etree.HTMLParser() root = etree.fromstring( r.content, htmlparser ) for iframe in root.xpath( "//iframe[contains(@src,'www.youtube.com/embed/')]" ): yurl = iframe.attrib["src"]
dest="multiple", help="Multiple, comma-separated timestamp/page values.") parser.add_argument("-p", dest="page", help="Embedding page.") parser.add_argument("-t", dest="timestamp", help="Embedding page timestamp.") parser.add_argument("-x", dest="xpath", help="XPath to element.") parser.add_argument("-u", dest="url", help="Video URL.") parser.add_argument("-f", dest="filename", help="Filename on disk.") parser.add_argument("-l", dest="playlist", help="Playlist of videos.") parser.add_argument("-y", action="store_true", help="YouTube videos [iframes only].") args = parser.parse_args() warcwriter = WarcWriterPool(gzip=True, write_warcinfo=False) if args.playlist: write_playlist(args.page, args.timestamp, args.xpath, args.playlist, args.filename) elif not args.filename: if args.y: r = requests.get(args.page) if not r.ok: print "ERROR: %s" % r.content sys.exit(1) ydl = youtube_dl.YoutubeDL() ydl.add_default_info_extractors() htmlparser = etree.HTMLParser() root = etree.fromstring(r.content, htmlparser) for iframe in root.xpath(
logger = logging.getLogger( "archiver" ) logger.setLevel( logging.WARNING ) logger.addHandler( logging.StreamHandler( sys.stdout ) ) logging.root.setLevel( logging.WARNING ) parser = argparse.ArgumentParser( description="Archiving tweets." ) parser.add_argument( "-u", "--users", type=str, help="Comma-separated list of users to follow." ) parser.add_argument( "-t", "--terms", type=str, help="Comma-separated list of terms to track." ) args = parser.parse_args() users = [] terms = [] if args.users is not None: users = args.users.split( "," ) if args.terms is not None: terms = args.terms.split( "," ) if len( users + terms ) == 0: parser.print_help() sys.exit( 1 ) w = WarcWriterPool( gzip=True ) try: if len(users) > 0: users = screen_names_to_ids(auth, users) stream = tweepy.Stream( auth=auth, listener=StreamListener( writer=w ) ) stream.filter( follow=users, track=terms ) except KeyboardInterrupt as k: w.cleanup() sys.exit( 0 )
video_url = url video_date = warc_datetime_str(datetime.now()) video_type = WarcRecord.RESOURCE content_type = "video/mp4" writemetadata(video_url, video_uuid, base64.b64encode(etree.tostring(object).strip()), index, page) videoblock = streamvideo(video_url) if len(videoblock) == 0 or videoblock is None: print "ERROR: Couldn't stream video; %s" % video_url continue headers = [ (WarcRecord.TYPE, video_type), (WarcRecord.URL, video_url), (WarcRecord.DATE, video_date), (WarcRecord.ID, video_uuid), (WarcRecord.CONTENT_TYPE, content_type), ] warcwriter.write_record(headers, content_type, videoblock) if __name__ == "__main__": warcwriter = WarcWriterPool(gzip=True, write_warcinfo=False) for arg in sys.argv[1:]: if arg[0].isdigit(): timestamp, url = arg.split("/", 1) getvideo(url, timestamp=timestamp) else: getvideo(sys.argv[1]) warcwriter.cleanup()