existing = set([ x.name for x in db.list_blobs(bucket, 'common-crawl/crawl-data/CC-MAIN-2014-23/') ]) todo = [x.rstrip() for x in sys.stdin.readlines()] todo = [x for x in todo if x not in existing] ### while todo or in_progress: new_progress = set() for path in in_progress: props = db.get_blob_properties(bucket, path) if props['x-ms-copy-status'] == 'success': print '\t%s completed' % path continue new_progress.add(path) time.sleep(0.25) print 'Task queue length is %d' % len(new_progress) print 'TODO queue length is %d' % len(todo) # Populate the queue while todo and len(new_progress) < 256: path = todo.pop() # If it exists, skip it -- only add if it's missing try: resp = db.get_blob_properties(bucket, path) except WindowsAzureMissingResourceError: db.copy_blob(bucket, path, 'https://commoncrawl.s3.amazonaws.com/' + path) new_progress.add(path) in_progress = new_progress time.sleep(300)
### bucket = 'crawl-data' in_progress = set() # existing = set([x.name for x in db.list_blobs(bucket, 'common-crawl/crawl-data/CC-MAIN-2014-23/')]) todo = [x.rstrip() for x in sys.stdin.readlines()] todo = [x for x in todo if x not in existing] ### while todo or in_progress: new_progress = set() for path in in_progress: props = db.get_blob_properties(bucket, path) if props['x-ms-copy-status'] == 'success': print '\t%s completed' % path continue new_progress.add(path) time.sleep(0.25) print 'Task queue length is %d' % len(new_progress) print 'TODO queue length is %d' % len(todo) # Populate the queue while todo and len(new_progress) < 256: path = todo.pop() # If it exists, skip it -- only add if it's missing try: resp = db.get_blob_properties(bucket, path) except WindowsAzureMissingResourceError: db.copy_blob(bucket, path, 'https://aws-publicdatasets.s3.amazonaws.com/' + path) new_progress.add(path) in_progress = new_progress time.sleep(300)
from azure.storage import BlobService import sys key = raw_input("Please enter azure vidoepath blob storage key: ") blob_service = BlobService(account_name='videopathmobilefiles', account_key=key) source = sys.argv[1] target = sys.argv[2] print source + " -> " + target blob_service.create_container(target, x_ms_blob_public_access='container') # blob_service.copy_blob('test2', 'copiedkey', '/videopathmobilefiles/test/key') blobs = blob_service.list_blobs(source) for b in blobs: name = b.name source_path = '/videopathmobilefiles/' + source + '/' + name blob_service.copy_blob(target, name, source_path) print name
### bucket = 'crawl-data' in_progress = set() # existing = set([x.name for x in db.list_blobs(bucket, 'common-crawl/crawl-data/CC-MAIN-2014-23/')]) todo = [x.rstrip() for x in sys.stdin.readlines()] todo = [x for x in todo if x not in existing] ### while todo or in_progress: new_progress = set() for path in in_progress: props = db.get_blob_properties(bucket, path) if props['x-ms-copy-status'] == 'success': print '\t%s completed' % path continue new_progress.add(path) time.sleep(0.25) print 'Task queue length is %d' % len(new_progress) print 'TODO queue length is %d' % len(todo) # Populate the queue while todo and len(new_progress) < 256: path = todo.pop() # If it exists, skip it -- only add if it's missing try: resp = db.get_blob_properties(bucket, path) except WindowsAzureMissingResourceError: db.copy_blob(bucket, path, 'https://commoncrawl.s3.amazonaws.com/' + path) new_progress.add(path) in_progress = new_progress time.sleep(300)