Esempio n. 1
0
existing = set([
    x.name
    for x in db.list_blobs(bucket, 'common-crawl/crawl-data/CC-MAIN-2014-23/')
])
todo = [x.rstrip() for x in sys.stdin.readlines()]
todo = [x for x in todo if x not in existing]
###
while todo or in_progress:
    new_progress = set()
    for path in in_progress:
        props = db.get_blob_properties(bucket, path)
        if props['x-ms-copy-status'] == 'success':
            print '\t%s completed' % path
            continue
        new_progress.add(path)
        time.sleep(0.25)
    print 'Task queue length is %d' % len(new_progress)
    print 'TODO queue length is %d' % len(todo)
    # Populate the queue
    while todo and len(new_progress) < 256:
        path = todo.pop()
        # If it exists, skip it -- only add if it's missing
        try:
            resp = db.get_blob_properties(bucket, path)
        except WindowsAzureMissingResourceError:
            db.copy_blob(bucket, path,
                         'https://commoncrawl.s3.amazonaws.com/' + path)
            new_progress.add(path)
    in_progress = new_progress
    time.sleep(300)
Esempio n. 2
0
###
bucket = 'crawl-data'
in_progress = set()
#
existing = set([x.name for x in db.list_blobs(bucket, 'common-crawl/crawl-data/CC-MAIN-2014-23/')])
todo = [x.rstrip() for x in sys.stdin.readlines()]
todo = [x for x in todo if x not in existing]
###
while todo or in_progress:
  new_progress = set()
  for path in in_progress:
    props = db.get_blob_properties(bucket, path)
    if props['x-ms-copy-status'] == 'success':
      print '\t%s completed' % path
      continue
    new_progress.add(path)
    time.sleep(0.25)
  print 'Task queue length is %d' % len(new_progress)
  print 'TODO queue length is %d' % len(todo)
  # Populate the queue
  while todo and len(new_progress) < 256:
    path = todo.pop()
    # If it exists, skip it -- only add if it's missing
    try:
      resp = db.get_blob_properties(bucket, path)
    except WindowsAzureMissingResourceError:
      db.copy_blob(bucket, path, 'https://aws-publicdatasets.s3.amazonaws.com/' + path)
      new_progress.add(path)
  in_progress = new_progress
  time.sleep(300)
Esempio n. 3
0
from azure.storage import BlobService
import sys

key = raw_input("Please enter azure vidoepath blob storage key: ")
blob_service = BlobService(account_name='videopathmobilefiles',
                           account_key=key)

source = sys.argv[1]
target = sys.argv[2]

print source + " -> " + target

blob_service.create_container(target, x_ms_blob_public_access='container')

# blob_service.copy_blob('test2', 'copiedkey', '/videopathmobilefiles/test/key')

blobs = blob_service.list_blobs(source)

for b in blobs:
    name = b.name
    source_path = '/videopathmobilefiles/' + source + '/' + name
    blob_service.copy_blob(target, name, source_path)
    print name
Esempio n. 4
0
###
bucket = 'crawl-data'
in_progress = set()
#
existing = set([x.name for x in db.list_blobs(bucket, 'common-crawl/crawl-data/CC-MAIN-2014-23/')])
todo = [x.rstrip() for x in sys.stdin.readlines()]
todo = [x for x in todo if x not in existing]
###
while todo or in_progress:
  new_progress = set()
  for path in in_progress:
    props = db.get_blob_properties(bucket, path)
    if props['x-ms-copy-status'] == 'success':
      print '\t%s completed' % path
      continue
    new_progress.add(path)
    time.sleep(0.25)
  print 'Task queue length is %d' % len(new_progress)
  print 'TODO queue length is %d' % len(todo)
  # Populate the queue
  while todo and len(new_progress) < 256:
    path = todo.pop()
    # If it exists, skip it -- only add if it's missing
    try:
      resp = db.get_blob_properties(bucket, path)
    except WindowsAzureMissingResourceError:
      db.copy_blob(bucket, path, 'https://commoncrawl.s3.amazonaws.com/' + path)
      new_progress.add(path)
  in_progress = new_progress
  time.sleep(300)