def test_download_extract_archive(self):
     # Generate a gzipped tarfile
     output_filename = os.path.join(self.base_path, 'subfolder.tar.gz')
     output_dirname = os.path.join(self.base_path, 'subfolder')
     extracted_filename = os.path.join(output_dirname, 'subfolder_text.txt')
     with tarfile.open(output_filename, 'w:gz') as tar:
         tar.add(output_dirname, arcname='subfolder')
     shutil.rmtree(output_dirname)
     sha1_hash = download_from_google_storage.get_sha1(output_filename)
     input_filename = '%s/%s' % (self.base_url, sha1_hash)
     self.queue.put((sha1_hash, output_filename))
     self.queue.put((None, None))
     stdout_queue = Queue.Queue()
     download_from_google_storage._downloader_worker_thread(0,
                                                            self.queue,
                                                            True,
                                                            self.base_url,
                                                            self.gsutil,
                                                            stdout_queue,
                                                            self.ret_codes,
                                                            True,
                                                            True,
                                                            delete=False)
     expected_calls = [('check_call', ('ls', input_filename)),
                       ('check_call', ('cp', input_filename,
                                       output_filename))]
     if sys.platform != 'win32':
         expected_calls.append(
             ('check_call', ('stat', 'gs://sometesturl/%s' % sha1_hash)))
     expected_output = ['0> Downloading %s...' % output_filename]
     expected_output.extend([
         '0> Extracting 3 entries from %s to %s' %
         (output_filename, output_dirname)
     ])
     expected_ret_codes = []
     self.assertEqual(list(stdout_queue.queue), expected_output)
     self.assertEqual(self.gsutil.history, expected_calls)
     self.assertEqual(list(self.ret_codes.queue), expected_ret_codes)
     self.assertTrue(os.path.exists(output_dirname))
     self.assertTrue(os.path.exists(extracted_filename))
Exemple #2
0
 def test_download_extract_archive(self):
   # Generate a gzipped tarfile
   output_filename = os.path.join(self.base_path, 'subfolder.tar.gz')
   output_dirname = os.path.join(self.base_path, 'subfolder')
   extracted_filename = os.path.join(output_dirname, 'subfolder_text.txt')
   with tarfile.open(output_filename, 'w:gz') as tar:
     tar.add(output_dirname, arcname='subfolder')
   shutil.rmtree(output_dirname)
   sha1_hash = download_from_google_storage.get_sha1(output_filename)
   input_filename = '%s/%s' % (self.base_url, sha1_hash)
   self.queue.put((sha1_hash, output_filename))
   self.queue.put((None, None))
   stdout_queue = Queue.Queue()
   download_from_google_storage._downloader_worker_thread(
       0, self.queue, True, self.base_url, self.gsutil,
       stdout_queue, self.ret_codes, True, True, delete=False)
   expected_calls = [
       ('check_call',
           ('ls', input_filename)),
       ('check_call',
           ('cp', input_filename, output_filename))]
   if sys.platform != 'win32':
     expected_calls.append(
         ('check_call',
          ('stat',
           'gs://sometesturl/%s' % sha1_hash)))
   expected_output = [
       '0> Downloading %s...' % output_filename]
   expected_output.extend([
       '0> Extracting 3 entries from %s to %s' % (output_filename,
                                                  output_dirname)])
   expected_ret_codes = []
   self.assertEqual(list(stdout_queue.queue), expected_output)
   self.assertEqual(self.gsutil.history, expected_calls)
   self.assertEqual(list(self.ret_codes.queue), expected_ret_codes)
   self.assertTrue(os.path.exists(output_dirname))
   self.assertTrue(os.path.exists(extracted_filename))
Exemple #3
0
def upload_to_google_storage(input_filenames, base_url, gsutil, force, use_md5,
                             num_threads, skip_hashing):
    # We only want one MD5 calculation happening at a time to avoid HD thrashing.
    md5_lock = threading.Lock()

    # Start up all the worker threads plus the printer thread.
    all_threads = []
    ret_codes = Queue.Queue()
    ret_codes.put((0, None))
    upload_queue = Queue.Queue()
    upload_timer = time.time()
    stdout_queue = Queue.Queue()
    printer_thread = threading.Thread(target=printer_worker,
                                      args=[stdout_queue])
    printer_thread.daemon = True
    printer_thread.start()
    for thread_num in range(num_threads):
        t = threading.Thread(target=_upload_worker,
                             args=[
                                 thread_num, upload_queue, base_url, gsutil,
                                 md5_lock, force, use_md5, stdout_queue,
                                 ret_codes
                             ])
        t.daemon = True
        t.start()
        all_threads.append(t)

    # We want to hash everything in a single thread since its faster.
    # The bottleneck is in disk IO, not CPU.
    hashing_start = time.time()
    for filename in input_filenames:
        if not os.path.exists(filename):
            stdout_queue.put('Main> Error: %s not found, skipping.' % filename)
            continue
        if os.path.exists('%s.sha1' % filename) and skip_hashing:
            stdout_queue.put(
                'Main> Found hash for %s, sha1 calculation skipped.' %
                filename)
            with open(filename + '.sha1', 'rb') as f:
                sha1_file = f.read(1024)
            if not re.match('^([a-z0-9]{40})$', sha1_file):
                print >> sys.stderr, 'Invalid sha1 hash file %s.sha1' % filename
                return 1
            upload_queue.put((filename, sha1_file))
            continue
        stdout_queue.put('Main> Calculating hash for %s...' % filename)
        sha1_sum = get_sha1(filename)
        with open(filename + '.sha1', 'wb') as f:
            f.write(sha1_sum)
        stdout_queue.put('Main> Done calculating hash for %s.' % filename)
        upload_queue.put((filename, sha1_sum))
    hashing_duration = time.time() - hashing_start

    # Wait for everything to finish.
    for _ in all_threads:
        upload_queue.put((None, None))  # To mark the end of the work queue.
    for t in all_threads:
        t.join()
    stdout_queue.put(None)
    printer_thread.join()

    # Print timing information.
    print 'Hashing %s files took %1f seconds' % (len(input_filenames),
                                                 hashing_duration)
    print 'Uploading took %1f seconds' % (time.time() - upload_timer)

    # See if we ran into any errors.
    max_ret_code = 0
    for ret_code, message in ret_codes.queue:
        max_ret_code = max(ret_code, max_ret_code)
        if message:
            print >> sys.stderr, message

    if not max_ret_code:
        print 'Success!'

    return max_ret_code
 def test_get_sha1(self):
     lorem_ipsum = os.path.join(self.base_path, 'lorem_ipsum.txt')
     self.assertEqual(download_from_google_storage.get_sha1(lorem_ipsum),
                      '7871c8e24da15bad8b0be2c36edc9dc77e37727f')
Exemple #5
0
 def test_get_sha1(self):
   lorem_ipsum = os.path.join(self.base_path, 'lorem_ipsum.txt')
   self.assertEqual(
       download_from_google_storage.get_sha1(lorem_ipsum),
       '7871c8e24da15bad8b0be2c36edc9dc77e37727f')
def upload_to_google_storage(
    input_filenames, base_url, gsutil, force,
    use_md5, num_threads, skip_hashing):
  # We only want one MD5 calculation happening at a time to avoid HD thrashing.
  md5_lock = threading.Lock()

  # Start up all the worker threads plus the printer thread.
  all_threads = []
  ret_codes = Queue.Queue()
  ret_codes.put((0, None))
  upload_queue = Queue.Queue()
  upload_timer = time.time()
  stdout_queue = Queue.Queue()
  printer_thread = threading.Thread(target=printer_worker, args=[stdout_queue])
  printer_thread.daemon = True
  printer_thread.start()
  for thread_num in range(num_threads):
    t = threading.Thread(
        target=_upload_worker,
        args=[thread_num, upload_queue, base_url, gsutil, md5_lock,
              force, use_md5, stdout_queue, ret_codes])
    t.daemon = True
    t.start()
    all_threads.append(t)

  # We want to hash everything in a single thread since its faster.
  # The bottleneck is in disk IO, not CPU.
  hashing_start = time.time()
  for filename in input_filenames:
    if not os.path.exists(filename):
      stdout_queue.put('Main> Error: %s not found, skipping.' % filename)
      continue
    if os.path.exists('%s.sha1' % filename) and skip_hashing:
      stdout_queue.put(
          'Main> Found hash for %s, sha1 calculation skipped.' % filename)
      with open(filename + '.sha1', 'rb') as f:
        sha1_file = f.read(1024)
      if not re.match('^([a-z0-9]{40})$', sha1_file):
        print >> sys.stderr, 'Invalid sha1 hash file %s.sha1' % filename
        return 1
      upload_queue.put((filename, sha1_file))
      continue
    stdout_queue.put('Main> Calculating hash for %s...' % filename)
    sha1_sum = get_sha1(filename)
    with open(filename + '.sha1', 'wb') as f:
      f.write(sha1_sum)
    stdout_queue.put('Main> Done calculating hash for %s.' % filename)
    upload_queue.put((filename, sha1_sum))
  hashing_duration = time.time() - hashing_start

  # Wait for everything to finish.
  for _ in all_threads:
    upload_queue.put((None, None))  # To mark the end of the work queue.
  for t in all_threads:
    t.join()
  stdout_queue.put(None)
  printer_thread.join()

  # Print timing information.
  print 'Hashing %s files took %1f seconds' % (
      len(input_filenames), hashing_duration)
  print 'Uploading took %1f seconds' % (time.time() - upload_timer)

  # See if we ran into any errors.
  max_ret_code = 0
  for ret_code, message in ret_codes.queue:
    max_ret_code = max(ret_code, max_ret_code)
    if message:
      print >> sys.stderr, message

  if not max_ret_code:
    print 'Success!'

  return max_ret_code