Ejemplo n.º 1
0
def _upload_s3(datafiles, job_id, bucket_name='infernyx'):
    rval = []

    conn = boto.connect_s3()
    bucket = conn.get_bucket(bucket_name, validate=False)

    for tmp_file_list, _, tablename, columns in datafiles:
        s3_entries = []
        for tmpfile in tmp_file_list:
            with open(tmpfile) as f:
                md5 = compute_md5(f)

            k = Key(bucket)
            k.key = "%s-%s" % (job_id, tmpfile)

            _log(job_id, "->S3 %s/%s" % (bucket_name, k.key))
            k.set_contents_from_filename(tmpfile, md5=md5, replace=True)

            s3_entry = {"url": "s3://%s/%s" % (bucket_name, k.key), "mandatory": True}
            s3_entries.append(s3_entry)

        # upload the manifest
        prefix = tmp_file_list[0].rsplit('.')[0]
        manifest = ujson.dumps({"entries": s3_entries})
        manifest_key = Key(bucket)
        manifest_key.key = "%s.%s.manifest" % (job_id, prefix)
        _log(job_id, "->S3 %s/%s: %s" % (bucket_name, manifest_key.key, manifest))
        manifest_key.set_contents_from_string(manifest)

        # store manifest
        rval.append(DataFile(tmp_file_list, (bucket_name, manifest_key.key), tablename, columns))

    return rval
Ejemplo n.º 2
0
def upload_file(bucket, key_name, file_path, remote_prefix=None, policy='public-read', metadata=None):
    if not metadata:
        metadata = {}

    if remote_prefix:
        key_name = '{0}/{1}'.format(remote_prefix, key_name)

    fd = open(file_path)
    md5 = compute_md5(fd)
    fd.close()

    current_md5 = None
    current_key = bucket.lookup(key_name)
    if current_key:
        current_md5 = current_key.get_metadata('fabix-md5')
        if current_md5 == md5[0]:
            for k, v in metadata.iteritems():
                current_key.set_metadata(k, v)
            puts("Skip file {0}".format(file_path))
            return current_key

    key = bucket.new_key(key_name)

    for k, v in metadata.iteritems():
        key.set_metadata(k, v)

    key.set_metadata('fabix-md5', md5[0])

    puts("Upload file {0}".format(file_path))
    key.set_contents_from_filename(file_path, md5=md5, policy=policy)
    return key
Ejemplo n.º 3
0
    def setUp(self):
        self.mock_s3 = moto.mock_s3()
        self.mock_s3.start()
        self.s3_conn = boto.connect_s3()
        self.s3_conn.create_bucket('source_bucket')

        self.source_bucket = self.s3_conn.get_bucket('source_bucket')
        self.data = [
            {
                'name': 'Roberto',
                'birthday': '12/05/1987'
            },
            {
                'name': 'Claudia',
                'birthday': '21/12/1985'
            },
        ]
        with closing(
                self.source_bucket.new_key('some_prefix/test_key')) as key:
            with TmpFile() as tmp_filename:
                with open(tmp_filename, 'w') as f:
                    f.write(json.dumps(self.data))
                with open(tmp_filename) as f:
                    self.key_md5 = compute_md5(f)
            key.metadata = {'total': 2, 'md5': self.key_md5}
            key.set_contents_from_string(json.dumps(self.data))
        self.tmp_bypass_resume_file = 'tests/data/tmp_s3_bypass_resume_persistence.pickle'
        shutil.copyfile('tests/data/s3_bypass_resume_persistence.pickle',
                        self.tmp_bypass_resume_file)
Ejemplo n.º 4
0
def upload_file(bucket, key_name, file_path, remote_prefix=None, policy='public-read', metadata=None):
    if not metadata:
        metadata = {}

    if remote_prefix:
        key_name = '{0}/{1}'.format(remote_prefix, key_name)

    fd = open(file_path)
    md5 = compute_md5(fd)
    fd.close()

    current_md5 = None
    current_key = bucket.lookup(key_name)
    if current_key:
        current_md5 = current_key.get_metadata('fabix-md5')
        if current_md5 == md5[0]:
            for k, v in metadata.iteritems():
                current_key.set_metadata(k, v)
            puts("Skip file {0}".format(file_path))
            return current_key

    key = bucket.new_key(key_name)

    for k, v in metadata.iteritems():
        key.set_metadata(k, v)

    key.set_metadata('fabix-md5', md5[0])

    puts("Upload file {0}".format(file_path))
    key.set_contents_from_filename(file_path, md5=md5, policy=policy)
    return key
Ejemplo n.º 5
0
def putter(put, put_queue, stat_queue, options):
    logger = logging.getLogger(
        '%s[putter-%d]' %
        (os.path.basename(sys.argv[0]), current_process().pid))
    connection, bucket = None, None
    file_object_cache = FileObjectCache()
    while True:
        args = put_queue.get()
        if args is None:
            put_queue.task_done()
            break
        key_name, value_kwargs = args
        value = Value(file_object_cache, **value_kwargs)
        try:
            if connection is None:
                connection = S3Connection(is_secure=options.secure,
                                          host=options.host)
            if bucket is None:
                bucket = connection.get_bucket(options.bucket)
            key = put(bucket, key_name, value)
            if key:
                if options.headers:
                    headers = dict(
                        tuple(header.split(':', 1))
                        for header in options.headers)
                else:
                    headers = {}
                if options.content_type:
                    if options.content_type == "guess":
                        headers['Content-Type'] = mimetypes.guess_type(
                            value.path)[0]
                    else:
                        headers['Content-Type'] = options.content_type

                content = value.get_content()
                md5 = value.md5
                if options.gzip:
                    headers['Content-Encoding'] = 'gzip'
                    string_io = StringIO()
                    gzip_file = GzipFile(compresslevel=9,
                                         fileobj=string_io,
                                         mode='w')
                    gzip_file.write(content)
                    gzip_file.close()
                    content = string_io.getvalue()
                    md5 = compute_md5(StringIO(content))
                if not options.dry_run:
                    key.set_contents_from_string(content,
                                                 headers,
                                                 md5=md5,
                                                 policy=options.grant)
                logger.info('%s -> %s' % (value.path, key.name))
                stat_queue.put(dict(size=value.get_size()))
            else:
                logger.info('skipping %s -> %s' % (value.path, key_name))
        except SSLError as exc:
            logger.error('%s -> %s (%s)' % (value.path, key_name, exc))
            put_queue.put(args)
            connection, bucket = None, None
        put_queue.task_done()
Ejemplo n.º 6
0
def putter(put, put_queue, stat_queue, options):
	pid=current_process().pid
	log = logging.getLogger(os.path.basename(sys.argv[0]))
	connection, bucket = None, None
	file_object_cache = FileObjectCache()


	if options.gzip:
		pass
		#log.debug('These content types will be gzipped: %s' % unicode(gzip_content_types))
	while True:
		args = put_queue.get()
		#print args, pid
		if args is None:
			put_queue.task_done()
			break
		key_name, value_kwargs = args
		#print(666,value_kwargs)
		if options.gzip:
			key_name = '%s.gz' %  key_name
		value = Value(file_object_cache, **value_kwargs)
		try:
			if connection is None:
				connection = S3Connection(is_secure=options.secure, host=options.host)
			if bucket is None:
				bucket = connection.get_bucket(options.bucket, validate=False)
			key = put(bucket, key_name, value)
			if key:
				if value.should_copy_content():
					if options.headers:
						headers = dict(tuple(header.split(':', 1)) for header in options.headers)
					else:
						headers = {}

					content = value.get_content()
					

					if options.gzip:
						headers['Content-Encoding'] = 'gzip'
						string_io = StringIO()
						gzip_file = GzipFile(compresslevel=1, fileobj=string_io, mode='w')
						gzip_file.write(content)
						gzip_file.close()
						content = string_io.getvalue()
						md5 = compute_md5(StringIO(content))
					if not options.dry_run:
						key.set_contents_from_string(content, headers, md5=md5, policy=options.grant, encrypt_key=options.encrypt_key)
				#log.info('%s %s> %s' % (value.path, 'z' if options.gzip else '-', key.name))
				stat_queue.put(dict(size=value.get_size()))
			else:
				log.info('skipping %s -> %s' % (value.path, key_name))
		except SSLError as exc:
			log.error('%s -> %s (%s)' % (value.path, key_name, exc))
			put_queue.put(args)
			connection, bucket = None, None
		except IOError as exc:
			log.error('%s -> %s (%s)' % (value.path, key_name, exc))
		put_queue.task_done()
Ejemplo n.º 7
0
 def __get__(self, instance, owner):
     try:
         return (instance.md5, instance.b64md5)
     except AttributeError:
         instance.seek(0)
         (instance.md5, instance.b64md5, size) = compute_md5(
             instance, instance.bytes)
         instance.seek(0)
         return (instance.md5, instance.b64md5)
Ejemplo n.º 8
0
 def test_catches_bad_md5(self):
     '''Make sure Riak CS catches a bad content-md5 header'''
     key_name = str(uuid.uuid4())
     bucket = self.conn.create_bucket(self.bucket_name)
     key = Key(bucket, key_name)
     s = StringIO('not the real content')
     x = compute_md5(s)
     with self.assertRaises(S3ResponseError):
         key.set_contents_from_string('this is different from the md5 we calculated', md5=x)
Ejemplo n.º 9
0
def compute_localfile_md5sum(localfile):
    """
    Compute the hex-digested md5 checksum of the given ``localfile``.

    :param localfile: Path to a file on the local filesystem.
    """
    fp = open(localfile, 'rb')
    md5sum = compute_md5(fp)[0]
    fp.close()
    return md5sum
Ejemplo n.º 10
0
 def _create_key_metadata(self, dump_path, md5=None):
     from boto.utils import compute_md5
     metadata = {}
     metadata['total'] = self._get_total_count(dump_path)
     if md5:
         metadata['md5'] = md5
     else:
         with open(dump_path, 'r') as f:
             metadata['md5'] = compute_md5(f)
     return metadata
Ejemplo n.º 11
0
def compute_localfile_md5sum(localfile):
    """
    Compute the hex-digested md5 checksum of the given ``localfile``.

    :param localfile: Path to a file on the local filesystem.
    """
    fp = open(localfile, "rb")
    md5sum = compute_md5(fp)[0]
    fp.close()
    return md5sum
Ejemplo n.º 12
0
 def _create_key_metadata(self, dump_path, md5=None):
     from boto.utils import compute_md5
     metadata = {}
     metadata['total'] = self._get_total_count(dump_path)
     if md5:
         metadata['md5'] = md5
     else:
         with open(dump_path, 'r') as f:
             metadata['md5'] = compute_md5(f)
     return metadata
Ejemplo n.º 13
0
def upload_file(bucket, key_name, file_path, remote_prefix=None, policy='public-read'):
    if remote_prefix:
        key_name = '{0}/{1}'.format(remote_prefix, key_name)
    key = bucket.new_key(key_name)
    fd = open(file_path)
    md5 = compute_md5(fd)
    fd.close()
    key.set_metadata('fabix-md5', md5[0])
    key.set_contents_from_filename(file_path, md5=md5, policy=policy)
    return key
Ejemplo n.º 14
0
 def _MakeFile(file_size):
   """Creates a temporary file of the given size and returns its path."""
   fd, fpath = tempfile.mkstemp(suffix='.bin', prefix='gsutil_test_file',
                                text=False)
   self.file_sizes[fpath] = file_size
   self.file_contents[fpath] = os.urandom(file_size)
   with os.fdopen(fd, 'wb') as f:
     f.write(self.file_contents[fpath])
   with open(fpath, 'rb') as f:
     self.file_md5s[fpath] = compute_md5(f)
   return fpath
Ejemplo n.º 15
0
 def _MakeFile(file_size):
   """Creates a temporary file of the given size and returns its path."""
   fd, fpath = tempfile.mkstemp(suffix='.bin', prefix='gsutil_test_file',
                                text=False)
   self.file_sizes[fpath] = file_size
   self.file_contents[fpath] = os.urandom(file_size)
   with os.fdopen(fd, 'wb') as f:
     f.write(self.file_contents[fpath])
   with open(fpath, 'rb') as f:
     self.file_md5s[fpath] = compute_md5(f)
   return fpath
def _upload_s3(datafile, key_id, access_key, bucket_name, key):
    with open(datafile) as f:
        md5 = compute_md5(f)

    conn = boto.connect_s3(key_id, access_key)
    bucket = conn.get_bucket(bucket_name, validate=False)

    k = Key(bucket)
    k.key = key

    k.set_contents_from_filename(datafile, md5=md5, replace=True)
    return "s3://%s/%s" % (bucket_name, k.key)
Ejemplo n.º 17
0
def _upload_s3(datafile, key_id, access_key, bucket_name, key):
    with open(datafile) as f:
        md5 = compute_md5(f)

    conn = boto.connect_s3(key_id, access_key)
    bucket = conn.get_bucket(bucket_name, validate=False)

    k = Key(bucket)
    k.key = key

    k.set_contents_from_filename(datafile, md5=md5, replace=True)
    return "s3://%s/%s" % (bucket_name, k.key)
Ejemplo n.º 18
0
def filedata(files, sftp, pathmatch='.*', noop=False):
    """Generator that yields file path, data as a tempfile,
    and tuple containing the etag/md5.

    :param files: List of files to work with.
    :type files: list

    :param sftp: paramiko.SFTPClient object.
    :type sftp: paramiko.SFTPClient

    :param pathmatch: Regex to match files, use to exclude unwatched files.
    :type pathmatch: str

    :param noop: Enable No Op to not download the file.
    :type noop: bool

    :returns: A generator containing str, tempfile.NamedTemporaryFile, tuple
    :rtype: generator
    """
    match = re.compile(pathmatch)
    for filepath in files:
        if re.search(match, filepath):
            # boto will set content type by file suffix
            suffix = os.path.splitext(filepath)[1]
            data = tempfile.NamedTemporaryFile(suffix=suffix)
            if not noop:
                try:
                    # sftp.get checks file sizes, so we don't manually stat
                    sftp.get(filepath, data.name)
                    # Seek file cursor to beginning before md5
                    data.seek(0)
                    md5 = compute_md5(data)
                    logging.info('Local MD5: ' + str(md5))
                    yield filepath, data, (md5[0], md5[1])
                except IOError:
                    logging.warning('Error downloading: ' + filepath)
            else:
                logging.info('NOOP: Downloading from SFTP: ' + filepath)
                md5 = compute_md5(data)
                yield filepath, data, (md5[0], md5[1])
Ejemplo n.º 19
0
 def _save_metadata_for_key(self, key, dump_path, md5=None):
     from boto.exception import S3ResponseError
     from boto.utils import compute_md5
     try:
         key.set_metadata('total', self._get_total_count(dump_path))
         if md5:
             key.set_metadata('md5', md5)
         else:
             with open(dump_path, 'r') as f:
                 key.set_metadata('md5', compute_md5(f))
     except S3ResponseError:
         self.logger.warning(
                 'We have no READ_ACP/WRITE_ACP permissions, '
                 'so we could not add metadata info')
Ejemplo n.º 20
0
def putter(put, put_queue, stat_queue, options):
    logger = logging.getLogger('%s[putter-%d]' % (os.path.basename(sys.argv[0]), current_process().pid))
    connection, bucket = None, None
    file_object_cache = FileObjectCache()
    while True:
        args = put_queue.get()
        if args is None:
            put_queue.task_done()
            break
        key_name, value_kwargs = args
        value = Value(file_object_cache, **value_kwargs)
        try:
            if connection is None:
                connection = S3Connection(is_secure=options.secure, host=options.host)
            if bucket is None:
                bucket = connection.get_bucket(options.bucket)
            key = put(bucket, key_name, value)
            if key:
                if options.headers:
                    headers = dict(tuple(header.split(':', 1)) for header in options.headers)
                else:
                    headers = {}
                if options.content_type:
                    if options.content_type == "guess":
                        headers['Content-Type'] = mimetypes.guess_type(value.path)[0]
                    else:
                        headers['Content-Type'] = options.content_type

                content = value.get_content()
                md5 = value.md5
                if options.gzip:
                    headers['Content-Encoding'] = 'gzip'
                    string_io = StringIO()
                    gzip_file = GzipFile(compresslevel=9, fileobj=string_io, mode='w')
                    gzip_file.write(content)
                    gzip_file.close()
                    content = string_io.getvalue()
                    md5 = compute_md5(StringIO(content))
                if not options.dry_run:
                    key.set_contents_from_string(content, headers, md5=md5, policy=options.grant)
                logger.info('%s -> %s' % (value.path, key.name))
                stat_queue.put(dict(size=value.get_size()))
            else:
                logger.info('skipping %s -> %s' % (value.path, key_name))
        except SSLError as exc:
            logger.error('%s -> %s (%s)' % (value.path, key_name, exc))
            put_queue.put(args)
            connection, bucket = None, None
        put_queue.task_done()
Ejemplo n.º 21
0
 def _get_md5(self, key, tmp_filename):
     from boto.utils import compute_md5
     import re
     md5 = None
     md5_from_metadata = key.get_metadata('md5')
     if md5_from_metadata:
         match = re.match("\(\'(.*)\', u\'(.*)\', (.*)\)", str(md5_from_metadata))
         if match:
             groups = match.groups()
             md5 = (groups[0], unicode(groups[1]), int(groups[2]))
     # If it's not in metadata, let's compute it
     if md5 is None:
         with open(tmp_filename) as f:
             md5 = compute_md5(f)
     return md5
Ejemplo n.º 22
0
 def _get_md5(self, key, tmp_filename):
     from boto.utils import compute_md5
     import re
     md5 = None
     md5_from_metadata = key.get_metadata('md5')
     if md5_from_metadata:
         match = re.match("\(\'(.*)\', u\'(.*)\', (.*)\)", str(md5_from_metadata))
         if match:
             groups = match.groups()
             md5 = (groups[0], unicode(groups[1]), int(groups[2]))
     # If it's not in metadata, let's compute it
     if md5 is None:
         with open(tmp_filename) as f:
             md5 = compute_md5(f)
     return md5
Ejemplo n.º 23
0
 def _upload(retries_left=amount_of_retries):
     try:
         logging.info("Start uploading part #{0:d} of {1}".format(part_num, file_path))
         target_bucket = S3Connection(access_key_id, secret_access_key).get_bucket(target_bucket_name)
         for mp in target_bucket.get_all_multipart_uploads():
             if mp.id == multipart_id:
                 with FileChunkIO(file_path, 'r', offset=offset, bytes=bytes) as fp:
                     hex_digest, base64_digest, data_size = utils.compute_md5(fp, size=bytes)
                     mp.upload_part_from_file(fp=fp, part_num=part_num, cb=cb, num_cb=1,  md5=(hex_digest, base64_digest))
                 break
     except Exception, exc:
         if retries_left:
             _upload(retries_left=retries_left - 1)
         else:
             logging.error("Failed uploading part #{0:d} of {1}".format(part_num, file_path))
             raise exc
Ejemplo n.º 24
0
    def upload_file(self, file_path):
        self._currently_processing.add(file_path)
        key = Key(self.get_target_bucket())
        rel_path = str(file_path[self._watched_dir_offset:])
        key.key = rel_path

        if os.path.isfile(file_path) and os.stat(file_path).st_size > self._file_split_threshold_bytes:
            self.multipart_upload_file(file_path, key.key)
        else:
            fp = open(file_path, "r")
            hex_digest, base64_digest, data_size = utils.compute_md5(fp)
            key.set_contents_from_filename(file_path, cb=upload_progress_cb, num_cb=1, md5=(hex_digest, base64_digest))

        # Check in queue since the same file path may have been added again while this one was uploading    
        if os.path.isfile(file_path) and not self.is_queued(file_path):
            os.remove(file_path)
        self._currently_processing.discard(file_path)
Ejemplo n.º 25
0
def _upload_s3(datafiles, job_id, bucket_name='infernyx'):
    rval = []
    for tmpfile, _, tablename, columns in datafiles:
        with open(tmpfile) as f:
            md5 = compute_md5(f)

        conn = boto.connect_s3()
        bucket = conn.get_bucket(bucket_name, validate=False)

        k = Key(bucket)
        k.key = "%s-%s" % (job_id, tmpfile)

        k.set_contents_from_filename(tmpfile, md5=md5, replace=True)

        rval.append(DataFile(tmpfile, (bucket_name, k.key), tablename, columns))
        _log(job_id, "->S3 %s/%s" % (bucket_name, k.key))
    return rval
Ejemplo n.º 26
0
def _upload_s3(datafiles, job_id, bucket_name='infernyx'):
    rval = []
    for tmpfile, _, tablename, columns in datafiles:
        with open(tmpfile) as f:
            md5 = compute_md5(f)

        conn = boto.connect_s3()
        bucket = conn.get_bucket(bucket_name, validate=False)

        k = Key(bucket)
        k.key = "%s-%s" % (job_id, tmpfile)

        k.set_contents_from_filename(tmpfile, md5=md5, replace=True)

        rval.append(DataFile(tmpfile, (bucket_name, k.key), tablename,
                             columns))
        _log(job_id, "->S3 %s/%s" % (bucket_name, k.key))
    return rval
Ejemplo n.º 27
0
 def _MakeFile(file_size):
   """Creates a temporary file of the given size and returns its path."""
   fd, fpath = tempfile.mkstemp(suffix='.bin', prefix='gsutil_test_file',
                                text=False)
   self.file_sizes[fpath] = file_size
   random_bytes = os.urandom(min(file_size, self.MAX_UNIQUE_RANDOM_BYTES))
   total_bytes = 0
   file_contents = ""
   while total_bytes < file_size:
     num_bytes = min(self.MAX_UNIQUE_RANDOM_BYTES, file_size - total_bytes)
     file_contents += random_bytes[:num_bytes]
     total_bytes += num_bytes
   self.file_contents[fpath] = file_contents
   with os.fdopen(fd, 'wb') as f:
     f.write(self.file_contents[fpath])
   with open(fpath, 'rb') as f:
     self.file_md5s[fpath] = compute_md5(f)
   return fpath
Ejemplo n.º 28
0
 def _MakeFile(file_size):
   """Creates a temporary file of the given size and returns its path."""
   fd, fpath = tempfile.mkstemp(suffix='.bin', prefix='gsutil_test_file',
                                text=False)
   self.file_sizes[fpath] = file_size
   random_bytes = os.urandom(min(file_size, self.MAX_UNIQUE_RANDOM_BYTES))
   total_bytes = 0
   file_contents = ""
   while total_bytes < file_size:
     num_bytes = min(self.MAX_UNIQUE_RANDOM_BYTES, file_size - total_bytes)
     file_contents += random_bytes[:num_bytes]
     total_bytes += num_bytes
   self.file_contents[fpath] = file_contents
   with os.fdopen(fd, 'wb') as f:
     f.write(self.file_contents[fpath])
   with open(fpath, 'rb') as f:
     self.file_md5s[fpath] = compute_md5(f)
   return fpath
Ejemplo n.º 29
0
    def test_bad_md5_leaves_old_object_alone(self):
        '''Github #705 Regression test:
           Make sure that overwriting an object using a bad md5
           simply leaves the old version in place.'''
        key_name = str(uuid.uuid4())
        bucket = self.conn.create_bucket(self.bucket_name)
        value = 'good value'

        good_key = Key(bucket, key_name)
        good_key.set_contents_from_string(value)

        bad_key = Key(bucket, key_name)
        s = StringIO('not the real content')
        x = compute_md5(s)
        try:
            bad_key.set_contents_from_string('this is different from the md5 we calculated', md5=x)
        except S3ResponseError:
            pass
        self.assertEqual(good_key.get_contents_as_string(), value)
Ejemplo n.º 30
0
    async def _complete_multipart_upload(self, path, session_upload_id,
                                         parts_metadata):
        """This operation completes a multipart upload by assembling previously uploaded parts.

        Docs: https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadComplete.html
        """

        payload = ''.join([
            '<?xml version="1.0" encoding="UTF-8"?><CompleteMultipartUpload>',
            ''.join([
                '<Part><PartNumber>{}</PartNumber><ETag>{}</ETag></Part>'.
                format(i + 1, xml.sax.saxutils.escape(part['ETAG']))
                for i, part in enumerate(parts_metadata)
            ]),
            '</CompleteMultipartUpload>',
        ]).encode('utf-8')
        headers = {
            'Content-Length': str(len(payload)),
            'Content-MD5': compute_md5(BytesIO(payload))[1],
            'Content-Type': 'text/xml',
        }
        params = {'uploadId': session_upload_id}
        complete_url = functools.partial(self.bucket.new_key(
            path.path).generate_url,
                                         settings.TEMP_URL_SECS,
                                         'POST',
                                         query_parameters=params,
                                         headers=headers)

        resp = await self.make_request(
            'POST',
            complete_url,
            data=payload,
            headers=headers,
            params=params,
            expects=(
                200,
                201,
            ),
            throws=exceptions.UploadError,
        )
        await resp.release()
 def compute_md5(self, fp):
     """
     :type fp: file
     :param fp: File pointer to the file to MD5 hash.  The file pointer
                will be reset to the beginning of the file before the
                method returns.
     :rtype: tuple
     :return: A tuple containing the hex digest version of the MD5 hash
              as the first element and the base64 encoded version of the
              plain digest as the second element.
     """
     tup = compute_md5(fp)
     # Returned values are MD5 hash, base64 encoded MD5 hash, and file size.
     # The internal implementation of compute_md5() needs to return the
     # file size but we don't want to return that value to the external
     # caller because it changes the class interface (i.e. it might
     # break some code) so we consume the third tuple value here and
     # return the remainder of the tuple to the caller, thereby preserving
     # the existing interface.
     self.size = tup[2]
     return tup[0:2]
Ejemplo n.º 32
0
 def compute_md5(self, fp):
     """
     :type fp: file
     :param fp: File pointer to the file to MD5 hash.  The file pointer
                will be reset to the beginning of the file before the
                method returns.
     :rtype: tuple
     :return: A tuple containing the hex digest version of the MD5 hash
              as the first element and the base64 encoded version of the
              plain digest as the second element.
     """
     tup = compute_md5(fp)
     # Returned values are MD5 hash, base64 encoded MD5 hash, and file size.
     # The internal implementation of compute_md5() needs to return the
     # file size but we don't want to return that value to the external
     # caller because it changes the class interface (i.e. it might
     # break some code) so we consume the third tuple value here and
     # return the remainder of the tuple to the caller, thereby preserving
     # the existing interface.
     self.size = tup[2]
     return tup[0:2]
Ejemplo n.º 33
0
    async def _complete_multipart_upload(self, path, session_upload_id, parts_metadata):
        """This operation completes a multipart upload by assembling previously uploaded parts.

        Docs: https://docs.aws.amazon.com/AmazonS3/latest/API/mpUploadComplete.html
        """

        payload = ''.join([
            '<?xml version="1.0" encoding="UTF-8"?><CompleteMultipartUpload>',
            ''.join(
                ['<Part><PartNumber>{}</PartNumber><ETag>{}</ETag></Part>'.format(
                    i + 1,
                    xml.sax.saxutils.escape(part['ETAG'])
                ) for i, part in enumerate(parts_metadata)]
            ),
            '</CompleteMultipartUpload>',
        ]).encode('utf-8')
        headers = {
            'Content-Length': str(len(payload)),
            'Content-MD5': compute_md5(BytesIO(payload))[1],
            'Content-Type': 'text/xml',
        }
        params = {'uploadId': session_upload_id}
        complete_url = functools.partial(
            self.bucket.new_key(path.path).generate_url,
            settings.TEMP_URL_SECS,
            'POST',
            query_parameters=params,
            headers=headers
        )

        resp = await self.make_request(
            'POST',
            complete_url,
            data=payload,
            headers=headers,
            params=params,
            expects=(200, 201, ),
            throws=exceptions.UploadError,
        )
        await resp.release()
Ejemplo n.º 34
0
def _upload_s3(datafiles, job_id, bucket_name='infernyx'):
    rval = []

    conn = boto.connect_s3()
    bucket = conn.get_bucket(bucket_name, validate=False)

    for tmp_file_list, _, tablename, columns in datafiles:
        s3_entries = []
        for tmpfile in tmp_file_list:
            with open(tmpfile) as f:
                md5 = compute_md5(f)

            k = Key(bucket)
            k.key = "%s-%s" % (job_id, tmpfile)

            _log(job_id, "->S3 %s/%s" % (bucket_name, k.key))
            k.set_contents_from_filename(tmpfile, md5=md5, replace=True)

            s3_entry = {
                "url": "s3://%s/%s" % (bucket_name, k.key),
                "mandatory": True
            }
            s3_entries.append(s3_entry)

        # upload the manifest
        prefix = tmp_file_list[0].rsplit('.')[0]
        manifest = ujson.dumps({"entries": s3_entries})
        manifest_key = Key(bucket)
        manifest_key.key = "%s.%s.manifest" % (job_id, prefix)
        _log(job_id,
             "->S3 %s/%s: %s" % (bucket_name, manifest_key.key, manifest))
        manifest_key.set_contents_from_string(manifest)

        # store manifest
        rval.append(
            DataFile(tmp_file_list, (bucket_name, manifest_key.key), tablename,
                     columns))

    return rval
Ejemplo n.º 35
0
def sync_dir_up(bucket_name, local_path, remote_prefix=None):
    puts("Sync directory {0} with bucket {1}".format(bucket_name, local_path))
    conn = boto.connect_s3()
    bucket = conn.get_bucket(bucket_name)

    for root, dirs, files in os.walk(local_path):
        for fname in files:
            file_path = os.path.join(root, fname)
            key_name = get_key_name(local_path, file_path)

            key = bucket.lookup(key_name)
            if key:
                key_md5 = key.get_metadata('fabix-md5')
                fd = open(file_path)
                md5 = compute_md5(fd)
                fd.close()
                if md5[0] == key_md5:
                    puts("Skipping {0} (MD5 match)".format(file_path))
                    continue

            puts("Upload file {0}".format(file_path))
            upload_file(bucket, key_name, file_path, remote_prefix=remote_prefix)
Ejemplo n.º 36
0
    def setUp(self):
        self.mock_s3 = moto.mock_s3()
        self.mock_s3.start()
        self.s3_conn = boto.connect_s3()
        self.s3_conn.create_bucket('source_bucket')

        self.source_bucket = self.s3_conn.get_bucket('source_bucket')
        self.data = [
            {'name': 'Roberto', 'birthday': '12/05/1987'},
            {'name': 'Claudia', 'birthday': '21/12/1985'},
        ]
        with closing(self.source_bucket.new_key('some_prefix/test_key')) as key:
            with TmpFile() as tmp_filename:
                with open(tmp_filename, 'w') as f:
                    f.write(json.dumps(self.data))
                with open(tmp_filename) as f:
                    self.key_md5 = compute_md5(f)
            key.metadata = {'total': 2, 'md5': self.key_md5}
            key.set_contents_from_string(json.dumps(self.data))
        self.tmp_bypass_resume_file = 'tests/data/tmp_s3_bypass_resume_persistence.pickle'
        shutil.copyfile('tests/data/s3_bypass_resume_persistence.pickle',
                        self.tmp_bypass_resume_file)
Ejemplo n.º 37
0
 def _md5_checksum_metadata(source_path):
     checksum = {}
     with open(source_path, "rb") as fd:
         hex_digest, b64_digest, data_size = compute_md5(fd)
         checksum['b64_digest'] = b64_digest
     return checksum
Ejemplo n.º 38
0
 def calculate_md5(self):
     if self.md5 is None:
         self.md5 = compute_md5(StringIO(self.get_content()))
     return self.md5
Ejemplo n.º 39
0
    async def _delete_folder(self, path, **kwargs):
        """Query for recursive contents of folder and delete in batches of 1000

        Called from: func: delete if not path.is_file

        Calls: func: self._check_region
               func: self.make_request
               func: self.bucket.generate_url

        :param *ProviderPath path: Path to be deleted

        On S3, folders are not first-class objects, but are instead inferred
        from the names of their children.  A regular DELETE request issued
        against a folder will not work unless that folder is completely empty.
        To fully delete an occupied folder, we must delete all of the comprising
        objects.  Amazon provides a bulk delete operation to simplify this.
        """
        await self._check_region()

        more_to_come = True
        content_keys = []
        query_params = {'prefix': path.path}
        marker = None

        while more_to_come:
            if marker is not None:
                query_params['marker'] = marker

            resp = await self.make_request(
                'GET',
                self.bucket.generate_url(settings.TEMP_URL_SECS,
                                         'GET',
                                         query_parameters=query_params),
                params=query_params,
                expects=(200, ),
                throws=exceptions.MetadataError,
            )

            contents = await resp.read()
            parsed = xmltodict.parse(
                contents, strip_whitespace=False)['ListBucketResult']
            more_to_come = parsed.get('IsTruncated') == 'true'
            contents = parsed.get('Contents', [])

            if isinstance(contents, dict):
                contents = [contents]

            content_keys.extend([content['Key'] for content in contents])
            if len(content_keys) > 0:
                marker = content_keys[-1]

        # Query against non-existant folder does not return 404
        if len(content_keys) == 0:
            raise exceptions.NotFoundError(str(path))

        while len(content_keys) > 0:
            key_batch = content_keys[:1000]
            del content_keys[:1000]

            payload = '<?xml version="1.0" encoding="UTF-8"?>'
            payload += '<Delete>'
            payload += ''.join(
                map(
                    lambda x: '<Object><Key>{}</Key></Object>'.format(
                        xml.sax.saxutils.escape(x)), key_batch))
            payload += '</Delete>'
            payload = payload.encode('utf-8')
            md5 = compute_md5(BytesIO(payload))

            query_params = {'delete': ''}
            headers = {
                'Content-Length': str(len(payload)),
                'Content-MD5': md5[1],
                'Content-Type': 'text/xml',
            }

            # We depend on a customized version of boto that can make query parameters part of
            # the signature.
            url = functools.partial(
                self.bucket.generate_url,
                settings.TEMP_URL_SECS,
                'POST',
                query_parameters=query_params,
                headers=headers,
            )
            resp = await self.make_request(
                'POST',
                url,
                params=query_params,
                data=payload,
                headers=headers,
                expects=(
                    200,
                    204,
                ),
                throws=exceptions.DeleteError,
            )
            await resp.release()
Ejemplo n.º 40
0
 def calculate_md5(self):
     if self.md5 is None:
         self.md5 = compute_md5(StringIO(self.get_content()))
     return self.md5
Ejemplo n.º 41
0
def putter(put, put_queue, stat_queue, options):
    pid = current_process().pid
    log = logging.getLogger(os.path.basename(sys.argv[0]))
    connection, bucket = None, None
    file_object_cache = FileObjectCache()
    # Figure out what content types we want to gzip
    if not options.gzip_type:  # default
        gzip_content_types = GZIP_CONTENT_TYPES
    elif 'all' in options.gzip_type:
        gzip_content_types = GZIP_ALL
    else:
        gzip_content_types = options.gzip_type
    if 'guess' in gzip_content_types:
        # don't bother removing 'guess' from the list since nothing will match it
        gzip_content_types.extend(GZIP_CONTENT_TYPES)
    if options.gzip:
        pass
        #log.debug('These content types will be gzipped: %s' % unicode(gzip_content_types))
    while True:
        args = put_queue.get()
        #print args, pid
        if args is None:
            put_queue.task_done()
            break
        key_name, value_kwargs = args
        #print(666,value_kwargs)
        if options.gzip:
            key_name = '%s.gz' % key_name
        value = Value(file_object_cache, **value_kwargs)
        should_gzip = False
        try:
            if connection is None:
                connection = S3Connection(is_secure=options.secure,
                                          host=options.host)
            if bucket is None:
                bucket = connection.get_bucket(options.bucket, validate=False)
            key = put(bucket, key_name, value)
            if key:
                if value.should_copy_content():
                    if options.headers:
                        headers = dict(
                            tuple(header.split(':', 1))
                            for header in options.headers)
                    else:
                        headers = {}

                    content_type = None
                    if options.content_type:
                        if options.content_type == 'guess':
                            content_type = mimetypes.guess_type(value.path)[0]
                        elif options.content_type == 'magic':
                            content_type = mimetypes.guess_type(value.path)[0]
                            if content_type is None:
                                content_type = magic.from_file(value.path,
                                                               mime=True)
                        else:
                            content_type = options.content_type
                        headers['Content-Type'] = content_type

                    content = value.get_content()
                    md5 = value.md5
                    should_gzip = options.gzip and (
                        content_type and content_type in gzip_content_types
                        or gzip_content_types == GZIP_ALL)
                    if should_gzip:
                        headers['Content-Encoding'] = 'gzip'
                        string_io = StringIO()
                        gzip_file = GzipFile(compresslevel=1,
                                             fileobj=string_io,
                                             mode='w')
                        gzip_file.write(content)
                        gzip_file.close()
                        content = string_io.getvalue()
                        md5 = compute_md5(StringIO(content))
                    if not options.dry_run:
                        key.set_contents_from_string(
                            content,
                            headers,
                            md5=md5,
                            policy=options.grant,
                            encrypt_key=options.encrypt_key)
                #log.info('%s %s> %s' % (value.path, 'z' if should_gzip else '-', key.name))
                stat_queue.put(dict(size=value.get_size()))
            else:
                log.info('skipping %s -> %s' % (value.path, key_name))
        except SSLError as exc:
            log.error('%s -> %s (%s)' % (value.path, key_name, exc))
            put_queue.put(args)
            connection, bucket = None, None
        except IOError as exc:
            log.error('%s -> %s (%s)' % (value.path, key_name, exc))
        put_queue.task_done()
Ejemplo n.º 42
0
    async def _delete_folder(self, path, **kwargs):
        """Query for recursive contents of folder and delete in batches of 1000

        Called from: func: delete if not path.is_file

        Calls: func: self._check_region
               func: self.make_request
               func: self.bucket.generate_url

        :param *ProviderPath path: Path to be deleted

        On S3, folders are not first-class objects, but are instead inferred
        from the names of their children.  A regular DELETE request issued
        against a folder will not work unless that folder is completely empty.
        To fully delete an occupied folder, we must delete all of the comprising
        objects.  Amazon provides a bulk delete operation to simplify this.
        """
        await self._check_region()

        more_to_come = True
        content_keys = []
        query_params = {'prefix': path.path}
        marker = None

        while more_to_come:
            if marker is not None:
                query_params['marker'] = marker

            resp = await self.make_request(
                'GET',
                self.bucket.generate_url(settings.TEMP_URL_SECS, 'GET', query_parameters=query_params),
                params=query_params,
                expects=(200, ),
                throws=exceptions.MetadataError,
            )

            contents = await resp.read()
            parsed = xmltodict.parse(contents, strip_whitespace=False)['ListBucketResult']
            more_to_come = parsed.get('IsTruncated') == 'true'
            contents = parsed.get('Contents', [])

            if isinstance(contents, dict):
                contents = [contents]

            content_keys.extend([content['Key'] for content in contents])
            if len(content_keys) > 0:
                marker = content_keys[-1]

        # Query against non-existant folder does not return 404
        if len(content_keys) == 0:
            raise exceptions.NotFoundError(str(path))

        while len(content_keys) > 0:
            key_batch = content_keys[:1000]
            del content_keys[:1000]

            payload = '<?xml version="1.0" encoding="UTF-8"?>'
            payload += '<Delete>'
            payload += ''.join(map(
                lambda x: '<Object><Key>{}</Key></Object>'.format(xml.sax.saxutils.escape(x)),
                key_batch
            ))
            payload += '</Delete>'
            payload = payload.encode('utf-8')
            md5 = compute_md5(BytesIO(payload))

            query_params = {'delete': ''}
            headers = {
                'Content-Length': str(len(payload)),
                'Content-MD5': md5[1],
                'Content-Type': 'text/xml',
            }

            # We depend on a customized version of boto that can make query parameters part of
            # the signature.
            url = functools.partial(
                self.bucket.generate_url,
                settings.TEMP_URL_SECS,
                'POST',
                query_parameters=query_params,
                headers=headers,
            )
            resp = await self.make_request(
                'POST',
                url,
                params=query_params,
                data=payload,
                headers=headers,
                expects=(200, 204, ),
                throws=exceptions.DeleteError,
            )
            await resp.release()