コード例 #1
0
ファイル: s3.py プロジェクト: dancaspi/redshift-unloader2
 def delete(self, keys: List[str]) -> None:
     logger.debug("Remove %s object(s) from S3", len(keys))
     for i in range(0, len(keys), MAX_DELETE_OBJECTS):
         sliced_keys = keys[i:i + MAX_DELETE_OBJECTS]
         self.__bucket.delete_objects(
             Delete={'Objects': [{
                 'Key': key
             } for key in sliced_keys]})
コード例 #2
0
    def get_columns(self, query: str, add_quotes: bool = True) -> List[str]:
        quote = '"' if add_quotes else ''
        sql = self.__generate_get_columns_sql(query)
        logger.debug("query: %s", sql)

        try:
            self.__cursor.execute(sql)
            result = [
                f'{quote}{column.name}{quote}'
                for column in self.__cursor.description
            ]

            return result
        except Exception as e:
            raise e
コード例 #3
0
    def unload(self,
               query: str,
               s3_uri: str,
               manifest: bool = False,
               delimiter: Optional[str] = None,
               fixed_width: Optional[str] = None,
               encrypted: bool = None,
               gzip: bool = False,
               add_quotes: bool = False,
               null_string: Optional[str] = None,
               escape: bool = False,
               allow_overwrite: bool = False,
               parallel: bool = True,
               max_file_size: Optional[str] = None) -> bool:
        options: Dict[str, Optional[str]] = {}

        if manifest:
            options['MANIFEST'] = None
        if delimiter is not None:
            options['DELIMITER'] = f"'{delimiter}'"
        if fixed_width is not None:
            options['FIXEDWIDTH'] = f"'fixed_width'"
        if encrypted:
            options['ENCRYPTED'] = None
        if gzip:
            options['GZIP'] = None
        if add_quotes:
            options['ADDQUOTES'] = None
        if null_string is not None:
            options['NULL'] = f"'{null_string}'"
        if escape:
            options['ESCAPE'] = None
        if allow_overwrite:
            options['ALLOWOVERWRITE'] = None
        options['PARALLEL'] = 'ON' if parallel else 'OFF'
        if max_file_size is not None:
            options['MAXFILESIZE'] = max_file_size

        sql = self.__generate_unload_sql(self.__escaped_query(query), s3_uri,
                                         self.__credential, options)
        logger.debug("query: %s", sql)

        try:
            self.__cursor.execute(sql)
            return True
        except Exception as e:
            raise e
コード例 #4
0
    def unload(self, query: str, filename: str,
               delimiter: str = ',', add_quotes: bool = True, escape: bool = True,
               null_string: str = '', with_header: bool = True, write_local=True, remove_from_s3=True) -> None:
        session_id = self.__generate_session_id()
        logger.debug("Session id: %s", session_id)

        s3_path = self.__generate_path("/tmp/redshift-unloader", session_id, '/')
        local_path = self.__generate_path(tempfile.gettempdir(), session_id)

        logger.debug("Get columns")
        columns = self.__redshift.get_columns(query, add_quotes) if with_header else None

        logger.debug("Unload")
        self.__redshift.unload(
            query,
            self.__s3.uri(s3_path),
            gzip=True,
            parallel=True,
            delimiter=delimiter,
            null_string=null_string,
            add_quotes=add_quotes,
            escape=escape,
            allow_overwrite=True)
        
        if write_local:
            logger.debug("Fetch the list of objects")
            s3_keys = self.__s3.list(s3_path.lstrip('/'))
            local_files = list(map(lambda key: os.path.join(local_path, os.path.basename(key)), s3_keys))

            logger.debug("Create temporary directory: %s", local_path)
            os.mkdir(local_path, 0o700)

            logger.debug("Download all objects")
            for s3_key, local_file in zip(s3_keys, local_files):
                self.__s3.download(key=s3_key, filename=local_file)

            logger.debug("Merge all objects")
            with open(filename, 'wb') as out:
                if columns is not None:
                    out.write(gzip.compress((delimiter.join(columns) + os.linesep).encode()))

                for local_file in local_files:
                    logger.debug("Merge %s into result file", local_file)

                    with open(local_file, 'rb') as read:
                        shutil.copyfileobj(read, out, 2 * MB)
        
        if remove_from_s3:
            logger.debug("Remove all objects in S3")
            self.__s3.delete(s3_keys)


        if write_local:
            logger.debug("Remove temporary directory in local")
            shutil.rmtree(local_path)
コード例 #5
0
ファイル: s3.py プロジェクト: dancaspi/redshift-unloader2
 def download(self, key: str, filename: str) -> None:
     logger.debug("Download %s to %s", key, filename)
     self.__bucket.download_file(Key=key, Filename=filename)
コード例 #6
0
    def unload_to_list(self,
                       query: str,
                       delimiter: str = ',',
                       add_quotes: bool = True,
                       escape: bool = True,
                       null_string: str = '',
                       with_header: bool = True) -> []:

        startTime = time.time()
        lines = io.StringIO()
        master_list = []

        print(psutil.virtual_memory())
        logger.debug("Get columns")
        columns = self.__redshift.get_columns(
            query, add_quotes) if with_header else None
        if columns is not None:
            master_list.append((delimiter.join(columns) + os.linesep).encode())
            #lines.write((delimiter.join(columns) + os.linesep).encode())
            print(columns)
            lines.write(",".join(columns) + os.linesep)

        session_id = self.__generate_session_id()
        logger.debug("Session id: %s", session_id)

        s3_path = self.__generate_path("/tmp/redshift-unloader", session_id,
                                       '/')

        logger.debug("Unload")
        self.__redshift.unload(query,
                               self.__s3.uri(s3_path),
                               gzip=True,
                               parallel=True,
                               delimiter=delimiter,
                               null_string=null_string,
                               add_quotes=add_quotes,
                               escape=escape,
                               allow_overwrite=True)

        logger.debug("Fetch the list of objects")
        s3_keys = self.__s3.list(s3_path.lstrip('/'))

        #set up the queue to hold all the urls
        q = Queue(maxsize=0)
        # Use many threads (5 max, or one for each url)
        num_theads = min(5, len(s3_keys))

        #Populating Queue with tasks
        results = [{} for x in s3_keys]
        #load up the queue with the keys to fetch and the index for each job (as a tuple):
        for i in range(len(s3_keys)):
            #need the index and the key in each queue item.
            q.put((i, s3_keys[i]))

        for i in range(num_theads):
            logging.debug('Starting thread ', i)
            worker = Thread(target=self.s3get, args=(q, results))
            worker.setDaemon(
                True)  #setting threads as "daemon" allows main program to
            #exit eventually even if these dont finish
            #correctly.
            worker.start()

        #print("Waiting on queue to empty")

        #now we wait until the queue has been processed
        q.join()
        print(psutil.virtual_memory())
        for i in range(len(s3_keys)):
            #master_list.append(results[i])
            lines.write(results[i])
            results[i] = ''

        logging.info('All tasks completed.')

        #print(master_list)
        logger.debug("Remove all objects in S3")
        self.__s3.delete(s3_keys)
        print(psutil.virtual_memory())
        endTime = time.time()
        print("start ", startTime)
        print("end   ", endTime)
        #return master_list
        #csv.reader(lines..getvalue()A
        lines.seek(0)
        #return pd.read_csv(lines.read(), sep=",")
        return lines.read()