def delete(self, keys: List[str]) -> None: logger.debug("Remove %s object(s) from S3", len(keys)) for i in range(0, len(keys), MAX_DELETE_OBJECTS): sliced_keys = keys[i:i + MAX_DELETE_OBJECTS] self.__bucket.delete_objects( Delete={'Objects': [{ 'Key': key } for key in sliced_keys]})
def get_columns(self, query: str, add_quotes: bool = True) -> List[str]: quote = '"' if add_quotes else '' sql = self.__generate_get_columns_sql(query) logger.debug("query: %s", sql) try: self.__cursor.execute(sql) result = [ f'{quote}{column.name}{quote}' for column in self.__cursor.description ] return result except Exception as e: raise e
def unload(self, query: str, s3_uri: str, manifest: bool = False, delimiter: Optional[str] = None, fixed_width: Optional[str] = None, encrypted: bool = None, gzip: bool = False, add_quotes: bool = False, null_string: Optional[str] = None, escape: bool = False, allow_overwrite: bool = False, parallel: bool = True, max_file_size: Optional[str] = None) -> bool: options: Dict[str, Optional[str]] = {} if manifest: options['MANIFEST'] = None if delimiter is not None: options['DELIMITER'] = f"'{delimiter}'" if fixed_width is not None: options['FIXEDWIDTH'] = f"'fixed_width'" if encrypted: options['ENCRYPTED'] = None if gzip: options['GZIP'] = None if add_quotes: options['ADDQUOTES'] = None if null_string is not None: options['NULL'] = f"'{null_string}'" if escape: options['ESCAPE'] = None if allow_overwrite: options['ALLOWOVERWRITE'] = None options['PARALLEL'] = 'ON' if parallel else 'OFF' if max_file_size is not None: options['MAXFILESIZE'] = max_file_size sql = self.__generate_unload_sql(self.__escaped_query(query), s3_uri, self.__credential, options) logger.debug("query: %s", sql) try: self.__cursor.execute(sql) return True except Exception as e: raise e
def unload(self, query: str, filename: str, delimiter: str = ',', add_quotes: bool = True, escape: bool = True, null_string: str = '', with_header: bool = True, write_local=True, remove_from_s3=True) -> None: session_id = self.__generate_session_id() logger.debug("Session id: %s", session_id) s3_path = self.__generate_path("/tmp/redshift-unloader", session_id, '/') local_path = self.__generate_path(tempfile.gettempdir(), session_id) logger.debug("Get columns") columns = self.__redshift.get_columns(query, add_quotes) if with_header else None logger.debug("Unload") self.__redshift.unload( query, self.__s3.uri(s3_path), gzip=True, parallel=True, delimiter=delimiter, null_string=null_string, add_quotes=add_quotes, escape=escape, allow_overwrite=True) if write_local: logger.debug("Fetch the list of objects") s3_keys = self.__s3.list(s3_path.lstrip('/')) local_files = list(map(lambda key: os.path.join(local_path, os.path.basename(key)), s3_keys)) logger.debug("Create temporary directory: %s", local_path) os.mkdir(local_path, 0o700) logger.debug("Download all objects") for s3_key, local_file in zip(s3_keys, local_files): self.__s3.download(key=s3_key, filename=local_file) logger.debug("Merge all objects") with open(filename, 'wb') as out: if columns is not None: out.write(gzip.compress((delimiter.join(columns) + os.linesep).encode())) for local_file in local_files: logger.debug("Merge %s into result file", local_file) with open(local_file, 'rb') as read: shutil.copyfileobj(read, out, 2 * MB) if remove_from_s3: logger.debug("Remove all objects in S3") self.__s3.delete(s3_keys) if write_local: logger.debug("Remove temporary directory in local") shutil.rmtree(local_path)
def download(self, key: str, filename: str) -> None: logger.debug("Download %s to %s", key, filename) self.__bucket.download_file(Key=key, Filename=filename)
def unload_to_list(self, query: str, delimiter: str = ',', add_quotes: bool = True, escape: bool = True, null_string: str = '', with_header: bool = True) -> []: startTime = time.time() lines = io.StringIO() master_list = [] print(psutil.virtual_memory()) logger.debug("Get columns") columns = self.__redshift.get_columns( query, add_quotes) if with_header else None if columns is not None: master_list.append((delimiter.join(columns) + os.linesep).encode()) #lines.write((delimiter.join(columns) + os.linesep).encode()) print(columns) lines.write(",".join(columns) + os.linesep) session_id = self.__generate_session_id() logger.debug("Session id: %s", session_id) s3_path = self.__generate_path("/tmp/redshift-unloader", session_id, '/') logger.debug("Unload") self.__redshift.unload(query, self.__s3.uri(s3_path), gzip=True, parallel=True, delimiter=delimiter, null_string=null_string, add_quotes=add_quotes, escape=escape, allow_overwrite=True) logger.debug("Fetch the list of objects") s3_keys = self.__s3.list(s3_path.lstrip('/')) #set up the queue to hold all the urls q = Queue(maxsize=0) # Use many threads (5 max, or one for each url) num_theads = min(5, len(s3_keys)) #Populating Queue with tasks results = [{} for x in s3_keys] #load up the queue with the keys to fetch and the index for each job (as a tuple): for i in range(len(s3_keys)): #need the index and the key in each queue item. q.put((i, s3_keys[i])) for i in range(num_theads): logging.debug('Starting thread ', i) worker = Thread(target=self.s3get, args=(q, results)) worker.setDaemon( True) #setting threads as "daemon" allows main program to #exit eventually even if these dont finish #correctly. worker.start() #print("Waiting on queue to empty") #now we wait until the queue has been processed q.join() print(psutil.virtual_memory()) for i in range(len(s3_keys)): #master_list.append(results[i]) lines.write(results[i]) results[i] = '' logging.info('All tasks completed.') #print(master_list) logger.debug("Remove all objects in S3") self.__s3.delete(s3_keys) print(psutil.virtual_memory()) endTime = time.time() print("start ", startTime) print("end ", endTime) #return master_list #csv.reader(lines..getvalue()A lines.seek(0) #return pd.read_csv(lines.read(), sep=",") return lines.read()