def __store_files(self) -> None: if not self.source_files or len(self.source_files) == 0: return RunnerLog( self.task, self.run_id, 8, "Storing output file%s..." % ("s" if len(self.source_files) != 1 else ""), ) for file_counter, this_file in enumerate(self.source_files, 1): this_file_size = (self.query_output_size if self.query_output_size is not None else Path( this_file.name).stat().st_size) # get file name. if no name specified in task setting, then use temp name. try: file_name, file_path, file_hash = File( task=self.task, run_id=self.run_id, data_file=this_file, params=self.param_loader, ).save() except BaseException as e: raise RunnerException(self.task, self.run_id, 11, f"Failed to create data file.\n{e}") self.output_files.append(file_path) if len(self.source_files) > 1: RunnerLog( self.task, self.run_id, 8, f"Storing file {file_counter} of {len(self.source_files)}...", ) # store # send to sftp if self.task.destination_sftp == 1 and self.task.destination_sftp_conn: if (self.task.destination_sftp_dont_send_empty_file == 1 and this_file_size == 0): RunnerLog( self.task, self.run_id, 8, "Skipping SFTP, file is empty.", ) else: Sftp( task=self.task, run_id=self.run_id, connection=self.task.destination_sftp_conn, directory=self.temp_path, ).save( overwrite=self.task.destination_sftp_overwrite, file_name=file_name, ) # send to ftp if self.task.destination_ftp == 1 and self.task.destination_ftp_conn: if (self.task.destination_ftp_dont_send_empty_file == 1 and this_file_size == 0): RunnerLog( self.task, self.run_id, 8, "Skipping FTP, file is empty.", ) else: Ftp( task=self.task, run_id=self.run_id, connection=self.task.destination_ftp_conn, directory=self.temp_path, ).save( overwrite=self.task.destination_ftp_overwrite, file_name=file_name, ) # save to smb if self.task.destination_smb == 1 and self.task.destination_smb_conn: if (self.task.destination_smb_dont_send_empty_file == 1 and this_file_size == 0): RunnerLog( self.task, self.run_id, 8, "Skipping SMB, file is empty.", ) else: Smb( task=self.task, run_id=self.run_id, connection=self.task.destination_smb_conn, directory=self.temp_path, ).save( overwrite=self.task.destination_smb_overwrite, file_name=file_name, ) # save historical copy smb_path = Smb( task=self.task, run_id=self.run_id, connection=None, # "default", directory=self.temp_path, ).save(overwrite=1, file_name=file_name) # log file details db.session.add( TaskFile( name=file_name, path=smb_path, task_id=self.task.id, job_id=self.run_id, file_hash=file_hash, size=file_size(str(os.path.getsize(file_path))), )) db.session.commit()
def __send_email(self) -> None: logs = ( TaskLog.query.filter_by( task_id=self.task.id, job_id=self.run_id).order_by( TaskLog.status_date.desc()) # type: ignore[union-attr] .all()) error_logs = (TaskLog.query.filter_by(task_id=self.task.id, job_id=self.run_id, error=1).order_by( TaskLog.status_date).all()) date = str(datetime.datetime.now()) # pylint: disable=broad-except try: template = env.get_template("email/email.html.j2") except BaseException as e: raise RunnerException(self.task, self.run_id, 8, f"Failed to get email template.\n{e}") # success email if self.task.email_completion == 1 and ( (len(error_logs) < 1 and self.task.email_error == 1) or self.task.email_error != 1): RunnerLog(self.task, self.run_id, 8, "Sending completion email.") output: List[List[str]] = [] empty = 0 attachments: List[str] = [] if self.task.email_completion_file == 1 and len( self.output_files) > 0: for output_file in self.output_files: if self.task.email_completion_file_embed == 1: with open(output_file, newline="") as csvfile: output.extend(list(csv.reader(csvfile))) # check attachement file size if the task # should not send blank files if (self.task.email_completion_dont_send_empty_file == 1 and output_file # if query and data is blank, or other types and file is 0 and os.path.getsize(output_file) == 0): empty = 1 attachments.append(output_file) if empty == 1: RunnerLog( self.task, self.run_id, 8, "Not sending completion email, file is empty.", ) return Smtp( task=self.task, run_id=self.run_id, recipients=self.task.email_completion_recipients, subject="Project: %s / Task: %s / Run: %s %s" % ( self.task.project.name, self.task.name, self.run_id, date, ), message=template.render( task=self.task, success=1, date=date, logs=logs, output=output, host=app.config["WEB_HOST"], ), short_message=self.task.email_completion_message or f"Atlas Hub job {self.task} completed successfully.", attachments=attachments, )
def __get_source(self) -> None: if self.task.source_type_id == 1: # sql external_db = self.task.source_database_conn try: RunnerLog(self.task, self.run_id, 8, "Loading query...") query = self.__get_query() except BaseException as e: raise RunnerException(self.task, self.run_id, 8, f"Failed to load query.\n{e}") RunnerLog(self.task, self.run_id, 8, "Starting query run, waiting for results...") if external_db.database_type.id == 1: # postgres try: self.query_output_size, self.source_files = Postgres( task=self.task, run_id=self.run_id, connection=em_decrypt(external_db.connection_string, app.config["PASS_KEY"]), timeout=external_db.timeout or app.config["DEFAULT_SQL_TIMEOUT"], directory=self.temp_path, ).run(query) except ValueError as message: raise RunnerException(self.task, self.run_id, 21, message) except BaseException as message: raise RunnerException(self.task, self.run_id, 21, f"Failed to run query.\n{message}") elif external_db.database_type.id == 2: # mssql try: self.query_output_size, self.source_files = SqlServer( task=self.task, run_id=self.run_id, connection=em_decrypt(external_db.connection_string, app.config["PASS_KEY"]), timeout=external_db.timeout or app.config["DEFAULT_SQL_TIMEOUT"], directory=self.temp_path, ).run(query) except ValueError as message: raise RunnerException(self.task, self.run_id, 20, message) except BaseException as message: raise RunnerException(self.task, self.run_id, 20, f"Failed to run query.\n{message}") RunnerLog( self.task, self.run_id, 8, f"Query completed.\nData file {self.source_files[0].name} created. Data size: {file_size(str(Path(self.source_files[0].name).stat().st_size))}.", ) elif self.task.source_type_id == 2: # smb file file_name = self.param_loader.insert_file_params( self.task.source_smb_file) file_name = DateParsing( task=self.task, run_id=self.run_id, date_string=file_name, ).string_to_date() self.source_files = Smb( task=self.task, run_id=self.run_id, connection=self.task.source_smb_conn, directory=self.temp_path, ).read(file_name=file_name) elif self.task.source_type_id == 3: # sftp file RunnerLog(self.task, self.run_id, 9, "Loading data from server...") file_name = self.param_loader.insert_file_params( self.task.source_sftp_file) file_name = DateParsing( task=self.task, run_id=self.run_id, date_string=file_name, ).string_to_date() self.source_files = Sftp( task=self.task, run_id=self.run_id, connection=self.task.source_sftp_conn, directory=self.temp_path, ).read(file_name=file_name) elif self.task.source_type_id == 4: # ftp file RunnerLog(self.task, self.run_id, 13, "Loading data from server...") file_name = self.param_loader.insert_file_params( self.task.source_ftp_file) file_name = DateParsing( task=self.task, run_id=self.run_id, date_string=file_name, ).string_to_date() self.source_files = Ftp( task=self.task, run_id=self.run_id, connection=self.task.source_ftp_conn, directory=self.temp_path, ).read(file_name=file_name) elif self.task.source_type_id == 6: # ssh command query = self.__get_query() Ssh( task=self.task, run_id=self.run_id, connection=self.task.source_ssh_conn, command=query, ).run()
def __process(self) -> None: RunnerLog(self.task, self.run_id, 8, "Starting processing script...") # get processing script # 1 = smb # 2 = sftp # 3 = ftp # 4 = git url # 5 = other url # 6 = source code processing_script_name = self.temp_path / (self.run_id + ".py") my_file = "" if (self.task.processing_type_id == 1 and self.task.processing_smb_id is not None): file_name = self.param_loader.insert_file_params( self.task.source_smb_file) file_name = DateParsing( task=self.task, run_id=self.run_id, date_string=file_name, ).string_to_date() my_file = Path( Smb( task=self.task, run_id=self.run_id, directory=self.temp_path, connection=self.task.processing_smb_conn, ).read(file_name)[0].name).read_text("utf8") elif (self.task.processing_type_id == 2 and self.task.processing_sftp_id is not None): file_name = self.param_loader.insert_file_params( self.task.processing_sftp_file) file_name = DateParsing( task=self.task, run_id=self.run_id, date_string=file_name, ).string_to_date() my_file = Path( Sftp( task=self.task, run_id=self.run_id, connection=self.task.processing_sftp_conn, directory=self.temp_path, ).read(file_name=file_name)[0].name).read_text("utf8") elif (self.task.processing_type_id == 3 and self.task.processing_ftp_id is not None): file_name = self.param_loader.insert_file_params( self.task.processing_ftp_file) file_name = DateParsing( task=self.task, run_id=self.run_id, date_string=file_name, ).string_to_date() my_file = Path( Ftp( task=self.task, run_id=self.run_id, connection=self.task.source_ftp_conn, directory=self.temp_path, ).read(file_name=file_name)[0].name).read_text("utf8") elif self.task.processing_type_id == 4 and self.task.processing_git is not None: # if a dir is specified then download all files if (self.task.processing_command is not None and self.task.processing_command != ""): try: url = (re.sub( r"(https?://)(.+?)", r"\1<username>:<password>@\2", self.task.processing_git, flags=re.IGNORECASE, ).replace("<username>", urllib.parse.quote( app.config["GIT_USERNAME"])).replace( "<password>", urllib.parse.quote( app.config["GIT_PASSWORD"]))) cmd = ( "$(which git) clone -q --depth 1 " + '--recurse-submodules --shallow-submodules %s "%s"' % (url, str(self.temp_path))) Cmd( self.task, self.run_id, cmd, "Repo cloned.", "Failed to clone repo: %s" % (self.task.processing_git, ), ).shell() # pylint: disable=broad-except except BaseException: raise RunnerException(self.task, self.run_id, 8, "Processor failed to clone repo.") # otherwise get py file else: my_file = self.source_loader.gitlab(self.task.processing_git) elif self.task.processing_type_id == 5 and self.task.processing_url is not None: if self.task.processing_command is not None: try: cmd = ( "$(which git) clone -q --depth 1 " + '--recurse-submodules --shallow-submodules %s "%s"' % (self.task.processing_url, str(self.temp_path))) Cmd( task=self.task, run_id=self.run_id, cmd=cmd, success_msg="Repo cloned", error_msg="Failed to clone repo: %s" % (self.task.processing_url, ), ).shell() processing_script_name = str(self.temp_path) + ( self.task.processing_command if self.task.processing_command is not None else "") # pylint: disable=broad-except except BaseException: raise RunnerException(self.task, self.run_id, 8, "Processor failed to clone repo.") else: my_file = self.source_loader.web_url(self.task.processing_url) elif (self.task.processing_type_id == 6 and self.task.processing_code is not None): my_file = self.task.processing_code elif self.task.processing_type_id > 0: raise RunnerException( self.task, self.run_id, 8, "Processing error, Not enough information to run a processing script from.", ) try: if my_file != "" and self.task.processing_type_id > 0: Path(processing_script_name).parent.mkdir(parents=True, exist_ok=True) with open(processing_script_name, "w") as text_file: text_file.write(my_file) RunnerLog(self.task, self.run_id, 8, "Processing script created.") # pylint: disable=broad-except except BaseException as e: raise RunnerException(self.task, self.run_id, 8, f"Processing script failure:\n{e}") # run processing script output = PyProcesser( task=self.task, run_id=self.run_id, directory=self.temp_path, source_files=self.source_files, script=self.task.processing_command or processing_script_name.name, ).run() # # allow processer to rename file if output: RunnerLog(self.task, self.run_id, 8, f"Processing script output:\n{output}") self.data_files = output
def save(self, overwrite: int, file_name: str) -> str: # type: ignore[return] """Load data into network file path, creating location if not existing.""" try: if self.connection is not None: dest_path = str( Path(self.connection.path or "").joinpath(file_name)) else: dest_path = str( Path( Path(sanitize_filename(self.task.project.name)) / sanitize_filename(self.task.name) / sanitize_filename(self.task.last_run_job_id) / file_name)) # path must be created one folder at a time.. the docs say the path will all be # created if not existing, but it doesn't seem to be the case :) my_dir = dest_path.split("/")[:-1] path_builder = "" for my_path in my_dir: path_builder += my_path + "/" try: self.conn.listPath(self.share_name, path_builder) # pylint: disable=broad-except except OperationFailure: self.conn.createDirectory(self.share_name, path_builder) # pylint: disable=useless-else-on-loop else: if overwrite != 1: try: # try to get security of the file. if it doesn't exist, # we crash and then can create the file. self.conn.getSecurity(self.share_name, dest_path) RunnerLog( self.task, self.run_id, 10, "File already exists and will not be loaded", ) return dest_path # pylint: disable=broad-except except BaseException: pass with open(str(self.dir.joinpath(file_name)), "rb", buffering=0) as file_obj: uploaded_size = self.conn.storeFile( self.share_name, dest_path, file_obj) server_name = ("backup" if self.connection is None else self.connection.server_name) RunnerLog( self.task, self.run_id, 10, f"{file_size(uploaded_size)} uploaded to {server_name} server.", ) return dest_path # pylint: disable=broad-except except BaseException as e: raise RunnerException(self.task, self.run_id, 10, f"Failed to save file on server.\n{e}")
def save(self) -> Tuple[str, str, str]: """Create and save the file. returns [filename, filepath] of final file. """ if (self.task.destination_file_name is None or self.task.destination_file_name == ""): RunnerLog( self.task, self.run_id, 11, f"No filename specified, {Path(self.data_file.name).name} will be used.", ) if (self.task.destination_file_name != "" and self.task.destination_file_name is not None): # insert params self.file_name = self.params.insert_file_params( self.task.destination_file_name.strip()) # parse python dates self.file_name = DateParsing(self.task, self.run_id, self.file_name).string_to_date() else: self.file_name = Path(self.data_file.name).name # 4 is other if self.task.destination_file_type_id != 4 and self.task.file_type is not None: self.file_name += "." + (self.task.file_type.ext or "csv") self.file_path = str(Path(self.base_path).joinpath(self.file_name)) # if the source name matches the destination name, rename the source and update tmp file name. if self.data_file.name == self.file_path: data_file_as_path = Path(self.data_file.name) new_data_file_name = str( data_file_as_path.parent / (data_file_as_path.stem + "_tmp" + data_file_as_path.suffix)) os.rename(self.data_file.name, new_data_file_name) self.data_file.name = new_data_file_name # type: ignore[misc] with open(self.data_file.name, "r", newline="") as data_file: reader = csv.reader(data_file) with open(self.file_path, mode="w") as myfile: # if csv (1) or text (2) and had delimiter if (self.task.destination_file_type_id == 1 or self.task.destination_file_type_id == 2 or self.task.destination_file_type_id == 4) and ( self.task.destination_ignore_delimiter is None or self.task.destination_ignore_delimiter != 1): wrtr = ( csv.writer( myfile, delimiter=str(self.task.destination_file_delimiter) .encode("utf-8").decode("unicode_escape"), quoting=self.__quote_level(), ) if self.task.destination_file_delimiter is not None and len(self.task.destination_file_delimiter) > 0 and (self.task.destination_file_type_id == 2 or self.task.destination_file_type_id == 4 ) # txt or other else csv.writer( myfile, quoting=self.__quote_level(), )) for row in reader: new_row = [(x.strip('"').strip("'") if isinstance( x, str) else x) for x in row] if (self.task.destination_file_type_id == 1 or self.task.destination_file_type_id == 2 or self.task.destination_file_type_id == 4 ) and (self.task.destination_file_line_terminator is not None and self.task.destination_file_line_terminator != ""): new_row.append( self.task.destination_file_line_terminator) wrtr.writerow(new_row) # if xlxs (3) elif self.task.destination_file_type_id == 3: wrtr = csv.writer( myfile, dialect="excel", quoting=self.__quote_level(), ) for row in reader: new_row = [(x.strip('"').strip("'") if isinstance( x, str) else x) for x in row] wrtr.writerow(new_row) else: for line in data_file: myfile.write(line) RunnerLog( self.task, self.run_id, 11, f"File {self.file_name} created. Size: {file_size(Path(self.file_path).stat().st_size)}.\n{self.file_path}", ) # encrypt file if self.task.file_gpg == 1: gpg = gnupg.GPG("/usr/local/bin/gpg") # import the key keychain = gpg.import_keys( em_decrypt(self.task.file_gpg_conn.key, app.config["PASS_KEY"])) # set it to trusted gpg.trust_keys(keychain.fingerprints, "TRUST_ULTIMATE") # encrypt file with open(self.file_path, "rb") as my_file: encrypt_status = gpg.encrypt_file( file=my_file, recipients=keychain.fingerprints, output=self.file_path + ".gpg", ) # remove key gpg.delete_keys(keychain.fingerprints) # update global file name if not encrypt_status.ok: raise RunnerException( self.task, self.run_id, 11, "File failed to encrypt.\n%s\n%s\n%s" % ( self.file_path, encrypt_status.status, encrypt_status.stderr, ), ) self.file_path = self.file_path + ".gpg" self.file_name = self.file_name + ".gpg" RunnerLog( self.task, self.run_id, 11, "File encrypted.\n%s\n%s\n%s" % (self.file_path, encrypt_status.status, encrypt_status.stderr), ) # get file hash.. after encrypting with open(self.file_path, "rb") as my_file: while True: chunk = my_file.read(8192) if not chunk: break self.file_hash.update(chunk) RunnerLog(self.task, self.run_id, 11, f"File md5 hash: {self.file_hash.hexdigest()}") # create zip if self.task.destination_create_zip == 1: self.zip_name = DateParsing( self.task, self.run_id, str(self.task.destination_zip_name)).string_to_date() # parse params self.zip_name = self.params.insert_file_params(self.zip_name) self.zip_name = self.zip_name.replace(".zip", "") + ".zip" with zipfile.ZipFile( str(Path(self.base_path).joinpath(self.zip_name)), "w") as zip_file: zip_file.write( self.file_path, compress_type=zipfile.ZIP_DEFLATED, arcname=self.file_name, ) # now we change all file stuff to our zip. self.file_name = self.zip_name self.file_path = str(Path(self.base_path).joinpath(self.zip_name)) RunnerLog(self.task, self.run_id, 11, f"ZIP archive created.\n{self.file_path}") return self.file_name, self.file_path, self.file_hash.hexdigest()
def shell(self) -> str: """Run input command as a shell command.""" try: out_bytes = subprocess.check_output( self.cmd, stderr=subprocess.STDOUT, shell=True ) out = out_bytes.decode("utf-8") if "Error" in out: raise RunnerException( self.task, self.run_id, 17, self.error_msg + ("\n" if out != "" else "") + re.sub( r"(?<=:)([^:]+?)(?=@)", "*****", out, flags=re.IGNORECASE | re.MULTILINE, ), ) RunnerLog( self.task, self.run_id, 17, self.success_msg + (("\n" + out) if out != "" else ""), ) return out except subprocess.CalledProcessError as e: out = e.output.decode("utf-8") raise RunnerException( self.task, self.run_id, 17, self.error_msg + (("\n" + out) if out != "" else "") + "\n" + re.sub( r"(?<=:)([^:]+?)(?=@)", "*****", str(e), flags=re.IGNORECASE | re.MULTILINE, ), ) except BaseException as e: raise RunnerException( self.task, self.run_id, 17, "Command failed.\n" + (("\n" + out) if out != "" else "") + "\n" + re.sub( r"(?<=:)([^:]+?)(?=@)", "*****", str(e), flags=re.IGNORECASE | re.MULTILINE, ), )
def run(self) -> None: """Run an SSH Command. First, this will make a connection then run the command Some code from https://stackoverflow.com/a/32758464 - thanks! :returns: Output from command. """ self.__connect() timeout = 600 try: RunnerLog( self.task, self.run_id, 19, "Starting command.", ) # pylint: disable=W0612 stdin, stdout, stderr = self.session.exec_command( # noqa: S601 self.command, timeout=timeout ) channel = stdout.channel stdin.close() channel.shutdown_write() stderr_data = b"" stdout_data = b"" while ( not channel.closed or channel.recv_ready() or channel.recv_stderr_ready() ): got_chunk = False readq, _, _ = select.select([stdout.channel], [], [], timeout) for chunk in readq: if chunk.recv_ready(): stdout_data += stdout.channel.recv(len(chunk.in_buffer)) got_chunk = True if chunk.recv_stderr_ready(): stderr_data += stderr.channel.recv_stderr( len(chunk.in_stderr_buffer) ) got_chunk = True if ( not got_chunk and stdout.channel.exit_status_ready() and not stderr.channel.recv_stderr_ready() and not stdout.channel.recv_ready() ): # indicate that we're not going to read from this channel anymore stdout.channel.shutdown_read() # close the channel stdout.channel.close() break # exit as remote side is finished and our buffers are empty time.sleep(0.01) # timeout after a few minutes out = stdout_data.decode("utf-8") or "None" err = stderr_data.decode("utf-8") or "None" if stdout.channel.recv_exit_status() != 0 or stderr_data != b"": raise ValueError( f"Command stdout: {out}\nCommand stderr: {err}", ) RunnerLog( self.task, self.run_id, 19, f"Command output:\n{out}", ) except BaseException as e: raise RunnerException( self.task, self.run_id, 19, f"Failed to run command.\n{e}" ) self.__close()
def gitlab(self, url: str) -> str: """Get source code from gitlab using authentication.""" # pylint: disable=too-many-statements if ".git" in str(url): return "" if url: try: # convert the "raw" url into an api url branch = urllib.parse.quote( urllib.parse.unquote( re.findall(r"\/(?:raw|blob)\/(.+?)\/", url)[0]), safe="", ) project = urllib.parse.quote( urllib.parse.unquote( re.findall(r"\.(?:com|net|org)\/(.+?)\/-", url)[0]), safe="", ) file_path = urllib.parse.quote( urllib.parse.unquote( re.findall(r"\/(?:raw|blob)\/.+?\/(.+?)$", url)[0]), safe="", ) api_url = "%sapi/v4/projects/%s/repository/files/%s/raw?ref=%s" % ( app.config["GIT_URL"], project, file_path, branch, ) headers = { "PRIVATE-TOKEN": app.config["GIT_TOKEN"], "Connection": "close", } page = requests.get(api_url, verify=app.config["GIT_VERIFY_SSL"], headers=headers) # noqa: S501 if page.status_code != 200: raise Exception("Failed to get code: " + page.text) if url.lower().endswith(".sql"): self.query = page.text self.db_type = ("mssql" if self.task.source_database_conn and self.task.source_database_conn.type_id == 2 else None) # save query cache before cleanup. if self.run_id or self.refresh_cache: self.task.source_cache = self.query db.session.commit() if self.refresh_cache: RunnerLog( self.task, self.run_id, 15, "Source cache manually refreshed.", ) # insert params return self.cleanup() return (page.text if not page.text.startswith("<!DOCTYPE") else "Visit URL to view code") # pylint: disable=broad-except except BaseException as e: # only use cache if we have a run id. Otherwise failures are from code preview. if (self.run_id and self.task.enable_source_cache == 1 and self.task.source_cache): RunnerLog( self.task, self.run_id, 15, f"Failed to get source from {url}. Using cached query.\nFull trace:\n{e}", ) self.db_type = ("mssql" if self.task.source_database_conn and self.task.source_database_conn.type_id == 2 else None) self.query = self.task.source_cache return self.cleanup() elif (self.run_id and self.task.enable_source_cache == 1 and not self.task.source_cache): raise RunnerException( self.task, self.run_id, 15, f"Failed to get source from {url}. Cache enabled, but no cache available.\n{e}", ) else: raise RunnerException( self.task, self.run_id, 15, f"Failed to get source from {url}.\n{e}", ) raise RunnerException(self.task, self.run_id, 15, "No url specified to get source from.")