def run(self) -> bool: """ Runs the command Returns: False on failure """ from .. import shell shell_command = self.shell_command() # create temp tap config file if self._tap_config: tmp_tap_config_path = pathlib.Path( config.config_dir()) / f'{self.config_file_name}.tmp' tap_config = self.tap_config with open(tmp_tap_config_path, 'w') as tap_config_file: json.dump(tap_config, tap_config_file) elif not os.path.exists(self.config_file_path()): log(message= f"The tap config '{self.config_file_path()}' does not exist.", is_error=True) return False try: result = shell.singer_run_shell_command(shell_command) finally: if self._tap_config: os.remove(tmp_tap_config_path) return result
def _pre_run(self) -> bool: if not os.path.exists(self.destination_path()): log(message= f"The destination path '{self.destination_path()}' does not exist.", is_error=True) return False return True
def run(self) -> bool: """ Runs the command Returns: False on failure """ # load schema schema = self.get_schema() # load sql file sql_query = self.get_sql_query() # query data frame from db logger.log(f'Read data from SQL', format=logger.Format.ITALICS) df = read_dataframe(self.db_alias, sql_query) # write avro file avro_file_path = f"{pathlib.Path(config.data_dir()) / self.file_name}" logger.log(f'Write to AVRO file {avro_file_path}', format=logger.Format.ITALICS) pdx.to_avro(avro_file_path, df, schema=schema) return True
def get_schema(self): if self.schema_file_name: schema_file_path = str(self.schema_file_path().absolute()) logger.log(f'Load AVRO schema from file {schema_file_path}', format=logger.Format.ITALICS) with open(schema_file_path, 'r') as f: return json.load(f) return self.schema
def run(self) -> bool: # create temp catalog (if necessary) tmp_catalog_file_path = None if self.stream_selection: tmp_catalog_file_path = self.catalog_file_path() catalog = SingerCatalog(self.catalog_file_name) has_error = False if isinstance(self.stream_selection, list): for stream_name in self.stream_selection: if stream_name in catalog.streams: catalog.streams[stream_name].mark_as_selected() else: log(message= f"Could not find stream '{stream_name}' in catalog for selection", is_error=True) has_error = True elif isinstance(self.stream_selection, dict): for stream_name, properties in self.stream_selection.items(): if stream_name in catalog.streams: catalog.streams[stream_name].mark_as_selected( properties=properties) else: log(message= f"Could not find stream '{stream_name}' in catalog for selection", is_error=True) has_error = True else: raise Exception( f'Unexpected type of stream_selection: {self.stream_selection.__class__.__name__}' ) if has_error: return False catalog.save(tmp_catalog_file_path) # create temp target config file target_config = {} self._create_target_config(target_config) tmp_target_config_path = self._target_config_path() with open(tmp_target_config_path, 'w') as target_config_file: json.dump(target_config, target_config_file) # run command try: # run pre-checks before calling run if not self._pre_run(): return False # execute shell command if not super().run(): return False finally: if self.stream_selection: os.remove(tmp_catalog_file_path) os.remove(tmp_target_config_path) return True
def run(self): self._has_error = False for line in self.process.stderr: pos = line.find(' ') if pos == -1: loglevel = 'NOTSET' logmsg = line else: loglevel = line[:pos] logmsg = line[(pos+1):] if loglevel == 'INFO': if logmsg.startswith('METRIC:'): # This data could be used for showing execution statistics; see also https://github.com/singer-io/getting-started/blob/96a0f7addec517fcf5155284744c648fe4f16902/docs/SYNC_MODE.md#metric-messages logger.log(logmsg, format=logger.Format.ITALICS) else: logger.log(logmsg, format=logger.Format.VERBATIM) elif loglevel in ['NOTSET','WARNING']: logger.log(logmsg, format=logger.Format.VERBATIM) elif loglevel == 'DEBUG': pass # DEBUG messages are ignored elif loglevel in ['ERROR','CRITICAL']: self._has_error = True logger.log(logmsg, format=logger.Format.VERBATIM, is_error=True)
def write_mondrian_schema(): import mara_mondrian.schema_generation file_name = pathlib.Path('.mondrian-schema.xml') logger.log(f'Writing {file_name}', logger.Format.ITALICS) mara_mondrian.schema_generation.write_mondrian_schema( file_name=pathlib.Path('.mondrian-schema.xml'), data_set_tables={ data_set: ('mondrian', data_set.id()) for data_set in mara_schema.config.data_sets() }, personal_data=False, high_cardinality_attributes=False) return True
def get_sql_query(self): sql_query = None if self.sql_file_path: sql_query_file_path = str(self.sql_file_path().absolute()) logger.log(f'Read SQL query from file {sql_query_file_path}', format=logger.Format.ITALICS) with open(sql_query_file_path, 'r') as f: sql_query = f.read() if self.sql_query: sql_query = self.sql_query if self.replace: for key, value in self.replace: sql_query = sql_query.replace(key, value) return sql_query
def run(self): from mara_pipelines import shell from mara_pipelines.logging import logger pipeline_base_directory = self.parent.parent.base_path() excludes = ' --exclude=__init__.py --exclude=\*.md --exclude=\*.pyc' # cd'ing && grepping in . allows us to show short filenames # The "(...) || true" will ensure that we do not get any output if nothing is found shell_command = f'(cd "{pipeline_base_directory}" && egrep --recursive {excludes} "{self.pattern}" .) || true' lines_or_bool = shell.run_shell_command(shell_command) if lines_or_bool is True: return True else: # The || true makes sure we will not get any False logger.log(f"Please don\'t use the pattern '{self.pattern}' in this pipeline. Matching lines:", format=logger.Format.ITALICS) lines = '\n'.join(lines_or_bool) logger.log(f"{lines}", format=logger.Format.ITALICS) return False
def run(self) -> bool: logger.log( f'Loading google analytics data {self.view_id} ({self.dimensions} {self.metrics}) into {self.target_db_alias}.{self.target_table_name}...') if not super().run(): logger.log(f'Error while loading google analytics data.') return False logger.log(f'Finished loading google analytics data.') return True
def run(self) -> bool: logger.log( f'Loading google sheet {self.spreadsheet_key} into {self.target_db_alias}.{self.target_table_name}...' ) if not super().run(): logger.log( f'Error while loading google sheet {self.spreadsheet_key}.') return False logger.log(f'Finished loading google sheet {self.spreadsheet_key}.') return True
def read_process_stdout(): for line in process.stdout: output_lines.append(line) logger.log(line, format=logger.Format.VERBATIM)
def singer_run_shell_command(command: str, log_command: bool = True): """ Runs a command in a bash shell and logs the output of the command in (near)real-time according to the singer specification: https://github.com/singer-io/getting-started/blob/master/docs/SPEC.md#output Args: command: The command to run log_command: When true, then the command itself is logged before execution Returns: Either (in order) - False when the exit code of the command was not 0 - True when there was no output to stdout - The output to stdout, as an array of lines """ import shlex, subprocess, threading if log_command: logger.log(command, format=logger.Format.ITALICS) process = subprocess.Popen(shlex.split(config.bash_command_string()) + ['-c', command], stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True) # keep stdout output output_lines = [] # unfortunately, only file descriptors and the system stream can be passed to # subprocess.Popen(..) (and not custom streams without a file handle). # So in order to see be able to log the output in real-time, we have to # query the output steams of the process from to separate threads def read_process_stdout(): for line in process.stdout: output_lines.append(line) logger.log(line, format=logger.Format.VERBATIM) read_stdout_thread = threading.Thread(target=read_process_stdout) read_stdout_thread.start() read_singertaplog_thread = SingerTapReadLogThread(process=process) read_singertaplog_thread.start() # wait until the process finishes while process.poll() is None: time.sleep(0.005) read_stdout_thread.join() read_singertaplog_thread.join() if read_singertaplog_thread.has_error: logger.log('Singer tap error occured', is_error=True, format=logger.Format.ITALICS) return False exitcode = process.returncode if exitcode != 0: logger.log(f'exit code {exitcode}', is_error=True, format=logger.Format.ITALICS) return False return output_lines or True