def test_row_str_and_repr(): r = Row({'a': 1, 'b': 2}) expected = ' a b\n--- ---\n 1 2' assert str(r) in expected assert repr(r) in expected # parse html representation with pandas html = pd.read_html(r._repr_html_())[0] assert html.to_dict() == {'a': {0: 1}, 'b': {0: 2}}
def status(self, return_code_diff=False): """Prints the current task status """ p = self.product data = {} data['name'] = self.name if p.timestamp is not None: dt = datetime.fromtimestamp(p.timestamp) date_h = dt.strftime('%b %d, %y at %H:%M') time_h = humanize.naturaltime(dt) data['Last updated'] = '{} ({})'.format(time_h, date_h) else: data['Last updated'] = 'Has not been run' data['Outdated dependencies'] = p._outdated_data_dependencies() outd_code = p._outdated_code_dependency() data['Outdated code'] = outd_code if outd_code and return_code_diff: data['Code diff'] = (self.dag .differ .get_diff(p.stored_source_code, self.source_code, language=self.source.language)) else: outd_code = '' data['Product'] = str(self.product) data['Doc (short)'] = self.source.doc_short data['Location'] = self.source.loc return Row(data)
def test_table_auto_size(monkeypatch): TerminalSize = namedtuple('TerminalSize', ['columns']) monkeypatch.setattr(shutil, 'get_terminal_size', lambda: TerminalSize(80)) r = Row({'a': '1' * 60, 'b': '1' * 60}) table = Table([r, r], column_width='auto') assert max([len(line) for line in str(table).splitlines()]) == 80 # simulate resize monkeypatch.setattr(shutil, 'get_terminal_size', lambda: TerminalSize(120)) assert max([len(line) for line in str(table).splitlines()]) == 120
def test_table_str_and_repr(monkeypatch): mock = Mock() mock.get_terminal_size().columns = 6 monkeypatch.setattr(table, 'shutil', mock) r = Row({'a': 1, 'b': 2}) t = Table([r, r]) expected = ' a b\n--- ---\n 1 2\n 1 2' assert str(t) == expected assert repr(t) == expected # parse html representation with pandas html = pd.read_html(t._repr_html_())[0] assert html.to_dict(orient='list') == {'a': [1, 1], 'b': [2, 2]}
def status(self, return_code_diff=False, sections=None): """Prints the current task status Parameters ---------- sections : list, optional Sections to include. Defaults to "name", "last_run", "oudated", "product", "doc", "location" """ sections = sections or [ 'name', 'last_run', 'outdated', 'product', 'doc', 'location' ] p = self.product data = {} if 'name' in sections: data['name'] = self.name if 'type' in sections: data['type'] = type(self).__name__ if 'status' in sections: data['status'] = self.exec_status.name if 'client' in sections: # FIXME: all tasks should have a client property data['client'] = (repr(self.client) if hasattr(self, 'client') else None) if 'last_run' in sections: if p.metadata.timestamp is not None: dt = datetime.fromtimestamp(p.metadata.timestamp) date_h = dt.strftime('%b %d, %y at %H:%M') time_h = humanize.naturaltime(dt) data['Last run'] = '{} ({})'.format(time_h, date_h) else: data['Last run'] = 'Has not been run' outd_data = p._outdated_data_dependencies() outd_code = p._outdated_code_dependency() outd = False if outd_code: outd = 'Source code' if outd_data: if not outd: outd = 'Upstream' else: outd += ' & Upstream' if 'outdated' in sections: data['Outdated?'] = outd if 'outdated_dependencies' in sections: data['Outdated dependencies'] = outd_data if 'outdated_code' in sections: data['Outdated code'] = outd_code if outd_code and return_code_diff: data['Code diff'] = (self.dag.differ.get_diff( p.metadata.stored_source_code, str(self.source), extension=self.source.extension)) else: outd_code = '' if 'product_type' in sections: data['Product type'] = type(self.product).__name__ if 'product' in sections: data['Product'] = repr(self.product) if 'product_client' in sections: # FIXME: all products should have a client property data['Product client'] = (repr(self.product.client) if hasattr( self.product, 'client') else None) if 'doc' in sections: data['Doc (short)'] = _doc_short(self.source.doc) if 'location' in sections: data['Location'] = self.source.loc return Row(data)
def test_convert_to_dict(): d = {'a': 1, 'b': 2} r = Row(d) t = Table([r, r], column_width=None) assert t.to_dict() == {'a': [1, 1], 'b': [2, 2]}
def test_convert_to_pandas(): d = {'a': 1, 'b': 2} r = Row(d) t = Table([r, r], column_width=None) expected = pd.DataFrame({'a': [1, 1], 'b': [2, 2]}) assert expected.equals(t.to_pandas())
def test_create_build_report(): row = Row({'Elapsed (s)': 1}) report = BuildReport([row, row]) assert report == {'Elapsed (s)': [1, 1], 'Percentage': [50, 50]}
def test_table_values(): d = {'a': 1, 'b': 2} r = Row(d) t = Table([r, r], column_width=None) assert t.values == {'a': [1, 1], 'b': [2, 2]}
def test_select_multiple_cols_in_table(): d = {'a': 1, 'b': 2} r = Row(d) t = Table([r, r], column_width=None) assert t[['a', 'b']] == {'a': [1, 1], 'b': [2, 2]}
def test_error_if_row_initialized_with_non_mapping(): with pytest.raises(TypeError): Row([])
def test_select_multiple_cols_in_row(): r = Row({'a': 1, 'b': 2}) assert r[['a', 'b']] == {'a': 1, 'b': 2}
def test_table_wrap(): r = Row({'a': 'abc d', 'b': 'abc d'}) table = Table([r, r], column_width=3) # Max expected length: 3 (col a) + 2 (whitespace) + 3 (col b) = 8 assert max([len(line) for line in str(table).splitlines()]) == 8
def test_table_iter(): r = Row({'a': 1, 'b': 2}) t = Table([r, r]) assert set(iter(t)) == {'a', 'b'}
def test_row_str_setitem(): r = Row({'a': 1, 'b': 2}) r['a'] = 10 assert r['a'] == 10
def build(self, force=False): """Run the task if needed by checking its dependencies Returns ------- dict A dictionary with keys 'run' and 'elapsed' """ # TODO: if this is run in a task that has upstream dependencies # it will fail with a useless error since self.params does not have # upstream yet (added after rendering) # NOTE: should i fetch metadata here? I need to make sure I have # the latest before building self._logger.info(f'-----\nChecking {repr(self)}....') # do not run unless some of the conditions below match... run = False elapsed = 0 if force: self._logger.info('Forcing run, skipping checks...') run = True else: # not forcing, need to check dependencies... p_exists = self.product.exists() # check dependencies only if the product exists and there is # metadata if p_exists and self.product.metadata is not None: outdated_data_deps = self.product._outdated_data_dependencies() outdated_code_dep = self.product._outdated_code_dependency() self._logger.info('Checking dependencies...') if outdated_data_deps: run = True self._logger.info('Outdated data deps...') else: self._logger.info('Up-to-date data deps...') if outdated_code_dep: run = True self._logger.info('Outdated code dep...') else: self._logger.info('Up-to-date code dep...') else: run = True # just log why it will run if not p_exists: self._logger.info('Product does not exist...') if self.product.metadata is None: self._logger.info('Product metadata is None...') self._logger.info('Running...') if run: self._logger.info(f'Starting execution: {repr(self)}') then = datetime.now() try: self.run() except Exception as e: tb = traceback.format_exc() if self.on_failure: try: self.on_failure(self, tb) except Exception: self._logger.exception('Error executing on_failure ' 'callback') raise e now = datetime.now() elapsed = (now - then).total_seconds() self._logger.info(f'Done. Operation took {elapsed:.1f} seconds') # update metadata self.product.timestamp = datetime.now().timestamp() self.product.stored_source_code = self.source_code self.product.save_metadata() # TODO: also check that the Products were updated: # if they did not exist, they must exist now, if they alredy # exist, timestamp must be recent equal to the datetime.now() # used. maybe run fetch metadata again and validate? if not self.product.exists(): raise TaskBuildError(f'Error building task "{self}": ' 'the task ran successfully but product ' f'"{self.product}" does not exist yet ' '(task.product.exist() returned False)') if self.on_finish: try: if 'client' in inspect.getfullargspec(self.on_finish).args: self.on_finish(self, client=self.client) else: self.on_finish(self) except Exception as e: raise TaskBuildError('Exception when running on_finish ' 'for task {}: {}'.format(self, e)) else: self._logger.info(f'No need to run {repr(self)}') self._logger.info('-----\n') self._status = TaskStatus.Executed for t in self._get_downstream(): t._update_status() self.build_report = Row({'name': self.name, 'Ran?': run, 'Elapsed (s)': elapsed, }) return self
def test_select_col_in_table(): r = Row({'a': 1, 'b': 2}) t = Table([r, r], column_width=None) assert t['a'] == [1, 1]
def test_rows2columns(): r1 = Row({'a': 1}) r2 = Row({'a': 2}) assert rows2columns([r1, r2]) == {'a': [1, 2]}