def __init__(self, datadir, pipeline): self.task_queue = pipeline self.datadir = datadir params = docker.Config(self.datadir).get_parameters() self.login = params.get('#login') self.password = params.get('#password') self.out_cols = ['AVAILABILITY', 'COUNTRY', 'CSE_ID', 'CSE_URL', 'DISTRCHAN', 'ESHOP', 'FREQ', 'HIGHLIGHTED_POSITION', 'MATERIAL', 'POSITION', 'PRICE', 'RATING', 'REVIEW_COUNT', 'SOURCE', 'SOURCE_ID', 'STOCK', 'TOP', 'TS', 'URL'] self.all_cols = self.out_cols + ['DATE', 'ZBOZI_SHOP_ID', 'MATCHING_ID'] self.export_table = 'results' self.daily_uploads_file = 'zbozi_products.csv' self.previous_df = self.load_previous_ids() try: # load next url from file, if previous run ended early keep_scraping = pd.read_csv(f'{self.datadir}in/tables/keep_scraping.csv', dtype=object) logging.debug(str(keep_scraping)) next_url = keep_scraping.iloc[0, 0] if next_url and str(next_url).lower() not in ['none', 'nan', 'false']: self.next_url = next_url else: raise IndexError() except (IndexError, FileNotFoundError): logging.warning('No next_url, starting from scratch') self.next_url = '/v1/shop/items?paired=True&limit=1000&loadProductDetail=False'
def test_empty_storage(self): cfg = docker.Config( os.path.join(os.getenv('KBC_DATADIR', ''), '..', 'data2')) self.assertEqual(cfg.get_expected_output_tables(), []) self.assertEqual(cfg.get_expected_output_files(), []) self.assertEqual(cfg.get_input_tables(), []) self.assertEqual(cfg.get_input_files(), []) self.assertEqual(cfg.get_parameters(), {})
def test_get_file_manifest(self): cfg = docker.Config() files = cfg.get_input_files() file1 = cfg.get_file_manifest(files[0]) self.assertEqual(151971405, file1['id']) self.assertEqual('21702.strip.print.gif', file1['name']) self.assertEqual(['dilbert'], file1['tags']) file2 = cfg.get_file_manifest('151971405_21702.strip.print.gif') self.assertEqual(file1, file2)
def test_table_manifest_error_column_delete_2(self): cfg = docker.Config() some_file = os.path.join( tempfile.mkdtemp('kbc-test') + 'some-table.csv') with self.assertRaises(TypeError): cfg.write_table_manifest(some_file, delete_where={ "column": "a", "values": "b" })
def test_table_manifest_minimal(self): cfg = docker.Config() some_file = os.path.join( tempfile.mkdtemp('kbc-test') + 'some-table.csv') cfg.write_table_manifest(some_file, primary_key=['foo', 'bar']) manifest_filename = some_file + '.manifest' with open(manifest_filename) as manifest_file: config = json.load(manifest_file) self.assertEqual({'primary_key': ['foo', 'bar']}, config) os.remove(manifest_filename)
def test_get_parameters(self): cfg = docker.Config() params = cfg.get_parameters() self.assertEqual({ 'fooBar': { 'bar': 24, 'foo': 42 }, 'baz': 'bazBar' }, params) self.assertEqual(params['fooBar']['foo'], 42) self.assertEqual(params['fooBar']['bar'], 24)
def test_get_input_tables(self): cfg = docker.Config() tables = cfg.get_input_tables() self.assertEqual(len(tables), 2) for table in tables: if table['destination'] == 'sample.csv': self.assertEqual(table['source'], 'in.c-main.test') self.assertTrue(os.path.isfile(table['full_path'])) else: self.assertEqual('in.c-main.test2', table['source']) self.assertTrue(os.path.isfile(table['full_path']))
def __init__(self, mandatory_params, data_path=None): # fetch data folder from ENV by default if not data_path: data_path = os.environ.get('KBC_DATADIR') self.kbc_config_id = os.environ.get('KBC_CONFIGID') self.data_path = data_path self.configuration = docker.Config(data_path) self.cfg_params = self.configuration.get_parameters() self.tables_out_path = os.path.join(data_path, 'out', 'tables') self.tables_in_path = os.path.join(data_path, 'in', 'tables') self._mandatory_params = mandatory_params
def __init__(self): self.utctime_started_datetime = datetime.datetime.utcnow() self.datadir = os.getenv("KBC_DATADIR", "/data/") cfg = docker.Config(self.datadir) self.parameters = cfg.get_parameters() # log parameters (excluding sensitive designated by '#') logging.info({k: v for k, v in self.parameters.items() if "#" not in k}) self.wanted_columns = self.parameters.get("wanted_columns") self.columns_mapping = self.parameters.get("columns_mapping") self.allowed_file_patterns = self.parameters.get("allowed_file_patterns") self.forbidden_file_patterns = self.parameters.get("forbidden_file_patterns") self.last_timestamp_filename = self.parameters.get("last_timestamp_filename") self.input_filelist_filename = self.parameters.get("input_filelist_filename") self.input_fileset = {} self.files_to_process = [] self.last_processed_timestamp = None self.max_timestamp_this_run_tz = None self.max_timestamp_this_run = None
def test_file_manifest(self): cfg = docker.Config() some_file = os.path.join(tempfile.mkdtemp('kbc-test') + 'someFile.txt') cfg.write_file_manifest(some_file, file_tags=['foo', 'bar'], is_public=True, is_permanent=False, notify=True) manifest_filename = some_file + '.manifest' with open(manifest_filename) as manifest_file: config = json.load(manifest_file) self.assertEqual( { 'is_public': True, 'is_permanent': False, 'notify': True, 'tags': ['foo', 'bar'] }, config) os.remove(manifest_filename)
def test_table_manifest_full(self): cfg = docker.Config() some_file = os.path.join( tempfile.mkdtemp('kbc-test') + 'some-table.csv') cfg.write_table_manifest(some_file, columns=['foo', 'bar'], destination='some-destination', primary_key=['foo'], incremental=True, metadata={'bar': 'kochba'}, column_metadata={'bar': { 'foo': 'gogo' }}, delete_where={ 'column': 'lilly', 'values': ['a', 'b'], 'operator': 'eq' }) manifest_filename = some_file + '.manifest' with open(manifest_filename) as manifest_file: config = json.load(manifest_file) self.assertEqual( { 'destination': 'some-destination', 'columns': ['foo', 'bar'], 'primary_key': ['foo'], 'incremental': True, 'metadata': [{ 'key': 'bar', 'value': 'kochba' }], 'column_metadata': { 'bar': [{ 'key': 'foo', 'value': 'gogo' }] }, 'delete_where_column': 'lilly', 'delete_where_values': ['a', 'b'], 'delete_where_operator': 'eq' }, config) os.remove(manifest_filename)
def run(datadir): cfg = kbc_py.Config(datadir) datadir_path = Path(datadir) in_base_path = datadir_path / 'in/files' out_base_path = datadir_path / 'out/files' params = validate_expand_defaults(cfg.get_parameters()) print("Datadir: " + str(list(str(d) for d in datadir_path.glob("**")))) output_params = params["output"] feature_format = feature_output_formats[output_params["featureFormat"]] include_additional_fields = output_params["includeAdditionalColumns"] input_format_params = params["input"]["format"] for format_name, format_params in input_format_params.items(): in_format = input_formats[format_name] enabled = format_params["enabled"] glob_pattern = format_params["glob"] if not enabled: continue matching_files = list(in_base_path.glob(glob_pattern)) print(f"Files matching {glob_pattern} in {in_base_path}: " f"{[str(f) for f in matching_files]}") for full_in_path in matching_files: relative_path = Path(full_in_path).relative_to(in_base_path) target_relative_path = relative_path.with_suffix(".csv") full_out_path = out_base_path / target_relative_path print(f"Converting {relative_path} (as {format_name}) " f"to {target_relative_path}") full_out_path.parent.mkdir(parents=True, exist_ok=True) with open(str(full_out_path), mode="wt", encoding="utf=8") as out: convert(str(full_in_path), out, in_format, feature_format, include_additional_fields)
def run(datadir): cfg = docker.Config(datadir) parameters = cfg.get_parameters() print("Hello World!") print(parameters) in_file = datadir + '/in/tables/source.csv' out_file = datadir + '/out/tables/destination.csv' with open(in_file, mode='rt', encoding='utf-8') as in_file, \ open(out_file, mode='wt', encoding='utf-8') as out_file: lazy_lines = (line.replace('\0', '') for line in in_file) reader = csv.DictReader(lazy_lines, dialect='kbc') writer = csv.DictWriter(out_file, dialect='kbc', fieldnames=reader.fieldnames) writer.writeheader() for row in reader: writer.writerow({ 'id': int(row['id']) * 42, 'sound': row['sound'] + 'ping' })
def parse_configs(): kbc_datadir = os.getenv("KBC_DATADIR", "/data/") cfg = docker.Config(kbc_datadir) parameters = cfg.get_parameters() # log parameters (excluding sensitive designated by '#') logging.info({k: v for k, v in parameters.items() if "#" not in k}) input_filename = parameters.get("input_filename") # read unique product ids with open(f'{kbc_datadir}in/tables/{input_filename}.csv') as input_file: product_ids = { str(pid.replace('"', '')) for pid # read all input file rows, except the header in input_file.read().split(os.linesep)[1:] if re.match('"[0-9]+"$', pid) } return product_ids, parameters
def run(datadir): cfg = docker.Config(datadir) parameters = cfg.get_parameters() c_parent = parameters.get('parentColumn', 'categoryParentId') c_child = parameters.get('idColumn', 'categoryId') # get input and output table and validate them tables = cfg.get_input_tables() if len(tables) != 1: raise ValueError("Input mapping must contain one table only.") in_table = tables[0] tables = cfg.get_expected_output_tables() if len(tables) != 1: raise ValueError("Output mapping must contain one table only.") out_table = tables[0] # physical location of the source file with source data in_file_path = in_table['full_path'] # physical location of the target file with output data out_file_path = out_table['full_path'] roots = set() relations = {} with open(in_file_path, mode='rt', encoding='utf-8') as in_file: lazy_lines = (line.replace('\0', '') for line in in_file) csv_reader = csv.DictReader(lazy_lines, dialect='kbc') input_csv_header = csv_reader.fieldnames if c_child not in input_csv_header: raise ValueError('Column ' + c_child + 'not present in table') if c_parent not in input_csv_header: raise ValueError('Column ' + c_parent + 'not present in table') roots, relations, rest = parse_tree(csv_reader, c_parent, c_child) with open(out_file_path, mode='wt', encoding='utf-8') as out_file: out_csv_header = input_csv_header + ['levels', 'root'] writer = csv.DictWriter(out_file, fieldnames=out_csv_header, dialect='kbc') writer.writeheader() for child, level, root in walk_tree(roots, relations): base_row = {c_child: child, 'levels': level, 'root': root} rest_row = rest[child] out_row = {**base_row, **rest_row} writer.writerow(out_row)
def __init__(self): self.datadir = os.getenv('KBC_DATADIR', '/data/') cfg = docker.Config(self.datadir) parameters = cfg.get_parameters() # log parameters (excluding sensitive designated by '#') logging.info({k: v for k, v in parameters.items() if "#" not in k}) self.previous_timestamp_filename = parameters.get( 'previous_timestamp_filename') self.filename_pattern = parameters.get('filename_pattern') self.server = parameters.get('server') self.port = int(parameters.get('port')) self.user = parameters.get('username') self.password = parameters.get('#password') self.passphrase = parameters.get('#passphrase') self.rsa_key = parameters.get('#key') self.sftp_folder = '/upload/' self.files_to_process = [] self.last_timestamp = 0 self.previous_timestamp = 0 (self.common_fields, self.highlighted_fields, self.cheapest_fields, self.mall_fields, self.constant_fields, self.observed_fields) = None, None, None, None, None, None
def test_success_run(tmpdir, datadir_and_results): dir_name, expected_levels, expected_roots, out_file_name = \ datadir_and_results src = 'tests/data/' + dir_name dst = str(tmpdir.realpath()) + "/" + dir_name copy_tree(src, dst) run(dst) current = dst + "/out/tables/" + out_file_name cfg = docker.Config(dst) parameters = cfg.get_parameters() c_child = parameters.get('idColumn', 'categoryId') with open(current, mode='rt', encoding='utf-8') as in_file: lazy_lines = (line.replace('\0', '') for line in in_file) csv_reader = csv.DictReader(lazy_lines, dialect='kbc') row_count = 0 for row in csv_reader: child = row[c_child] level = row['levels'] root = row['root'] assert expected_levels[child] == level assert expected_roots[child] == root row_count = row_count + 1 assert row_count == len(expected_levels)
job_url = request.json()['url'] wait_for_job(token, job_url) if __name__ == '__main__': logging.basicConfig(level=logging.INFO) logger = logging.getLogger() try: logging_gelf_handler = logging_gelf.handlers.GELFTCPSocketHandler( host=os.getenv('KBC_LOGGER_ADDR'), port=int(os.getenv('KBC_LOGGER_PORT'))) # remove stdout logging when running inside keboola logger.removeHandler(logger.handlers[0]) except TypeError: logging_gelf_handler = logging.StreamHandler() logging_gelf_handler.setFormatter( logging_gelf.formatters.GELFFormatter(null_character=True)) logger.addHandler(logging_gelf_handler) kbc_datadir = os.getenv('KBC_DATADIR', '/data/') cfg = docker.Config(kbc_datadir) parameters = cfg.get_parameters() # log parameters (excluding sensitive designated by '#') logging.info({k: v for k, v in parameters.items() if "#" not in k}) kbc_token = parameters['#token'] for table in parameters["tables"]: delete_table_rows(kbc_token, table, kbc_datadir)
from datetime import date, timedelta import csv from pyvirtualdisplay import Display print "Python libraries loaded." display = Display(visible=0, size=(1024, 768)) display.start() print "Current Working Directory is ... " + os.getcwd() print "Config taken from ... " + os.path.abspath( os.path.join(os.getcwd(), os.pardir)) + 'data/' # initialize KBC configuration cfg = docker.Config( os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + 'data/') # loads application parameters - user defined parameters = cfg.get_parameters() ### PARAMETERS #### #date scrape_date = str(time.strftime("%Y-%m-%d")) #mode mode = parameters.get('Mode') #mode = 'summary' #mode = 'by_category' print "Mode is ... " + mode ### DEFINITION OF PARAMETERS ###
def load() -> dict: cfg = docker.Config('/data/') params = cfg.get_parameters() # check required fields required = ('timezone', 'date_from', 'date_to', "#private_key", "#client_email", "token_uri", "network_code") for r in required: if r not in params: raise ValueError(f'Missing required field "{r}".') # validate timezone type allowed_timezones = ('PUBLISHER', 'PROPOSAL_LOCAL', 'AD_EXCHANGE') if params['timezone'] not in allowed_timezones: raise ValueError( f"Invalid timezone. Choose one from {allowed_timezones}") # handle default dimensions if "dimensions" not in params: print("[INFO]: Dimensions field is empty -> use default") params['dimensions'] = DEFAULT_DIMENSIONS # add date column to dimensions - depends on timezone type if params["timezone"] in ("PUBLISHER", "PROPOSAL_LOCAL"): params['dimensions'].append("DATE") elif params["timezone"] == "AD_EXCHANGE": params['dimensions'].append("AD_EXCHANGE_DATE") print(f"[INFO]: Selected dimensions: {params['dimensions']}") # handle default metrics if "metrics" not in params: print("[INFO]:Metrics field is empty -> use default") params['metrics'] = DEFAULT_METRICS print(f"[INFO]: Selected metrics: {params['metrics']}") # parse date range date_from = dateparser.parse(params['date_from']) date_to = dateparser.parse(params['date_to']) if not date_from: raise ValueError(f"Invalid date format '{params['date_from']}'") if not date_to: raise ValueError(f"Invalid date format '{params['date_to']}'") params['date_from'] = date_from.date() params['date_to'] = date_to.date() # create file with private key key_file = "/tmp/private_key.json" params['private_key_file'] = Config.private_key_file(params, key_file) # set max retries count for retryable decorator if 'max_retries' not in params: params["max_retries"] = DEFAULT_MAX_RETRIES if 'dimension_attributes' in params: print("[INFO]: Selected dimension attributes:" f" {params['dimension_attributes']}") if 'currency' not in params: for metric in params['metrics']: if metric.startswith("AD_EXCHANGE"): print("[INFO]: Currency is not set, but AD_EXCHANGE metric" " is present. Using CZK as default currency") params['currency'] = "CZK" break return params
def test_get_data_dir(self): cfg = docker.Config() self.assertEqual(os.getenv('KBC_DATADIR', ''), cfg.get_data_dir())
def test_get_action_empty_config(self): cfg = docker.Config( os.path.join(os.getenv('KBC_DATADIR', ''), '..', 'data2')) action = cfg.get_action() self.assertEqual(action, '')
def test_get_action(self): cfg = docker.Config() action = cfg.get_action() self.assertEqual(action, 'test')
from keboola import docker # pro komunikaci s parametrama a input/output mapping import warnings import arrow # Parameters data_folder = '/data/' warnings.filterwarnings("ignore", message="numpy.dtype size changed") print("Python libraries loaded.") print(f"Current Working Directory is ... {os.getcwd()}") print(f"Config taken from ... {data_folder}") # # initialize KBC configuration cfg = docker.Config(data_folder) parameters = cfg.get_parameters() # Get unix time of start and end date def unix_times(start, end): dates = list(pd.date_range(start=start, end=end, freq='D')) # print(dates) dates.append(dates[-1] + timedelta(days=1)) # print(dates) date_timetuples = [date_.timetuple() for date_ in dates] # print(date_timetuples) unix_dates = list(map(str, map(int, map(time.mktime, date_timetuples)))) return dates, unix_dates
def get_csv_schema(file_path: str) -> list: data_dir = path.realpath(path.join(path.dirname(file_path), '../..')) + path.sep return docker.Config(data_dir).get_file_manifest(file_path)['columns']
def test_get_oauthapi_appsecret(self): cfg = docker.Config() self.assertEqual(cfg.get_oauthapi_appsecret(), "myappsecret")
datefmt="%Y-%m-%d %H:%M:%S") """ logger = logging.getLogger() logging_gelf_handler = logging_gelf.handlers.GELFTCPSocketHandler( host=os.getenv('KBC_LOGGER_ADDR'), port=int(os.getenv('KBC_LOGGER_PORT')) ) logging_gelf_handler.setFormatter(logging_gelf.formatters.GELFFormatter(null_character=True)) logger.addHandler(logging_gelf_handler) # removes the initial stdout logging logger.removeHandler(logger.handlers[0]) """ ### Access the supplied rules cfg = docker.Config('/data/') params = cfg.get_parameters() client_id = params['client_id'] client_secret = params['#client_secret'] api_endpoint = params['api_endpoint'] looker_objects = params['looker_objects'] logging.info("Successfully fetched all parameters.") #logging.debug("Fetched parameters are :" + str(params)) ### Get proper list of tables cfg = docker.Config('/data/') in_tables = cfg.get_input_tables() out_tables = cfg.get_expected_output_tables() logging.info("IN tables mapped: " + str(in_tables))
def test_get_oauthapi_appkey(self): cfg = docker.Config() self.assertEqual(cfg.get_oauthapi_appkey(), "myappkey")
from datetime import datetime import pandas as pd import json import os, shutil in_tables_dir = '/data/in/tables/' out_tables_dir = '/data/out/tables/' out_data_dir = '/data/out/' in_config_dir = '/data/' date_col_default = 'date' suffix_delimiter = '-' csv_suffix = '.csv' config_suffix = '.config' # get KBC parameters cfg = docker.Config(in_config_dir) # loads application parameters - user defined parameters = cfg.get_parameters() account_key = parameters.get('account_key') account_name = parameters.get('account_name') data_container = parameters.get('data_container') config_container = parameters.get('config_container') date_col = parameters.get('date_col') # when date_col is not in params, set to default value if not date_col: date_col = date_col_default block_blob_service = BlockBlobService(account_name=account_name, account_key=account_key) base_blob_service = BaseBlobService(account_name=account_name,
def test_register_csv_dialect(self): docker.Config().register_csv_dialect() self.assertIn("kbc", csv.list_dialects())