Esempio n. 1
0
    def __init__(self, datadir, pipeline):
        self.task_queue = pipeline
        self.datadir = datadir
        params = docker.Config(self.datadir).get_parameters()
        self.login = params.get('#login')
        self.password = params.get('#password')
        self.out_cols = ['AVAILABILITY', 'COUNTRY', 'CSE_ID', 'CSE_URL', 'DISTRCHAN', 'ESHOP', 'FREQ',
                         'HIGHLIGHTED_POSITION', 'MATERIAL', 'POSITION', 'PRICE', 'RATING',
                         'REVIEW_COUNT', 'SOURCE', 'SOURCE_ID', 'STOCK', 'TOP', 'TS', 'URL']
        self.all_cols = self.out_cols + ['DATE', 'ZBOZI_SHOP_ID', 'MATCHING_ID']
        self.export_table = 'results'
        self.daily_uploads_file = 'zbozi_products.csv'
        self.previous_df = self.load_previous_ids()

        try:
            # load next url from file, if previous run ended early
            keep_scraping = pd.read_csv(f'{self.datadir}in/tables/keep_scraping.csv', dtype=object)
            logging.debug(str(keep_scraping))
            next_url = keep_scraping.iloc[0, 0]
            if next_url and str(next_url).lower() not in ['none', 'nan', 'false']:
                self.next_url = next_url
            else:
                raise IndexError()

        except (IndexError, FileNotFoundError):
            logging.warning('No next_url, starting from scratch')
            self.next_url = '/v1/shop/items?paired=True&limit=1000&loadProductDetail=False'
 def test_empty_storage(self):
     cfg = docker.Config(
         os.path.join(os.getenv('KBC_DATADIR', ''), '..', 'data2'))
     self.assertEqual(cfg.get_expected_output_tables(), [])
     self.assertEqual(cfg.get_expected_output_files(), [])
     self.assertEqual(cfg.get_input_tables(), [])
     self.assertEqual(cfg.get_input_files(), [])
     self.assertEqual(cfg.get_parameters(), {})
 def test_get_file_manifest(self):
     cfg = docker.Config()
     files = cfg.get_input_files()
     file1 = cfg.get_file_manifest(files[0])
     self.assertEqual(151971405, file1['id'])
     self.assertEqual('21702.strip.print.gif', file1['name'])
     self.assertEqual(['dilbert'], file1['tags'])
     file2 = cfg.get_file_manifest('151971405_21702.strip.print.gif')
     self.assertEqual(file1, file2)
 def test_table_manifest_error_column_delete_2(self):
     cfg = docker.Config()
     some_file = os.path.join(
         tempfile.mkdtemp('kbc-test') + 'some-table.csv')
     with self.assertRaises(TypeError):
         cfg.write_table_manifest(some_file,
                                  delete_where={
                                      "column": "a",
                                      "values": "b"
                                  })
 def test_table_manifest_minimal(self):
     cfg = docker.Config()
     some_file = os.path.join(
         tempfile.mkdtemp('kbc-test') + 'some-table.csv')
     cfg.write_table_manifest(some_file, primary_key=['foo', 'bar'])
     manifest_filename = some_file + '.manifest'
     with open(manifest_filename) as manifest_file:
         config = json.load(manifest_file)
     self.assertEqual({'primary_key': ['foo', 'bar']}, config)
     os.remove(manifest_filename)
 def test_get_parameters(self):
     cfg = docker.Config()
     params = cfg.get_parameters()
     self.assertEqual({
         'fooBar': {
             'bar': 24,
             'foo': 42
         },
         'baz': 'bazBar'
     }, params)
     self.assertEqual(params['fooBar']['foo'], 42)
     self.assertEqual(params['fooBar']['bar'], 24)
    def test_get_input_tables(self):
        cfg = docker.Config()
        tables = cfg.get_input_tables()

        self.assertEqual(len(tables), 2)
        for table in tables:
            if table['destination'] == 'sample.csv':
                self.assertEqual(table['source'], 'in.c-main.test')
                self.assertTrue(os.path.isfile(table['full_path']))
            else:
                self.assertEqual('in.c-main.test2', table['source'])
                self.assertTrue(os.path.isfile(table['full_path']))
Esempio n. 8
0
    def __init__(self, mandatory_params, data_path=None):
        # fetch data folder from ENV by default
        if not data_path:
            data_path = os.environ.get('KBC_DATADIR')

        self.kbc_config_id = os.environ.get('KBC_CONFIGID')
        
        self.data_path = data_path
        self.configuration = docker.Config(data_path)
        self.cfg_params = self.configuration.get_parameters()
        self.tables_out_path = os.path.join(data_path, 'out', 'tables')
        self.tables_in_path = os.path.join(data_path, 'in', 'tables')

        self._mandatory_params = mandatory_params
Esempio n. 9
0
 def __init__(self):
     self.utctime_started_datetime = datetime.datetime.utcnow()
     self.datadir = os.getenv("KBC_DATADIR", "/data/")
     cfg = docker.Config(self.datadir)
     self.parameters = cfg.get_parameters()
     # log parameters (excluding sensitive designated by '#')
     logging.info({k: v for k, v in self.parameters.items() if "#" not in k})
     self.wanted_columns = self.parameters.get("wanted_columns")
     self.columns_mapping = self.parameters.get("columns_mapping")
     self.allowed_file_patterns = self.parameters.get("allowed_file_patterns")
     self.forbidden_file_patterns = self.parameters.get("forbidden_file_patterns")
     self.last_timestamp_filename = self.parameters.get("last_timestamp_filename")
     self.input_filelist_filename = self.parameters.get("input_filelist_filename")
     self.input_fileset = {}
     self.files_to_process = []
     self.last_processed_timestamp = None
     self.max_timestamp_this_run_tz = None
     self.max_timestamp_this_run = None
 def test_file_manifest(self):
     cfg = docker.Config()
     some_file = os.path.join(tempfile.mkdtemp('kbc-test') + 'someFile.txt')
     cfg.write_file_manifest(some_file,
                             file_tags=['foo', 'bar'],
                             is_public=True,
                             is_permanent=False,
                             notify=True)
     manifest_filename = some_file + '.manifest'
     with open(manifest_filename) as manifest_file:
         config = json.load(manifest_file)
     self.assertEqual(
         {
             'is_public': True,
             'is_permanent': False,
             'notify': True,
             'tags': ['foo', 'bar']
         }, config)
     os.remove(manifest_filename)
 def test_table_manifest_full(self):
     cfg = docker.Config()
     some_file = os.path.join(
         tempfile.mkdtemp('kbc-test') + 'some-table.csv')
     cfg.write_table_manifest(some_file,
                              columns=['foo', 'bar'],
                              destination='some-destination',
                              primary_key=['foo'],
                              incremental=True,
                              metadata={'bar': 'kochba'},
                              column_metadata={'bar': {
                                  'foo': 'gogo'
                              }},
                              delete_where={
                                  'column': 'lilly',
                                  'values': ['a', 'b'],
                                  'operator': 'eq'
                              })
     manifest_filename = some_file + '.manifest'
     with open(manifest_filename) as manifest_file:
         config = json.load(manifest_file)
     self.assertEqual(
         {
             'destination': 'some-destination',
             'columns': ['foo', 'bar'],
             'primary_key': ['foo'],
             'incremental': True,
             'metadata': [{
                 'key': 'bar',
                 'value': 'kochba'
             }],
             'column_metadata': {
                 'bar': [{
                     'key': 'foo',
                     'value': 'gogo'
                 }]
             },
             'delete_where_column': 'lilly',
             'delete_where_values': ['a', 'b'],
             'delete_where_operator': 'eq'
         }, config)
     os.remove(manifest_filename)
def run(datadir):
    cfg = kbc_py.Config(datadir)

    datadir_path = Path(datadir)
    in_base_path = datadir_path / 'in/files'
    out_base_path = datadir_path / 'out/files'

    params = validate_expand_defaults(cfg.get_parameters())
    print("Datadir: " + str(list(str(d) for d in datadir_path.glob("**"))))

    output_params = params["output"]
    feature_format = feature_output_formats[output_params["featureFormat"]]
    include_additional_fields = output_params["includeAdditionalColumns"]

    input_format_params = params["input"]["format"]
    for format_name, format_params in input_format_params.items():
        in_format = input_formats[format_name]
        enabled = format_params["enabled"]
        glob_pattern = format_params["glob"]
        if not enabled:
            continue

        matching_files = list(in_base_path.glob(glob_pattern))
        print(f"Files matching {glob_pattern} in {in_base_path}: "
              f"{[str(f) for f in matching_files]}")

        for full_in_path in matching_files:
            relative_path = Path(full_in_path).relative_to(in_base_path)
            target_relative_path = relative_path.with_suffix(".csv")

            full_out_path = out_base_path / target_relative_path

            print(f"Converting {relative_path} (as {format_name}) "
                  f"to {target_relative_path}")

            full_out_path.parent.mkdir(parents=True, exist_ok=True)
            with open(str(full_out_path), mode="wt", encoding="utf=8") as out:
                convert(str(full_in_path),
                        out,
                        in_format,
                        feature_format,
                        include_additional_fields)
Esempio n. 13
0
def run(datadir):
    cfg = docker.Config(datadir)
    parameters = cfg.get_parameters()
    print("Hello World!")
    print(parameters)
    in_file = datadir + '/in/tables/source.csv'
    out_file = datadir + '/out/tables/destination.csv'
    with open(in_file, mode='rt', encoding='utf-8') as in_file, \
            open(out_file, mode='wt', encoding='utf-8') as out_file:
        lazy_lines = (line.replace('\0', '') for line in in_file)
        reader = csv.DictReader(lazy_lines, dialect='kbc')
        writer = csv.DictWriter(out_file,
                                dialect='kbc',
                                fieldnames=reader.fieldnames)
        writer.writeheader()
        for row in reader:
            writer.writerow({
                'id': int(row['id']) * 42,
                'sound': row['sound'] + 'ping'
            })
Esempio n. 14
0
def parse_configs():
    kbc_datadir = os.getenv("KBC_DATADIR", "/data/")
    cfg = docker.Config(kbc_datadir)
    parameters = cfg.get_parameters()

    # log parameters (excluding sensitive designated by '#')
    logging.info({k: v for k, v in parameters.items() if "#" not in k})

    input_filename = parameters.get("input_filename")

    # read unique product ids
    with open(f'{kbc_datadir}in/tables/{input_filename}.csv') as input_file:
        product_ids = {
            str(pid.replace('"', ''))
            for pid
            # read all input file rows, except the header
            in input_file.read().split(os.linesep)[1:]
            if re.match('"[0-9]+"$', pid)
        }
    return product_ids, parameters
Esempio n. 15
0
def run(datadir):
    cfg = docker.Config(datadir)
    parameters = cfg.get_parameters()
    c_parent = parameters.get('parentColumn', 'categoryParentId')
    c_child = parameters.get('idColumn', 'categoryId')

    # get input and output table and validate them
    tables = cfg.get_input_tables()
    if len(tables) != 1:
        raise ValueError("Input mapping must contain one table only.")
    in_table = tables[0]
    tables = cfg.get_expected_output_tables()
    if len(tables) != 1:
        raise ValueError("Output mapping must contain one table only.")
    out_table = tables[0]
    # physical location of the source file with source data
    in_file_path = in_table['full_path']
    # physical location of the target file with output data
    out_file_path = out_table['full_path']
    roots = set()
    relations = {}
    with open(in_file_path, mode='rt', encoding='utf-8') as in_file:
        lazy_lines = (line.replace('\0', '') for line in in_file)
        csv_reader = csv.DictReader(lazy_lines, dialect='kbc')
        input_csv_header = csv_reader.fieldnames
        if c_child not in input_csv_header:
            raise ValueError('Column ' + c_child + 'not present in table')
        if c_parent not in input_csv_header:
            raise ValueError('Column ' + c_parent + 'not present in table')
        roots, relations, rest = parse_tree(csv_reader, c_parent, c_child)

    with open(out_file_path, mode='wt', encoding='utf-8') as out_file:
        out_csv_header = input_csv_header + ['levels', 'root']
        writer = csv.DictWriter(out_file, fieldnames=out_csv_header,
                                dialect='kbc')
        writer.writeheader()
        for child, level, root in walk_tree(roots, relations):
            base_row = {c_child: child, 'levels': level, 'root': root}
            rest_row = rest[child]
            out_row = {**base_row, **rest_row}
            writer.writerow(out_row)
Esempio n. 16
0
 def __init__(self):
     self.datadir = os.getenv('KBC_DATADIR', '/data/')
     cfg = docker.Config(self.datadir)
     parameters = cfg.get_parameters()
     # log parameters (excluding sensitive designated by '#')
     logging.info({k: v for k, v in parameters.items() if "#" not in k})
     self.previous_timestamp_filename = parameters.get(
         'previous_timestamp_filename')
     self.filename_pattern = parameters.get('filename_pattern')
     self.server = parameters.get('server')
     self.port = int(parameters.get('port'))
     self.user = parameters.get('username')
     self.password = parameters.get('#password')
     self.passphrase = parameters.get('#passphrase')
     self.rsa_key = parameters.get('#key')
     self.sftp_folder = '/upload/'
     self.files_to_process = []
     self.last_timestamp = 0
     self.previous_timestamp = 0
     (self.common_fields, self.highlighted_fields, self.cheapest_fields,
      self.mall_fields, self.constant_fields,
      self.observed_fields) = None, None, None, None, None, None
Esempio n. 17
0
def test_success_run(tmpdir, datadir_and_results):
    dir_name, expected_levels, expected_roots, out_file_name = \
        datadir_and_results
    src = 'tests/data/' + dir_name
    dst = str(tmpdir.realpath()) + "/" + dir_name
    copy_tree(src, dst)
    run(dst)
    current = dst + "/out/tables/" + out_file_name
    cfg = docker.Config(dst)
    parameters = cfg.get_parameters()
    c_child = parameters.get('idColumn', 'categoryId')
    with open(current, mode='rt', encoding='utf-8') as in_file:
        lazy_lines = (line.replace('\0', '') for line in in_file)
        csv_reader = csv.DictReader(lazy_lines, dialect='kbc')
        row_count = 0
        for row in csv_reader:
            child = row[c_child]
            level = row['levels']
            root = row['root']
            assert expected_levels[child] == level
            assert expected_roots[child] == root
            row_count = row_count + 1
        assert row_count == len(expected_levels)
Esempio n. 18
0
    job_url = request.json()['url']
    wait_for_job(token, job_url)


if __name__ == '__main__':
    logging.basicConfig(level=logging.INFO)
    logger = logging.getLogger()
    try:
        logging_gelf_handler = logging_gelf.handlers.GELFTCPSocketHandler(
            host=os.getenv('KBC_LOGGER_ADDR'),
            port=int(os.getenv('KBC_LOGGER_PORT')))
        # remove stdout logging when running inside keboola
        logger.removeHandler(logger.handlers[0])
    except TypeError:
        logging_gelf_handler = logging.StreamHandler()

    logging_gelf_handler.setFormatter(
        logging_gelf.formatters.GELFFormatter(null_character=True))
    logger.addHandler(logging_gelf_handler)

    kbc_datadir = os.getenv('KBC_DATADIR', '/data/')
    cfg = docker.Config(kbc_datadir)
    parameters = cfg.get_parameters()
    # log parameters (excluding sensitive designated by '#')
    logging.info({k: v for k, v in parameters.items() if "#" not in k})

    kbc_token = parameters['#token']

    for table in parameters["tables"]:
        delete_table_rows(kbc_token, table, kbc_datadir)
Esempio n. 19
0
from datetime import date, timedelta
import csv
from pyvirtualdisplay import Display

print "Python libraries loaded."

display = Display(visible=0, size=(1024, 768))
display.start()

print "Current Working Directory is ... " + os.getcwd()

print "Config taken from ... " + os.path.abspath(
    os.path.join(os.getcwd(), os.pardir)) + 'data/'

# initialize KBC configuration
cfg = docker.Config(
    os.path.abspath(os.path.join(os.getcwd(), os.pardir)) + 'data/')
# loads application parameters - user defined
parameters = cfg.get_parameters()

### PARAMETERS ####

#date
scrape_date = str(time.strftime("%Y-%m-%d"))

#mode
mode = parameters.get('Mode')
#mode = 'summary'
#mode = 'by_category'
print "Mode is ... " + mode

### DEFINITION OF PARAMETERS ###
    def load() -> dict:
        cfg = docker.Config('/data/')
        params = cfg.get_parameters()

        # check required fields
        required = ('timezone', 'date_from', 'date_to', "#private_key",
                    "#client_email", "token_uri", "network_code")
        for r in required:
            if r not in params:
                raise ValueError(f'Missing required field "{r}".')

        # validate timezone type
        allowed_timezones = ('PUBLISHER', 'PROPOSAL_LOCAL', 'AD_EXCHANGE')
        if params['timezone'] not in allowed_timezones:
            raise ValueError(
                f"Invalid timezone. Choose one from {allowed_timezones}")

        # handle default dimensions
        if "dimensions" not in params:
            print("[INFO]: Dimensions field is empty -> use default")
            params['dimensions'] = DEFAULT_DIMENSIONS

            # add date column to dimensions - depends on timezone type
            if params["timezone"] in ("PUBLISHER", "PROPOSAL_LOCAL"):
                params['dimensions'].append("DATE")
            elif params["timezone"] == "AD_EXCHANGE":
                params['dimensions'].append("AD_EXCHANGE_DATE")

        print(f"[INFO]: Selected dimensions: {params['dimensions']}")

        # handle default metrics
        if "metrics" not in params:
            print("[INFO]:Metrics field is empty -> use default")
            params['metrics'] = DEFAULT_METRICS

        print(f"[INFO]: Selected metrics: {params['metrics']}")

        # parse date range
        date_from = dateparser.parse(params['date_from'])
        date_to = dateparser.parse(params['date_to'])

        if not date_from:
            raise ValueError(f"Invalid date format '{params['date_from']}'")

        if not date_to:
            raise ValueError(f"Invalid date format '{params['date_to']}'")

        params['date_from'] = date_from.date()
        params['date_to'] = date_to.date()

        # create file with private key
        key_file = "/tmp/private_key.json"
        params['private_key_file'] = Config.private_key_file(params, key_file)

        # set max retries count for retryable decorator
        if 'max_retries' not in params:
            params["max_retries"] = DEFAULT_MAX_RETRIES

        if 'dimension_attributes' in params:
            print("[INFO]: Selected dimension attributes:"
                  f" {params['dimension_attributes']}")

        if 'currency' not in params:
            for metric in params['metrics']:
                if metric.startswith("AD_EXCHANGE"):
                    print("[INFO]: Currency is not set, but AD_EXCHANGE metric"
                          " is present. Using CZK as default currency")
                    params['currency'] = "CZK"
                    break

        return params
 def test_get_data_dir(self):
     cfg = docker.Config()
     self.assertEqual(os.getenv('KBC_DATADIR', ''), cfg.get_data_dir())
 def test_get_action_empty_config(self):
     cfg = docker.Config(
         os.path.join(os.getenv('KBC_DATADIR', ''), '..', 'data2'))
     action = cfg.get_action()
     self.assertEqual(action, '')
 def test_get_action(self):
     cfg = docker.Config()
     action = cfg.get_action()
     self.assertEqual(action, 'test')
Esempio n. 24
0
from keboola import docker  # pro komunikaci s parametrama a input/output mapping
import warnings
import arrow

# Parameters
data_folder = '/data/'

warnings.filterwarnings("ignore", message="numpy.dtype size changed")

print("Python libraries loaded.")

print(f"Current Working Directory is ... {os.getcwd()}")
print(f"Config taken from ... {data_folder}")

# # initialize KBC configuration
cfg = docker.Config(data_folder)
parameters = cfg.get_parameters()


# Get unix time of start and end date
def unix_times(start, end):
    dates = list(pd.date_range(start=start, end=end, freq='D'))
    # print(dates)
    dates.append(dates[-1] + timedelta(days=1))
    # print(dates)
    date_timetuples = [date_.timetuple() for date_ in dates]
    # print(date_timetuples)
    unix_dates = list(map(str, map(int, map(time.mktime, date_timetuples))))
    return dates, unix_dates

Esempio n. 25
0
def get_csv_schema(file_path: str) -> list:
    data_dir = path.realpath(path.join(path.dirname(file_path),
                                       '../..')) + path.sep
    return docker.Config(data_dir).get_file_manifest(file_path)['columns']
 def test_get_oauthapi_appsecret(self):
     cfg = docker.Config()
     self.assertEqual(cfg.get_oauthapi_appsecret(), "myappsecret")
Esempio n. 27
0
    datefmt="%Y-%m-%d %H:%M:%S")
"""
logger = logging.getLogger()
logging_gelf_handler = logging_gelf.handlers.GELFTCPSocketHandler(
    host=os.getenv('KBC_LOGGER_ADDR'),
    port=int(os.getenv('KBC_LOGGER_PORT'))
    )
logging_gelf_handler.setFormatter(logging_gelf.formatters.GELFFormatter(null_character=True))
logger.addHandler(logging_gelf_handler)

# removes the initial stdout logging
logger.removeHandler(logger.handlers[0])
"""

### Access the supplied rules
cfg = docker.Config('/data/')
params = cfg.get_parameters()
client_id = params['client_id']
client_secret = params['#client_secret']
api_endpoint = params['api_endpoint']
looker_objects = params['looker_objects']

logging.info("Successfully fetched all parameters.")

#logging.debug("Fetched parameters are :" + str(params))

### Get proper list of tables
cfg = docker.Config('/data/')
in_tables = cfg.get_input_tables()
out_tables = cfg.get_expected_output_tables()
logging.info("IN tables mapped: " + str(in_tables))
 def test_get_oauthapi_appkey(self):
     cfg = docker.Config()
     self.assertEqual(cfg.get_oauthapi_appkey(), "myappkey")
Esempio n. 29
0
from datetime import datetime
import pandas as pd
import json
import os, shutil

in_tables_dir = '/data/in/tables/'
out_tables_dir = '/data/out/tables/'
out_data_dir = '/data/out/'
in_config_dir = '/data/'
date_col_default = 'date'
suffix_delimiter = '-'
csv_suffix = '.csv'
config_suffix = '.config'

# get KBC parameters
cfg = docker.Config(in_config_dir)
# loads application parameters - user defined
parameters = cfg.get_parameters()
account_key = parameters.get('account_key')
account_name = parameters.get('account_name')
data_container = parameters.get('data_container')
config_container = parameters.get('config_container')
date_col = parameters.get('date_col')

# when date_col is not in params, set to default value
if not date_col:
    date_col = date_col_default

block_blob_service = BlockBlobService(account_name=account_name,
                                      account_key=account_key)
base_blob_service = BaseBlobService(account_name=account_name,
 def test_register_csv_dialect(self):
     docker.Config().register_csv_dialect()
     self.assertIn("kbc", csv.list_dialects())