def process_file(
        file_path, parser, handle_package, show_progress=False):
    """
    Process a single file with specified plugin.
    """
    if not os.path.exists(file_path):
        raise Exception("Could not find file '{0}'".format(file_path))

    _directory, filename = os.path.split(file_path)

    with open(file_path) as data_file:
        stop_event = threading.Event()
        condition = compose(not_, stop_event.is_set)

        if show_progress:
            start_progress_reporter(data_file, condition)

        try:
            for package in parser.packages(data_file, filename):
                handle_package(package)
        except DataError as exc:
            raise ParseError("{0!s} at position {1:d}".format(
                exc, data_file.tell()))
        except Exception:
            stack_trace = traceback.format_exc()
            position = data_file.tell()
            message = "{0} at position {1:d}".format(stack_trace, position)
            raise Exception(message)
        finally:
            stop_event.set()
Example #2
0
def process_file(file_path, parser, handle_package, show_progress=False):
    """
    Process a single file with specified plugin.
    """
    if not os.path.exists(file_path):
        raise Exception("Could not find file '{0}'".format(file_path))

    _directory, filename = os.path.split(file_path)

    with open(file_path) as data_file:
        stop_event = threading.Event()
        condition = compose(not_, stop_event.is_set)

        if show_progress:
            start_progress_reporter(data_file, condition)

        try:
            for package in parser.packages(data_file, filename):
                handle_package(package)
        except DataError as exc:
            raise ParseError("{0!s} at position {1:d}".format(
                exc, data_file.tell()))
        except Exception:
            stack_trace = traceback.format_exc()
            position = data_file.tell()
            message = "{0} at position {1:d}".format(stack_trace, position)
            raise Exception(message)
        finally:
            stop_event.set()
Example #3
0
def load_csv(profile, csv_file):
    """
    Return tuple (column_names, data_rows).

    column_names - a list with selected column names
    data_rows - an iterator over row tuples (dn, timestamp, values)
    """
    csv_reader = csv.reader(csv_file, dialect=profile.dialect(csv_file))

    header = next(csv_reader)

    fields = profile.field_selector(header)
    values = ValuesExtractor(fields)

    check_header(header, fields)

    header_checks = [
        check for check in
        [
            profile.timestamp.header_check(),
            profile.identifier.header_check()
        ]
        if check is not None
    ]

    for check in header_checks:
        check(header)

    record_checks = [
        check for check in
        [
            profile.timestamp.record_check(),
            profile.identifier.record_check()
        ]
        if check is not None
    ]

    include_record = partial(record_passes_checks, record_checks)

    include_row = create_row_check(header)

    records = filter(
        include_record,
        (
            dict(zip(header, [item for item in row]))
            for line_nr, row in enumerate(csv_reader)
            if include_row(line_nr, row)
        )
    )

    extract_raw_data_row = compose(tuple, raw_data_row_extractor(
        profile.identifier.from_record,
        profile.timestamp.from_record,
        values.from_record
    ))

    return fields, map(extract_raw_data_row, records)
Example #4
0
    def execute(self):
        datasource_name = self.description["datasource"]

        try:
            datasource = get_datasource(self.minerva_context.writer_conn, datasource_name)
        except NoSuchDataSourceError:
            raise HarvestError("no datasource with name '{}'".format(datasource_name))

        parser_config = self.description.get("parser_config", {})
        uri = self.description["uri"]

        update_existence = parser_config.get("update_existence", None)

        datatype = self.description["datatype"]

        try:
            plugin = self.plugins[datatype]
        except KeyError:
            raise HarvestError("could not load parser plugin '{}'".format(datatype))

        storagetype = plugin.storagetype()

        try:
            storage_provider = self.minerva_context.storage_providers[storagetype]
        except KeyError:
            raise HarvestError("could not load '{}' storage provider plugin".format(storagetype))

        dispatch_raw_datapackage = partial(storage_provider.store_raw, datasource)

        if update_existence:
            dispatch_raw_datapackage = partial(dispatch_raw_and_mark_existing,
                    dispatch_raw_datapackage, update_existence,
                    self.existence.mark_existing)

        dispatch_raw = compose(dispatch_raw_datapackage, storage_provider.RawDataPackage)

        parser = plugin.create_parser(dispatch_raw, parser_config)

        encoding = self.description.get("encoding", "utf-8")

        datastream = open_uri(uri, encoding)

        logging.debug("opened uri '{}'".format(uri))

        try:
            parser.parse(datastream, os.path.basename(uri))
        except Exception as exc:
            stacktrace = traceback.format_exc()

            execute_action(uri, self.description.get("on_failure", DEFAULT_ACTION))

            raise JobError(stacktrace)
        else:
            execute_action(uri, self.description.get("on_success", DEFAULT_ACTION))

        if update_existence:
            self.existence.flush(datetime.now())
Example #5
0
def load_csv(profile, csv_file):
    """
    Return tuple (column_names, data_rows).

    column_names - a list with selected column names
    data_rows - an iterator over row tuples (dn, timestamp, values)
    """
    csv_reader = create_csv_reader(profile, csv_file)

    header = csv_reader.next()

    fields = profile.field_selector(header)
    values = ValuesExtractor(fields)

    check_header(header, fields)

    header_checks = [
        check for check in
        [
            profile.timestamp.header_check(),
            profile.identifier.header_check()
        ]
        if not check is None
    ]

    for check in header_checks:
        check(header)

    record_checks = [
        check for check in
        [
            profile.timestamp.record_check(),
            profile.identifier.record_check()
        ]
        if not check is None
    ]

    include_record = partial(record_passes_checks, record_checks)

    include_row = create_row_check(header)

    records = filter(
        include_record,
        (
            dict(zip(header, [item.decode('utf-8') for item in row]))
            for line_nr, row in enumerate(csv_reader)
            if profile.ignore_field_mismatches or include_row(line_nr, row)
        )
    )

    extract_raw_data_row = compose(tuple, raw_data_row_extractor(
        profile.identifier.from_record,
        profile.timestamp.from_record,
        values.from_record
    ))

    return fields, map(extract_raw_data_row, records)
Example #6
0
    def _connect(self):
        handler_map = {
            psycopg2.OperationalError:
            lambda exc: logging.error(
                "could not connect to database ({}), waiting".format(exc))
        }

        retry_condition = compose(not_, self.stop_event.is_set)

        return retry_while(self.connect_fn, handler_map, retry_condition)
    def __init__(self, template, regex):
        self.template = template
        self.regex = regex

        self.fields = re.findall("{([^}]+)}", template)

        # composed identifier (e.g. '{fld1}-{fld2}, {fld1}:{fld2}')
        get_identifier = expand_kwargs(template.format)

        extract_ident = partial(extract_identifier, regex)

        self.record_to_dn = compose(extract_ident, get_identifier)
Example #8
0
    def __init__(self, template, regex):
        self.template = template
        self.regex = regex

        self.fields = re.findall("{([^}]+)}", template)

        #composed identifier (e.g. '{fld1}-{fld2}, {fld1}:{fld2}')
        get_identifier = expand_kwargs(template.format)

        extract_ident = partial(extract_identifier, regex)

        self.record_to_dn = compose(extract_ident, get_identifier)
Example #9
0
    def execute(self, cursor, state):
        partition = self.partition(state)
        datapackage = self.datapackage(state)

        try:
            try:
                store_batch_insert(cursor, partition.table(), datapackage,
                        state["modified"])
            except Exception as exc:
                logging.debug("exception: {}".format(type(exc).__name__))
                raise exc
        except NoSuchTable:
            data_types = compose(DataPackage.deduce_data_types, self.datapackage)
            trend_names = compose(attrgetter("trend_names"), self.datapackage)
            fix = CreatePartition(self.partition, trend_names, data_types)
            return insert_before(fix)
        except NoSuchColumnError:
            data_types = compose(DataPackage.deduce_data_types, self.datapackage)
            trend_names = compose(attrgetter("trend_names"), self.datapackage)
            fix = CheckColumnsExist(self.partition, trend_names, data_types)
            return insert_before(fix)
        except UniqueViolation:
            fix = Update(self.partition, self.datapackage)
            return replace(fix)
        except DataTypeMismatch:
            data_types = compose(DataPackage.deduce_data_types, self.datapackage)
            trend_names = compose(attrgetter("trend_names"), self.datapackage)
            fix = CheckColumnTypes(self.partition, trend_names, data_types)
            return insert_before(fix)
Example #10
0
    def execute(self, cursor, state):
        partition = self.partition(state)
        datapackage = self.datapackage(state)

        try:
            store_copy_from(cursor, partition.table(), datapackage,
                    state["modified"])
        except NoCopyInProgress:
            return no_op
        except NoSuchTable:
            data_types = compose(DataPackage.deduce_data_types, self.datapackage)
            trend_names = compose(attrgetter("trend_names"), self.datapackage)
            fix = CreatePartition(self.partition, trend_names, data_types)
            return insert_before(fix)
        except NoSuchColumnError:
            data_types = compose(DataPackage.deduce_data_types, self.datapackage)
            trend_names = compose(attrgetter("trend_names"), self.datapackage)
            fix = CheckColumnsExist(self.partition, trend_names, data_types)
            return insert_before(fix)
        except UniqueViolation:
            fix = Update(self.partition, self.datapackage)
            return replace(fix)
        except DataTypeMismatch:
            data_types = compose(DataPackage.deduce_data_types, self.datapackage)
            trend_names = compose(attrgetter("trend_names"), self.datapackage)
            fix = CheckColumnTypes(self.partition, trend_names, data_types)
            return insert_before(fix)
def test_compose():
    composed = compose(add_one, times_two, add_one, add_one)

    assert_equal(composed(1), 7)
Example #12
0
        }

    @classmethod
    def from_dict(cls, d):
        """Return DataPackage constructed from the dictionary."""
        return cls(
            attribute_names=d["attribute_names"],
            rows=d["rows"]
        )


snd = itemgetter(1)

types_from_values = partial(map, datatype.deduce_from_value)

row_to_types = compose(types_from_values, itemgetter(2))


def create_copy_from_line(data_types, row):
    """Return line compatible with COPY FROM command."""
    entity_id, timestamp, attributes = row

    value_mappers = map(value_mapper_by_type.get, data_types)

    values = chain(
        (str(entity_id), str(timestamp)),
        zipapply(value_mappers, attributes)
    )

    return "\t".join(values) + "\n"
Example #13
0
from minerva.node import MinervaContext
from minerva_transform.types import Transformation
from minerva_db import reset_db, with_connection, \
        get_dummy_datasource, get_dummy_entitytype, TIMEZONE, add_function_set, \
        add_function_mapping, render_result

from util import render_datapackage

from minerva.storage.trend.store import CopyFrom
from minerva.storage.trend.types_v4 import DataPackage, TrendStore3
from minerva.storage.trend.granularity import create_granularity


tzinfo = pytz.timezone(TIMEZONE)

local_timestamp = compose(tzinfo.localize, datetime)

src_timestamp_1 = local_timestamp(2012, 12, 11, 13, 15, 0)
src_timestamp_2 = local_timestamp(2012, 12, 11, 13, 30, 0)
src_timestamp_3 = local_timestamp(2012, 12, 11, 13, 45, 0)
src_timestamp_4 = local_timestamp(2012, 12, 11, 14, 0, 0)

modified_a = local_timestamp(2012, 12, 11, 14, 3, 27)
modified_b = local_timestamp(2012, 12, 11, 14, 7, 14)

dest_timestamp = local_timestamp(2012, 12, 11, 14, 0, 0)

granularity = create_granularity("900")

trend_names = ("counter_a", "counter_b")
source_1_1 = DataPackage(granularity, src_timestamp_1, trend_names, [(1000, (4, 0))])
Example #14
0
 def record_check(self):
     return compose(operator.not_, any_field_empty(self.fields))
Example #15
0
    with closing(conn.cursor()) as cursor:
        column_names = transformation.function_set.get_dest_columns(cursor)

    rows = [(row[0], ([transformation.function_set.id],) + row[2:])
        for row in transformed_rows if row[0]]

    plugin = get_plugin("trend")(conn, api_version=4)

    datapackage = plugin.DataPackage(transformation.function_set.dest_trendstore.granularity,
            transformation.dest_timestamp, column_names, rows)

    return plugin.store_txn(transformation.function_set.dest_trendstore, datapackage)


row_has_entity_id = compose(truth, head)


def function_set_from_row(cursor, row):
    id, name, description, mapping_signature, source_datasource_ids, \
        source_entitytype_id, source_granularity_str, dest_datasource_id, \
        dest_entitytype_id, dest_granularity_str, filter_sub_query, group_by, \
        relation_type_id, enabled = row

    get_datasource = partial(get_datasource_by_id, cursor)
    get_entitytype = partial(get_entitytype_by_id, cursor)

    source_granularity = create_granularity(str(source_granularity_str))
    dest_granularity = create_granularity(str(dest_granularity_str))

    source_datasources = map(get_datasource, source_datasource_ids)
def test_compose_pair():
    composed = compose(times_two, add_one)

    assert_equal(composed(2), 6)
 def record_check(self):
     return compose(operator.not_, any_field_empty(self.fields))