Esempio n. 1
0
    def load_base_csv(self, data_file, languages, groups=[], translation_filename=None, translation_extra=[], keys_ex=[], validate=True):
        """Loads a base data map object from a csv
        groups is a list of additional fields (name is automatically include)
        that nest via groupname_subfield.
        """
        data_file = self.get_data_path(data_file)
        groups = ['name'] + groups

        rows = [group_fields(row, groups=groups) for row in read_csv(data_file)]

        basemap = DataMap(languages=languages, keys_ex=keys_ex)
        basemap.extend(rows)

        if translation_filename:
            try:
                translations = fix_id(self.load_list_csv(translation_filename))
                groups = set(['name'] + translation_extra)
                merge_list(basemap, translations, groups=groups, many=False)
            except FileNotFoundError:
                print(f"Warning: Could not find translation file {translation_filename}")

        if languages:
            self._validate_base_map(data_file, basemap, languages, error=validate)

        return basemap
Esempio n. 2
0
 def group_fields(self, data):
     if not isinstance(data, collections.Mapping):
         raise TypeError("Invalid data type, perhaps you forgot many=true?")
     groups = (list(self.__groups__ or []) +
               list(self.__translation_groups__ or []) +
               self.identify_prefixes())
     return group_fields(data, groups=groups)
Esempio n. 3
0
    def extend_base(self, filename, *, groups=[]):
        filename = self._get_filename(filename)
        dataitems = self.reader.load_list_csv(filename)
        if not dataitems:
            return self

        groups = set(['name'] + groups)

        # todo: have it check the first column name and allow joins on other languages

        # Get first column name, whose values will anchor the data to merge
        first_column_name = next(iter(dataitems[0].keys()))

        results = {}
        for item in dataitems:
            key = item[first_column_name]

            # Remove the join from the subdata
            item.pop(first_column_name)

            results[key] = group_fields(item, groups=groups)

        self.data_map.merge(results)

        return self
Esempio n. 4
0
def unflatten(obj_list, *, nest, groups=[], leaftype):
    """Performs the reverse of flatten. 
    Turns a CSV (list of objects) into a nested object.

    Nest is a list of fields used to walk through the nesting.

    TODO: Remove groups and leaftype and leave that to a post-step.
    Wait to see what the post-load abstraction will be before doing that.
    """
    if leaftype not in ['list', 'dict']:
        raise Exception("Unsupported leaf type")

    # This is a recursive algorithm
    if not nest:
        # BASE CASE: nothing more to nest, performs groups on entries
        if leaftype is 'list':
            return [util.group_fields(obj, groups=groups) for obj in obj_list]
        if leaftype is 'dict':
            return util.group_fields(obj_list[0], groups=groups)

    else:
        current_nest = nest[0]
        remaining_nest = nest[1:]

        # Phase one, start grouping rows
        grouped_rows = {}
        for mapping in obj_list:
            key = mapping.pop(current_nest)
            grouped_rows.setdefault(key, []).append(mapping)

        # Phase 2, unflatten recursively
        results = {}
        for key, items in grouped_rows.items():
            # Validation. Make sure it recurses correctly
            if leaftype != 'list' and len(items) > 1:
                raise Exception(
                    f"Found multiple entries for {current_nest}:{key}, " +
                    "which is invalid in this leaf type")

            # Recursive call
            results[key] = unflatten(items,
                                     nest=remaining_nest,
                                     groups=groups,
                                     leaftype=leaftype)

        return results
Esempio n. 5
0
    def load_base_csv(self, data_file, groups=[], validate=True):
        """Loads a base data map object from a csv
        groups is a list of additional fields (name is automatically include)
        that nest via groupname_subfield.
        """
        data_file = self.get_data_path(data_file)
        groups = ['name'] + groups

        rows = read_csv(data_file)
        rows = [group_fields(row, groups=groups) for row in rows]

        basemap = DataMap()
        basemap.extend(rows)
        self._validate_base_map(data_file, basemap, error=validate)

        return basemap
Esempio n. 6
0
 def group_fields(self, data):
     if not isinstance(data, collections.Mapping):
         raise TypeError("Invalid data type, perhaps you forgot many=true?")
     groups = self.__groups__ or []
     return group_fields(data, groups=groups)
Esempio n. 7
0
def merge_list(base,
               rows: typing.Iterable[dict],
               key=None,
               groups=[],
               many=False):
    """Routine to merge lists of dictionaries together using one or more keys.
    The keys used are determined by first sequential key of the first row.
    If the key is an id, it will join on that, but if it is a name, it will join on that and key_ex fields.
    """
    def create_key_fields(data_map, column_name):
        lang = derive_lang(column_name)

        key_fields = []
        if lang is None:
            key_fields.append('id')
        else:
            key_fields.append(f'name_{lang}')
            key_fields.extend(data_map.keys_ex)

        return key_fields

    def create_key_fn(key_fields):
        def derive_key(dict):
            items = []
            for k in key_fields:
                if f'base_{k}' in dict:
                    items.append(dict[f'base_{k}'])
                else:
                    items.append(dict[k])
            return tuple(str(i) for i in items)

        return derive_key

    if many and not key:
        raise ValueError('Key must have a value')

    if not rows:
        return

    # Create keying function
    first_column = next(iter(rows[0].keys()))
    key_fields = create_key_fields(base, first_column)
    derive_key = create_key_fn(key_fields)

    # group rows
    keyed_data = {}
    for row in rows:
        row_key = derive_key(row)

        # Delete key fields. Its possible for base_name_en AND name_en to be in the same row.
        # Therefore, prioritize deleting base_ versions first
        for k in key_fields:
            if f'base_{k}' in row:
                del row[f'base_{k}']
            elif k in row:
                del row[k]

        if groups:
            row = util.group_fields(row, groups=groups)
        entry = keyed_data.setdefault(row_key, [])
        entry.append(row)
        if not many and len(entry) > 1:
            raise ValueError(
                f"Key {row_key} has too many matching entries in sub data")

    # Group base
    base = {derive_key(e): e for e in base.values()}
    "Test the keys to see that sub's keys exist in base"
    unlinked = [k for k in keyed_data.keys() if k not in base.keys()]
    if unlinked:
        raise Exception(
            "Several entries in sub data map cannot be joined. Their keys are "
            + ','.join('None' if e is None else str(e) for e in unlinked))

    for data_key, data_entries in keyed_data.items():
        base_entry = base[data_key]
        if key:
            if many:
                base_entry[key] = data_entries
            else:
                base_entry[key] = data_entries[0]
        elif isinstance(data_entries[0], abc.Mapping):
            util.joindicts(base_entry, data_entries[0])
        else:
            # We cannot merge a dictionary with a non-dictionary
            raise Exception(
                "Invalid data, the data map must be a dictionary for a keyless merge"
            )
Esempio n. 8
0
def test_group_fields():
    item = {'level': 2, 'description_en': 'test', 'description_ja': None}
    grouped = util.group_fields(item, groups=('description', ))

    expected = {'level': 2, 'description': {'en': 'test', 'ja': None}}
    assert grouped == expected, "description should have been grouped"