Exemple #1
0
def match_column_identifier(column_names, c, zero_based=False):
    """
    Determine what column a single column id (name or index) matches in a series of column names.
    Note that integer values are *always* treated as positional identifiers. If you happen to have
    column names which are also integers, you must specify them using a positional index.
    """
    if isinstance(c, basestring) and not c.isdigit() and c in column_names:
        return column_names.index(c)
    else:
        try:
            c = int(c)
            if not zero_based:
                c -= 1
        # Fail out if neither a column name nor an integer
        except:
            raise ColumnIdentifierError('Column identifier "%s" is neither an integer, nor a existing column\'s name.' % c)

        # Fail out if index is 0-based
        if c < 0:
            raise ColumnIdentifierError('Column 0 is not valid; columns are 1-based.')

        # Fail out if index is out of range
        if c >= len(column_names):
            raise ColumnIdentifierError('Index %i is beyond the last named column, "%s" at index %i.' % (c, column_names[-1], len(column_names) - 1))

    return c
Exemple #2
0
def match_column_identifier(column_names, c, column_offset=1):
    """
    Determine what column a single column id (name or index) matches in a series of column names.
    Note that integer values are *always* treated as positional identifiers. If you happen to have
    column names which are also integers, you must specify them using a positional index.
    """
    if isinstance(c,
                  six.string_types) and not c.isdigit() and c in column_names:
        return column_names.index(c)
    else:
        try:
            c = int(c) - column_offset
        # Fail out if neither a column name nor an integer
        except ValueError:
            raise ColumnIdentifierError(
                "Column '%s' is invalid. It is neither an integer nor a column name. "
                "Column names are: %s" % (c, repr(column_names)[1:-1]))

        # Fail out if index is 0-based
        if c < 0:
            raise ColumnIdentifierError(
                "Column %i is invalid. Columns are 1-based." %
                (c + column_offset))

        # Fail out if index is out of range
        if c >= len(column_names):
            raise ColumnIdentifierError(
                "Column %i is invalid. The last column is '%s' at index %i." %
                (c + column_offset, column_names[-1],
                 len(column_names) - 1 + column_offset))

    return c
Exemple #3
0
def match_column_identifier(column_names, c):
    """
    Determine what column a single column id (name or index) matches in a series of column names.
    """
    if c in column_names:
        return column_names.index(c)
    else:
        try:
            c = int(c) - 1
        # Fail out if neither a column name nor an integer
        except:
            raise ColumnIdentifierError(
                'Column identifier "%s" is neither a index, nor a existing column\'s name.'
                % c)

        # Fail out if index is 0-based
        if c < 0:
            raise ColumnIdentifierError(
                'Columns 0 is not valid; columns are 1-based.')

        # Fail out if index is out of range
        if c >= len(column_names):
            raise ColumnIdentifierError(
                'Index %i is beyond the last named column, "%s" at index %i.' %
                (c, column_names[-1], len(column_names) - 1))

    return c
Exemple #4
0
def standardize_patterns(column_names, patterns):
    """
    Given patterns in any of the permitted input forms, return a dict whose keys 
    are column indices and whose values are functions which return a boolean value whether the value passes.
    If patterns is a dictionary and any of its keys are values in column_names, the returned dictionary will 
    have those keys replaced with the integer position of that value in column_names
    """
    try:
        # Dictionary of patterns
        patterns = dict(
            (k, pattern_as_function(v)) for k, v in patterns.items() if v)
        if not column_names:
            return patterns
        p2 = {}
        for k in patterns:
            if k in column_names:
                idx = column_names.index(k)
                if idx in patterns:
                    raise ColumnIdentifierError(
                        "Column %s has index %i which already has a pattern." %
                        (k, idx))
                p2[idx] = patterns[k]
            else:
                p2[k] = patterns[k]
        return p2
    except AttributeError:
        # Sequence of patterns
        return dict(
            (i, pattern_as_function(x)) for i, x in enumerate(patterns))
Exemple #5
0
def standardize_modifiers(column_names, modifiers):
    """
    Given modifiers in any of the permitted input forms, return a dict whose keys
    are column indices and whose values are functions which return a modified value.
    If modifiers is a dictionary and any of its keys are values in column_names, the
    returned dictionary will have those keys replaced with the integer position of
    that value in column_names
    """
    try:
        # Dictionary of modifiers
        modifiers = dict(
            (k, modifier_as_function(v)) for k, v in modifiers.items())
        if not column_names:
            return modifiers
        p2 = {}
        for k in modifiers:
            if k in column_names:
                idx = column_names.index(k)
                if idx in modifiers:
                    raise ColumnIdentifierError(
                        "Column %s has index %i which already has a pattern." %
                        (k, idx))
                p2[idx] = modifiers[k]
            else:
                p2[k] = modifiers[k]
        return p2
    except AttributeError:
        # Sequence of modifiers
        return dict((idx, modifier_as_function(x))
                    for idx, x in enumerate(modifiers.values()))
Exemple #6
0
def parse_column_identifiers(ids, column_names):
    """
    Parse a comma-separated list of column indices AND/OR names into a list of integer indices.
    Ranges of integers can be specified with two integers separated by a '-' or ':' character. Ranges of 
    non-integers (e.g. column names) are not supported.
    Note: Column indices are 1-based. 
    """
    # If not specified, return all columns 
    if not ids:
        return range(len(column_names))

    columns = []

    for c in ids.split(','):
        c = c.strip()

        try:
            columns.append(match_column_identifier(column_names, c))
        except ColumnIdentifierError:
            if ':' in c:
                a,b = c.split(':',1)
            elif '-' in c:
                a,b = c.split('-',1)
            else:
                raise
            
            try:
                if a:
                    a = int(a)
                else:
                    a = 1
                if b:
                    b = int(b) + 1
                else:
                    b = len(column_names)
                    
            except ValueError:
                raise ColumnIdentifierError("Invalid range %s. Ranges must be two integers separated by a - or : character.")
            
            for x in range(a,b):
                columns.append(match_column_identifier(column_names, x))

    return columns
Exemple #7
0
def standardize_modifiers(cnames, modifiers):
    # TODO: csvkit.grep.standardize_patterns could be refactored to support
    #       this process here as well...
    try:
        # Test to see if dictionary of modifiers
        modifiers = {k: v for k, v in modifiers.items() if v}
    except AttributeError:
        # Fallback to sequence of modifiers
        return {i: spec2modifier(v) for i, v in enumerate(modifiers) if v}
    modifiers = {k: spec2modifier(v) for k, v in modifiers.items()}
    if not cnames:
        return modifiers
    p2 = {}
    for k in modifiers:
        if k in cnames:
            idx = cnames.index(k)
            if idx in modifiers:
                raise ColumnIdentifierError(
                    'Column %s has index %i which already has a pattern.' %
                    (k, idx))
            p2[idx] = modifiers[k]
        else:
            p2[k] = modifiers[k]
    return p2
Exemple #8
0
def parse_column_identifiers(ids,
                             column_names,
                             column_offset=1,
                             excluded_columns=None):
    """
    Parse a comma-separated list of column indices AND/OR names into a list of integer indices.
    Ranges of integers can be specified with two integers separated by a '-' or ':' character.
    Ranges of non-integers (e.g. column names) are not supported.
    Note: Column indices are 1-based.
    """
    if not column_names:
        return []

    if not ids and not excluded_columns:
        return range(len(column_names))

    if ids:
        columns = []

        for c in ids.split(','):
            try:
                columns.append(
                    match_column_identifier(column_names, c, column_offset))
            except ColumnIdentifierError:
                if ':' in c:
                    a, b = c.split(':', 1)
                elif '-' in c:
                    a, b = c.split('-', 1)
                else:
                    raise

                try:
                    if a:
                        a = int(a)
                    else:
                        a = 1
                    if b:
                        b = int(b) + 1
                    else:
                        b = len(column_names) + 1

                except ValueError:
                    raise ColumnIdentifierError(
                        "Invalid range %s. Ranges must be two integers separated by a - or : character."
                    )

                for x in range(a, b):
                    columns.append(
                        match_column_identifier(column_names, x,
                                                column_offset))
    else:
        columns = range(len(column_names))

    excludes = []

    if excluded_columns:
        for c in excluded_columns.split(','):
            try:
                excludes.append(
                    match_column_identifier(column_names, c, column_offset))
            except ColumnIdentifierError:
                if ':' in c:
                    a, b = c.split(':', 1)
                elif '-' in c:
                    a, b = c.split('-', 1)
                else:
                    raise

                try:
                    if a:
                        a = int(a)
                    else:
                        a = 1
                    if b:
                        b = int(b) + 1
                    else:
                        b = len(column_names)

                except ValueError:
                    raise ColumnIdentifierError(
                        "Invalid range %s. Ranges must be two integers separated by a - or : character."
                    )

                for x in range(a, b):
                    excludes.append(
                        match_column_identifier(column_names, x,
                                                column_offset))

    return [c for c in columns if c not in excludes]