def match_column_identifier(column_names, c, zero_based=False): """ Determine what column a single column id (name or index) matches in a series of column names. Note that integer values are *always* treated as positional identifiers. If you happen to have column names which are also integers, you must specify them using a positional index. """ if isinstance(c, basestring) and not c.isdigit() and c in column_names: return column_names.index(c) else: try: c = int(c) if not zero_based: c -= 1 # Fail out if neither a column name nor an integer except: raise ColumnIdentifierError('Column identifier "%s" is neither an integer, nor a existing column\'s name.' % c) # Fail out if index is 0-based if c < 0: raise ColumnIdentifierError('Column 0 is not valid; columns are 1-based.') # Fail out if index is out of range if c >= len(column_names): raise ColumnIdentifierError('Index %i is beyond the last named column, "%s" at index %i.' % (c, column_names[-1], len(column_names) - 1)) return c
def match_column_identifier(column_names, c, column_offset=1): """ Determine what column a single column id (name or index) matches in a series of column names. Note that integer values are *always* treated as positional identifiers. If you happen to have column names which are also integers, you must specify them using a positional index. """ if isinstance(c, six.string_types) and not c.isdigit() and c in column_names: return column_names.index(c) else: try: c = int(c) - column_offset # Fail out if neither a column name nor an integer except ValueError: raise ColumnIdentifierError( "Column '%s' is invalid. It is neither an integer nor a column name. " "Column names are: %s" % (c, repr(column_names)[1:-1])) # Fail out if index is 0-based if c < 0: raise ColumnIdentifierError( "Column %i is invalid. Columns are 1-based." % (c + column_offset)) # Fail out if index is out of range if c >= len(column_names): raise ColumnIdentifierError( "Column %i is invalid. The last column is '%s' at index %i." % (c + column_offset, column_names[-1], len(column_names) - 1 + column_offset)) return c
def match_column_identifier(column_names, c): """ Determine what column a single column id (name or index) matches in a series of column names. """ if c in column_names: return column_names.index(c) else: try: c = int(c) - 1 # Fail out if neither a column name nor an integer except: raise ColumnIdentifierError( 'Column identifier "%s" is neither a index, nor a existing column\'s name.' % c) # Fail out if index is 0-based if c < 0: raise ColumnIdentifierError( 'Columns 0 is not valid; columns are 1-based.') # Fail out if index is out of range if c >= len(column_names): raise ColumnIdentifierError( 'Index %i is beyond the last named column, "%s" at index %i.' % (c, column_names[-1], len(column_names) - 1)) return c
def standardize_patterns(column_names, patterns): """ Given patterns in any of the permitted input forms, return a dict whose keys are column indices and whose values are functions which return a boolean value whether the value passes. If patterns is a dictionary and any of its keys are values in column_names, the returned dictionary will have those keys replaced with the integer position of that value in column_names """ try: # Dictionary of patterns patterns = dict( (k, pattern_as_function(v)) for k, v in patterns.items() if v) if not column_names: return patterns p2 = {} for k in patterns: if k in column_names: idx = column_names.index(k) if idx in patterns: raise ColumnIdentifierError( "Column %s has index %i which already has a pattern." % (k, idx)) p2[idx] = patterns[k] else: p2[k] = patterns[k] return p2 except AttributeError: # Sequence of patterns return dict( (i, pattern_as_function(x)) for i, x in enumerate(patterns))
def standardize_modifiers(column_names, modifiers): """ Given modifiers in any of the permitted input forms, return a dict whose keys are column indices and whose values are functions which return a modified value. If modifiers is a dictionary and any of its keys are values in column_names, the returned dictionary will have those keys replaced with the integer position of that value in column_names """ try: # Dictionary of modifiers modifiers = dict( (k, modifier_as_function(v)) for k, v in modifiers.items()) if not column_names: return modifiers p2 = {} for k in modifiers: if k in column_names: idx = column_names.index(k) if idx in modifiers: raise ColumnIdentifierError( "Column %s has index %i which already has a pattern." % (k, idx)) p2[idx] = modifiers[k] else: p2[k] = modifiers[k] return p2 except AttributeError: # Sequence of modifiers return dict((idx, modifier_as_function(x)) for idx, x in enumerate(modifiers.values()))
def parse_column_identifiers(ids, column_names): """ Parse a comma-separated list of column indices AND/OR names into a list of integer indices. Ranges of integers can be specified with two integers separated by a '-' or ':' character. Ranges of non-integers (e.g. column names) are not supported. Note: Column indices are 1-based. """ # If not specified, return all columns if not ids: return range(len(column_names)) columns = [] for c in ids.split(','): c = c.strip() try: columns.append(match_column_identifier(column_names, c)) except ColumnIdentifierError: if ':' in c: a,b = c.split(':',1) elif '-' in c: a,b = c.split('-',1) else: raise try: if a: a = int(a) else: a = 1 if b: b = int(b) + 1 else: b = len(column_names) except ValueError: raise ColumnIdentifierError("Invalid range %s. Ranges must be two integers separated by a - or : character.") for x in range(a,b): columns.append(match_column_identifier(column_names, x)) return columns
def standardize_modifiers(cnames, modifiers): # TODO: csvkit.grep.standardize_patterns could be refactored to support # this process here as well... try: # Test to see if dictionary of modifiers modifiers = {k: v for k, v in modifiers.items() if v} except AttributeError: # Fallback to sequence of modifiers return {i: spec2modifier(v) for i, v in enumerate(modifiers) if v} modifiers = {k: spec2modifier(v) for k, v in modifiers.items()} if not cnames: return modifiers p2 = {} for k in modifiers: if k in cnames: idx = cnames.index(k) if idx in modifiers: raise ColumnIdentifierError( 'Column %s has index %i which already has a pattern.' % (k, idx)) p2[idx] = modifiers[k] else: p2[k] = modifiers[k] return p2
def parse_column_identifiers(ids, column_names, column_offset=1, excluded_columns=None): """ Parse a comma-separated list of column indices AND/OR names into a list of integer indices. Ranges of integers can be specified with two integers separated by a '-' or ':' character. Ranges of non-integers (e.g. column names) are not supported. Note: Column indices are 1-based. """ if not column_names: return [] if not ids and not excluded_columns: return range(len(column_names)) if ids: columns = [] for c in ids.split(','): try: columns.append( match_column_identifier(column_names, c, column_offset)) except ColumnIdentifierError: if ':' in c: a, b = c.split(':', 1) elif '-' in c: a, b = c.split('-', 1) else: raise try: if a: a = int(a) else: a = 1 if b: b = int(b) + 1 else: b = len(column_names) + 1 except ValueError: raise ColumnIdentifierError( "Invalid range %s. Ranges must be two integers separated by a - or : character." ) for x in range(a, b): columns.append( match_column_identifier(column_names, x, column_offset)) else: columns = range(len(column_names)) excludes = [] if excluded_columns: for c in excluded_columns.split(','): try: excludes.append( match_column_identifier(column_names, c, column_offset)) except ColumnIdentifierError: if ':' in c: a, b = c.split(':', 1) elif '-' in c: a, b = c.split('-', 1) else: raise try: if a: a = int(a) else: a = 1 if b: b = int(b) + 1 else: b = len(column_names) except ValueError: raise ColumnIdentifierError( "Invalid range %s. Ranges must be two integers separated by a - or : character." ) for x in range(a, b): excludes.append( match_column_identifier(column_names, x, column_offset)) return [c for c in columns if c not in excludes]