def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, converters=None, missing='', missing_values=None, usecols=None, names=None, excludelist=None, deletechars=None, case_sensitive=True, unpack=None, usemask=False, loose=True): """ Load data from a text file. Each line past the first `skiprows` ones is split at the `delimiter` character, and characters following the `comments` character are discarded. Parameters ---------- fname : file or string File or filename to read. If the filename extension is `.gz` or `.bz2`, the file is first decompressed. dtype : data-type Data type of the resulting array. If this is a flexible data-type, the resulting array will be 1-dimensional, and each row will be interpreted as an element of the array. In this case, the number of columns used must match the number of fields in the data-type, and the names of each field will be set by the corresponding name of the dtype. If None, the dtypes will be determined by the contents of each column, individually. comments : {string}, optional The character used to indicate the start of a comment. All the characters occurring on a line after a comment are discarded delimiter : {string}, optional The string used to separate values. By default, any consecutive whitespace act as delimiter. skiprows : {int}, optional Numbers of lines to skip at the beginning of the file. converters : {None, dictionary}, optional A dictionary mapping column number to a function that will convert values in the column to a number. Converters can also be used to provide a default value for missing data: ``converters = {3: lambda s: float(s or 0)}``. missing : {string}, optional A string representing a missing value, irrespective of the column where it appears (e.g., `'missing'` or `'unused'`). missing_values : {None, dictionary}, optional A dictionary mapping a column number to a string indicating whether the corresponding field should be masked. usecols : {None, sequence}, optional Which columns to read, with 0 being the first. For example, ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns. names : {None, True, string, sequence}, optional If `names` is True, the field names are read from the first valid line after the first `skiprows` lines. If `names` is a sequence or a single-string of comma-separated names, the names will be used to define the field names in a flexible dtype. If `names` is None, the names of the dtype fields will be used, if any. excludelist : {sequence}, optional A list of names to exclude. This list is appended to the default list ['return','file','print']. Excluded names are appended an underscore: for example, `file` would become `file_`. deletechars : {string}, optional A string combining invalid characters that must be deleted from the names. case_sensitive : {True, False, 'upper', 'lower'}, optional If True, field names are case_sensitive. If False or 'upper', field names are converted to upper case. If 'lower', field names are converted to lower case. unpack : {bool}, optional If True, the returned array is transposed, so that arguments may be unpacked using ``x, y, z = loadtxt(...)`` usemask : {bool}, optional If True, returns a masked array. If False, return a regular standard array. Returns ------- out : MaskedArray Data read from the text file. Notes -------- * When spaces are used as delimiters, or when no delimiter has been given as input, there should not be any missing data between two fields. * When the variable are named (either by a flexible dtype or with `names`, there must not be any header in the file (else a :exc:ValueError exception is raised). Warnings -------- * Individual values are not stripped of spaces by default. When using a custom converter, make sure the function does remove spaces. See Also -------- numpy.loadtxt : equivalent function when no data is missing. """ # if usemask: from numpy.ma import MaskedArray, make_mask_descr # Check the input dictionary of converters user_converters = converters or {} if not isinstance(user_converters, dict): errmsg = "The input argument 'converter' should be a valid dictionary "\ "(got '%s' instead)" raise TypeError(errmsg % type(user_converters)) # Check the input dictionary of missing values user_missing_values = missing_values or {} if not isinstance(user_missing_values, dict): errmsg = "The input argument 'missing_values' should be a valid "\ "dictionary (got '%s' instead)" raise TypeError(errmsg % type(missing_values)) defmissing = [_.strip() for _ in missing.split(',')] + [''] # Initialize the filehandle, the LineSplitter and the NameValidator # fhd = _to_filehandle(fname) if isinstance(fname, basestring): fhd = np.lib._datasource.open(fname) elif not hasattr(fname, 'read'): raise TypeError("The input should be a string or a filehandle. "\ "(got %s instead)" % type(fname)) else: fhd = fname split_line = LineSplitter(delimiter=delimiter, comments=comments, autostrip=False)._handyman validate_names = NameValidator(excludelist=excludelist, deletechars=deletechars, case_sensitive=case_sensitive) # Get the first valid lines after the first skiprows ones for i in xrange(skiprows): fhd.readline() first_values = None while not first_values: first_line = fhd.readline() if first_line == '': raise IOError('End-of-file reached before encountering data.') if names is True: first_values = first_line.strip().split(delimiter) else: first_values = split_line(first_line) if names is True: fval = first_values[0].strip() if fval in comments: del first_values[0] # Check the columns to use if usecols is not None: usecols = list(usecols) nbcols = len(usecols or first_values) # Check the names and overwrite the dtype.names if needed if dtype is not None: dtype = np.dtype(dtype) dtypenames = getattr(dtype, 'names', None) if names is True: names = validate_names([_.strip() for _ in first_values]) first_line ='' elif _is_string_like(names): names = validate_names([_.strip() for _ in names.split(',')]) elif names: names = validate_names(names) elif dtypenames: dtype.names = validate_names(dtypenames) if names and dtypenames: dtype.names = names # If usecols is a list of names, convert to a list of indices if usecols: for (i, current) in enumerate(usecols): if _is_string_like(current): usecols[i] = names.index(current) # If user_missing_values has names as keys, transform them to indices missing_values = {} for (key, val) in user_missing_values.iteritems(): # If val is a list, flatten it. In any case, add missing &'' to the list if isinstance(val, (list, tuple)): val = [str(_) for _ in val] else: val = [str(val),] val.extend(defmissing) if _is_string_like(key): try: missing_values[names.index(key)] = val except ValueError: pass else: missing_values[key] = val # Initialize the default converters if dtype is None: # Note: we can't use a [...]*nbcols, as we would have 3 times the same # ... converter, instead of 3 different converters. converters = [StringConverter(None, missing_values=missing_values.get(_, defmissing)) for _ in range(nbcols)] else: flatdtypes = flatten_dtype(dtype) # Initialize the converters if len(flatdtypes) > 1: # Flexible type : get a converter from each dtype converters = [StringConverter(dt, missing_values=missing_values.get(i, defmissing), locked=True) for (i, dt) in enumerate(flatdtypes)] else: # Set to a default converter (but w/ different missing values) converters = [StringConverter(dtype, missing_values=missing_values.get(_, defmissing), locked=True) for _ in range(nbcols)] missing_values = [_.missing_values for _ in converters] # Update the converters to use the user-defined ones uc_update = [] for (i, conv) in user_converters.iteritems(): # If the converter is specified by column names, use the index instead if _is_string_like(i): i = names.index(i) if usecols: try: i = usecols.index(i) except ValueError: # Unused converter specified continue converters[i].update(conv, default=None, missing_values=missing_values[i], locked=True) uc_update.append((i, conv)) # Make sure we have the corrected keys in user_converters... user_converters.update(uc_update) # Reset the names to match the usecols if (not first_line) and usecols: names = [names[_] for _ in usecols] rows = [] append_to_rows = rows.append if usemask: masks = [] append_to_masks = masks.append # Parse each line for line in itertools.chain([first_line,], fhd): values = split_line(line) # Skip an empty line if len(values) == 0: continue # Select only the columns we need if usecols: values = [values[_] for _ in usecols] # Check whether we need to update the converter if dtype is None: for (converter, item) in zip(converters, values): converter.upgrade(item) # Store the values append_to_rows(tuple(values)) if usemask: append_to_masks(tuple([val.strip() in mss for (val, mss) in zip(values, missing_values)])) # Convert each value according to the converter: # We want to modify the list in place to avoid creating a new one... if loose: conversionfuncs = [conv._loose_call for conv in converters] else: conversionfuncs = [conv._strict_call for conv in converters] for (i, vals) in enumerate(rows): rows[i] = tuple([convert(val) for (convert, val) in zip(conversionfuncs, vals)]) # Reset the dtype data = rows if dtype is None: # Get the dtypes from the types of the converters coldtypes = [conv.type for conv in converters] # Find the columns with strings... strcolidx = [i for (i, v) in enumerate(coldtypes) if v in (type('S'), np.string_)] # ... and take the largest number of chars. for i in strcolidx: coldtypes[i] = "|S%i" % max(len(row[i]) for row in data) # if names is None: # If the dtype is uniform, don't define names, else use '' base = set([c.type for c in converters if c._checked]) if len(base) == 1: (ddtype, mdtype) = (list(base)[0], np.bool) else: ddtype = [('', dt) for dt in coldtypes] mdtype = [('', np.bool) for dt in coldtypes] else: ddtype = zip(names, coldtypes) mdtype = zip(names, [np.bool] * len(coldtypes)) output = np.array(data, dtype=ddtype) if usemask: outputmask = np.array(masks, dtype=mdtype) else: # Overwrite the initial dtype names if needed if names and dtype.names: dtype.names = names flatdtypes = flatten_dtype(dtype) # Case 1. We have a structured type if len(flatdtypes) > 1: # Nested dtype, eg [('a', int), ('b', [('b0', int), ('b1', 'f4')])] # First, create the array using a flattened dtype: # [('a', int), ('b1', int), ('b2', float)] # Then, view the array using the specified dtype. if has_nested_fields(dtype): if 'O' in (_.char for _ in flatdtypes): errmsg = "Nested fields involving objects "\ "are not supported..." raise NotImplementedError(errmsg) rows = np.array(data, dtype=[('', t) for t in flatdtypes]) output = rows.view(dtype) else: output = np.array(data, dtype=dtype) # Now, process the rowmasks the same way if usemask: rowmasks = np.array(masks, dtype=np.dtype([('', np.bool) for t in flatdtypes])) # Construct the new dtype mdtype = make_mask_descr(dtype) outputmask = rowmasks.view(mdtype) # Case #2. We have a basic dtype else: # We used some user-defined converters if user_converters: ishomogeneous = True descr = [] for (i, ttype) in enumerate([conv.type for conv in converters]): # Keep the dtype of the current converter if i in user_converters: ishomogeneous &= (ttype == dtype.type) if ttype == np.string_: ttype = "|S%i" % max(len(row[i]) for row in data) descr.append(('', ttype)) else: descr.append(('', dtype)) # So we changed the dtype ? if not ishomogeneous: # We have more than one field if len(descr) > 1: dtype = np.dtype(descr) # We have only one field: drop the name if not needed. else: dtype = np.dtype(ttype) # output = np.array(data, dtype) if usemask: if dtype.names: mdtype = [(_, np.bool) for _ in dtype.names] else: mdtype = np.bool outputmask = np.array(masks, dtype=mdtype) # Try to take care of the missing data we missed if usemask and output.dtype.names: for (name, conv) in zip(names or (), converters): missing_values = [conv(_) for _ in conv.missing_values if _ != ''] for mval in missing_values: outputmask[name] |= (output[name] == mval) # Construct the final array if usemask: output = output.view(MaskedArray) output._mask = outputmask if unpack: return output.squeeze().T return output.squeeze()
def savetxt(fname, X, fmt='%.18e',delimiter=' '): """ Save an array to file. Parameters ---------- fname : filename or a file handle If the filename ends in .gz, the file is automatically saved in compressed gzip format. The load() command understands gzipped files transparently. X : array_like Data. fmt : string or sequence of strings A single format (%10.5f), a sequence of formats, or a multi-format string, e.g. 'Iteration %d -- %10.5f', in which case delimiter is ignored. delimiter : str Character separating columns. See Also -------- save : Save an array to a binary file in NumPy format savez : Save several arrays into an .npz compressed archive Notes ----- Further explanation of the `fmt` parameter (``%[flag]width[.precision]specifier``): flags: ``-`` : left justify ``+`` : Forces to preceed result with + or -. ``0`` : Left pad the number with zeros instead of space (see width). width: Minimum number of characters to be printed. The value is not truncated if it has more characters. precision: - For integer specifiers (eg. ``d,i,o,x``), the minimum number of digits. - For ``e, E`` and ``f`` specifiers, the number of digits to print after the decimal point. - For ``g`` and ``G``, the maximum number of significant digits. - For ``s``, the maximum number of characters. specifiers: ``c`` : character ``d`` or ``i`` : signed decimal integer ``e`` or ``E`` : scientific notation with ``e`` or ``E``. ``f`` : decimal floating point ``g,G`` : use the shorter of ``e,E`` or ``f`` ``o`` : signed octal ``s`` : string of characters ``u`` : unsigned decimal integer ``x,X`` : unsigned hexadecimal integer This is not an exhaustive specification. Examples -------- >>> savetxt('test.out', x, delimiter=',') # X is an array >>> savetxt('test.out', (x,y,z)) # x,y,z equal sized 1D arrays >>> savetxt('test.out', x, fmt='%1.4e') # use exponential notation """ if _is_string_like(fname): if fname.endswith('.gz'): import gzip fh = gzip.open(fname,'wb') else: fh = file(fname,'w') elif hasattr(fname, 'seek'): fh = fname else: raise ValueError('fname must be a string or file handle') X = np.asarray(X) # Handle 1-dimensional arrays if X.ndim == 1: # Common case -- 1d array of numbers if X.dtype.names is None: X = np.atleast_2d(X).T ncol = 1 # Complex dtype -- each field indicates a separate column else: ncol = len(X.dtype.descr) else: ncol = X.shape[1] # `fmt` can be a string with multiple insertion points or a list of formats. # E.g. '%10.5f\t%10d' or ('%10.5f', '$10d') if type(fmt) in (list, tuple): if len(fmt) != ncol: raise AttributeError('fmt has wrong shape. %s' % str(fmt)) format = delimiter.join(fmt) elif type(fmt) is str: if fmt.count('%') == 1: fmt = [fmt,]*ncol format = delimiter.join(fmt) elif fmt.count('%') != ncol: raise AttributeError('fmt has wrong number of %% formats. %s' % fmt) else: format = fmt for row in X: fh.write(format % tuple(row) + '\n')
def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False): """ Load data from a text file. Each row in the text file must have the same number of values. Parameters ---------- fname : file or string File or filename to read. If the filename extension is ``.gz`` or ``.bz2``, the file is first decompressed. dtype : data-type Data type of the resulting array. If this is a record data-type, the resulting array will be 1-dimensional, and each row will be interpreted as an element of the array. In this case, the number of columns used must match the number of fields in the data-type. comments : string, optional The character used to indicate the start of a comment. delimiter : string, optional The string used to separate values. By default, this is any whitespace. converters : {} A dictionary mapping column number to a function that will convert that column to a float. E.g., if column 0 is a date string: ``converters = {0: datestr2num}``. Converters can also be used to provide a default value for missing data: ``converters = {3: lambda s: float(s or 0)}``. skiprows : int Skip the first `skiprows` lines. usecols : sequence Which columns to read, with 0 being the first. For example, ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns. unpack : bool If True, the returned array is transposed, so that arguments may be unpacked using ``x, y, z = loadtxt(...)`` Returns ------- out : ndarray Data read from the text file. See Also -------- scipy.io.loadmat : reads Matlab(R) data files Examples -------- >>> from StringIO import StringIO # StringIO behaves like a file object >>> c = StringIO("0 1\\n2 3") >>> np.loadtxt(c) array([[ 0., 1.], [ 2., 3.]]) >>> d = StringIO("M 21 72\\nF 35 58") >>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'), ... 'formats': ('S1', 'i4', 'f4')}) array([('M', 21, 72.0), ('F', 35, 58.0)], dtype=[('gender', '|S1'), ('age', '<i4'), ('weight', '<f4')]) >>> c = StringIO("1,0,2\\n3,0,4") >>> x,y = np.loadtxt(c, delimiter=',', usecols=(0,2), unpack=True) >>> x array([ 1., 3.]) >>> y array([ 2., 4.]) """ user_converters = converters if usecols is not None: usecols = list(usecols) isstring = False if _is_string_like(fname): isstring = True if fname.endswith('.gz'): import gzip fh = seek_gzip_factory(fname) elif fname.endswith('.bz2'): import bz2 fh = bz2.BZ2File(fname) else: fh = file(fname) elif hasattr(fname, 'readline'): fh = fname else: raise ValueError('fname must be a string or file handle') X = [] def flatten_dtype(dt): """Unpack a structured data-type.""" if dt.names is None: return [dt] else: types = [] for field in dt.names: tp, bytes = dt.fields[field] flat_dt = flatten_dtype(tp) types.extend(flat_dt) return types def split_line(line): """Chop off comments, strip, and split at delimiter.""" line = line.split(comments)[0].strip() if line: return line.split(delimiter) else: return [] try: # Make sure we're dealing with a proper dtype dtype = np.dtype(dtype) defconv = _getconv(dtype) # Skip the first `skiprows` lines for i in xrange(skiprows): fh.readline() # Read until we find a line with some values, and use # it to estimate the number of columns, N. first_vals = None while not first_vals: first_line = fh.readline() if first_line == '': # EOF reached raise IOError('End-of-file reached before encountering data.') first_vals = split_line(first_line) N = len(usecols or first_vals) dtype_types = flatten_dtype(dtype) if len(dtype_types) > 1: # We're dealing with a structured array, each field of # the dtype matches a column converters = [_getconv(dt) for dt in dtype_types] else: # All fields have the same dtype converters = [defconv for i in xrange(N)] # By preference, use the converters specified by the user for i, conv in (user_converters or {}).iteritems(): if usecols: try: i = usecols.index(i) except ValueError: # Unused converter specified continue converters[i] = conv # Parse each line, including the first for i, line in enumerate(itertools.chain([first_line], fh)): vals = split_line(line) if len(vals) == 0: continue if usecols: vals = [vals[i] for i in usecols] # Convert each value according to its column and store X.append(tuple([conv(val) for (conv, val) in zip(converters, vals)])) finally: if isstring: fh.close() if len(dtype_types) > 1: # We're dealing with a structured array, with a dtype such as # [('x', int), ('y', [('s', int), ('t', float)])] # # First, create the array using a flattened dtype: # [('x', int), ('s', int), ('t', float)] # # Then, view the array using the specified dtype. X = np.array(X, dtype=np.dtype([('', t) for t in dtype_types])) X = X.view(dtype) else: X = np.array(X, dtype) X = np.squeeze(X) if unpack: return X.T else: return X
def genfromtxt(fname, dtype=float, comments='#', delimiter=None, skiprows=0, converters=None, missing='', missing_values=None, usecols=None, names=None, excludelist=None, deletechars=None, case_sensitive=True, unpack=None, usemask=False, loose=True): """ Load data from a text file. Each line past the first `skiprows` ones is split at the `delimiter` character, and characters following the `comments` character are discarded. Parameters ---------- fname : file or string File or filename to read. If the filename extension is `.gz` or `.bz2`, the file is first decompressed. dtype : data-type Data type of the resulting array. If this is a flexible data-type, the resulting array will be 1-dimensional, and each row will be interpreted as an element of the array. In this case, the number of columns used must match the number of fields in the data-type, and the names of each field will be set by the corresponding name of the dtype. If None, the dtypes will be determined by the contents of each column, individually. comments : {string}, optional The character used to indicate the start of a comment. All the characters occurring on a line after a comment are discarded delimiter : {string}, optional The string used to separate values. By default, any consecutive whitespace act as delimiter. skiprows : {int}, optional Numbers of lines to skip at the beginning of the file. converters : {None, dictionary}, optional A dictionary mapping column number to a function that will convert values in the column to a number. Converters can also be used to provide a default value for missing data: ``converters = {3: lambda s: float(s or 0)}``. missing : {string}, optional A string representing a missing value, irrespective of the column where it appears (e.g., `'missing'` or `'unused'`). missing_values : {None, dictionary}, optional A dictionary mapping a column number to a string indicating whether the corresponding field should be masked. usecols : {None, sequence}, optional Which columns to read, with 0 being the first. For example, ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns. names : {None, True, string, sequence}, optional If `names` is True, the field names are read from the first valid line after the first `skiprows` lines. If `names` is a sequence or a single-string of comma-separated names, the names will be used to define the field names in a flexible dtype. If `names` is None, the names of the dtype fields will be used, if any. excludelist : {sequence}, optional A list of names to exclude. This list is appended to the default list ['return','file','print']. Excluded names are appended an underscore: for example, `file` would become `file_`. deletechars : {string}, optional A string combining invalid characters that must be deleted from the names. case_sensitive : {True, False, 'upper', 'lower'}, optional If True, field names are case_sensitive. If False or 'upper', field names are converted to upper case. If 'lower', field names are converted to lower case. unpack : {bool}, optional If True, the returned array is transposed, so that arguments may be unpacked using ``x, y, z = loadtxt(...)`` usemask : {bool}, optional If True, returns a masked array. If False, return a regular standard array. Returns ------- out : MaskedArray Data read from the text file. Notes -------- * When spaces are used as delimiters, or when no delimiter has been given as input, there should not be any missing data between two fields. * When the variable are named (either by a flexible dtype or with `names`, there must not be any header in the file (else a :exc:ValueError exception is raised). Warnings -------- * Individual values are not stripped of spaces by default. When using a custom converter, make sure the function does remove spaces. See Also -------- numpy.loadtxt : equivalent function when no data is missing. """ # if usemask: from numpy.ma import MaskedArray, make_mask_descr # Check the input dictionary of converters user_converters = converters or {} if not isinstance(user_converters, dict): errmsg = "The input argument 'converter' should be a valid dictionary "\ "(got '%s' instead)" raise TypeError(errmsg % type(user_converters)) # Check the input dictionary of missing values user_missing_values = missing_values or {} if not isinstance(user_missing_values, dict): errmsg = "The input argument 'missing_values' should be a valid "\ "dictionary (got '%s' instead)" raise TypeError(errmsg % type(missing_values)) defmissing = [_.strip() for _ in missing.split(',')] + [''] # Initialize the filehandle, the LineSplitter and the NameValidator # fhd = _to_filehandle(fname) if isinstance(fname, basestring): fhd = np.lib._datasource.open(fname) elif not hasattr(fname, 'read'): raise TypeError("The input should be a string or a filehandle. "\ "(got %s instead)" % type(fname)) else: fhd = fname split_line = LineSplitter(delimiter=delimiter, comments=comments, autostrip=False)._handyman validate_names = NameValidator(excludelist=excludelist, deletechars=deletechars, case_sensitive=case_sensitive) # Get the first valid lines after the first skiprows ones for i in xrange(skiprows): fhd.readline() first_values = None while not first_values: first_line = fhd.readline() if first_line == '': raise IOError('End-of-file reached before encountering data.') if names is True: first_values = first_line.strip().split(delimiter) else: first_values = split_line(first_line) if names is True: fval = first_values[0].strip() if fval in comments: del first_values[0] # Check the columns to use if usecols is not None: usecols = list(usecols) nbcols = len(usecols or first_values) # Check the names and overwrite the dtype.names if needed if dtype is not None: dtype = np.dtype(dtype) dtypenames = getattr(dtype, 'names', None) if names is True: names = validate_names([_.strip() for _ in first_values]) first_line = '' elif _is_string_like(names): names = validate_names([_.strip() for _ in names.split(',')]) elif names: names = validate_names(names) elif dtypenames: dtype.names = validate_names(dtypenames) if names and dtypenames: dtype.names = names # If usecols is a list of names, convert to a list of indices if usecols: for (i, current) in enumerate(usecols): if _is_string_like(current): usecols[i] = names.index(current) # If user_missing_values has names as keys, transform them to indices missing_values = {} for (key, val) in user_missing_values.iteritems(): # If val is a list, flatten it. In any case, add missing &'' to the list if isinstance(val, (list, tuple)): val = [str(_) for _ in val] else: val = [ str(val), ] val.extend(defmissing) if _is_string_like(key): try: missing_values[names.index(key)] = val except ValueError: pass else: missing_values[key] = val # Initialize the default converters if dtype is None: # Note: we can't use a [...]*nbcols, as we would have 3 times the same # ... converter, instead of 3 different converters. converters = [ StringConverter(None, missing_values=missing_values.get(_, defmissing)) for _ in range(nbcols) ] else: flatdtypes = flatten_dtype(dtype) # Initialize the converters if len(flatdtypes) > 1: # Flexible type : get a converter from each dtype converters = [ StringConverter(dt, missing_values=missing_values.get( i, defmissing), locked=True) for (i, dt) in enumerate(flatdtypes) ] else: # Set to a default converter (but w/ different missing values) converters = [ StringConverter(dtype, missing_values=missing_values.get( _, defmissing), locked=True) for _ in range(nbcols) ] missing_values = [_.missing_values for _ in converters] # Update the converters to use the user-defined ones uc_update = [] for (i, conv) in user_converters.iteritems(): # If the converter is specified by column names, use the index instead if _is_string_like(i): i = names.index(i) if usecols: try: i = usecols.index(i) except ValueError: # Unused converter specified continue converters[i].update(conv, default=None, missing_values=missing_values[i], locked=True) uc_update.append((i, conv)) # Make sure we have the corrected keys in user_converters... user_converters.update(uc_update) # Reset the names to match the usecols if (not first_line) and usecols: names = [names[_] for _ in usecols] rows = [] append_to_rows = rows.append if usemask: masks = [] append_to_masks = masks.append # Parse each line for line in itertools.chain([ first_line, ], fhd): values = split_line(line) # Skip an empty line if len(values) == 0: continue # Select only the columns we need if usecols: values = [values[_] for _ in usecols] # Check whether we need to update the converter if dtype is None: for (converter, item) in zip(converters, values): converter.upgrade(item) # Store the values append_to_rows(tuple(values)) if usemask: append_to_masks( tuple([ val.strip() in mss for (val, mss) in zip(values, missing_values) ])) # Convert each value according to the converter: # We want to modify the list in place to avoid creating a new one... if loose: conversionfuncs = [conv._loose_call for conv in converters] else: conversionfuncs = [conv._strict_call for conv in converters] for (i, vals) in enumerate(rows): rows[i] = tuple( [convert(val) for (convert, val) in zip(conversionfuncs, vals)]) # Reset the dtype data = rows if dtype is None: # Get the dtypes from the types of the converters coldtypes = [conv.type for conv in converters] # Find the columns with strings... strcolidx = [ i for (i, v) in enumerate(coldtypes) if v in (type('S'), np.string_) ] # ... and take the largest number of chars. for i in strcolidx: coldtypes[i] = "|S%i" % max(len(row[i]) for row in data) # if names is None: # If the dtype is uniform, don't define names, else use '' base = set([c.type for c in converters if c._checked]) if len(base) == 1: (ddtype, mdtype) = (list(base)[0], np.bool) else: ddtype = [('', dt) for dt in coldtypes] mdtype = [('', np.bool) for dt in coldtypes] else: ddtype = zip(names, coldtypes) mdtype = zip(names, [np.bool] * len(coldtypes)) output = np.array(data, dtype=ddtype) if usemask: outputmask = np.array(masks, dtype=mdtype) else: # Overwrite the initial dtype names if needed if names and dtype.names: dtype.names = names flatdtypes = flatten_dtype(dtype) # Case 1. We have a structured type if len(flatdtypes) > 1: # Nested dtype, eg [('a', int), ('b', [('b0', int), ('b1', 'f4')])] # First, create the array using a flattened dtype: # [('a', int), ('b1', int), ('b2', float)] # Then, view the array using the specified dtype. if has_nested_fields(dtype): if 'O' in (_.char for _ in flatdtypes): errmsg = "Nested fields involving objects "\ "are not supported..." raise NotImplementedError(errmsg) rows = np.array(data, dtype=[('', t) for t in flatdtypes]) output = rows.view(dtype) else: output = np.array(data, dtype=dtype) # Now, process the rowmasks the same way if usemask: rowmasks = np.array(masks, dtype=np.dtype([('', np.bool) for t in flatdtypes])) # Construct the new dtype mdtype = make_mask_descr(dtype) outputmask = rowmasks.view(mdtype) # Case #2. We have a basic dtype else: # We used some user-defined converters if user_converters: ishomogeneous = True descr = [] for (i, ttype) in enumerate([conv.type for conv in converters]): # Keep the dtype of the current converter if i in user_converters: ishomogeneous &= (ttype == dtype.type) if ttype == np.string_: ttype = "|S%i" % max(len(row[i]) for row in data) descr.append(('', ttype)) else: descr.append(('', dtype)) # So we changed the dtype ? if not ishomogeneous: # We have more than one field if len(descr) > 1: dtype = np.dtype(descr) # We have only one field: drop the name if not needed. else: dtype = np.dtype(ttype) # output = np.array(data, dtype) if usemask: if dtype.names: mdtype = [(_, np.bool) for _ in dtype.names] else: mdtype = np.bool outputmask = np.array(masks, dtype=mdtype) # Try to take care of the missing data we missed if usemask and output.dtype.names: for (name, conv) in zip(names or (), converters): missing_values = [conv(_) for _ in conv.missing_values if _ != ''] for mval in missing_values: outputmask[name] |= (output[name] == mval) # Construct the final array if usemask: output = output.view(MaskedArray) output._mask = outputmask if unpack: return output.squeeze().T return output.squeeze()
def savetxt(fname, X, fmt='%.18e', delimiter=' '): """ Save an array to file. Parameters ---------- fname : filename or a file handle If the filename ends in .gz, the file is automatically saved in compressed gzip format. The load() command understands gzipped files transparently. X : array_like Data. fmt : string or sequence of strings A single format (%10.5f), a sequence of formats, or a multi-format string, e.g. 'Iteration %d -- %10.5f', in which case delimiter is ignored. delimiter : str Character separating columns. See Also -------- save : Save an array to a binary file in NumPy format savez : Save several arrays into an .npz compressed archive Notes ----- Further explanation of the `fmt` parameter (``%[flag]width[.precision]specifier``): flags: ``-`` : left justify ``+`` : Forces to preceed result with + or -. ``0`` : Left pad the number with zeros instead of space (see width). width: Minimum number of characters to be printed. The value is not truncated if it has more characters. precision: - For integer specifiers (eg. ``d,i,o,x``), the minimum number of digits. - For ``e, E`` and ``f`` specifiers, the number of digits to print after the decimal point. - For ``g`` and ``G``, the maximum number of significant digits. - For ``s``, the maximum number of characters. specifiers: ``c`` : character ``d`` or ``i`` : signed decimal integer ``e`` or ``E`` : scientific notation with ``e`` or ``E``. ``f`` : decimal floating point ``g,G`` : use the shorter of ``e,E`` or ``f`` ``o`` : signed octal ``s`` : string of characters ``u`` : unsigned decimal integer ``x,X`` : unsigned hexadecimal integer This is not an exhaustive specification. Examples -------- >>> savetxt('test.out', x, delimiter=',') # X is an array >>> savetxt('test.out', (x,y,z)) # x,y,z equal sized 1D arrays >>> savetxt('test.out', x, fmt='%1.4e') # use exponential notation """ if _is_string_like(fname): if fname.endswith('.gz'): import gzip fh = gzip.open(fname, 'wb') else: fh = file(fname, 'w') elif hasattr(fname, 'seek'): fh = fname else: raise ValueError('fname must be a string or file handle') X = np.asarray(X) # Handle 1-dimensional arrays if X.ndim == 1: # Common case -- 1d array of numbers if X.dtype.names is None: X = np.atleast_2d(X).T ncol = 1 # Complex dtype -- each field indicates a separate column else: ncol = len(X.dtype.descr) else: ncol = X.shape[1] # `fmt` can be a string with multiple insertion points or a list of formats. # E.g. '%10.5f\t%10d' or ('%10.5f', '$10d') if type(fmt) in (list, tuple): if len(fmt) != ncol: raise AttributeError('fmt has wrong shape. %s' % str(fmt)) format = delimiter.join(fmt) elif type(fmt) is str: if fmt.count('%') == 1: fmt = [ fmt, ] * ncol format = delimiter.join(fmt) elif fmt.count('%') != ncol: raise AttributeError('fmt has wrong number of %% formats. %s' % fmt) else: format = fmt for row in X: fh.write(format % tuple(row) + '\n')
def loadtxt(fname, dtype=float, comments='#', delimiter=None, converters=None, skiprows=0, usecols=None, unpack=False): """ Load data from a text file. Each row in the text file must have the same number of values. Parameters ---------- fname : file or string File or filename to read. If the filename extension is ``.gz`` or ``.bz2``, the file is first decompressed. dtype : data-type Data type of the resulting array. If this is a record data-type, the resulting array will be 1-dimensional, and each row will be interpreted as an element of the array. In this case, the number of columns used must match the number of fields in the data-type. comments : string, optional The character used to indicate the start of a comment. delimiter : string, optional The string used to separate values. By default, this is any whitespace. converters : {} A dictionary mapping column number to a function that will convert that column to a float. E.g., if column 0 is a date string: ``converters = {0: datestr2num}``. Converters can also be used to provide a default value for missing data: ``converters = {3: lambda s: float(s or 0)}``. skiprows : int Skip the first `skiprows` lines. usecols : sequence Which columns to read, with 0 being the first. For example, ``usecols = (1,4,5)`` will extract the 2nd, 5th and 6th columns. unpack : bool If True, the returned array is transposed, so that arguments may be unpacked using ``x, y, z = loadtxt(...)`` Returns ------- out : ndarray Data read from the text file. See Also -------- scipy.io.loadmat : reads Matlab(R) data files Examples -------- >>> from StringIO import StringIO # StringIO behaves like a file object >>> c = StringIO("0 1\\n2 3") >>> np.loadtxt(c) array([[ 0., 1.], [ 2., 3.]]) >>> d = StringIO("M 21 72\\nF 35 58") >>> np.loadtxt(d, dtype={'names': ('gender', 'age', 'weight'), ... 'formats': ('S1', 'i4', 'f4')}) array([('M', 21, 72.0), ('F', 35, 58.0)], dtype=[('gender', '|S1'), ('age', '<i4'), ('weight', '<f4')]) >>> c = StringIO("1,0,2\\n3,0,4") >>> x,y = np.loadtxt(c, delimiter=',', usecols=(0,2), unpack=True) >>> x array([ 1., 3.]) >>> y array([ 2., 4.]) """ user_converters = converters if usecols is not None: usecols = list(usecols) isstring = False if _is_string_like(fname): isstring = True if fname.endswith('.gz'): import gzip fh = seek_gzip_factory(fname) elif fname.endswith('.bz2'): import bz2 fh = bz2.BZ2File(fname) else: fh = file(fname) elif hasattr(fname, 'readline'): fh = fname else: raise ValueError('fname must be a string or file handle') X = [] def flatten_dtype(dt): """Unpack a structured data-type.""" if dt.names is None: return [dt] else: types = [] for field in dt.names: tp, bytes = dt.fields[field] flat_dt = flatten_dtype(tp) types.extend(flat_dt) return types def split_line(line): """Chop off comments, strip, and split at delimiter.""" line = line.split(comments)[0].strip() if line: return line.split(delimiter) else: return [] try: # Make sure we're dealing with a proper dtype dtype = np.dtype(dtype) defconv = _getconv(dtype) # Skip the first `skiprows` lines for i in xrange(skiprows): fh.readline() # Read until we find a line with some values, and use # it to estimate the number of columns, N. first_vals = None while not first_vals: first_line = fh.readline() if first_line == '': # EOF reached raise IOError('End-of-file reached before encountering data.') first_vals = split_line(first_line) N = len(usecols or first_vals) dtype_types = flatten_dtype(dtype) if len(dtype_types) > 1: # We're dealing with a structured array, each field of # the dtype matches a column converters = [_getconv(dt) for dt in dtype_types] else: # All fields have the same dtype converters = [defconv for i in xrange(N)] # By preference, use the converters specified by the user for i, conv in (user_converters or {}).iteritems(): if usecols: try: i = usecols.index(i) except ValueError: # Unused converter specified continue converters[i] = conv # Parse each line, including the first for i, line in enumerate(itertools.chain([first_line], fh)): vals = split_line(line) if len(vals) == 0: continue if usecols: vals = [vals[i] for i in usecols] # Convert each value according to its column and store X.append( tuple([conv(val) for (conv, val) in zip(converters, vals)])) finally: if isstring: fh.close() if len(dtype_types) > 1: # We're dealing with a structured array, with a dtype such as # [('x', int), ('y', [('s', int), ('t', float)])] # # First, create the array using a flattened dtype: # [('x', int), ('s', int), ('t', float)] # # Then, view the array using the specified dtype. X = np.array(X, dtype=np.dtype([('', t) for t in dtype_types])) X = X.view(dtype) else: X = np.array(X, dtype) X = np.squeeze(X) if unpack: return X.T else: return X