def test_easy_dtype(self): "Test ndtype on dtypes" # Simple case ndtype = float assert_equal(easy_dtype(ndtype), np.dtype(float)) # As string w/o names ndtype = "i4, f8" assert_equal(easy_dtype(ndtype), np.dtype([('f0', "i4"), ('f1', "f8")])) # As string w/o names but different default format assert_equal(easy_dtype(ndtype, defaultfmt="field_%03i"), np.dtype([('field_000', "i4"), ('field_001', "f8")])) # As string w/ names ndtype = "i4, f8" assert_equal(easy_dtype(ndtype, names="a, b"), np.dtype([('a', "i4"), ('b', "f8")])) # As string w/ names (too many) ndtype = "i4, f8" assert_equal(easy_dtype(ndtype, names="a, b, c"), np.dtype([('a', "i4"), ('b', "f8")])) # As string w/ names (not enough) ndtype = "i4, f8" assert_equal(easy_dtype(ndtype, names=", b"), np.dtype([('f0', "i4"), ('b', "f8")])) # ... (with different default format) assert_equal(easy_dtype(ndtype, names="a", defaultfmt="f%02i"), np.dtype([('a', "i4"), ('f00', "f8")])) # As list of tuples w/o names ndtype = [('A', int), ('B', float)] assert_equal(easy_dtype(ndtype), np.dtype([('A', int), ('B', float)])) # As list of tuples w/ names assert_equal(easy_dtype(ndtype, names="a,b"), np.dtype([('a', int), ('b', float)])) # As list of tuples w/ not enough names assert_equal(easy_dtype(ndtype, names="a"), np.dtype([('a', int), ('f0', float)])) # As list of tuples w/ too many names assert_equal(easy_dtype(ndtype, names="a,b,c"), np.dtype([('a', int), ('b', float)])) # As list of types w/o names ndtype = (int, float, float) assert_equal(easy_dtype(ndtype), np.dtype([('f0', int), ('f1', float), ('f2', float)])) # As list of types w names ndtype = (int, float, float) assert_equal(easy_dtype(ndtype, names="a, b, c"), np.dtype([('a', int), ('b', float), ('c', float)])) # As simple dtype w/ names ndtype = np.dtype(float) assert_equal(easy_dtype(ndtype, names="a, b, c"), np.dtype([(_, float) for _ in ('a', 'b', 'c')])) # As simple dtype w/o names (but multiple fields) ndtype = np.dtype(float) assert_equal( easy_dtype(ndtype, names=['', '', ''], defaultfmt="f%02i"), np.dtype([(_, float) for _ in ('f00', 'f01', 'f02')]))
def test_easy_dtype(self): "Test ndtype on dtypes" # Simple case ndtype = float assert_equal(easy_dtype(ndtype), np.dtype(float)) # As string w/o names ndtype = "i4, f8" assert_equal(easy_dtype(ndtype), np.dtype([("f0", "i4"), ("f1", "f8")])) # As string w/o names but different default format assert_equal(easy_dtype(ndtype, defaultfmt="field_%03i"), np.dtype([("field_000", "i4"), ("field_001", "f8")])) # As string w/ names ndtype = "i4, f8" assert_equal(easy_dtype(ndtype, names="a, b"), np.dtype([("a", "i4"), ("b", "f8")])) # As string w/ names (too many) ndtype = "i4, f8" assert_equal(easy_dtype(ndtype, names="a, b, c"), np.dtype([("a", "i4"), ("b", "f8")])) # As string w/ names (not enough) ndtype = "i4, f8" assert_equal(easy_dtype(ndtype, names=", b"), np.dtype([("f0", "i4"), ("b", "f8")])) # ... (with different default format) assert_equal(easy_dtype(ndtype, names="a", defaultfmt="f%02i"), np.dtype([("a", "i4"), ("f00", "f8")])) # As list of tuples w/o names ndtype = [("A", int), ("B", float)] assert_equal(easy_dtype(ndtype), np.dtype([("A", int), ("B", float)])) # As list of tuples w/ names assert_equal(easy_dtype(ndtype, names="a,b"), np.dtype([("a", int), ("b", float)])) # As list of tuples w/ not enough names assert_equal(easy_dtype(ndtype, names="a"), np.dtype([("a", int), ("f0", float)])) # As list of tuples w/ too many names assert_equal(easy_dtype(ndtype, names="a,b,c"), np.dtype([("a", int), ("b", float)])) # As list of types w/o names ndtype = (int, float, float) assert_equal(easy_dtype(ndtype), np.dtype([("f0", int), ("f1", float), ("f2", float)])) # As list of types w names ndtype = (int, float, float) assert_equal(easy_dtype(ndtype, names="a, b, c"), np.dtype([("a", int), ("b", float), ("c", float)])) # As simple dtype w/ names ndtype = np.dtype(float) assert_equal(easy_dtype(ndtype, names="a, b, c"), np.dtype([(_, float) for _ in ("a", "b", "c")])) # As simple dtype w/o names (but multiple fields) ndtype = np.dtype(float) assert_equal( easy_dtype(ndtype, names=["", "", ""], defaultfmt="f%02i"), np.dtype([(_, float) for _ in ("f00", "f01", "f02")]), )
def test_easy_dtype(self): "Test ndtype on dtypes" # Simple case ndtype = float assert_equal(easy_dtype(ndtype), np.dtype(float)) # As string w/o names ndtype = "i4, f8" assert_equal(easy_dtype(ndtype), np.dtype([("f0", "i4"), ("f1", "f8")])) # As string w/o names but different default format assert_equal( easy_dtype(ndtype, defaultfmt="field_%03i"), np.dtype([("field_000", "i4"), ("field_001", "f8")]), ) # As string w/ names ndtype = "i4, f8" assert_equal(easy_dtype(ndtype, names="a, b"), np.dtype([("a", "i4"), ("b", "f8")])) # As string w/ names (too many) ndtype = "i4, f8" assert_equal(easy_dtype(ndtype, names="a, b, c"), np.dtype([("a", "i4"), ("b", "f8")])) # As string w/ names (not enough) ndtype = "i4, f8" assert_equal(easy_dtype(ndtype, names=", b"), np.dtype([("f0", "i4"), ("b", "f8")])) # ... (with different default format) assert_equal( easy_dtype(ndtype, names="a", defaultfmt="f%02i"), np.dtype([("a", "i4"), ("f00", "f8")]), ) # As list of tuples w/o names ndtype = [("A", int), ("B", float)] assert_equal(easy_dtype(ndtype), np.dtype([("A", int), ("B", float)])) # As list of tuples w/ names assert_equal(easy_dtype(ndtype, names="a,b"), np.dtype([("a", int), ("b", float)])) # As list of tuples w/ not enough names assert_equal(easy_dtype(ndtype, names="a"), np.dtype([("a", int), ("f0", float)])) # As list of tuples w/ too many names assert_equal(easy_dtype(ndtype, names="a,b,c"), np.dtype([("a", int), ("b", float)])) # As list of types w/o names ndtype = (int, float, float) assert_equal(easy_dtype(ndtype), np.dtype([("f0", int), ("f1", float), ("f2", float)])) # As list of types w names ndtype = (int, float, float) assert_equal( easy_dtype(ndtype, names="a, b, c"), np.dtype([("a", int), ("b", float), ("c", float)]), ) # As simple dtype w/ names ndtype = np.dtype(float) assert_equal( easy_dtype(ndtype, names="a, b, c"), np.dtype([(_, float) for _ in ("a", "b", "c")]), ) # As simple dtype w/o names (but multiple fields) ndtype = np.dtype(float) assert_equal( easy_dtype(ndtype, names=["", "", ""], defaultfmt="f%02i"), np.dtype([(_, float) for _ in ("f00", "f01", "f02")]), )
def genfromdta(fname, excludelist=None, missing_flt=-999., missing_str=""): """ Returns an ndarray from a Stata .dta file. Parameters ---------- fname : str or filehandle Stata .dta file. missing_values excludelist missing_flt missing_str Notes ------ If the parser encounters a format that it doesn't understand, then it will convert to string. This may be the case with date formats. """ #TODO: extend to get data from online if isinstance(fname, basestring): fhd = StataReader(open(fname, 'rb'), missing_values=False) elif not hasattr(fname, 'read'): raise TypeError("The input should be a string or a filehandle. "\ "(got %s instead)" % type(fname)) else: fhd = StataReader(fname, missing_values=False) # validate_names = np.lib._iotools.NameValidator(excludelist=excludelist, # deletechars=deletechars, # case_sensitive=case_sensitive) #TODO: does this need to handle the byteorder? header = fhd.file_headers() # types = header['typlist'] # typemap in StataReader? nobs = header['nobs'] numvars = header['nvar'] varnames = header['varlist'] dataname = header['data_label'] labels = header['vlblist'] # labels are thrown away unless DataArray # type is used data = np.zeros((nobs,numvars)) stata_dta = fhd.dataset() # build dtype from stata formats # see http://www.stata.com/help.cgi?format # This converts all of these to float64 # all time and strings are converted to strings #TODO: put these notes in the docstring #TODO: need to write a time parser to_flt = ['g','e','f','h','gc','fc', 'x', 'l'] # how to deal with x # and double-precision to_str = ['s'] if 1:# if not convert_time: #time parser not written to_str.append('t') flt_or_str = lambda x: ((x.lower()[-1] in to_str and 's') or \ (x.lower()[-1] in to_flt and 'f8')) or 's' #TODO: this is surely not the best way to handle data types convert_missing = {'f8' : missing_flt, 's' : missing_str} #TODO: needs to be made more flexible when change types fmt = [_.split('.')[-1] for _ in header['fmtlist']] remove_comma = [fmt.index(_) for _ in fmt if 'c' in _] for i in range(len(fmt)): # remove commas and convert any time types to 't' if 't' in fmt[i]: fmt[i] = 't' if i in remove_comma: fmt[i] = fmt[i][:-1] # needs to be changed if time doesn't req. # loop formats = map(flt_or_str, fmt) # have to go through the whole file first to find string lengths? #TODO: this is going to be miserably slow # have a closer look at numpy.genfromtxt and revisit this first_list = [] for rownum,line in enumerate(stata_dta): # doesn't handle missing value objects # Untested for commas and string missing # None will only work without missing value object. if None in line and not remove_comma: for val in line: if val is None: line[line.index(val)] = convert_missing[\ formats[line.index(val)]] if None in line and remove_comma: for i,val in enumerate(line): if val is None: line[i] = convert_missing[formats[i]] elif i in remove_comma: try: # sometimes a format, say gc is read as a float or int #TODO: I'm actually not sure now that comma formats # are read as strings. line[i] = ''.join(line[i].split(',')) except: line[j] = str(line[j]) if formats[i] == 'f8': line[i] = float(line[i]) if remove_comma and not None in line: for j in remove_comma: try: # sometimes a format, say gc is read as a float or int line[j] = ''.join(line[j].split(',')) except: line[j] = str(line[j]) if formats[j] == 'f8': # change when change f8 line[j] = float(line[j]) first_list.append(line) #TODO: add informative error message similar to genfromtxt # Get string lengths strcolidx = [] if 's' in formats: for col,type in enumerate(formats): if type == 's': strcolidx.append(col) for i in strcolidx: formats[i] = "a%i" % max(len(str(row[i])) for row in first_list) dt = zip(varnames, formats) # make dtype again dt = easy_dtype(dt) data = np.zeros((nobs), dtype=dt) # init final array for i,row in enumerate(first_list): data[i] = tuple(row) #TODO: make it possible to return plain array if all 'f8' for example return data