Exemple #1
0
 def test_easy_dtype(self):
     "Test ndtype on dtypes"
     # Simple case
     ndtype = float
     assert_equal(easy_dtype(ndtype), np.dtype(float))
     # As string w/o names
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype),
                  np.dtype([('f0', "i4"), ('f1', "f8")]))
     # As string w/o names but different default format
     assert_equal(easy_dtype(ndtype, defaultfmt="field_%03i"),
                  np.dtype([('field_000', "i4"), ('field_001', "f8")]))
     # As string w/ names
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype, names="a, b"),
                  np.dtype([('a', "i4"), ('b', "f8")]))
     # As string w/ names (too many)
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype, names="a, b, c"),
                  np.dtype([('a', "i4"), ('b', "f8")]))
     # As string w/ names (not enough)
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype, names=", b"),
                  np.dtype([('f0', "i4"), ('b', "f8")]))
     # ... (with different default format)
     assert_equal(easy_dtype(ndtype, names="a", defaultfmt="f%02i"),
                  np.dtype([('a', "i4"), ('f00', "f8")]))
     # As list of tuples w/o names
     ndtype = [('A', int), ('B', float)]
     assert_equal(easy_dtype(ndtype), np.dtype([('A', int), ('B', float)]))
     # As list of tuples w/ names
     assert_equal(easy_dtype(ndtype, names="a,b"),
                  np.dtype([('a', int), ('b', float)]))
     # As list of tuples w/ not enough names
     assert_equal(easy_dtype(ndtype, names="a"),
                  np.dtype([('a', int), ('f0', float)]))
     # As list of tuples w/ too many names
     assert_equal(easy_dtype(ndtype, names="a,b,c"),
                  np.dtype([('a', int), ('b', float)]))
     # As list of types w/o names
     ndtype = (int, float, float)
     assert_equal(easy_dtype(ndtype),
                  np.dtype([('f0', int), ('f1', float), ('f2', float)]))
     # As list of types w names
     ndtype = (int, float, float)
     assert_equal(easy_dtype(ndtype, names="a, b, c"),
                  np.dtype([('a', int), ('b', float), ('c', float)]))
     # As simple dtype w/ names
     ndtype = np.dtype(float)
     assert_equal(easy_dtype(ndtype, names="a, b, c"),
                  np.dtype([(_, float) for _ in ('a', 'b', 'c')]))
     # As simple dtype w/o names (but multiple fields)
     ndtype = np.dtype(float)
     assert_equal(
         easy_dtype(ndtype, names=['', '', ''], defaultfmt="f%02i"),
         np.dtype([(_, float) for _ in ('f00', 'f01', 'f02')]))
Exemple #2
0
 def test_easy_dtype(self):
     "Test ndtype on dtypes"
     # Simple case
     ndtype = float
     assert_equal(easy_dtype(ndtype), np.dtype(float))
     # As string w/o names
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype), np.dtype([('f0', "i4"),
                                                ('f1', "f8")]))
     # As string w/o names but different default format
     assert_equal(easy_dtype(ndtype, defaultfmt="field_%03i"),
                  np.dtype([('field_000', "i4"), ('field_001', "f8")]))
     # As string w/ names
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype, names="a, b"),
                  np.dtype([('a', "i4"), ('b', "f8")]))
     # As string w/ names (too many)
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype, names="a, b, c"),
                  np.dtype([('a', "i4"), ('b', "f8")]))
     # As string w/ names (not enough)
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype, names=", b"),
                  np.dtype([('f0', "i4"), ('b', "f8")]))
     # ... (with different default format)
     assert_equal(easy_dtype(ndtype, names="a", defaultfmt="f%02i"),
                  np.dtype([('a', "i4"), ('f00', "f8")]))
     # As list of tuples w/o names
     ndtype = [('A', int), ('B', float)]
     assert_equal(easy_dtype(ndtype), np.dtype([('A', int), ('B', float)]))
     # As list of tuples w/ names
     assert_equal(easy_dtype(ndtype, names="a,b"),
                  np.dtype([('a', int), ('b', float)]))
     # As list of tuples w/ not enough names
     assert_equal(easy_dtype(ndtype, names="a"),
                  np.dtype([('a', int), ('f0', float)]))
     # As list of tuples w/ too many names
     assert_equal(easy_dtype(ndtype, names="a,b,c"),
                  np.dtype([('a', int), ('b', float)]))
     # As list of types w/o names
     ndtype = (int, float, float)
     assert_equal(easy_dtype(ndtype),
                  np.dtype([('f0', int), ('f1', float), ('f2', float)]))
     # As list of types w names
     ndtype = (int, float, float)
     assert_equal(easy_dtype(ndtype, names="a, b, c"),
                  np.dtype([('a', int), ('b', float), ('c', float)]))
     # As simple dtype w/ names
     ndtype = np.dtype(float)
     assert_equal(easy_dtype(ndtype, names="a, b, c"),
                  np.dtype([(_, float) for _ in ('a', 'b', 'c')]))
     # As simple dtype w/o names (but multiple fields)
     ndtype = np.dtype(float)
     assert_equal(
         easy_dtype(ndtype, names=['', '', ''], defaultfmt="f%02i"),
         np.dtype([(_, float) for _ in ('f00', 'f01', 'f02')]))
Exemple #3
0
 def test_easy_dtype(self):
     "Test ndtype on dtypes"
     # Simple case
     ndtype = float
     assert_equal(easy_dtype(ndtype), np.dtype(float))
     # As string w/o names
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype), np.dtype([("f0", "i4"), ("f1", "f8")]))
     # As string w/o names but different default format
     assert_equal(easy_dtype(ndtype, defaultfmt="field_%03i"), np.dtype([("field_000", "i4"), ("field_001", "f8")]))
     # As string w/ names
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype, names="a, b"), np.dtype([("a", "i4"), ("b", "f8")]))
     # As string w/ names (too many)
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype, names="a, b, c"), np.dtype([("a", "i4"), ("b", "f8")]))
     # As string w/ names (not enough)
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype, names=", b"), np.dtype([("f0", "i4"), ("b", "f8")]))
     # ... (with different default format)
     assert_equal(easy_dtype(ndtype, names="a", defaultfmt="f%02i"), np.dtype([("a", "i4"), ("f00", "f8")]))
     # As list of tuples w/o names
     ndtype = [("A", int), ("B", float)]
     assert_equal(easy_dtype(ndtype), np.dtype([("A", int), ("B", float)]))
     # As list of tuples w/ names
     assert_equal(easy_dtype(ndtype, names="a,b"), np.dtype([("a", int), ("b", float)]))
     # As list of tuples w/ not enough names
     assert_equal(easy_dtype(ndtype, names="a"), np.dtype([("a", int), ("f0", float)]))
     # As list of tuples w/ too many names
     assert_equal(easy_dtype(ndtype, names="a,b,c"), np.dtype([("a", int), ("b", float)]))
     # As list of types w/o names
     ndtype = (int, float, float)
     assert_equal(easy_dtype(ndtype), np.dtype([("f0", int), ("f1", float), ("f2", float)]))
     # As list of types w names
     ndtype = (int, float, float)
     assert_equal(easy_dtype(ndtype, names="a, b, c"), np.dtype([("a", int), ("b", float), ("c", float)]))
     # As simple dtype w/ names
     ndtype = np.dtype(float)
     assert_equal(easy_dtype(ndtype, names="a, b, c"), np.dtype([(_, float) for _ in ("a", "b", "c")]))
     # As simple dtype w/o names (but multiple fields)
     ndtype = np.dtype(float)
     assert_equal(
         easy_dtype(ndtype, names=["", "", ""], defaultfmt="f%02i"),
         np.dtype([(_, float) for _ in ("f00", "f01", "f02")]),
     )
Exemple #4
0
 def test_easy_dtype(self):
     "Test ndtype on dtypes"
     # Simple case
     ndtype = float
     assert_equal(easy_dtype(ndtype), np.dtype(float))
     # As string w/o names
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype), np.dtype([("f0", "i4"),
                                                ("f1", "f8")]))
     # As string w/o names but different default format
     assert_equal(
         easy_dtype(ndtype, defaultfmt="field_%03i"),
         np.dtype([("field_000", "i4"), ("field_001", "f8")]),
     )
     # As string w/ names
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype, names="a, b"),
                  np.dtype([("a", "i4"), ("b", "f8")]))
     # As string w/ names (too many)
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype, names="a, b, c"),
                  np.dtype([("a", "i4"), ("b", "f8")]))
     # As string w/ names (not enough)
     ndtype = "i4, f8"
     assert_equal(easy_dtype(ndtype, names=", b"),
                  np.dtype([("f0", "i4"), ("b", "f8")]))
     # ... (with different default format)
     assert_equal(
         easy_dtype(ndtype, names="a", defaultfmt="f%02i"),
         np.dtype([("a", "i4"), ("f00", "f8")]),
     )
     # As list of tuples w/o names
     ndtype = [("A", int), ("B", float)]
     assert_equal(easy_dtype(ndtype), np.dtype([("A", int), ("B", float)]))
     # As list of tuples w/ names
     assert_equal(easy_dtype(ndtype, names="a,b"),
                  np.dtype([("a", int), ("b", float)]))
     # As list of tuples w/ not enough names
     assert_equal(easy_dtype(ndtype, names="a"),
                  np.dtype([("a", int), ("f0", float)]))
     # As list of tuples w/ too many names
     assert_equal(easy_dtype(ndtype, names="a,b,c"),
                  np.dtype([("a", int), ("b", float)]))
     # As list of types w/o names
     ndtype = (int, float, float)
     assert_equal(easy_dtype(ndtype),
                  np.dtype([("f0", int), ("f1", float), ("f2", float)]))
     # As list of types w names
     ndtype = (int, float, float)
     assert_equal(
         easy_dtype(ndtype, names="a, b, c"),
         np.dtype([("a", int), ("b", float), ("c", float)]),
     )
     # As simple dtype w/ names
     ndtype = np.dtype(float)
     assert_equal(
         easy_dtype(ndtype, names="a, b, c"),
         np.dtype([(_, float) for _ in ("a", "b", "c")]),
     )
     # As simple dtype w/o names (but multiple fields)
     ndtype = np.dtype(float)
     assert_equal(
         easy_dtype(ndtype, names=["", "", ""], defaultfmt="f%02i"),
         np.dtype([(_, float) for _ in ("f00", "f01", "f02")]),
     )
Exemple #5
0
def genfromdta(fname, excludelist=None, missing_flt=-999., missing_str=""):
    """
    Returns an ndarray from a Stata .dta file.

    Parameters
    ----------
    fname : str or filehandle
        Stata .dta file.
    missing_values
    excludelist
    missing_flt
    missing_str

    Notes
    ------
    If the parser encounters a format that it doesn't understand, then it will
    convert to string.  This may be the case with date formats.
    """
#TODO: extend to get data from online
    if isinstance(fname, basestring):
        fhd = StataReader(open(fname, 'rb'), missing_values=False)
    elif not hasattr(fname, 'read'):
        raise TypeError("The input should be a string or a filehandle. "\
                "(got %s instead)" % type(fname))
    else:
        fhd = StataReader(fname, missing_values=False)
#    validate_names = np.lib._iotools.NameValidator(excludelist=excludelist,
#                                    deletechars=deletechars,
#                                    case_sensitive=case_sensitive)


#TODO: does this need to handle the byteorder?
    header = fhd.file_headers()
#    types = header['typlist'] # typemap in StataReader?
    nobs = header['nobs']
    numvars = header['nvar']
    varnames = header['varlist']
    dataname = header['data_label']
    labels = header['vlblist'] # labels are thrown away unless DataArray
                               # type is used
    data = np.zeros((nobs,numvars))
    stata_dta = fhd.dataset()

    # build dtype from stata formats
    # see http://www.stata.com/help.cgi?format
    # This converts all of these to float64
    # all time and strings are converted to strings
    #TODO: put these notes in the docstring
    #TODO: need to write a time parser
    to_flt = ['g','e','f','h','gc','fc', 'x', 'l'] # how to deal with x
                                                   # and double-precision
    to_str = ['s']
    if 1:#    if not convert_time: #time parser not written
        to_str.append('t')
    flt_or_str = lambda x: ((x.lower()[-1] in to_str and 's') or \
            (x.lower()[-1] in to_flt and 'f8')) or 's'
    #TODO: this is surely not the best way to handle data types
    convert_missing = {'f8' : missing_flt, 's' : missing_str}
    #TODO: needs to be made more flexible when change types
    fmt = [_.split('.')[-1] for _ in header['fmtlist']]
    remove_comma = [fmt.index(_) for _ in fmt if 'c' in _]
    for i in range(len(fmt)): # remove commas and convert any time types to 't'
        if 't' in fmt[i]:
            fmt[i] = 't'
        if i in remove_comma:
            fmt[i] = fmt[i][:-1] # needs to be changed if time doesn't req.
                                 # loop
    formats = map(flt_or_str, fmt)
# have to go through the whole file first to find string lengths?
#TODO: this is going to be miserably slow
# have a closer look at numpy.genfromtxt and revisit this
    first_list = []
    for rownum,line in enumerate(stata_dta):
        # doesn't handle missing value objects
        # Untested for commas and string missing
        # None will only work without missing value object.
        if None in line and not remove_comma:
            for val in line:
                if val is None:
                    line[line.index(val)] = convert_missing[\
                            formats[line.index(val)]]
        if None in line and remove_comma:
            for i,val in enumerate(line):
                if val is None:
                    line[i] = convert_missing[formats[i]]
                elif i in remove_comma:
                    try: # sometimes a format, say gc is read as a float or int
                        #TODO: I'm actually not sure now that comma formats
                        # are read as strings.
                        line[i] = ''.join(line[i].split(','))
                    except:
                        line[j] = str(line[j])
                    if formats[i] == 'f8':
                        line[i] = float(line[i])
        if remove_comma and not None in line:
            for j in remove_comma:
                try: # sometimes a format, say gc is read as a float or int
                    line[j] = ''.join(line[j].split(','))
                except:
                    line[j] = str(line[j])
                if formats[j] == 'f8': # change when change f8
                    line[j] = float(line[j])

        first_list.append(line)
#TODO: add informative error message similar to genfromtxt
# Get string lengths
    strcolidx = []
    if 's' in formats:
        for col,type in enumerate(formats):
            if type == 's':
                strcolidx.append(col)
        for i in strcolidx:
            formats[i] = "a%i" % max(len(str(row[i])) for row in first_list)
    dt = zip(varnames, formats) # make dtype again
    dt = easy_dtype(dt)
    data = np.zeros((nobs), dtype=dt) # init final array
    for i,row in enumerate(first_list):
        data[i] = tuple(row)

#TODO: make it possible to return plain array if all 'f8' for example
    return data