def train(fpath):
    df = pd.read_csv(fpath)
    df = df.drop(['DateTime'], axis=1)
    df.SubId = np.object_(np.int64(df.SubId))
    df.UserId = np.object_(df.UserId)
    df.Rating = np.int64(df.Rating)
    temp = df.UserId.value_counts()[df.UserId.value_counts() < 10].index
    temp = set(temp)
    remain = []
    for i in df.index:
        if df.UserId[i] not in temp:
            remain.append(i)
    df = df.loc[remain]
    sf = gl.SFrame(df)
    print 'finished reading in data'
    training, test = gl.recommender.util.random_split_by_user(sf,
                                                              user_id='UserId',
                                                              item_id='SubId',
                                                              item_test_proportion=0.2,
                                                              random_seed=1234)
    rcmder = gl.recommender.factorization_recommender.create(training,
                                                             user_id='UserId',
                                                             item_id='SubId',
                                                             target='Rating',
                                                             regularization=1e-5)
    print 'finished training model'
    print rcmder.evaluate(test, target='Rating')
    return rcmder
def train(fpath):
    df = pd.read_csv(fpath)
    df = df.drop(['DateTime'], axis=1)
    df.SubId = np.object_(np.int64(df.SubId))
    df.UserId = np.object_(df.UserId)
    df.Rating = np.int64(df.Rating)
    temp = df.UserId.value_counts()[df.UserId.value_counts() < 10].index
    temp = set(temp)
    remain = []
    for i in df.index:
        if df.UserId[i] not in temp:
            remain.append(i)
    df = df.loc[remain]
    sf = gl.SFrame(df)
    print 'finished reading in data'
    dataset, test = gl.recommender.util.random_split_by_user(sf,
                                                             user_id='UserId',
                                                             item_id='SubId',
                                                             item_test_proportion=0.2,
                                                             random_seed=2345)
    training, validate = gl.recommender.util.random_split_by_user(dataset,
                                                                  user_id='UserId',
                                                                  item_id='SubId',
                                                                  item_test_proportion=0.25,
                                                                  random_seed=3456)
    stype = ['jaccard', 'cosine', 'pearson']
    thres = [10 ** e for e in range(-8, 1)]
    res = {}
    min_rmse = 99999.0
    coor_min_rmse = (stype[0], thres[0])
    for j in stype:
        for i in thres:
            rcmder = gl.recommender.item_similarity_recommender.create(training,
                                                                       user_id='UserId',
                                                                       item_id='SubId',
                                                                       target='Rating',
                                                                       threshold=i,
                                                                       similarity_type=j)
            res[(j, i)] = rcmder.evaluate(validate, metric='rmse',
                                          target='Rating')['rmse_overall']
            if res[(j, i)] < min_rmse:
                min_rmse = res[(j, i)]
                coor_min_rmse = (j, i)
    print res
    print 'best combination is {} with RMSE {}'.format(coor_min_rmse, min_rmse)
    rcmder = gl.recommender.item_similarity_recommender.create(dataset,
                                                               user_id='UserId',
                                                               item_id='SubId',
                                                               target='Rating',
                                                               threshold=coor_min_rmse[1],
                                                               similarity_type=coor_min_rmse[0])
    print 'finished training model'
    print rcmder.evaluate(test, metric='rmse', target='Rating')
    return rcmder
Ejemplo n.º 3
0
 def test_for_object_scalar_creation(self, level=rlevel):
     """Ticket #816"""
     a = np.object_()
     b = np.object_(3)
     b2 = np.object_(3.0)
     c = np.object_([4,5])
     d = np.object_([None, {}, []])
     assert a is None
     assert type(b) is int
     assert type(b2) is float
     assert type(c) is np.ndarray
     assert c.dtype == object
     assert d.dtype == object
Ejemplo n.º 4
0
 def test_for_object_scalar_creation(self):
     import numpy as np
     import sys
     a = np.object_()
     b = np.object_(3)
     b2 = np.object_(3.0)
     c = np.object_([4, 5])
     d = np.array([None])[0]
     assert a is None
     assert type(b) is int
     assert type(b2) is float
     assert type(c) is np.ndarray
     assert c.dtype == object
     assert type(d) is type(None)
     if '__pypy__' in sys.builtin_module_names:
         skip('not implemented yet')
     e = np.object_([None, {}, []])
     assert e.dtype == object
Ejemplo n.º 5
0
 def test_isscalar_numpy_array_scalars(self):
     self.assertTrue(is_scalar(np.int64(1)))
     self.assertTrue(is_scalar(np.float64(1.)))
     self.assertTrue(is_scalar(np.int32(1)))
     self.assertTrue(is_scalar(np.object_('foobar')))
     self.assertTrue(is_scalar(np.str_('foobar')))
     self.assertTrue(is_scalar(np.unicode_(u('foobar'))))
     self.assertTrue(is_scalar(np.bytes_(b'foobar')))
     self.assertTrue(is_scalar(np.datetime64('2014-01-01')))
     self.assertTrue(is_scalar(np.timedelta64(1, 'h')))
Ejemplo n.º 6
0
 def test_isscalar_numpy_array_scalars(self):
     self.assertTrue(lib.isscalar(np.int64(1)))
     self.assertTrue(lib.isscalar(np.float64(1.0)))
     self.assertTrue(lib.isscalar(np.int32(1)))
     self.assertTrue(lib.isscalar(np.object_("foobar")))
     self.assertTrue(lib.isscalar(np.str_("foobar")))
     self.assertTrue(lib.isscalar(np.unicode_(u("foobar"))))
     self.assertTrue(lib.isscalar(np.bytes_(b"foobar")))
     self.assertTrue(lib.isscalar(np.datetime64("2014-01-01")))
     self.assertTrue(lib.isscalar(np.timedelta64(1, "h")))
Ejemplo n.º 7
0
 def test_generic_roundtrip(self):
     values = [
         np.int_(1),
         np.int32(-2),
         np.float_(2.5),
         np.nan,
         -np.inf,
         np.inf,
         np.datetime64('2014-01-01'),
         np.str_('foo'),
         np.unicode_('bar'),
         np.object_({'a': 'b'}),
         np.complex_(1 - 2j)
     ]
     for value in values:
         decoded = self.roundtrip(value)
         assert_equal(decoded, value)
         self.assertTrue(isinstance(decoded, type(value)))
Ejemplo n.º 8
0
def test_generic_roundtrip():
    values = [
        np.int_(1),
        np.int32(-2),
        np.float_(2.5),
        np.nan,
        -np.inf,
        np.inf,
        np.datetime64('2014-01-01'),
        np.str_('foo'),
        np.unicode_('bar'),
        np.object_({'a': 'b'}),
        np.complex_(1 - 2j),
    ]
    for value in values:
        decoded = roundtrip(value)
        assert_equal(decoded, value)
        assert isinstance(decoded, type(value))
Ejemplo n.º 9
0
 def test_generic_roundtrip(self):
     if self.should_skip:
         return self.skip('numpy is not importable')
     values = [
         np.int_(1),
         np.int32(-2),
         np.float_(2.5),
         np.nan,
         -np.inf,
         np.inf,
         np.datetime64('2014-01-01'),
         np.str_('foo'),
         np.unicode_('bar'),
         np.object_({'a': 'b'}),
         np.complex_(1 - 2j),
     ]
     for value in values:
         decoded = self.roundtrip(value)
         assert_equal(decoded, value)
         self.assertTrue(isinstance(decoded, type(value)))
Ejemplo n.º 10
0
 def test_generic_roundtrip(self):
     if self.should_skip:
         return self.skip("numpy is not importable")
     values = [
         np.int_(1),
         np.int32(-2),
         np.float_(2.5),
         np.nan,
         -np.inf,
         np.inf,
         np.datetime64("2014-01-01"),
         np.str_("foo"),
         np.unicode_("bar"),
         np.object_({"a": "b"}),
         np.complex_(1 - 2j),
     ]
     for value in values:
         decoded = self.roundtrip(value)
         assert_equal(decoded, value)
         self.assertTrue(isinstance(decoded, type(value)))
Ejemplo n.º 11
0
                       " - Enter for yes, continue\n"
                       " - n then Enter for no, abort\n"
                       ">>> ".format(muteswanfile.name))
    if ohgoonthen == "n":
        quit()
birdtable = [row for row in birdreader]
if birdtable[-1][0] != "END":
    ohgoonthen = input("END row not found ({} may be malformed), continue?\n"
                       " - Enter for yes, continue\n"
                       " - n then Enter for no, abort\n"
                       ">>> ".format(birdfile.name))
    if ohgoonthen == "n":
        quit()

# Extract column headers (and prints them, to make sure they're what you expect)
muteswanheader = np.object_(muteswantable[0])
print("\n" + str(muteswanheader), "\n")
birdheader = np.object_(birdtable[0])
print("\n" + str(birdheader), "\n")
# Extract body of the data table, to be indexed by [row, column] (here removes headers and "END" row)
muteswandata = np.object_(muteswantable[1:-1])
birddata = np.object_(birdtable[1:-1])
# Extract data of location column
muteswanloc = muteswandata[:, 0]
birdloc = birddata[:, 0]
# Extract population data, to be indexed by [site, year]
muteswanpop = muteswandata[:, 1:-5]
birdpop = birddata[:, 1:-5]

# Function that finds all the strings containing non-numeric characters, vectorised for use on the data
nonnumericmatcher = np.vectorize(lambda x: bool(re.compile("[^0-9]").search(x)))
Ejemplo n.º 12
0
def assert_equal_matlab_format(a, b):
    # Compares a and b for equality. b is always the original. If they
    # are dictionaries, a must be a structured ndarray and they must
    # have the same set of keys, after which they values must all be
    # compared. If they are a collection type (list, tuple, set,
    # frozenset, or deque), then the compairison must be made with b
    # converted to an object array. If the original is not a numpy type
    # (isn't or doesn't inherit from np.generic or np.ndarray), then it
    # is a matter of converting it to the appropriate numpy
    # type. Otherwise, both are supposed to be numpy types. For object
    # arrays, each element must be iterated over to be compared. Then,
    # if it isn't a string type, then they must have the same dtype,
    # shape, and all elements. All strings are converted to numpy.str_
    # on read. If it is empty, it has shape (1, 0). A numpy.str_ has all
    # of its strings per row compacted together. A numpy.bytes_ string
    # has to have the same thing done, but then it needs to be converted
    # up to UTF-32 and to numpy.str_ through uint32.
    #
    # In all cases, we expect things to be at least two dimensional
    # arrays.
    if type(b) == dict:
        assert type(a) == np.ndarray
        assert a.dtype.names is not None
        assert set(a.dtype.names) == set(b.keys())
        for k in b:
            assert_equal_matlab_format(a[k][0], b[k])
    elif type(b) in (list, tuple, set, frozenset, collections.deque):
        assert_equal_matlab_format(a, np.object_(list(b)))
    elif not isinstance(b, (np.generic, np.ndarray)):
        if b is None:
            # It should be np.zeros(shape=(0, 1), dtype='float64'))
            assert type(a) == np.ndarray
            assert a.dtype == np.dtype('float64')
            assert a.shape == (1, 0)
        elif (sys.hexversion >= 0x03000000 \
                and isinstance(b, (bytes, str, bytearray))) \
                or (sys.hexversion < 0x03000000 \
                and isinstance(b, (bytes, unicode, bytearray))):
            if len(b) == 0:
                assert_equal(a, np.zeros(shape=(1, 0), dtype='U'))
            elif isinstance(b, (bytes, bytearray)):
                assert_equal(a, np.atleast_2d(np.unicode_(b.decode())))
            else:
                assert_equal(a, np.atleast_2d(np.unicode_(b)))
        else:
            assert_equal(a, np.atleast_2d(np.array(b)))
    else:
        if b.dtype.name != 'object':
            if b.dtype.char in ('U', 'S'):
                if len(b) == 0 and (b.shape == tuple() \
                        or b.shape == (0, )):
                    assert_equal(a, np.zeros(shape=(1, 0), dtype='U'))
                elif b.dtype.char == 'U':
                    c = np.atleast_1d(b)
                    c = np.atleast_2d(c.view(np.dtype('U' \
                        + str(c.shape[-1]*c.dtype.itemsize//4))))
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    npt.assert_equal(a, c)
                elif b.dtype.char == 'S':
                    c = np.atleast_1d(b)
                    c = c.view(np.dtype('S' \
                        + str(c.shape[-1]*c.dtype.itemsize)))
                    c = np.uint32(c.view(np.dtype('uint8')))
                    c = c.view(np.dtype('U' + str(c.shape[-1])))
                    c = np.atleast_2d(c)
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    npt.assert_equal(a, c)
                    pass
                else:
                    c = np.atleast_2d(b)
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    npt.assert_equal(a, c)
            else:
                c = np.atleast_2d(b)
                # An empty complex number gets turned into a real
                # number when it is stored.
                if np.prod(c.shape) == 0 \
                        and b.dtype.name.startswith('complex'):
                    c = np.real(c)
                # If it is structured, check that the field names are
                # the same, in the same order, and then go through them
                # one by one. Otherwise, make sure the dtypes and shapes
                # are the same before comparing all values.
                if b.dtype.names is None and a.dtype.names is None:
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    npt.assert_equal(a, c)
                else:
                    assert a.dtype.names is not None
                    assert b.dtype.names is not None
                    assert set(a.dtype.names) == set(b.dtype.names)
                    assert a.dtype.names == b.dtype.names
                    a = a.flatten()
                    b = b.flatten()
                    for k in b.dtype.names:
                        for index, x in np.ndenumerate(a):
                            assert_equal_from_matlab(a[k][index], b[k][index])
        else:
            c = np.atleast_2d(b)
            assert a.dtype == c.dtype
            assert a.shape == c.shape
            for index, x in np.ndenumerate(a):
                assert_equal_matlab_format(a[index], c[index])
Ejemplo n.º 13
0
def assert_equal_none_format(a, b):
    # Compares a and b for equality. b is always the original. If they
    # are dictionaries, a must be a structured ndarray and they must
    # have the same set of keys, after which they values must all be
    # compared. If they are a collection type (list, tuple, set,
    # frozenset, or deque), then the compairison must be made with b
    # converted to an object array. If the original is not a numpy type
    # (isn't or doesn't inherit from np.generic or np.ndarray), then it
    # is a matter of converting it to the appropriate numpy
    # type. Otherwise, both are supposed to be numpy types. For object
    # arrays, each element must be iterated over to be compared. Then,
    # if it isn't a string type, then they must have the same dtype,
    # shape, and all elements. If it is an empty string, then it would
    # have been stored as just a null byte (recurse to do that
    # comparison). If it is a bytes_ type, the dtype, shape, and
    # elements must all be the same. If it is string_ type, we must
    # convert to uint32 and then everything can be compared.
    if type(b) == dict:
        assert type(a) == np.ndarray
        assert a.dtype.names is not None
        assert set(a.dtype.names) == set(b.keys())
        for k in b:
            assert_equal_none_format(a[k][0], b[k])
    elif type(b) in (list, tuple, set, frozenset, collections.deque):
        assert_equal_none_format(a, np.object_(list(b)))
    elif not isinstance(b, (np.generic, np.ndarray)):
        if b is None:
            # It should be np.float64([])
            assert type(a) == np.ndarray
            assert a.dtype == np.float64([]).dtype
            assert a.shape == (0, )
        elif (sys.hexversion >= 0x03000000 \
                and isinstance(b, (bytes, bytearray))) \
                or (sys.hexversion < 0x03000000 \
                and isinstance(b, (bytes, bytearray))):
            assert a == np.bytes_(b)
        elif (sys.hexversion >= 0x03000000 \
                and isinstance(b, str)) \
                or (sys.hexversion < 0x03000000 \
                and isinstance(b, unicode)):
            assert_equal_none_format(a, np.unicode_(b))
        else:
            assert_equal_none_format(a, np.array(b)[()])
    else:
        if b.dtype.name != 'object':
            if b.dtype.char in ('U', 'S'):
                if b.dtype.char == 'S' and b.shape == tuple() \
                        and len(b) == 0:
                    assert_equal(a, \
                        np.zeros(shape=tuple(), dtype=b.dtype.char))
                elif b.dtype.char == 'U':
                    if b.shape == tuple() and len(b) == 0:
                        c = np.uint32(())
                    else:
                        c = np.atleast_1d(b).view(np.uint32)
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    npt.assert_equal(a, c)
                else:
                    assert a.dtype == b.dtype
                    assert a.shape == b.shape
                    npt.assert_equal(a, b)
            else:
                assert a.dtype == b.dtype
                # Now, if b.shape is just all ones, then a.shape will
                # just be (1,). Otherwise, we need to compare the shapes
                # directly. Also, dimensions need to be squeezed before
                # comparison in this case.
                assert np.prod(a.shape) == np.prod(b.shape)
                assert a.shape == b.shape \
                    or (np.prod(b.shape) == 1 and a.shape == (1,))
                if np.prod(a.shape) == 1:
                    a = np.squeeze(a)
                    b = np.squeeze(b)
                npt.assert_equal(a, b)
        else:
            assert a.dtype == b.dtype
            assert a.shape == b.shape
            for index, x in np.ndenumerate(a):
                assert_equal_none_format(a[index], b[index])
Ejemplo n.º 14
0
def assert_equal_none_format(a, b, options=None):
    # Compares a and b for equality. b is always the original. If they
    # are dictionaries, a must be a structured ndarray and they must
    # have the same set of keys, after which they values must all be
    # compared. If they are a collection type (list, tuple, set,
    # frozenset, or deque), then the compairison must be made with b
    # converted to an object array. If the original is not a numpy type
    # (isn't or doesn't inherit from np.generic or np.ndarray), then it
    # is a matter of converting it to the appropriate numpy
    # type. Otherwise, both are supposed to be numpy types. For object
    # arrays, each element must be iterated over to be compared. Then,
    # if it isn't a string type, then they must have the same dtype,
    # shape, and all elements. If it is an empty string, then it would
    # have been stored as just a null byte (recurse to do that
    # comparison). If it is a bytes_ type, the dtype, shape, and
    # elements must all be the same. If it is string_ type, we must
    # convert to uint32 and then everything can be compared. Big longs
    # and ints get written as numpy.bytes_.
    if type(b) == dict or (sys.hexversion >= 0x2070000
                           and type(b) == collections.OrderedDict):
        assert type(a) == np.ndarray
        assert a.dtype.names is not None

        # Determine if any of the keys could not be stored as str. If
        # they all can be, then the dtype field names should be the
        # keys. Otherwise, they should be 'keys' and 'values'.
        all_str_keys = True
        if sys.hexversion >= 0x03000000:
            tp_str = str
            tp_bytes = bytes
            converters = {tp_str: lambda x: x,
                          tp_bytes: lambda x: x.decode('UTF-8'),
                          np.bytes_:
                          lambda x: bytes(x).decode('UTF-8'),
                          np.unicode_: lambda x: str(x)}
            tp_conv = lambda x: converters[type(x)](x)
            tp_conv_str = lambda x: tp_conv(x)
        else:
            tp_str = unicode
            tp_bytes = str
            converters = {tp_str: lambda x: x,
                          tp_bytes: lambda x: x.decode('UTF-8'),
                          np.bytes_:
                          lambda x: bytes(x).decode('UTF-8'),
                          np.unicode_: lambda x: unicode(x)}
            tp_conv = lambda x: converters[type(x)](x)
            tp_conv_str = lambda x: tp_conv(x).encode('UTF-8')
        tps = tuple(converters.keys())
        for k in b.keys():
            if type(k) not in tps:
                all_str_keys = False
                break
            try:
                k_str = tp_conv(k)
            except:
                all_str_keys = False
                break
        if all_str_keys:
            assert set(a.dtype.names) == set([tp_conv_str(k)
                                              for k in b.keys()])
            for k in b:
                assert_equal_none_format(a[tp_conv_str(k)][0],
                                         b[k], options)
        else:
            names = (options.dict_like_keys_name,
                     options.dict_like_values_name)
            assert set(a.dtype.names) == set(names)
            keys = a[names[0]]
            values = a[names[1]]
            assert_equal_none_format(keys, tuple(b.keys()), options)
            assert_equal_none_format(values, tuple(b.values()), options)
    elif type(b) in (list, tuple, set, frozenset, collections.deque):
        assert_equal_none_format(a, np.object_(list(b)), options)
    elif not isinstance(b, (np.generic, np.ndarray)):
        if b is None:
            # It should be np.float64([])
            assert type(a) == np.ndarray
            assert a.dtype == np.float64([]).dtype
            assert a.shape == (0, )
        elif (sys.hexversion >= 0x03000000 \
                and isinstance(b, (bytes, bytearray))) \
                or (sys.hexversion < 0x03000000 \
                and isinstance(b, (bytes, bytearray))):
            assert a == np.bytes_(b)
        elif (sys.hexversion >= 0x03000000 \
                and isinstance(b, str)) \
                or (sys.hexversion < 0x03000000 \
                and isinstance(b, unicode)):
            assert_equal_none_format(a, np.unicode_(b), options)
        elif (sys.hexversion >= 0x03000000 \
                and type(b) == int) \
                or (sys.hexversion < 0x03000000 \
                and type(b) == long):
            if b > 2**63 or b < -(2**63 - 1):
                assert_equal_none_format(a, np.bytes_(b), options)
            else:
                assert_equal_none_format(a, np.int64(b), options)
        else:
            assert_equal_none_format(a, np.array(b)[()], options)
    else:
        if b.dtype.name != 'object':
            if b.dtype.char in ('U', 'S'):
                if b.dtype.char == 'S' and b.shape == tuple() \
                        and len(b) == 0:
                    assert_equal(a, \
                        np.zeros(shape=tuple(), dtype=b.dtype.char), \
                        options)
                elif b.dtype.char == 'U':
                    if b.shape == tuple() and len(b) == 0:
                        c = np.uint32(())
                    else:
                        c = np.atleast_1d(b).view(np.uint32)
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    npt.assert_equal(a, c)
                else:
                    assert a.dtype == b.dtype
                    assert a.shape == b.shape
                    npt.assert_equal(a, b)
            else:
                # Now, if b.shape is just all ones, then a.shape will
                # just be (1,). Otherwise, we need to compare the shapes
                # directly. Also, dimensions need to be squeezed before
                # comparison in this case.
                assert np.prod(a.shape) == np.prod(b.shape)
                assert a.shape == b.shape \
                    or (np.prod(b.shape) == 1 and a.shape == (1,))
                if np.prod(a.shape) == 1:
                    a = np.squeeze(a)
                    b = np.squeeze(b)
                # If there was a null in the dtype, then it was written
                # as a Group so the field order could have changed.
                if '\\x00' in str(b.dtype):
                    assert set(a.dtype.descr) == set(b.dtype.descr)
                    # Reorder the fields of a.
                    c = np.empty(shape=b.shape, dtype=b.dtype)
                    for n in b.dtype.names:
                        c[n] = a[n]
                else:
                    c = a
                assert c.dtype == b.dtype
                with warnings.catch_warnings():
                    warnings.simplefilter('ignore', RuntimeWarning)
                    npt.assert_equal(c, b)
        else:
            assert a.dtype == b.dtype
            assert a.shape == b.shape
            for index, x in np.ndenumerate(a):
                assert_equal_none_format(a[index], b[index], options)
Ejemplo n.º 15
0
def assert_equal_matlab_format(a, b, options=None):
    # Compares a and b for equality. b is always the original. If they
    # are dictionaries, a must be a structured ndarray and they must
    # have the same set of keys, after which they values must all be
    # compared. If they are a collection type (list, tuple, set,
    # frozenset, or deque), then the compairison must be made with b
    # converted to an object array. If the original is not a numpy type
    # (isn't or doesn't inherit from np.generic or np.ndarray), then it
    # is a matter of converting it to the appropriate numpy
    # type. Otherwise, both are supposed to be numpy types. For object
    # arrays, each element must be iterated over to be compared. Then,
    # if it isn't a string type, then they must have the same dtype,
    # shape, and all elements. All strings are converted to numpy.str_
    # on read unless they were stored as a numpy.bytes_ due to having
    # non-ASCII characters. If it is empty, it has shape (1, 0). A
    # numpy.str_ has all of its strings per row compacted together. A
    # numpy.bytes_ string has to have the same thing done, but then it
    # needs to be converted up to UTF-32 and to numpy.str_ through
    # uint32. Big longs and ints end up getting converted to UTF-16
    # uint16's when written and read back as UTF-32 numpy.unicode_.
    #
    # In all cases, we expect things to be at least two dimensional
    # arrays.
    if type(b) == dict or (sys.hexversion >= 0x2070000
                           and type(b) == collections.OrderedDict):
        assert type(a) == np.ndarray
        assert a.dtype.names is not None

        # Determine if any of the keys could not be stored as str. If
        # they all can be, then the dtype field names should be the
        # keys. Otherwise, they should be 'keys' and 'values'.
        all_str_keys = True
        if sys.hexversion >= 0x03000000:
            tp_str = str
            tp_bytes = bytes
            converters = {tp_str: lambda x: x,
                          tp_bytes: lambda x: x.decode('UTF-8'),
                          np.bytes_:
                          lambda x: bytes(x).decode('UTF-8'),
                          np.unicode_: lambda x: str(x)}
            tp_conv = lambda x: converters[type(x)](x)
            tp_conv_str = lambda x: tp_conv(x)
        else:
            tp_str = unicode
            tp_bytes = str
            converters = {tp_str: lambda x: x,
                          tp_bytes: lambda x: x.decode('UTF-8'),
                          np.bytes_:
                          lambda x: bytes(x).decode('UTF-8'),
                          np.unicode_: lambda x: unicode(x)}
            tp_conv = lambda x: converters[type(x)](x)
            tp_conv_str = lambda x: tp_conv(x).encode('UTF-8')
        tps = tuple(converters.keys())
        for k in b.keys():
            if type(k) not in tps:
                all_str_keys = False
                break
            try:
                k_str = tp_conv(k)
            except:
                all_str_keys = False
                break
        if all_str_keys:
            assert set(a.dtype.names) == set([tp_conv_str(k)
                                              for k in b.keys()])
            for k in b:
                assert_equal_matlab_format(a[tp_conv_str(k)][0],
                                           b[k], options)
        else:
            names = (options.dict_like_keys_name,
                     options.dict_like_values_name)
            assert set(a.dtype.names) == set(names)
            keys = a[names[0]][0]
            values = a[names[1]][0]
            assert_equal_matlab_format(keys, tuple(b.keys()), options)
            assert_equal_matlab_format(values, tuple(b.values()),
                                       options)
    elif type(b) in (list, tuple, set, frozenset, collections.deque):
        assert_equal_matlab_format(a, np.object_(list(b)), options)
    elif not isinstance(b, (np.generic, np.ndarray)):
        if b is None:
            # It should be np.zeros(shape=(0, 1), dtype='float64'))
            assert type(a) == np.ndarray
            assert a.dtype == np.dtype('float64')
            assert a.shape == (1, 0)
        elif (sys.hexversion >= 0x03000000 \
                and isinstance(b, (bytes, str, bytearray))) \
                or (sys.hexversion < 0x03000000 \
                and isinstance(b, (bytes, unicode, bytearray))):
            if len(b) == 0:
                assert_equal(a, np.zeros(shape=(1, 0), dtype='U'),
                             options)
            elif isinstance(b, (bytes, bytearray)):
                try:
                    c = np.unicode_(b.decode('ASCII'))
                except:
                    c = np.bytes_(b)
                assert_equal(a, np.atleast_2d(c), options)
            else:
                assert_equal(a, np.atleast_2d(np.unicode_(b)), options)
        elif (sys.hexversion >= 0x03000000 \
                and type(b) == int) \
                or (sys.hexversion < 0x03000000 \
                and type(b) == long):
            if b > 2**63 or b < -(2**63 - 1):
                assert_equal(a, np.atleast_2d(np.unicode_(b)), options)
            else:
                assert_equal(a, np.atleast_2d(np.int64(b)), options)
        else:
            assert_equal(a, np.atleast_2d(np.array(b)), options)
    else:
        if b.dtype.name != 'object':
            if b.dtype.char in ('U', 'S'):
                if len(b) == 0 and (b.shape == tuple() \
                        or b.shape == (0, )):
                    assert_equal(a, np.zeros(shape=(1, 0),
                                             dtype='U'), options)
                elif b.dtype.char == 'U':
                    c = np.atleast_1d(b)
                    c = np.atleast_2d(c.view(np.dtype('U' \
                        + str(c.shape[-1]*c.dtype.itemsize//4))))
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    npt.assert_equal(a, c)
                elif b.dtype.char == 'S':
                    c = np.atleast_1d(b).view(np.ndarray)
                    if np.all(c.view(np.uint8) < 128):
                        c = c.view(np.dtype('S' \
                            + str(c.shape[-1]*c.dtype.itemsize)))
                        c = c.view(np.dtype('uint8'))
                        c = np.uint32(c.view(np.dtype('uint8')))
                        c = c.view(np.dtype('U' + str(c.shape[-1])))
                    c = np.atleast_2d(c)
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    npt.assert_equal(a, c)
                    pass
                else:
                    c = np.atleast_2d(b)
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    with warnings.catch_warnings():
                        warnings.simplefilter('ignore', RuntimeWarning)
                        npt.assert_equal(a, c)
            else:
                c = np.atleast_2d(b)
                # An empty complex number gets turned into a real
                # number when it is stored.
                if np.prod(c.shape) == 0 \
                        and b.dtype.name.startswith('complex'):
                    c = np.real(c)
                # If it is structured, check that the field names are
                # the same, in the same order, and then go through them
                # one by one. Otherwise, make sure the dtypes and shapes
                # are the same before comparing all values.
                if b.dtype.names is None and a.dtype.names is None:
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    with warnings.catch_warnings():
                        warnings.simplefilter('ignore', RuntimeWarning)
                        npt.assert_equal(a, c)
                else:
                    assert a.dtype.names is not None
                    assert b.dtype.names is not None
                    assert set(a.dtype.names) == set(b.dtype.names)
                    # The ordering of fields must be preserved if the
                    # MATLAB_fields attribute could be used, which can
                    # only be done if there are no non-ascii characters
                    # in any of the field names.
                    if sys.hexversion >= 0x03000000:
                        allfields = ''.join(b.dtype.names)
                    else:
                        allfields = unicode('').join( \
                            [nm.decode('UTF-8') \
                            for nm in b.dtype.names])
                    if np.all(np.array([ord(ch) < 128 \
                            for ch in allfields])):
                        assert a.dtype.names == b.dtype.names
                    a = a.flatten()
                    b = b.flatten()
                    for k in b.dtype.names:
                        for index, x in np.ndenumerate(a):
                            assert_equal_from_matlab(a[k][index],
                                                     b[k][index],
                                                     options)
        else:
            c = np.atleast_2d(b)
            assert a.dtype == c.dtype
            assert a.shape == c.shape
            for index, x in np.ndenumerate(a):
                assert_equal_matlab_format(a[index], c[index], options)
Ejemplo n.º 16
0
def vcf2numpy(filename,
              return_mat=True,
              return_taxa=True,
              return_vrnt_chrgrp=True,
              return_vrnt_phypos=True,
              return_vrnt_name=True):
    """
    Extract information from a vcf file.

    Parameters
    ----------
    filename : str
        String indicating VCF file name.
    return_mat : bool
        Whether to return the genotype matrix. The genotype matrix is formatted
        as (m x n x p) where m is the number of chromosome phases (2, diploid
        for almost all cases), n is the number of taxa/individuals, p is the
        number of genomic markers/variants.
    return_taxa : bool
        Whether to return the taxa/individual name array.
    return_vrnt_chrgrp : bool
        Whether to return the variant chromosome number array. This is the
        chromosome number the marker/variant is assigned.
    return_vrnt_phypos : bool
        Whether to return the variant chromosome physical position array. This
        is the physical position on the assigned chromosome for the
        marker/variant.
    return_vrnt_name : bool
        Whether to return the variant name array.

    Returns
    -------
    out : dict
        A dictionary containing the desired data types. Possible fields are:
            Field         | Data type     | Description
            --------------|---------------|----------------
            "mat"         | numpy.int8    | genotype matrix
            "taxa"        | numpy.object_ | taxa/individual name array
            "vrnt_chrgrp" | numpy.int64   | variant chromosome number array
            "vrnt_phypos" | numpy.int64   | variant chromosome physical position array
            "vrnt_name"   | numpy.object_ | variant name array
    """
    # make VCF iterator
    vcf = cyvcf2.VCF(fname)

    # extract taxa names from vcf header
    taxa = vcf.samples

    # make empty lists to store extracted values
    mat = []
    vrnt_chrgrp = []
    vrnt_phypos = []
    vrnt_name = []

    # iterate through VCF file and accumulate variants
    for variant in vcf:
        if return_vrnt_chrgrp:
            # append chromosome integer
            vrnt_chrgrp.append(int(variant.CHROM))

        if return_vrnt_phypos:
            # append variant position coordinates
            vrnt_phypos.append(variant.POS)

        if return_vrnt_name:
            # append marker name
            vrnt_name.append(str(variant.ID))

        if return_mat:
            # extract allele states + whether they are phased or not
            phases = numpy.int8(variant.genotypes)

            # append genotype states
            mat.append(phases[:, 0:2].copy())

    # construct a dictionary of values
    out_dict = {}

    if return_mat:
        out_dict["mat"] = numpy.int8(mat).transpose(
            2, 1, 0)  # convert and transpose genotype matrix
    if return_taxa:
        out_dict["taxa"] = numpy.object_(taxa)  # convert to object array
    if return_vrnt_chrgrp:
        out_dict["vrnt_chrgrp"] = numpy.int64(
            vrnt_chrgrp)  # convert to int64 array
    if return_vrnt_phypos:
        out_dict["vrnt_phypos"] = numpy.int64(
            vrnt_phypos)  # convert to int64 array
    if return_vrnt_name:
        out_dict["vrnt_name"] = numpy.object_(
            vrnt_name)  # convert to object array

    # return output dictionary
    return out_dict
Ejemplo n.º 17
0
# For Done, 0 represents incomplete, 1 represents complete for count check, lasers check and cover check
coldefaults = {"Frame": np.arange(capn), **{col: np.nan for col in dfenv.columns},
               "ScaleOK": -1, **{i: np.nan for i in scalestats},
               **{name: 0 for name in faunanames},
               **{covertype: 0 for covertype in covertypes},
               "Done": 0, "LastEdited": "nan"}
try:
    # Filenames are of format "AllData-Z.csv", where Z is the version nr
    csvnames = sorted(glob("AllData-*.csv"), key=getfileno)
    if len(csvnames) == 0:
        csvinname = "Matching Files"
        raise FileNotFoundError
    csvinname = csvnames[-1]
    dfout = pd.read_csv(csvinname)
    csvoutname = "AllData-{}.csv".format(getfileno(csvinname) + 1)
    coldefaultskeys = np.object_(list(coldefaults.keys()))
    colsinmask = np.isin(coldefaultskeys, dfout.columns)
    if not np.all(colsinmask):
        print(coldefaultskeys[~colsinmask], "not found, adding")
        dfout = dfout.assign(**{key: coldefaults[key] for key in coldefaultskeys[~colsinmask]})
    print("Loaded in", csvinname)
except FileNotFoundError:
    csvoutname = "AllData-0.csv"
    print("No {} found, creating new DataFrame".format(csvinname))
    print("{} frames displayed and in".format(np.ceil(capn / skipspeed)), csvoutname)
    dfout = pd.DataFrame(coldefaults)
dataoutindex = np.argwhere(dfout["Frame"] == pos)[0][0]

# TODO: (After assignment) add help interface, automatic graph scaling, etc
try:
    while True:
Ejemplo n.º 18
0
                                             '/..'
                                             '/..')
        table = read_html(table.get_attribute("innerHTML"))[0]
        for i in range(3):
            for j in range(5):
                backyearbutton.click()
            table = driver.find_element_by_xpath('//table[@class="maintable"]'
                                                 '/tbody[@id="wr_webs_report"]'
                                                 '/..'
                                                 '/..')
            table = read_html(table.get_attribute("innerHTML"))[0]
            muteswantable = pd.concat([table, table[table.columns[2:7]]], axis=1)
        cols = muteswantable.columns.tolist()
        cols = [cols[0]] + cols[12:] + cols[2:7] + cols[8:11]
        muteswantable = muteswantable[cols]
        muteswantable = np.object_(muteswantable)

        for row in muteswantable:
            muteswanwriter.writerow(row)

        if muteswanpage == totalmuteswanpages:
            muteswanwriter.writerow(["END"] * 12)
            break
        else:
            nextpagebutton.click()
            muteswanpage += 1
    # winsound.Beep(2500, 1000)
    muteswanfile.close()
    print(" - Data table extracted.\n\n", muteswantable, "\n")

    # Finds the location dropdown menu and clicks it
Ejemplo n.º 19
0
                maskname="at sites with WWT centres")
        showsave("WWT site " + goosenames[k])
    for k in range(len(ducknames)):
        plotpop(ducknames[k],
                duckpops[k],
                ducklocs[k],
                selectsites(ducklocs[k], WWTsitenames),
                maskname="at sites with WWT centres")
        showsave("WWT site " + ducknames[k])

# Plot the population of all swan species at each site with a WWT centre
if "WWTcombinedpop" in whichplots:
    for sitename in WWTsitenames:
        plotpop("Swan",
                np.object_([
                    swanpops[k][swanlocs[k] == sitename]
                    for k in range(len(swannames))
                ]),
                swannames,
                maskname="at " + sitename)
        showsave("Swan Species at " + sitename)
        plotpop("Goose",
                np.object_([
                    goosepops[k][gooselocs[k] == sitename]
                    for k in range(len(goosenames))
                ]),
                goosenames,
                maskname="at " + sitename)
        showsave("Goose Species at " + sitename)
        plotpop("Duck",
                np.object_([
                    duckpops[k][ducklocs[k] == sitename]
Ejemplo n.º 20
0
def assert_equal_matlab_format(a, b, options=None):
    # Compares a and b for equality. b is always the original. If they
    # are dictionaries, a must be a structured ndarray and they must
    # have the same set of keys, after which they values must all be
    # compared. If they are a collection type (list, tuple, set,
    # frozenset, or deque), then the compairison must be made with b
    # converted to an object array. If the original is not a numpy type
    # (isn't or doesn't inherit from np.generic or np.ndarray), then it
    # is a matter of converting it to the appropriate numpy
    # type. Otherwise, both are supposed to be numpy types. For object
    # arrays, each element must be iterated over to be compared. Then,
    # if it isn't a string type, then they must have the same dtype,
    # shape, and all elements. All strings are converted to numpy.str_
    # on read unless they were stored as a numpy.bytes_ due to having
    # non-ASCII characters. If it is empty, it has shape (1, 0). A
    # numpy.str_ has all of its strings per row compacted together. A
    # numpy.bytes_ string has to have the same thing done, but then it
    # needs to be converted up to UTF-32 and to numpy.str_ through
    # uint32. Big longs and ints end up getting converted to UTF-16
    # uint16's when written and read back as UTF-32 numpy.unicode_.
    #
    # In all cases, we expect things to be at least two dimensional
    # arrays.
    if type(b) == dict or (sys.hexversion >= 0x2070000
                           and type(b) == collections.OrderedDict):
        assert_equal_nose(type(a), np.ndarray)
        assert a.dtype.names is not None

        # Determine if any of the keys could not be stored as str. If
        # they all can be, then the dtype field names should be the
        # keys. Otherwise, they should be 'keys' and 'values'.
        all_str_keys = True
        if sys.hexversion >= 0x03000000:
            tp_str = str
            tp_bytes = bytes
            converters = {
                tp_str: lambda x: x,
                tp_bytes: lambda x: x.decode('UTF-8'),
                np.bytes_: lambda x: bytes(x).decode('UTF-8'),
                np.unicode_: lambda x: str(x)
            }
            tp_conv = lambda x: converters[type(x)](x)
            tp_conv_str = lambda x: tp_conv(x)
        else:
            tp_str = unicode
            tp_bytes = str
            converters = {
                tp_str: lambda x: x,
                tp_bytes: lambda x: x.decode('UTF-8'),
                np.bytes_: lambda x: bytes(x).decode('UTF-8'),
                np.unicode_: lambda x: unicode(x)
            }
            tp_conv = lambda x: converters[type(x)](x)
            tp_conv_str = lambda x: tp_conv(x).encode('UTF-8')
        tps = tuple(converters.keys())
        for k in b.keys():
            if type(k) not in tps:
                all_str_keys = False
                break
            try:
                k_str = tp_conv(k)
            except:
                all_str_keys = False
                break
        if all_str_keys:
            assert_equal_nose(set(a.dtype.names),
                              set([tp_conv_str(k) for k in b.keys()]))
            for k in b:
                assert_equal_matlab_format(a[tp_conv_str(k)][0], b[k], options)
        else:
            names = (options.dict_like_keys_name,
                     options.dict_like_values_name)
            assert_equal_nose(set(a.dtype.names), set(names))
            keys = a[names[0]][0]
            values = a[names[1]][0]
            assert_equal_matlab_format(keys, tuple(b.keys()), options)
            assert_equal_matlab_format(values, tuple(b.values()), options)
    elif type(b) in (list, tuple, set, frozenset, collections.deque):
        assert_equal_matlab_format(a, np.object_(list(b)), options)
    elif not isinstance(b, (np.generic, np.ndarray)):
        if b is None:
            # It should be np.zeros(shape=(0, 1), dtype='float64'))
            assert_equal_nose(type(a), np.ndarray)
            assert_equal_nose(a.dtype, np.dtype('float64'))
            assert_equal_nose(a.shape, (1, 0))
        elif (sys.hexversion >= 0x03000000 \
                and isinstance(b, (bytes, str, bytearray))) \
                or (sys.hexversion < 0x03000000 \
                and isinstance(b, (bytes, unicode, bytearray))):
            if len(b) == 0:
                assert_equal(a, np.zeros(shape=(1, 0), dtype='U'), options)
            elif isinstance(b, (bytes, bytearray)):
                try:
                    c = np.unicode_(b.decode('ASCII'))
                except:
                    c = np.bytes_(b)
                assert_equal(a, np.atleast_2d(c), options)
            else:
                assert_equal(a, np.atleast_2d(np.unicode_(b)), options)
        elif (sys.hexversion >= 0x03000000 \
                and type(b) == int) \
                or (sys.hexversion < 0x03000000 \
                and type(b) == long):
            if b > 2**63 or b < -(2**63 - 1):
                assert_equal(a, np.atleast_2d(np.unicode_(b)), options)
            else:
                assert_equal(a, np.atleast_2d(np.int64(b)), options)
        else:
            assert_equal(a, np.atleast_2d(np.array(b)), options)
    else:
        if b.dtype.name != 'object':
            if b.dtype.char in ('U', 'S'):
                if len(b) == 0 and (b.shape == tuple() \
                        or b.shape == (0, )):
                    assert_equal(a, np.zeros(shape=(1, 0), dtype='U'), options)
                elif b.dtype.char == 'U':
                    c = np.atleast_1d(b)
                    c = np.atleast_2d(c.view(np.dtype('U' \
                        + str(c.shape[-1]*c.dtype.itemsize//4))))
                    assert_equal_nose(a.dtype, c.dtype)
                    assert_equal_nose(a.shape, c.shape)
                    npt.assert_equal(a, c)
                elif b.dtype.char == 'S':
                    c = np.atleast_1d(b).view(np.ndarray)
                    if np.all(c.view(np.uint8) < 128):
                        c = c.view(np.dtype('S' \
                            + str(c.shape[-1]*c.dtype.itemsize)))
                        c = c.view(np.dtype('uint8'))
                        c = np.uint32(c.view(np.dtype('uint8')))
                        c = c.view(np.dtype('U' + str(c.shape[-1])))
                    c = np.atleast_2d(c)
                    assert_equal_nose(a.dtype, c.dtype)
                    assert_equal_nose(a.shape, c.shape)
                    npt.assert_equal(a, c)
                    pass
                else:
                    c = np.atleast_2d(b)
                    assert_equal_nose(a.dtype, c.dtype)
                    assert_equal_nose(a.shape, c.shape)
                    with warnings.catch_warnings():
                        warnings.simplefilter('ignore', RuntimeWarning)
                        npt.assert_equal(a, c)
            else:
                c = np.atleast_2d(b)
                # An empty complex number gets turned into a real
                # number when it is stored.
                if np.prod(c.shape) == 0 \
                        and b.dtype.name.startswith('complex'):
                    c = np.real(c)
                # If it is structured, check that the field names are
                # the same, in the same order, and then go through them
                # one by one. Otherwise, make sure the dtypes and shapes
                # are the same before comparing all values.
                if b.dtype.names is None and a.dtype.names is None:
                    assert_equal_nose(a.dtype, c.dtype)
                    assert_equal_nose(a.shape, c.shape)
                    with warnings.catch_warnings():
                        warnings.simplefilter('ignore', RuntimeWarning)
                        npt.assert_equal(a, c)
                else:
                    assert a.dtype.names is not None
                    assert b.dtype.names is not None
                    assert_equal_nose(set(a.dtype.names), set(b.dtype.names))
                    # The ordering of fields must be preserved if the
                    # MATLAB_fields attribute could be used, which can
                    # only be done if there are no non-ascii characters
                    # in any of the field names.
                    if sys.hexversion >= 0x03000000:
                        allfields = ''.join(b.dtype.names)
                    else:
                        allfields = unicode('').join( \
                            [nm.decode('UTF-8') \
                            for nm in b.dtype.names])
                    if np.all(np.array([ord(ch) < 128 \
                            for ch in allfields])):
                        assert_equal_nose(a.dtype.names, b.dtype.names)
                    a = a.flatten()
                    b = b.flatten()
                    for k in b.dtype.names:
                        for index, x in np.ndenumerate(a):
                            assert_equal_from_matlab(a[k][index], b[k][index],
                                                     options)
        else:
            c = np.atleast_2d(b)
            assert_equal_nose(a.dtype, c.dtype)
            assert_equal_nose(a.shape, c.shape)
            for index, x in np.ndenumerate(a):
                assert_equal_matlab_format(a[index], c[index], options)
Ejemplo n.º 21
0
                       o["data-reg"], o["data-migratoriness"], o["data-ranginess"]]
                      for o in iddatasoup.find_all("option")[1:]]
        print(" -  - Writing site names to 'birdnames.csv'...")
        # Save these to the file for later use
        birdnameswriter = csv.writer(birdiddatafile)
        birdnameswriter.writerow(["Name", "ID", "Taxon", "IsSummerMigrant", "MigrantStatus", "Range"])
        for row in birdiddata:
            print(row)
            birdnameswriter.writerow(row)
    else:
        # Otherwise read the names from a previously saved file
        birdiddatafile = open("birdnames.csv", "r", encoding="utf-8", newline="")
        birdiddata = [row for row in csv.reader(birdiddatafile)]
    # Always sure to close the file afterwards (Although python would probably clean up otherwise anyway)
    birdiddatafile.close()
    birdiddata = np.object_(birdiddata[1:])
    print(birdiddata)
    birdnames = birdiddata[:, 0]
    print(" -  - Bird names etc ready.\n\n", birdnames, "\n - Bird names etc extracted.")

    while True:
        # Take input of regex to match a set of bird names whose tables to download
        birdname = input("\nEnter bird name or regex pattern, case insensitive\n>>> ")
        print("Finding bird name matches...")
        matcher = re.compile(birdname, re.IGNORECASE)
        chosenbirdnames = list(filter(matcher.search, birdnames))
        if len(chosenbirdnames) == 0:
            print(" - No bird name matches found.")
            # Simply go back and ask again if none match the request
            continue
        print(" - Bird name matches found ({}).\n\n".format(len(chosenbirdnames)), "\n".join(chosenbirdnames), sep="")
Ejemplo n.º 22
0
def assert_equal_none_format(a, b, options=None):
    # Compares a and b for equality. b is always the original. If they
    # are dictionaries, a must be a structured ndarray and they must
    # have the same set of keys, after which they values must all be
    # compared. If they are a collection type (list, tuple, set,
    # frozenset, or deque), then the compairison must be made with b
    # converted to an object array. If the original is not a numpy type
    # (isn't or doesn't inherit from np.generic or np.ndarray), then it
    # is a matter of converting it to the appropriate numpy
    # type. Otherwise, both are supposed to be numpy types. For object
    # arrays, each element must be iterated over to be compared. Then,
    # if it isn't a string type, then they must have the same dtype,
    # shape, and all elements. If it is an empty string, then it would
    # have been stored as just a null byte (recurse to do that
    # comparison). If it is a bytes_ type, the dtype, shape, and
    # elements must all be the same. If it is string_ type, we must
    # convert to uint32 and then everything can be compared. Big longs
    # and ints get written as numpy.bytes_.
    if type(b) == dict or (sys.hexversion >= 0x2070000
                           and type(b) == collections.OrderedDict):
        assert_equal_nose(type(a), np.ndarray)
        assert a.dtype.names is not None

        # Determine if any of the keys could not be stored as str. If
        # they all can be, then the dtype field names should be the
        # keys. Otherwise, they should be 'keys' and 'values'.
        all_str_keys = True
        if sys.hexversion >= 0x03000000:
            tp_str = str
            tp_bytes = bytes
            converters = {
                tp_str: lambda x: x,
                tp_bytes: lambda x: x.decode('UTF-8'),
                np.bytes_: lambda x: bytes(x).decode('UTF-8'),
                np.unicode_: lambda x: str(x)
            }
            tp_conv = lambda x: converters[type(x)](x)
            tp_conv_str = lambda x: tp_conv(x)
        else:
            tp_str = unicode
            tp_bytes = str
            converters = {
                tp_str: lambda x: x,
                tp_bytes: lambda x: x.decode('UTF-8'),
                np.bytes_: lambda x: bytes(x).decode('UTF-8'),
                np.unicode_: lambda x: unicode(x)
            }
            tp_conv = lambda x: converters[type(x)](x)
            tp_conv_str = lambda x: tp_conv(x).encode('UTF-8')
        tps = tuple(converters.keys())
        for k in b.keys():
            if type(k) not in tps:
                all_str_keys = False
                break
            try:
                k_str = tp_conv(k)
            except:
                all_str_keys = False
                break
        if all_str_keys:
            assert_equal_nose(set(a.dtype.names),
                              set([tp_conv_str(k) for k in b.keys()]))
            for k in b:
                assert_equal_none_format(a[tp_conv_str(k)][0], b[k], options)
        else:
            names = (options.dict_like_keys_name,
                     options.dict_like_values_name)
            assert set(a.dtype.names) == set(names)
            keys = a[names[0]]
            values = a[names[1]]
            assert_equal_none_format(keys, tuple(b.keys()), options)
            assert_equal_none_format(values, tuple(b.values()), options)
    elif type(b) in (list, tuple, set, frozenset, collections.deque):
        assert_equal_none_format(a, np.object_(list(b)), options)
    elif not isinstance(b, (np.generic, np.ndarray)):
        if b is None:
            # It should be np.float64([])
            assert_equal_nose(type(a), np.ndarray)
            assert_equal_nose(a.dtype, np.float64([]).dtype)
            assert_equal_nose(a.shape, (0, ))
        elif (sys.hexversion >= 0x03000000 \
                and isinstance(b, (bytes, bytearray))) \
                or (sys.hexversion < 0x03000000 \
                and isinstance(b, (bytes, bytearray))):
            assert_equal_nose(a, np.bytes_(b))
        elif (sys.hexversion >= 0x03000000 \
                and isinstance(b, str)) \
                or (sys.hexversion < 0x03000000 \
                and isinstance(b, unicode)):
            assert_equal_none_format(a, np.unicode_(b), options)
        elif (sys.hexversion >= 0x03000000 \
                and type(b) == int) \
                or (sys.hexversion < 0x03000000 \
                and type(b) == long):
            if b > 2**63 or b < -(2**63 - 1):
                assert_equal_none_format(a, np.bytes_(b), options)
            else:
                assert_equal_none_format(a, np.int64(b), options)
        else:
            assert_equal_none_format(a, np.array(b)[()], options)
    else:
        if b.dtype.name != 'object':
            if b.dtype.char in ('U', 'S'):
                if b.dtype.char == 'S' and b.shape == tuple() \
                        and len(b) == 0:
                    assert_equal(a, \
                        np.zeros(shape=tuple(), dtype=b.dtype.char), \
                        options)
                elif b.dtype.char == 'U':
                    if b.shape == tuple() and len(b) == 0:
                        c = np.uint32(())
                    else:
                        c = np.atleast_1d(b).view(np.uint32)
                    assert_equal_nose(a.dtype, c.dtype)
                    assert_equal_nose(a.shape, c.shape)
                    npt.assert_equal(a, c)
                else:
                    assert_equal_nose(a.dtype, b.dtype)
                    assert_equal_nose(a.shape, b.shape)
                    npt.assert_equal(a, b)
            else:
                # Now, if b.shape is just all ones, then a.shape will
                # just be (1,). Otherwise, we need to compare the shapes
                # directly. Also, dimensions need to be squeezed before
                # comparison in this case.
                assert_equal_nose(np.prod(a.shape), np.prod(b.shape))
                assert a.shape == b.shape \
                    or (np.prod(b.shape) == 1 and a.shape == (1,))
                if np.prod(a.shape) == 1:
                    a = np.squeeze(a)
                    b = np.squeeze(b)
                # If there was a null in the dtype, then it was written
                # as a Group so the field order could have changed.
                if '\\x00' in str(b.dtype):
                    assert_equal_nose(set(a.dtype.descr), set(b.dtype.descr))
                    # Reorder the fields of a.
                    c = np.empty(shape=b.shape, dtype=b.dtype)
                    for n in b.dtype.names:
                        c[n] = a[n]
                else:
                    c = a
                assert_equal_nose(c.dtype, b.dtype)
                with warnings.catch_warnings():
                    warnings.simplefilter('ignore', RuntimeWarning)
                    npt.assert_equal(c, b)
        else:
            assert_equal_nose(a.dtype, b.dtype)
            assert_equal_nose(a.shape, b.shape)
            for index, x in np.ndenumerate(a):
                assert_equal_none_format(a[index], b[index], options)
def train(fpath):
    df = pd.read_csv(fpath)
    df = df.drop(['DateTime'], axis=1)
    df.SubId = np.object_(np.int64(df.SubId))
    df.UserId = np.object_(df.UserId)
    df.Rating = np.int64(df.Rating)
    # remove users with less than 50 ratings
    temp = df.UserId.value_counts()[df.UserId.value_counts() < 50].index
    temp = set(temp)
    remain = []
    for i in df.index:
        if df.UserId[i] not in temp:
            remain.append(i)
    df = df.loc[remain]
    # remove items with less than 50 ratings
    temp = df.SubId.value_counts()[df.SubId.value_counts() < 50].index
    temp = set(temp)
    remain = []
    for i in df.index:
        if df.SubId[i] not in temp:
            remain.append(i)
    df = df.loc[remain]

    sf = gl.SFrame(df)
    print 'finished reading in data'
    dataset, test = gl.recommender.util.random_split_by_user(
        sf,
        user_id='UserId',
        item_id='SubId',
        item_test_proportion=0.2,
        random_seed=2345)
    training, validate = gl.recommender.util.random_split_by_user(
        dataset,
        user_id='UserId',
        item_id='SubId',
        item_test_proportion=0.25,
        random_seed=3456)
    numf = [2**e for e in range(3, 8)]
    regl = [1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3]
    res = {}
    min_rmse = 99999.0
    coor_min_rmse = (numf[0], regl[0])
    for j in numf:
        for i in regl:
            rcmder = gl.recommender.factorization_recommender.create(
                training,
                user_id='UserId',
                item_id='SubId',
                target='Rating',
                regularization=i,
                num_factors=j)
            res[(j, i)] = rcmder.evaluate(validate,
                                          metric='rmse',
                                          target='Rating')['rmse_overall']
            if res[(j, i)] < min_rmse:
                min_rmse = res[(j, i)]
                coor_min_rmse = (j, i)
    print res
    print 'best combination is {} with RMSE {}'.format(coor_min_rmse, min_rmse)
    rcmder = gl.recommender.factorization_recommender.create(
        dataset,
        user_id='UserId',
        item_id='SubId',
        target='Rating',
        regularization=coor_min_rmse[1],
        num_factors=coor_min_rmse[0])
    print 'finished training model'
    print rcmder.evaluate(test, metric='rmse', target='Rating')
    return rcmder
Ejemplo n.º 24
0
def assert_equal_none_format(a, b):
    # Compares a and b for equality. b is always the original. If they
    # are dictionaries, a must be a structured ndarray and they must
    # have the same set of keys, after which they values must all be
    # compared. If they are a collection type (list, tuple, set,
    # frozenset, or deque), then the compairison must be made with b
    # converted to an object array. If the original is not a numpy type
    # (isn't or doesn't inherit from np.generic or np.ndarray), then it
    # is a matter of converting it to the appropriate numpy
    # type. Otherwise, both are supposed to be numpy types. For object
    # arrays, each element must be iterated over to be compared. Then,
    # if it isn't a string type, then they must have the same dtype,
    # shape, and all elements. If it is an empty string, then it would
    # have been stored as just a null byte (recurse to do that
    # comparison). If it is a bytes_ type, the dtype, shape, and
    # elements must all be the same. If it is string_ type, we must
    # convert to uint32 and then everything can be compared.
    if type(b) == dict:
        assert type(a) == np.ndarray
        assert a.dtype.names is not None
        assert set(a.dtype.names) == set(b.keys())
        for k in b:
            assert_equal_none_format(a[k][0], b[k])
    elif type(b) in (list, tuple, set, frozenset, collections.deque):
        assert_equal_none_format(a, np.object_(list(b)))
    elif not isinstance(b, (np.generic, np.ndarray)):
        if b is None:
            # It should be np.float64([])
            assert type(a) == np.ndarray
            assert a.dtype == np.float64([]).dtype
            assert a.shape == (0, )
        elif (sys.hexversion >= 0x03000000 \
                and isinstance(b, (bytes, bytearray))) \
                or (sys.hexversion < 0x03000000 \
                and isinstance(b, (bytes, bytearray))):
            assert a == np.bytes_(b)
        elif (sys.hexversion >= 0x03000000 \
                and isinstance(b, str)) \
                or (sys.hexversion < 0x03000000 \
                and isinstance(b, unicode)):
            assert_equal_none_format(a, np.unicode_(b))
        else:
            assert_equal_none_format(a, np.array(b)[()])
    else:
        if b.dtype.name != 'object':
            if b.dtype.char in ('U', 'S'):
                if b.dtype.char == 'S' and b.shape == tuple() \
                        and len(b) == 0:
                    assert_equal(a, \
                        np.zeros(shape=tuple(), dtype=b.dtype.char))
                elif b.dtype.char == 'U':
                    if b.shape == tuple() and len(b) == 0:
                        c = np.uint32(())
                    else:
                        c = np.atleast_1d(b).view(np.uint32)
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    npt.assert_equal(a, c)
                else:
                    assert a.dtype == b.dtype
                    assert a.shape == b.shape
                    npt.assert_equal(a, b)
            else:
                assert a.dtype == b.dtype
                # Now, if b.shape is just all ones, then a.shape will
                # just be (1,). Otherwise, we need to compare the shapes
                # directly. Also, dimensions need to be squeezed before
                # comparison in this case.
                assert np.prod(a.shape) == np.prod(b.shape)
                assert a.shape == b.shape \
                    or (np.prod(b.shape) == 1 and a.shape == (1,))
                if np.prod(a.shape) == 1:
                    a = np.squeeze(a)
                    b = np.squeeze(b)
                npt.assert_equal(a, b)
        else:
            assert a.dtype == b.dtype
            assert a.shape == b.shape
            for index, x in np.ndenumerate(a):
                assert_equal_none_format(a[index], b[index])
Ejemplo n.º 25
0
def assert_equal_matlab_format(a, b):
    # Compares a and b for equality. b is always the original. If they
    # are dictionaries, a must be a structured ndarray and they must
    # have the same set of keys, after which they values must all be
    # compared. If they are a collection type (list, tuple, set,
    # frozenset, or deque), then the compairison must be made with b
    # converted to an object array. If the original is not a numpy type
    # (isn't or doesn't inherit from np.generic or np.ndarray), then it
    # is a matter of converting it to the appropriate numpy
    # type. Otherwise, both are supposed to be numpy types. For object
    # arrays, each element must be iterated over to be compared. Then,
    # if it isn't a string type, then they must have the same dtype,
    # shape, and all elements. All strings are converted to numpy.str_
    # on read. If it is empty, it has shape (1, 0). A numpy.str_ has all
    # of its strings per row compacted together. A numpy.bytes_ string
    # has to have the same thing done, but then it needs to be converted
    # up to UTF-32 and to numpy.str_ through uint32.
    #
    # In all cases, we expect things to be at least two dimensional
    # arrays.
    if type(b) == dict:
        assert type(a) == np.ndarray
        assert a.dtype.names is not None
        assert set(a.dtype.names) == set(b.keys())
        for k in b:
            assert_equal_matlab_format(a[k][0], b[k])
    elif type(b) in (list, tuple, set, frozenset, collections.deque):
        assert_equal_matlab_format(a, np.object_(list(b)))
    elif not isinstance(b, (np.generic, np.ndarray)):
        if b is None:
            # It should be np.zeros(shape=(0, 1), dtype='float64'))
            assert type(a) == np.ndarray
            assert a.dtype == np.dtype('float64')
            assert a.shape == (1, 0)
        elif (sys.hexversion >= 0x03000000 \
                and isinstance(b, (bytes, str, bytearray))) \
                or (sys.hexversion < 0x03000000 \
                and isinstance(b, (bytes, unicode, bytearray))):
            if len(b) == 0:
                assert_equal(a, np.zeros(shape=(1, 0), dtype='U'))
            elif isinstance(b, (bytes, bytearray)):
                assert_equal(a, np.atleast_2d(np.unicode_(b.decode())))
            else:
                assert_equal(a, np.atleast_2d(np.unicode_(b)))
        else:
            assert_equal(a, np.atleast_2d(np.array(b)))
    else:
        if b.dtype.name != 'object':
            if b.dtype.char in ('U', 'S'):
                if len(b) == 0 and (b.shape == tuple() \
                        or b.shape == (0, )):
                    assert_equal(a, np.zeros(shape=(1, 0),
                                 dtype='U'))
                elif b.dtype.char == 'U':
                    c = np.atleast_1d(b)
                    c = np.atleast_2d(c.view(np.dtype('U' \
                        + str(c.shape[-1]*c.dtype.itemsize//4))))
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    npt.assert_equal(a, c)
                elif b.dtype.char == 'S':
                    c = np.atleast_1d(b)
                    c = c.view(np.dtype('S' \
                        + str(c.shape[-1]*c.dtype.itemsize)))
                    c = np.uint32(c.view(np.dtype('uint8')))
                    c = c.view(np.dtype('U' + str(c.shape[-1])))
                    c = np.atleast_2d(c)
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    npt.assert_equal(a, c)
                    pass
                else:
                    c = np.atleast_2d(b)
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    npt.assert_equal(a, c)
            else:
                c = np.atleast_2d(b)
                # An empty complex number gets turned into a real
                # number when it is stored.
                if np.prod(c.shape) == 0 \
                        and b.dtype.name.startswith('complex'):
                    c = np.real(c)
                # If it is structured, check that the field names are
                # the same, in the same order, and then go through them
                # one by one. Otherwise, make sure the dtypes and shapes
                # are the same before comparing all values.
                if b.dtype.names is None and a.dtype.names is None:
                    assert a.dtype == c.dtype
                    assert a.shape == c.shape
                    npt.assert_equal(a, c)
                else:
                    assert a.dtype.names is not None
                    assert b.dtype.names is not None
                    assert set(a.dtype.names) == set(b.dtype.names)
                    assert a.dtype.names == b.dtype.names
                    a = a.flatten()
                    b = b.flatten()
                    for k in b.dtype.names:
                        for index, x in np.ndenumerate(a):
                            assert_equal_from_matlab(a[k][index],
                                                     b[k][index])
        else:
            c = np.atleast_2d(b)
            assert a.dtype == c.dtype
            assert a.shape == c.shape
            for index, x in np.ndenumerate(a):
                assert_equal_matlab_format(a[index], c[index])
Ejemplo n.º 26
0
    np.timedelta64(np.iinfo(np.int64).min + 1, "ms"),
    np.timedelta64(42, "us"),
    np.timedelta64(np.iinfo(np.int64).max, "us"),
    np.timedelta64(np.iinfo(np.int64).min + 1, "us"),
    np.timedelta64(42, "ns"),
    np.timedelta64(np.iinfo(np.int64).max, "ns"),
    np.timedelta64(np.iinfo(np.int64).min + 1, "ns"),
    "",
    "one",
    "1",
    True,
    False,
    np.bool_(True),
    np.bool_(False),
    np.str_("asdf"),
    np.object_("asdf"),
]

DECIMAL_VALUES = [
    Decimal("100"),
    Decimal("0.0042"),
    Decimal("1.0042"),
]


@pytest.mark.parametrize("value", SCALAR_VALUES + DECIMAL_VALUES)
def test_scalar_host_initialization(value):
    s = cudf.Scalar(value)

    np.testing.assert_equal(s.value, value)
    assert s.is_valid() is True
def train(fpath):
    df = pd.read_csv(fpath)
    df = df.drop(['DateTime'], axis=1)
    df.SubId = np.object_(np.int64(df.SubId))
    df.UserId = np.object_(df.UserId)
    df.Rating = np.int64(df.Rating)
    # remove users with less than 50 ratings
    temp = df.UserId.value_counts()[df.UserId.value_counts() < 50].index
    temp = set(temp)
    remain = []
    for i in df.index:
        if df.UserId[i] not in temp:
            remain.append(i)
    df = df.loc[remain]
    # remove items with less than 50 ratings
    temp = df.SubId.value_counts()[df.SubId.value_counts() < 50].index
    temp = set(temp)
    remain = []
    for i in df.index:
        if df.SubId[i] not in temp:
            remain.append(i)
    df = df.loc[remain]

    sf = gl.SFrame(df)
    print 'finished reading in data'
    dataset, test = gl.recommender.util.random_split_by_user(sf,
                                                             user_id='UserId',
                                                             item_id='SubId',
                                                             item_test_proportion=0.2,
                                                             random_seed=2345)
    training, validate = gl.recommender.util.random_split_by_user(dataset,
                                                                  user_id='UserId',
                                                                  item_id='SubId',
                                                                  item_test_proportion=0.25,
                                                                  random_seed=3456)
    numf = [2 ** e for e in range(3, 8)]
    regl = [1e-6, 3e-6, 1e-5, 3e-5, 1e-4, 3e-4, 1e-3]
    res = {}
    min_rmse = 99999.0
    coor_min_rmse = (numf[0], regl[0])
    for j in numf:
        for i in regl:
            rcmder = gl.recommender.factorization_recommender.create(training,
                                                                     user_id='UserId',
                                                                     item_id='SubId',
                                                                     target='Rating',
                                                                     regularization=i,
                                                                     num_factors=j)
            res[(j, i)] = rcmder.evaluate(validate, metric='rmse',
                                          target='Rating')['rmse_overall']
            if res[(j, i)] < min_rmse:
                min_rmse = res[(j, i)]
                coor_min_rmse = (j, i)
    print res
    print 'best combination is {} with RMSE {}'.format(coor_min_rmse, min_rmse)
    rcmder = gl.recommender.factorization_recommender.create(dataset,
                                                             user_id='UserId',
                                                             item_id='SubId',
                                                             target='Rating',
                                                             regularization=coor_min_rmse[1],
                                                             num_factors=coor_min_rmse[0])
    print 'finished training model'
    print rcmder.evaluate(test, metric='rmse', target='Rating')
    return rcmder