Beispiel #1
0
def has_non_float(expr, variables):
    '''
    Whether the given expression has an integer or boolean variable in it.

    Parameters
    ----------
    expr : str
        The expression to check
    variables : dict-like
        `Variable` and `Function` object for all the identifiers used in `expr`

    Returns
    -------
    has_non_float : bool
        Whether `expr` has an integer or boolean in it
    '''
    identifiers = get_identifiers_recursively([expr], variables,
                                              include_numbers=True)
    # Check whether there is an integer literal in the expression:
    for name in identifiers:
        if name not in variables:
            try:
                int(name)
                # if this worked, this was an integer literal
                return True
            except (TypeError, ValueError):
                pass  # not an integer literal
    non_float_var = any((name in variables and isinstance(name, Variable) and
                         (np.issubdtype(variables[name].dtype, np.integer) or
                          np.issubdtype(variables[name].dtype, np.bool_)))
                        for name in identifiers)
    return non_float_var
Beispiel #2
0
    def autotyped(cls, data, units=None):
        """
        Automatically choose between Component and CategoricalComponent,
        based on the input data type.

        :param data: The data to pack into a Component
        :type data: Array-like
        :param units: Optional units
        :type units: str

        :returns: A Component (or subclass)
        """
        data = np.asarray(data)

        if np.issubdtype(data.dtype, np.object_):
            return CategoricalComponent(data, units=units)

        n = coerce_numeric(data)
        thresh = 0.5
        try:
            use_categorical = np.issubdtype(data.dtype, np.character) and \
                np.isfinite(n).mean() <= thresh
        except TypeError:  # isfinite not supported. non-numeric dtype
            use_categorical = True

        if use_categorical:
            return CategoricalComponent(data, units=units)
        else:
            return Component(n, units=units)
Beispiel #3
0
    def _smoketest(self, spxlu, check, dtype):
        if np.issubdtype(dtype, np.complexfloating):
            A = self.A + 1j*self.A.T
        else:
            A = self.A

        A = A.astype(dtype)
        lu = spxlu(A)

        rng = random.RandomState(1234)

        # Input shapes
        for k in [None, 1, 2, self.n, self.n+2]:
            msg = "k=%r" % (k,)

            if k is None:
                b = rng.rand(self.n)
            else:
                b = rng.rand(self.n, k)

            if np.issubdtype(dtype, np.complexfloating):
                b = b + 1j*rng.rand(*b.shape)
            b = b.astype(dtype)

            x = lu.solve(b)
            check(A, b, x, msg)

            x = lu.solve(b, 'T')
            check(A.T, b, x, msg)

            x = lu.solve(b, 'H')
            check(A.T.conj(), b, x, msg)
Beispiel #4
0
    def entrymean(self, m, axis=None):
        """Average a matrix over the given axis. If the axis is None,
        average over both rows and columns, returning a scalar.
        (via some SciPy function)
        """
        # Mimic numpy's casting.  The int32/int64 check works around numpy
        # 1.5.x behavior of np.issubdtype, see gh-2677.
        if (np.issubdtype(m.dtype, np.float_) or
                np.issubdtype(m.dtype, np.int_) or
                    m.dtype in [np.dtype('int32'), np.dtype('int64')] or
                np.issubdtype(m.dtype, np.bool_)):
            res_dtype = np.float_
        elif np.issubdtype(m.dtype, np.complex_):
            res_dtype = np.complex_
        else:
            res_dtype = m.dtype

        m = m.astype(res_dtype)
        mu = m.sum(None) / m.getnnz()
        # if user or item has no ratings (stripped from training data), set to 0
        b_i = m.sum(0)
        b_u = m.sum(1)
        with np.errstate(invalid='ignore'):
            b_i = (b_i / m.getnnz(axis=0)) - mu
            b_u = (b_u.T / m.getnnz(axis=1)) - mu
        b_i[np.isnan(b_i)] = 0
        b_u[np.isnan(b_u)] = 0

        return mu, np.array(b_i)[0], np.array(b_u)[0]
Beispiel #5
0
def assert_image_equal(actual, expected):
    if np.issubdtype(actual.dtype, np.integer):
        assert_equal(actual, expected)
    else:
        if np.issubdtype(expected.dtype, np.integer):
            expected = expected/float(np.iinfo(expected.dtype).max)
        assert_allclose(actual, expected, atol=1/256.)
Beispiel #6
0
    def sum(self, axis=None):
        """Sum the matrix over the given axis.  If the axis is None, sum
        over both rows and columns, returning a scalar.
        """
        # We use multiplication by an array of ones to achieve this.
        # For some sparse matrix formats more efficient methods are
        # possible -- these should override this function.
        m, n = self.shape

        # Mimic numpy's casting.
        if np.issubdtype(self.dtype, np.float_):
            res_dtype = np.float_
        elif (np.issubdtype(self.dtype, np.int_) or
              np.issubdtype(self.dtype, np.bool_)):
                res_dtype = np.int_
        elif np.issubdtype(self.dtype, np.complex_):
            res_dtype = np.complex_
        else:
            res_dtype = self.dtype

        if axis is None:
            # sum over rows and columns
            return (self * np.asmatrix(np.ones((n, 1), dtype=res_dtype))).sum()

        if axis < 0:
            axis += 2
        if axis == 0:
            # sum over columns
            return np.asmatrix(np.ones((1, m), dtype=res_dtype)) * self
        elif axis == 1:
            # sum over rows
            return self * np.asmatrix(np.ones((n, 1), dtype=res_dtype))
        else:
            raise ValueError("axis out of bounds")
Beispiel #7
0
    def mean(self, axis=None):
        """Average the matrix over the given axis.  If the axis is None,
        average over both rows and columns, returning a scalar.
        """
        # Mimic numpy's casting.  The int32/int64 check works around numpy
        # 1.5.x behavior of np.issubdtype, see gh-2677.
        if (np.issubdtype(self.dtype, np.float_) or
            np.issubdtype(self.dtype, np.int_) or
            self.dtype in [np.dtype('int32'), np.dtype('int64')] or
            np.issubdtype(self.dtype, np.bool_)):
                res_dtype = np.float_
        elif np.issubdtype(self.dtype, np.complex_):
            res_dtype = np.complex_
        else:
            res_dtype = self.dtype

        if axis is None:
            return self.sum(None) * 1.0 / (self.shape[0]*self.shape[1])

        if axis < 0:
            axis += 2
        if axis == 0:
            mean = self.astype(res_dtype).sum(0)
            mean *= 1.0 / self.shape[0]
            return mean
        elif axis == 1:
            mean = self.astype(res_dtype).sum(1)
            mean *= 1.0 / self.shape[1]
            return mean
        else:
            raise ValueError("axis out of bounds")
Beispiel #8
0
def RATWriteArray(rat, array, field, start=0):
    """
    Pure Python implementation of writing a chunk of the RAT
    from a numpy array. Type of array is coerced to one of the types
    (int, double, string) supported. Called from RasterAttributeTable.WriteArray
    """
    if array is None:
        raise ValueError("Expected array of dim 1")

# if not the array type convert it to handle lists etc
    if not isinstance(array, numpy.ndarray):
        array = numpy.array(array)

    if array.ndim != 1:
        raise ValueError("Expected array of dim 1")

    if (start + array.size) > rat.GetRowCount():
        raise ValueError("Array too big to fit into RAT from start position")

    if numpy.issubdtype(array.dtype, numpy.integer):
# is some type of integer - coerce to standard int
# TODO: must check this is fine on all platforms
# confusingly numpy.int 64 bit even if native type 32 bit
        array = array.astype(numpy.int32)
    elif numpy.issubdtype(array.dtype, numpy.floating):
# is some type of floating point - coerce to double
        array = array.astype(numpy.double)
    elif numpy.issubdtype(array.dtype, numpy.character):
# cast away any kind of Unicode etc
        array = array.astype(numpy.character)
    else:
        raise ValueError("Array not of a supported type (integer, double or string)")

    return RATValuesIONumPyWrite(rat, field, start, array)
Beispiel #9
0
 def _set_dtype(self, dtype, union=False):
     if np.issubdtype(dtype, np.complexfloating) \
            or np.issubdtype(self.dtype, np.complexfloating):
         self.dtype = np.complex_
     else:
         if not union or self.dtype != np.complex_:
             self.dtype = np.float_
def fftconvolve(in1, in2, in3=None, mode="full"):
    """Convolve two N-dimensional arrays using FFT. See convolve.

    copied from scipy, but here used to try out inverse filter
    doesn't work or I can't get it to work
    """
    s1 = array(in1.shape)
    s2 = array(in2.shape)
    complex_result = (np.issubdtype(in1.dtype, np.complex) or
                      np.issubdtype(in2.dtype, np.complex))
    size = s1+s2-1

    # Always use 2**n-sized FFT
    fsize = 2**np.ceil(np.log2(size))
    IN1 = fftn(in1,fsize)
    #IN1 *= fftn(in2,fsize)
    IN1 /= fftn(in2,fsize)  # use inverse filter
    # note the inverse is elementwise not matrix inverse
    # is this correct, NO  doesn't seem to work
    fslice = tuple([slice(0, int(sz)) for sz in size])
    ret = ifftn(IN1)[fslice].copy()
    del IN1
    if not complex_result:
        ret = ret.real
    if mode == "full":
        return ret
    elif mode == "same":
        if product(s1,axis=0) > product(s2,axis=0):
            osize = s1
        else:
            osize = s2
        return _centered(ret,osize)
    elif mode == "valid":
        return _centered(ret,abs(s2-s1)+1)
Beispiel #11
0
 def transform_python_types(self, obj):
     """handle special scalars, default to default json encoder
     """
     # Pandas Timestamp
     if is_pandas and isinstance(obj, pd.tslib.Timestamp):
         return obj.value / 10**6.0  #nanosecond to millisecond
     elif np.issubdtype(type(obj), np.float):
         return float(obj)
     elif np.issubdtype(type(obj), np.int):
         return int(obj)
     elif np.issubdtype(type(obj), np.bool_):
         return bool(obj)
     # Datetime
     # datetime is a subclass of date.
     elif isinstance(obj, dt.datetime):
         return calendar.timegm(obj.timetuple()) * 1000. + obj.microsecond / 1000.
     # Date
     elif isinstance(obj, dt.date):
         return calendar.timegm(obj.timetuple()) * 1000.
     # Numpy datetime64
     elif isinstance(obj, np.datetime64):
         epoch_delta = obj - np.datetime64('1970-01-01T00:00:00Z')
         return (epoch_delta / np.timedelta64(1, 'ms'))
     # Time
     elif isinstance(obj, dt.time):
         return (obj.hour * 3600 + obj.minute * 60 + obj.second) * 1000 + obj.microsecond / 1000.
     elif is_dateutil and isinstance(obj, relativedelta):
         return dict(years=obj.years, months=obj.months, days=obj.days, hours=obj.hours,
             minutes=obj.minutes, seconds=obj.seconds, microseconds=obj.microseconds)
     # Decimal
     elif isinstance(obj, decimal.Decimal):
         return float(obj)
     else:
         return super(BokehJSONEncoder, self).default(obj)
Beispiel #12
0
    def from_numpy_dtype(self, dt):
        """
        From Numpy dtype.

        >>> from datashape import CType
        >>> from numpy import dtype
        >>> CType.from_numpy_dtype(dtype('int32'))
        ctype("int32")
        >>> CType.from_numpy_dtype(dtype('i8'))
        ctype("int64")
        >>> CType.from_numpy_dtype(dtype('M8'))
        DateTime(None)
        >>> CType.from_numpy_dtype(dtype('U30'))
        ctype("string[30, 'U32']")
        """
        try:
            return Type.lookup_type(dt.name)
        except KeyError:
            pass
        if np.issubdtype(dt, np.datetime64):
            unit, _ = np.datetime_data(dt)
            defaults = {'D': date_, 'Y': date_, 'M': date_, 'W': date_}
            return defaults.get(unit, datetime_)
        elif np.issubdtype(dt, np.timedelta64):
            unit, _ = np.datetime_data(dt)
            return TimeDelta(unit=unit)
        elif np.issubdtype(dt, np.unicode_):
            return String(dt.itemsize // 4, 'U32')
        elif np.issubdtype(dt, np.str_) or np.issubdtype(dt, np.bytes_):
            return String(dt.itemsize, 'ascii')
        raise NotImplementedError("NumPy datatype %s not supported" % dt)
Beispiel #13
0
    def to_json(o, level=0):
        ''' format JSON with no line break between list items '''
        INDENT = 3
        SPACE = " "
        NEWLINE = "\n"

        ret = ""
        if isinstance(o, dict):
            ret += "{" + NEWLINE
            comma = ""
            for k,v in o.iteritems():
                ret += comma
                comma = ",\n"
                ret += SPACE * INDENT * (level+1)
                ret += '"' + str(k) + '":' + SPACE
                ret += Utils.to_json(v, level + 1)

            ret += NEWLINE + SPACE * INDENT * level + "}"
        elif isinstance(o, basestring):
            ret += '"' + o + '"'
        elif isinstance(o, list):
            ret += "[" + ",".join([Utils.to_json(e, level+1) for e in o]) + "]"
        elif isinstance(o, bool):
            ret += "true" if o else "false"
        elif isinstance(o, int):
            ret += str(o)
        elif isinstance(o, float):
            ret += '%.7g' % o
        elif isinstance(o, numpy.ndarray) and numpy.issubdtype(o.dtype, numpy.integer):
            ret += "[" + ','.join(map(str, o.flatten().tolist())) + "]"
        elif isinstance(o, numpy.ndarray) and numpy.issubdtype(o.dtype, numpy.inexact):
            ret += "[" + ','.join(map(lambda x: '%.7g' % x, o.flatten().tolist())) + "]"
        else:
            raise TypeError("Unknown type '%s' for json serialization" % str(type(o)))
        return ret
Beispiel #14
0
def into(a, b, **kwargs):
    dialect = b.dialect.copy()
    del dialect['lineterminator']
    dates = [i for i, typ in enumerate(b.schema[0].types)
               if 'date' in str(typ)]
    schema = b.schema
    if '?' in str(schema):
        schema = dshape(str(schema).replace('?', ''))

    dtypes = valmap(to_numpy_dtype, schema[0].dict)

    datenames = [name for name in dtypes
                      if np.issubdtype(dtypes[name], np.datetime64)]

    dtypes = dict((k, v) for k, v in dtypes.items()
                         if not np.issubdtype(v, np.datetime64))

    if 'strict' in dialect:
        del dialect['strict']

    # Pass only keyword arguments appropriate for read_csv
    kws = keywords(pd.read_csv)
    options = toolz.merge(dialect, kwargs)
    options = toolz.keyfilter(lambda k: k in kws, options)

    if b.open == gzip.open:
        options['compression'] = 'gzip'

    return pd.read_csv(b.path,
                       skiprows=1 if b.header else 0,
                       dtype=dtypes,
                       parse_dates=datenames,
                       names=b.columns,
                       **options)
Beispiel #15
0
def find_linear_scale(data):
    scale = []
    scale_name = []
    linear_scale = False
    longest = None
    if type(data.columns) == pd.MultiIndex:
        for n, l in enumerate(data.columns.levels):
            if l.dtype == np.dtype('O'):  # Object; maybe str?
                if longest is None or len(l) > longest:
                    longest = len(l)

            elif np.issubdtype(l.dtype, np.integer) or np.issubdtype(l.dtype, np.float):
                linear_scale = True
                scale = [v[n] for v in data.columns.values]
                scale_name = data.columns.names[n]

                if np.issubdtype(l.dtype, np.float):
                    # Prefer float scales, assume more accurate
                    break
    else:
        scale = []
        linear_scale = True
        for x in data.columns.values:
            try:
                scale.append(float(x))
            except:
                linear_scale = False
                break

    return scale, linear_scale, scale_name
Beispiel #16
0
 def __init__(self, shape, size, dtype=np.uint8, saturation=None,
              hard_radius=None, signal=None, noise=0,
              feat_func=feat_gauss, **feat_kwargs):
     self.ndim = len(shape)
     self.shape = shape
     self.dtype = dtype
     self.image = Frame(np.zeros(shape, dtype=dtype))
     self.size = validate_tuple(size, self.ndim)
     self.isotropic = np.all([self.size[1:] == self.size[:-1]])
     self.feat_func = feat_func
     self.feat_kwargs = feat_kwargs
     self.noise = noise
     if saturation is None and np.issubdtype(dtype, np.integer):
         self.saturation = np.iinfo(dtype).max
     elif saturation is None and np.issubdtype(dtype, np.float):
         self.saturation = 1
     else:
         self.saturation = saturation
     if signal is None:
         self.signal = self.saturation
     else:
         self.signal = signal
     self.center = tuple([s // 2 for s in shape])
     self.hard_radius = hard_radius
     self._coords = []
     self.pos_columns = ['z', 'y', 'x'][-self.ndim:]
     if self.isotropic:
         self.size_columns = ['size']
     else:
         self.size_columns = ['size_z', 'size_y', 'size_x'][-self.ndim:]
Beispiel #17
0
    def _diff(self):
        if self.a.shape != self.b.shape:
            self.diff_dimensions = (self.a.shape, self.b.shape)
            # Don't do any further comparison if the dimensions differ
            # TODO: Perhaps we could, however, diff just the intersection
            # between the two images
            return

        # Find the indices where the values are not equal
        # If neither a nor b are floating point, ignore self.tolerance
        if not ((np.issubdtype(self.a.dtype, float) or
                 np.issubdtype(self.a.dtype, complex)) or
                (np.issubdtype(self.b.dtype, float) or
                 np.issubdtype(self.b.dtype, complex))):
            tolerance = 0
        else:
            tolerance = self.tolerance

        diffs = where_not_allclose(self.a, self.b, atol=0.0, rtol=tolerance)

        self.diff_total = len(diffs[0])

        if self.diff_total == 0:
            # Then we're done
            return

        if self.numdiffs < 0:
            numdiffs = self.diff_total
        else:
            numdiffs = self.numdiffs

        self.diff_pixels = [(idx, (self.a[idx], self.b[idx]))
                            for idx in islice(izip(*diffs), 0, numdiffs)]
        self.diff_ratio = float(self.diff_total) / float(len(self.a.flat))
Beispiel #18
0
    def Execute(self):

        self.PrintLog('Converting Numpy Array to vtkImageData')
        self.Image = vtk.vtkImageData()
        self.Image.SetDimensions(self.ArrayDict['Dimensions'])
        self.Image.SetOrigin(self.ArrayDict['Origin'])
        self.Image.SetSpacing(self.ArrayDict['Spacing'])
        self.Image.SetExtent((0, self.ArrayDict['Dimensions'][0] - 1,
                                0, self.ArrayDict['Dimensions'][1] - 1,
                                0, self.ArrayDict['Dimensions'][2] - 1,))


        self.PrintLog('converting point data')
        for pointKey in self.ArrayDict['PointData'].keys():
            if np.issubdtype(self.ArrayDict['PointData'][pointKey].dtype, np.floating):
                pointDataArrayType = vtk.VTK_FLOAT
            else:
                for checkDt in [int, np.uint8, np.uint16, np.uint32, np.uint64]:
                    if np.issubdtype(self.ArrayDict['PointData'][pointKey].dtype, checkDt):
                        pointDataArrayType = vtk.VTK_INT
                        break
                    else:
                        continue

            flatArray = self.ArrayDict['PointData'][pointKey].ravel(order='F')

            pointDataArray = dsa.numpyTovtkDataArray(flatArray, name=pointKey, array_type=pointDataArrayType)

            self.Image.GetPointData().SetActiveScalars(pointKey)
            self.Image.GetPointData().SetScalars(pointDataArray)
Beispiel #19
0
def fftconvolve(in1, in2, mode="full"):
    """Convolve two N-dimensional arrays using FFT. See convolve.

    """
    s1 = array(in1.shape)
    s2 = array(in2.shape)
    complex_result = (np.issubdtype(in1.dtype, np.complex) or
                      np.issubdtype(in2.dtype, np.complex))
    size = s1+s2-1
    IN1 = fftn(in1,size)
    IN1 *= fftn(in2,size)
    ret = ifftn(IN1)
    del IN1
    if not complex_result:
        ret = ret.real
    if mode == "full":
        return ret
    elif mode == "same":
        if product(s1,axis=0) > product(s2,axis=0):
            osize = s1
        else:
            osize = s2
        return _centered(ret,osize)
    elif mode == "valid":
        return _centered(ret,abs(s2-s1)+1)
Beispiel #20
0
def isEqual(left, right, eps=None, masked_equal=True):
  ''' This function checks if two numpy arrays or scalars are equal within machine precision, and returns a scalar logical. '''
  diff_type = "Both arguments to function 'isEqual' must be of the same class!"
  if isinstance(left,np.ndarray):
    # ndarray
    if not isinstance(right,np.ndarray): raise TypeError(diff_type)
    if not left.dtype==right.dtype:
      right = right.astype(left.dtype) # casting='same_kind' doesn't work...
    if np.issubdtype(left.dtype, np.inexact): # also catch float32 etc
      if eps is None: return ma.allclose(left, right, masked_equal=masked_equal)
      else: return ma.allclose(left, right, masked_equal=masked_equal, atol=eps)
    elif np.issubdtype(left.dtype, np.integer) or np.issubdtype(left.dtype, np.bool):
      return np.all( left == right ) # need to use numpy's all()
  elif isinstance(left,(float,np.inexact)):
    # numbers
    if not isinstance(right,(float,np.inexact)): raise TypeError(diff_type)
    if eps is None: eps = 100.*floateps # default
    if ( isinstance(right,float) or isinstance(right,float) ) or left.dtype.itemsize == right.dtype.itemsize: 
      return np.absolute(left-right) <= eps
    else:
      if left.dtype.itemsize < right.dtype.itemsize: right = left.dtype.type(right)
      else: left = right.dtype.type(left)
      return np.absolute(left-right) <= eps  
  elif isinstance(left,(int,bool,np.integer,np.bool)):
    # logicals
    if not isinstance(right,(int,bool,np.integer,np.bool)): raise TypeError(diff_type)
    return left == right
  else: raise TypeError(left)
Beispiel #21
0
    def test_random_like(self):
        """
        Test that the random_like function produces sensible data
        """

        # Try for floats and complex data
        for dtype in [np.float32, np.float64, np.complex64, np.complex128]:
            # Test random array creation with same
            # shape and type as existing array
            shape = (np.random.randint(1, 50), np.random.randint(1, 50))
            ary = np.empty(shape=shape, dtype=dtype)    
            random_ary = mbu.random_like(ary)

            # Test that that the shape and type is correct
            self.assertTrue(random_ary.shape == ary.shape)
            self.assertTrue(random_ary.dtype == dtype)

            # Test that we're getting complex data out
            if np.issubdtype(dtype, np.complexfloating):
                proportion_cplx = np.sum(np.iscomplex(random_ary)) / random_ary.size
                self.assertTrue(proportion_cplx > 0.9)

            # Test random array creation with supplied shape and type
            shape = (np.random.randint(1, 50), np.random.randint(1, 50))
            random_ary = mbu.random_like(shape=shape, dtype=dtype)

            # Test that that the shape and type is correct
            self.assertTrue(random_ary.shape == shape)
            self.assertTrue(random_ary.dtype == dtype)

            # Test that we're getting complex data out
            if np.issubdtype(dtype, np.complexfloating):
                proportion_cplx = np.sum(np.iscomplex(random_ary)) / random_ary.size
                self.assertTrue(proportion_cplx > 0.9)
Beispiel #22
0
def half_fft_convolve(in1, in2, size, mode = 'full', return_type='real'):
    """
    Rewrite of fftconvolve from scipy.signal ((c) Travis Oliphant 1999-2002)
    to deal with fft convolution where one signal is not fft transformed
    and the other one is.  Application is, for example, in a loop where
    convolution happens repeatedly with different kernels over the same
    signal.  First input is not transformed, second input is.
    """
    s1 = np.array(in1.shape)
    s2 = size - s1 + 1
    complex_result = (np.issubdtype( in1.dtype, np.complex) or
                      np.issubdtype( in2.dtype, np.complex) )

    # Always use 2**n-sized FFT
    fsize = 2 **np.ceil( np.log2( size) )
    IN1 = fftn(in1, fsize)
    IN1 *= in2
    fslice = tuple( [slice( 0, int(sz)) for sz in size] )
    ret = ifftn(IN1)[fslice].copy()
    del IN1
    if not complex_result:
        ret = ret.real
    if return_type == 'real':
        ret = ret.real
    if mode == 'full':
        return ret
    elif mode == 'same':
        if np.product(s1, axis=0) > np.product(s2, axis=0):
            osize = s1
        else:
            osize = s2
        return _centered(ret, osize)
    elif mode == 'valid':
        return _centered(ret, abs(s2 - s1) + 1)
  def DtypeToNumberConverter(self, dtype):
    """Converts a Numpy dtype to a converter method if applicable.

      The converter method takes in a numpy array of objects of the provided
      dtype
      and returns a numpy array of the numbers backing that object for
      statistical
      analysis. Returns None if no converter is necessary.

    Args:
      dtype: The numpy dtype to make a converter for.

    Returns:
      The converter method or None.
    """
    if np.issubdtype(dtype, np.datetime64):

      def DatetimesToNumbers(dt_list):
        return np.array([pd.Timestamp(dt).value for dt in dt_list])

      return DatetimesToNumbers
    elif np.issubdtype(dtype, np.timedelta64):

      def TimedetlasToNumbers(td_list):
        return np.array([pd.Timedelta(td).value for td in td_list])

      return TimedetlasToNumbers
    else:
      return None
def load_image(image_file):
    """
    Loads an analyze/nifti image, generally for 3D images. 
    Casts input as 32-bit float (if float) or 32-bit uint (if int).
    
    Paramaters
    ----------
    image_file: str
        path to data
    
    Returns
    -------
    dat: nparray
        Image as numpy array (i.e., 3D array)
    """
    img = nib.load(image_file)
    dat = img.get_data()
    # Ensure that data is cast as at least 32-bit
    if np.issubdtype(dat.dtype, float):
        dat = dat.astype("float32")
        # Check for negative values
        if (dat < 0).any():
            print "found negative values, setting to zero (see file: %s)" % image_file
            dat[dat < 0] = 0
    elif np.issubdtype(dat.dtype, int):
        dat = dat.astype("uint32")
    else:
        msg = "Error: Unknown datatype %s" % dat.dtype
        print msg
        raise Exception(msg)
    return dat
Beispiel #25
0
def from_bcolz(x, chunksize=None, categorize=True, index=None, **kwargs):
    """ Read dask Dataframe from bcolz.ctable

    Parameters
    ----------

    x : bcolz.ctable
        Input data
    chunksize : int (optional)
        The size of blocks to pull out from ctable.  Ideally as large as can
        comfortably fit in memory
    categorize : bool (defaults to True)
        Automatically categorize all string dtypes
    index : string (optional)
        Column to make the index

    See Also
    --------

    from_array: more generic function not optimized for bcolz
    """
    import dask.array as da
    import bcolz
    if isinstance(x, (str, unicode)):
        x = bcolz.ctable(rootdir=x)
    bc_chunklen = max(x[name].chunklen for name in x.names)
    if chunksize is None and bc_chunklen > 10000:
        chunksize = bc_chunklen

    categories = dict()
    if categorize:
        for name in x.names:
            if (np.issubdtype(x.dtype[name], np.string_) or
                    np.issubdtype(x.dtype[name], np.unicode_) or
                    np.issubdtype(x.dtype[name], np.object_)):
                a = da.from_array(x[name], chunks=(chunksize * len(x.names),))
                categories[name] = da.unique(a)

    columns = tuple(x.dtype.names)
    divisions = (0,) + tuple(range(-1, len(x), chunksize))[1:]
    if divisions[-1] != len(x) - 1:
        divisions = divisions + (len(x) - 1,)
    new_name = 'from_bcolz' + next(tokens)
    dsk = dict(((new_name, i),
                (dataframe_from_ctable,
                 x,
                 (slice(i * chunksize, (i + 1) * chunksize),),
                 None, categories))
               for i in range(0, int(ceil(len(x) / chunksize))))

    result = DataFrame(dsk, new_name, columns, divisions)

    if index:
        assert index in x.names
        a = da.from_array(x[index], chunks=(chunksize * len(x.names),))
        q = np.linspace(0, 100, len(x) // chunksize + 2)
        divisions = da.percentile(a, q).compute()
        return set_partition(result, index, divisions, **kwargs)
    else:
        return result
Beispiel #26
0
def has_inf_or_nan(datum, tensor):
  """A predicate for whether a tensor consists of any bad numerical values.

  This predicate is common enough to merit definition in this module.
  Bad numerical values include nans and infs.
  The signature of this function follows the requiremnet of DebugDumpDir's
  find() method.

  Args:
    datum: (DebugTensorDatum) Datum metadata.
    tensor: (numpy.ndarray or None) Value of the tensor. None represents
      an uninitialized tensor.

  Returns:
    (bool) True if and only if tensor consists of any nan or inf values.
  """

  _ = datum  # Datum metadata is unused in this predicate.
  if tensor is None:
    # Uninitialized tensor doesn't have bad numerical values.
    return False
  elif (np.issubdtype(tensor.dtype, np.float) or
        np.issubdtype(tensor.dtype, np.complex) or
        np.issubdtype(tensor.dtype, np.integer)):
    return np.any(np.isnan(tensor)) or np.any(np.isinf(tensor))
  else:
    return False
    def load_onto_vtk(self, vtk_data):
        """ Load the stored information onto a vtk data container.

        Parameters
        ----------
        vtk_data : vtkPointData or vtkCellData
            The vtk container to load the value onto.

        Data are loaded onto the vtk container based on their data
        type. The name of the added array is the name of the CUBA key
        (i.e. :samp:`{CUBA}.name`). Currently only scalars and three
        dimensional vectors are supported.

        """
        def replacer(data):
            return nan if data is None else data

        for cuba in self.keys:
            default = dummy_cuba_value(cuba)
            if (numpy.issubdtype(type(default), numpy.float) or
                    numpy.issubdtype(type(default), numpy.int)):
                data = numpy.array(self._data[cuba], dtype=float)
                index = vtk_data.add_array(data)
                vtk_data.get_array(index).name = cuba.name
            elif isinstance(default, numpy.ndarray) and default.size == 3:
                nan = numpy.array([None, None, None], dtype=float)
                data = numpy.array(
                    tuple(replacer(data) for data in self._data[cuba]),
                    dtype=numpy.float)
                index = vtk_data.add_array(data)
                vtk_data.get_array(index).name = cuba.name
            else:
                message = 'property {!r} is currently ignored'
                warnings.warn(message.format(cuba))
Beispiel #28
0
def test_apply_loop_invariant_optimisation_integer():
    variables = {'v': Variable('v', scalar=False),
                 'N': Constant('N', 10),
                 'b': Variable('b', scalar=True, dtype=int),
                 'c': Variable('c', scalar=True, dtype=int),
                 'd': Variable('d', scalar=True, dtype=int),
                 'y': Variable('y', scalar=True, dtype=float),
                 'z': Variable('z', scalar=True, dtype=float),
                 'w': Variable('w', scalar=True, dtype=float),
                 }
    statements = [Statement('v', '=', 'v % (2*3*N)', '', np.float32),
                  # integer version doesn't get rewritten but float version does
                  Statement('a', ':=', 'b//(c//d)', '', int),
                  Statement('x', ':=', 'y/(z/w)', '', float),
                  ]
    scalar, vector = optimise_statements([], statements, variables)
    assert len(scalar) == 3
    assert np.issubdtype(scalar[0].dtype, np.signedinteger)
    assert scalar[0].var == '_lio_1'
    expr = scalar[0].expr.replace(' ', '')
    assert expr=='6*N' or expr=='N*6'
    assert np.issubdtype(scalar[1].dtype, np.signedinteger)
    assert scalar[1].var == '_lio_2'
    expr = scalar[1].expr.replace(' ', '')
    assert expr=='b//(c//d)'
    assert np.issubdtype(scalar[2].dtype, np.floating)
    assert scalar[2].var == '_lio_3'
    expr = scalar[2].expr.replace(' ', '')
    assert expr=='(y*w)/z' or expr=='(w*y)/z'
Beispiel #29
0
    def mean(self, axis=None):
        """Average the matrix over the given axis.  If the axis is None,
        average over both rows and columns, returning a scalar.
        """
        # Mimic numpy's casting.
        if (np.issubdtype(self.dtype, np.float_) or
                np.issubdtype(self.dtype, np.integer) or
                np.issubdtype(self.dtype, np.bool_)):
            res_dtype = np.float_
        elif np.issubdtype(self.dtype, np.complex_):
            res_dtype = np.complex_
        else:
            res_dtype = self.dtype

        if axis is None:
            return self.sum(None) * 1.0 / (self.shape[0]*self.shape[1])

        if axis < 0:
            axis += 2
        if axis == 0:
            mean = self.astype(res_dtype).sum(0)
            mean *= 1.0 / self.shape[0]
            return mean
        elif axis == 1:
            mean = self.astype(res_dtype).sum(1)
            mean *= 1.0 / self.shape[1]
            return mean
        else:
            raise ValueError("axis out of bounds")
Beispiel #30
0
    def _argsortData(self, data, order):
        if data.ndim == 1:
            indices = np.argsort(data, kind='mergesort')
            if order == Qt.DescendingOrder:
                indices = indices[::-1]
            # Always sort NaNs last
            if np.issubdtype(data.dtype, np.number):
                indices = np.roll(indices, -np.isnan(data).sum())
        else:
            assert np.issubdtype(data.dtype, np.number), \
                'We do not deal with non numeric values in sorting by ' \
                'multiple values'
            if order == Qt.DescendingOrder:
                data[:, -1] = -data[:, -1]

            # In order to make sure NaNs always appear at the end, insert a
            # indicator whether NaN or not. Note that the data array must
            # contain an empty column of zeros at index -2 since inserting an
            # extra column after the fact can result in a MemoryError for data
            # with a large amount of variables
            assert np.all(data[:, -2] == 0), \
                'Add an empty column of zeros at index -2 to accomodate NaNs'
            np.isnan(data[:, -1], out=data[:, -2])

            indices = np.lexsort(np.flip(data.T, axis=0))

        return indices
Beispiel #31
0
def solve_bvp(fun,
              bc,
              x,
              y,
              p=None,
              S=None,
              fun_jac=None,
              bc_jac=None,
              tol=1e-3,
              max_nodes=1000,
              verbose=0,
              args=()):
    """Solve a boundary-value problem for a system of ODEs.
    This function numerically solves a first order system of ODEs subject to
    two-point boundary conditions::
        dy / dx = f(x, y, p) + S * y / (x - a), a <= x <= b
        bc(y(a), y(b), p) = 0
    Here x is a 1-dimensional independent variable, y(x) is a n-dimensional
    vector-valued function and p is a k-dimensional vector of unknown
    parameters which is to be found along with y(x). For the problem to be
    determined there must be n + k boundary conditions, i.e. bc must be
    (n + k)-dimensional function.
    The last singular term in the right-hand side of the system is optional.
    It is defined by an n-by-n matrix S, such that the solution must satisfy
    S y(a) = 0. This condition will be forced during iterations, so it must not
    contradict boundary conditions. See [2]_ for the explanation how this term
    is handled when solving BVPs numerically.
    Problems in a complex domain can be solved as well. In this case y and p
    are considered to be complex, and f and bc are assumed to be complex-valued
    functions, but x stays real. Note that f and bc must be complex
    differentiable (satisfy Cauchy-Riemann equations [4]_), otherwise you
    should rewrite your problem for real and imaginary parts separately. To
    solve a problem in a complex domain, pass an initial guess for y with a
    complex data type (see below).
    Parameters
    ----------
    fun : callable
        Right-hand side of the system. The calling signature is ``fun(x, y)``,
        or ``fun(x, y, p)`` if parameters are present. All arguments are
        ndarray: ``x`` with shape (m,), ``y`` with shape (n, m), meaning that
        ``y[:, i]`` corresponds to ``x[i]``, and ``p`` with shape (k,). The
        return value must be an array with shape (n, m) and with the same
        layout as ``y``.
    bc : callable
        Function evaluating residuals of the boundary conditions. The calling
        signature is ``bc(ya, yb)``, or ``bc(ya, yb, p)`` if parameters are
        present. All arguments are ndarray: ``ya`` and ``yb`` with shape (n,),
        and ``p`` with shape (k,). The return value must be an array with
        shape (n + k,).
    x : array_like, shape (m,)
        Initial mesh. Must be a strictly increasing sequence of real numbers
        with ``x[0]=a`` and ``x[-1]=b``.
    y : array_like, shape (n, m)
        Initial guess for the function values at the mesh nodes, i-th column
        corresponds to ``x[i]``. For problems in a complex domain pass `y`
        with a complex data type (even if the initial guess is purely real).
    p : array_like with shape (k,) or None, optional
        Initial guess for the unknown parameters. If None (default), it is
        assumed that the problem doesn't depend on any parameters.
    S : array_like with shape (n, n) or None
        Matrix defining the singular term. If None (default), the problem is
        solved without the singular term.
    fun_jac : callable or None, optional
        Function computing derivatives of f with respect to y and p. The
        calling signature is ``fun_jac(x, y)``, or ``fun_jac(x, y, p)`` if
        parameters are present. The return must contain 1 or 2 elements in the
        following order:
            * df_dy : array_like with shape (n, n, m) where an element
              (i, j, q) equals to d f_i(x_q, y_q, p) / d (y_q)_j.
            * df_dp : array_like with shape (n, k, m) where an element
              (i, j, q) equals to d f_i(x_q, y_q, p) / d p_j.
        Here q numbers nodes at which x and y are defined, whereas i and j
        number vector components. If the problem is solved without unknown
        parameters df_dp should not be returned.
        If `fun_jac` is None (default), the derivatives will be estimated
        by the forward finite differences.
    bc_jac : callable or None, optional
        Function computing derivatives of bc with respect to ya, yb and p.
        The calling signature is ``bc_jac(ya, yb)``, or ``bc_jac(ya, yb, p)``
        if parameters are present. The return must contain 2 or 3 elements in
        the following order:
            * dbc_dya : array_like with shape (n, n) where an element (i, j)
              equals to d bc_i(ya, yb, p) / d ya_j.
            * dbc_dyb : array_like with shape (n, n) where an element (i, j)
              equals to d bc_i(ya, yb, p) / d yb_j.
            * dbc_dp : array_like with shape (n, k) where an element (i, j)
              equals to d bc_i(ya, yb, p) / d p_j.
        If the problem is solved without unknown parameters dbc_dp should not
        be returned.
        If `bc_jac` is None (default), the derivatives will be estimated by
        the forward finite differences.
    tol : float, optional
        Desired tolerance of the solution. If we define ``r = y' - f(x, y)``
        where y is the found solution, then the solver tries to achieve on each
        mesh interval ``norm(r / (1 + abs(f)) < tol``, where ``norm`` is
        estimated in a root mean squared sense (using a numerical quadrature
        formula). Default is 1e-3.
    max_nodes : int, optional
        Maximum allowed number of the mesh nodes. If exceeded, the algorithm
        terminates. Default is 1000.
    verbose : {0, 1, 2}, optional
        Level of algorithm's verbosity:
            * 0 (default) : work silently.
            * 1 : display a termination report.
            * 2 : display progress during iterations.
    Returns
    -------
    Bunch object with the following fields defined:
    sol : PPoly
        Found solution for y as `scipy.interpolate.PPoly` instance, a C1
        continuous cubic spline.
    p : ndarray or None, shape (k,)
        Found parameters. None, if the parameters were not present in the
        problem.
    x : ndarray, shape (m,)
        Nodes of the final mesh.
    y : ndarray, shape (n, m)
        Solution values at the mesh nodes.
    yp : ndarray, shape (n, m)
        Solution derivatives at the mesh nodes.
    rms_residuals : ndarray, shape (m - 1,)
        RMS values of the relative residuals over each mesh interval (see the
        description of `tol` parameter).
    niter : int
        Number of completed iterations.
    status : int
        Reason for algorithm termination:
            * 0: The algorithm converged to the desired accuracy.
            * 1: The maximum number of mesh nodes is exceeded.
            * 2: A singular Jacobian encountered when solving the collocation
              system.
    message : string
        Verbal description of the termination reason.
    success : bool
        True if the algorithm converged to the desired accuracy (``status=0``).
    Notes
    -----
    This function implements a 4-th order collocation algorithm with the
    control of residuals similar to [1]_. A collocation system is solved
    by a damped Newton method with an affine-invariant criterion function as
    described in [3]_.
    Note that in [1]_  integral residuals are defined without normalization
    by interval lengths. So their definition is different by a multiplier of
    h**0.5 (h is an interval length) from the definition used here.
    .. versionadded:: 0.18.0
    References
    ----------
    .. [1] J. Kierzenka, L. F. Shampine, "A BVP Solver Based on Residual
           Control and the Maltab PSE", ACM Trans. Math. Softw., Vol. 27,
           Number 3, pp. 299-316, 2001.
    .. [2] L.F. Shampine, P. H. Muir and H. Xu, "A User-Friendly Fortran BVP
           Solver".
    .. [3] U. Ascher, R. Mattheij and R. Russell "Numerical Solution of
           Boundary Value Problems for Ordinary Differential Equations".
    .. [4] `Cauchy-Riemann equations
            <https://en.wikipedia.org/wiki/Cauchy-Riemann_equations>`_ on
            Wikipedia.
    Examples
    --------
    In the first example we solve Bratu's problem::
        y'' + k * exp(y) = 0
        y(0) = y(1) = 0
    for k = 1.
    We rewrite the equation as a first order system and implement its
    right-hand side evaluation::
        y1' = y2
        y2' = -exp(y1)
    >>> def fun(x, y):
    ...     return num.vstack((y[1], -num.exp(y[0])))
    Implement evaluation of the boundary condition residuals:
    >>> def bc(ya, yb):
    ...     return num.array([ya[0], yb[0]])
    Define the initial mesh with 5 nodes:
    >>> x = num.linspace(0, 1, 5)
    This problem is known to have two solutions. To obtain both of them we
    use two different initial guesses for y. We denote them by subscripts
    a and b.
    >>> y_a = num.zeros((2, x.size))
    >>> y_b = num.zeros((2, x.size))
    >>> y_b[0] = 3
    Now we are ready to run the solver.
    >>> from scipy.integrate import solve_bvp
    >>> res_a = solve_bvp(fun, bc, x, y_a)
    >>> res_b = solve_bvp(fun, bc, x, y_b)
    Let's plot the two found solutions. We take an advantage of having the
    solution in a spline form to produce a smooth plot.
    >>> x_plot = num.linspace(0, 1, 100)
    >>> y_plot_a = res_a.sol(x_plot)[0]
    >>> y_plot_b = res_b.sol(x_plot)[0]
    >>> import matplotlib.pyplot as plt
    >>> plt.plot(x_plot, y_plot_a, label='y_a')
    >>> plt.plot(x_plot, y_plot_b, label='y_b')
    >>> plt.legend()
    >>> plt.xlabel("x")
    >>> plt.ylabel("y")
    >>> plt.show()
    We see that the two solutions have similar shape, but differ in scale
    significantly.
    In the second example we solve a simple Sturm-Liouville problem::
        y'' + k**2 * y = 0
        y(0) = y(1) = 0
    It is known that a non-trivial solution y = A * sin(k * x) is possible for
    k = pi * n, where n is an integer. To establish the normalization constant
    A = 1 we add a boundary condition::
        y'(0) = k
    Again we rewrite our equation as a first order system and implement its
    right-hand side evaluation::
        y1' = y2
        y2' = -k**2 * y1
    >>> def fun(x, y, p):
    ...     k = p[0]
    ...     return num.vstack((y[1], -k**2 * y[0]))
    Note that parameters p are passed as a vector (with one element in our
    case).
    Implement the boundary conditions:
    >>> def bc(ya, yb, p):
    ...     k = p[0]
    ...     return num.array([ya[0], yb[0], ya[1] - k])
    Setup the initial mesh and guess for y. We aim to find the solution for
    k = 2 * pi, to achieve that we set values of y to approximately follow
    sin(2 * pi * x):
    >>> x = num.linspace(0, 1, 5)
    >>> y = num.zeros((2, x.size))
    >>> y[0, 1] = 1
    >>> y[0, 3] = -1
    Run the solver with 6 as an initial guess for k.
    >>> sol = solve_bvp(fun, bc, x, y, p=[6])
    We see that the found k is approximately correct:
    >>> sol.p[0]
    6.28329460046
    And finally plot the solution to see the anticipated sinusoid:
    >>> x_plot = num.linspace(0, 1, 100)
    >>> y_plot = sol.sol(x_plot)[0]
    >>> plt.plot(x_plot, y_plot)
    >>> plt.xlabel("x")
    >>> plt.ylabel("y")
    >>> plt.show()
    """
    x = num.asarray(x, dtype=float)
    if x.ndim != 1:
        raise ValueError("`x` must be 1 dimensional.")
    h = num.diff(x)
    if num.any(h <= 0):
        raise ValueError("`x` must be strictly increasing.")
    a = x[0]

    y = num.asarray(y)
    if num.issubdtype(y.dtype, num.complexfloating):
        dtype = complex
    else:
        dtype = float
    y = y.astype(dtype, copy=False)

    if y.ndim != 2:
        raise ValueError("`y` must be 2 dimensional.")
    if y.shape[1] != x.shape[0]:
        raise ValueError("`y` is expected to have {} columns, but actually "
                         "has {}.".format(x.shape[0], y.shape[1]))

    if p is None:
        p = num.array([])
    else:
        p = num.asarray(p, dtype=dtype)
    if p.ndim != 1:
        raise ValueError("`p` must be 1 dimensional.")

    if tol < 100 * EPS:
        warn("`tol` is too low, setting to {:.2e}".format(100 * EPS))
        tol = 100 * EPS

    if verbose not in [0, 1, 2]:
        raise ValueError("`verbose` must be in [0, 1, 2].")

    n = y.shape[0]
    k = p.shape[0]

    if S is not None:
        S = num.asarray(S, dtype=dtype)
        if S.shape != (n, n):
            raise ValueError("`S` is expected to have shape {}, "
                             "but actually has {}".format((n, n), S.shape))

        # Compute I - S^+ S to impose necessary boundary conditions.
        B = num.identity(n) - num.dot(pinv(S), S)

        y[:, 0] = num.dot(B, y[:, 0])

        # Compute (I - S)^+ to correct derivatives at x=a.
        D = pinv(num.identity(n) - S)
    else:
        B = None
        D = None

    fun_wrapped, bc_wrapped, fun_jac_wrapped, bc_jac_wrapped = wrap_functions(
        fun, bc, fun_jac, bc_jac, k, a, S, D, dtype)

    f = fun_wrapped(x, y, p, args)
    if f.shape != y.shape:
        raise ValueError("`fun` return is expected to have shape {}, "
                         "but actually has {}.".format(y.shape, f.shape))

    bc_res = bc_wrapped(y[:, 0], y[:, -1], p, args)
    if bc_res.shape != (n + k, ):
        raise ValueError("`bc` return is expected to have shape {}, "
                         "but actually has {}.".format((n + k, ),
                                                       bc_res.shape))

    status = 0
    iteration = 0
    if verbose == 2:
        print_iteration_header()

    while True:
        m = x.shape[0]

        col_fun, jac_sys = prepare_sys(n, m, k, fun_wrapped, bc_wrapped,
                                       fun_jac_wrapped, bc_jac_wrapped, x, h,
                                       args)
        y, p, singular = solve_newton(n, m, h, col_fun, bc_wrapped, jac_sys, y,
                                      p, B, tol, args)
        iteration += 1

        col_res, y_middle, f, f_middle = collocation_fun(
            fun_wrapped, y, p, x, h, args)
        # This relation is not trivial, but can be verified.
        r_middle = 1.5 * col_res / h
        sol = create_spline(y, f, x, h)
        rms_res = estimate_rms_residuals(fun_wrapped, sol, x, h, p, r_middle,
                                         f_middle, args)
        max_rms_res = num.max(rms_res)

        if singular:
            status = 2
            break

        insert_1, = num.nonzero((rms_res > tol) & (rms_res < 100 * tol))
        insert_2, = num.nonzero(rms_res >= 100 * tol)
        nodes_added = insert_1.shape[0] + 2 * insert_2.shape[0]

        if m + nodes_added > max_nodes:
            status = 1
            if verbose == 2:
                nodes_added = "({})".format(nodes_added)
                print_iteration_progress(iteration, max_rms_res, m,
                                         nodes_added)
            break

        if verbose == 2:
            print_iteration_progress(iteration, max_rms_res, m, nodes_added)

        if nodes_added > 0:
            x = modify_mesh(x, insert_1, insert_2)
            h = num.diff(x)
            y = sol(x)
        else:
            status = 0
            break

    if verbose > 0:
        if status == 0:
            print("Solved in {} iterations, number of nodes {}, "
                  "maximum relative residual {:.2e}.".format(
                      iteration, x.shape[0], max_rms_res))
        elif status == 1:
            print("Number of nodes is exceeded after iteration {}, "
                  "maximum relative residual {:.2e}.".format(
                      iteration, max_rms_res))
        elif status == 2:
            print("Singular Jacobian encountered when solving the collocation "
                  "system on iteration {}, maximum relative residual {:.2e}.".
                  format(iteration, max_rms_res))

    if p.size == 0:
        p = None

    return BVPResult(sol=sol,
                     p=p,
                     x=x,
                     y=y,
                     yp=f,
                     rms_residuals=rms_res,
                     niter=iteration,
                     status=status,
                     message=TERMINATION_MESSAGES[status],
                     success=status == 0)
 def test_same(self):
     for cls in (np.float32, np.int32):
         for w1, w2 in itertools.product(self.wrappers, repeat=2):
             assert_(np.issubdtype(w1(cls), w2(cls)))
 def test_both_abstract(self):
     assert_(np.issubdtype(np.floating, np.inexact))
     assert_(not np.issubdtype(np.inexact, np.floating))
Beispiel #34
0
def array2vtk(num_array, vtk_array=None):
    """Converts a real numpy Array (or a Python list) to a VTK array
    object.

    This function only works for real arrays.  Complex arrays are NOT
    handled.  It also works for multi-component arrays.  However, only
    1, and 2 dimensional arrays are supported.  This function is very
    efficient, so large arrays should not be a problem.

    Even in cases when no copy of the numpy array data is performed,
    a reference to the array is cached.  The passed array can
    therefore be deleted safely in all circumstances.

    Parameters
    ----------

    - num_array : numpy array or Python list/tuple

      The input array must be 1 or 2D.  A copy of the numeric array
      data passed is made in the following circumstances:

       1. A Python list/tuple was passed.
       2. A non-contiguous numpy array was passed.
       3. A `vtkBitArray` instance was passed as the second argument.
       4. The types of the `vtk_array` and the `num_array` are not
          equivalent to each other.  For example if one is an integer
          array and the other a float.

    - vtk_array : `vtkDataArray` (default: `None`)

      If an optional `vtkDataArray` instance, is passed as an argument
      then a new array is not created and returned.  The passed array
      is itself returned.

    """

    z = numpy.asarray(num_array)

    shape = z.shape
    assert len(shape) < 3, \
        "Only arrays of dimensionality 2 or lower are allowed!"
    assert not numpy.issubdtype(z.dtype, numpy.complexfloating), \
        "Complex numpy arrays cannot be converted to vtk arrays."\
        "Use real() or imag() to get a component of the array before"\
        " passing it to vtk."

    # First create an array of the right type by using the typecode.
    # Bit arrays need special casing.
    bit_array = False
    if vtk_array is None:
        vtk_typecode = get_vtk_array_type(z.dtype)
        result_array = create_vtk_array(vtk_typecode)
    elif vtk_array.GetDataType() == vtkConstants.VTK_BIT:
        vtk_typecode = vtkConstants.VTK_CHAR
        result_array = create_vtk_array(vtkConstants.VTK_CHAR)
        bit_array = True
    else:
        vtk_typecode = vtk_array.GetDataType()
        result_array = vtk_array

    # Find the shape and set number of components.
    if len(shape) == 1:
        result_array.SetNumberOfComponents(1)
    else:
        result_array.SetNumberOfComponents(shape[1])

    result_array.SetNumberOfTuples(shape[0])

    # Ravel the array appropriately.
    arr_dtype = get_numeric_array_type(vtk_typecode)
    if numpy.issubdtype(z.dtype, arr_dtype):
        z_flat = numpy.ravel(z)
    else:
        z_flat = numpy.ravel(z).astype(arr_dtype)

    # Point the VTK array to the numpy data.  The last argument (1)
    # tells the array not to deallocate.
    result_array.SetVoidArray(getbuffer(z_flat), len(z_flat), 1)

    if bit_array:
        # Handle bit arrays -- they have to be copied.  Note that bit
        # arrays are used ONLY when the user has passed one as an
        # argument to this function.
        vtk_array.SetNumberOfTuples(result_array.GetNumberOfTuples())
        vtk_array.SetNumberOfComponents(result_array.GetNumberOfComponents())
        for i in range(result_array.GetNumberOfComponents()):
            vtk_array.CopyComponent(i, result_array, i)
        result_array = vtk_array
    else:
        # Save a reference to the flatted array in the array cache.
        # This prevents the user from deleting or resizing the array
        # and getting into serious trouble.  This is only done for
        # non-bit array cases where the data is not copied.
        global _array_cache
        _array_cache.add(result_array, z_flat)

    return result_array
Beispiel #35
0
def assign_policies_to_panel(cases_df, policies, cases_level, aggregate_vars=[], get_latlons=True, errors='raise'):
    """Assign all policy variables from `policies` to `cases_df`
    Args:
        cases_df (pandas.DataFrame): table to assign policy variables to, 
            typically with case data already assigned
        policies (pandas.DataFrame): table of policies, listed by date and regions affected
        cases_level (int): Adminisrative unit level used for analysis of policy effects,
            typically the lowest level which pop-weights have been applied to
        aggregate_vars (list of str): list of policy variables where optional version
            should be treated independently of mandatory version

    Returns:
        pandas.DataFrame: a version of `cases_df` with all policies from `policies` assigned as new columns
    """

    # Make sure policies input doesn't change unexpectedly
    policies = policies.copy()
    
    # Convert 'optional' to indicator variable
    if not np.issubdtype(policies['optional'].dtype, np.number):
        policies['optional'] = policies['optional'].replace({"Y":1, "N":0})
        # fill any nans with 0
        policies['optional'] = policies['optional'].fillna(0).astype(int)
    
    policies['optional'] = policies['optional'].fillna(0)
    if errors == 'raise':
        assert len(policies['optional'].unique()) <= 2
    elif errors == 'warn':
    	if len(policies['optional'].unique()) > 2:
    		print('there were more than two values for optional: {0}'.format(policies['optional'].unique()))

    policies['date_end'] = policies['date_end'].fillna(pd.to_datetime('2099-12-31'))

    # Assign population columns to `policies` and `cases_df`
    policies, cases_df = cpop.assign_all_populations(policies, cases_df, cases_level, get_latlons=get_latlons, errors=errors)

    # Assign policy_level to distinguish policies specified at different admin-unit levels
    policies['policy_level'] = policies.apply(get_policy_level, axis=1)

    # Treat policies in `aggregate_vars` as independent policies (just like mandatory policies)
    # Set optional to 0 to avoid applying normal optional logic in `get_policy_vals()`
    for policy in aggregate_vars:
        policies.loc[policies['optional'] == 1, 'policy'] = policies.loc[policies['optional'] == 1, 'policy'] + '_opt'
        policies.loc[policies['optional'] == 1, 'optional'] = 0

    policy_list = list(policies['policy'].unique())
    policy_popwts = [p + '_popwt' for p in policy_list if p not in exclude_from_popweights]

    date_min = cases_df['date'].min()
    date_max = cases_df['date'].max()

    # Initalize panel with same structure as `cases_df`
    policy_panel = pd.DataFrame(
        index=pd.MultiIndex.from_product([
            pd.date_range(date_min, date_max), 
            sorted(cases_df[f'adm{cases_level}_name'].unique())
        ]), 
        columns=policy_list + policy_popwts).reset_index().rename(
            columns={'level_0':'date', 'level_1':f'adm{cases_level}_name'}
        ).fillna(0)
    
    # Assign each policy one-by-one to the panel
    for policy in policy_list:
        policy_pickle_dict = dict()

        # Get Series of 4-tuples for mandatory pop-weighted, mandatory indicator,
        # optional pop-weighted, optional indicator
        tmp = policy_panel.apply(lambda row: get_policy_vals(policies, policy, row['date'], row[f'adm{cases_level}_name'], cases_level, policy_pickle_dict), axis=1)
        
        # Assign regular policy indicator
        policy_panel[policy] = tmp.apply(lambda x: x[1])

        # Assign opt-column if there's anything there
        opt_col = tmp.apply(lambda x: x[3])
        use_opt_col = opt_col.sum() > 0
        if use_opt_col:
            policy_panel[policy + '_opt'] = tmp.apply(lambda x: x[3])

        # Assign pop-weighted column if it's not excluded from pop-weighting, and opt-pop-weighted if
        # Optional and pop-weighted are both used
        if policy not in exclude_from_popweights:
            policy_panel[policy + '_popwt'] = tmp.apply(lambda x: x[0])
            if use_opt_col:
                policy_panel[policy + '_opt_popwt'] = tmp.apply(lambda x: x[2])
        
    policy_panel = count_policies_enacted(policy_panel, policy_list)

    # Merge panel with `cases_df`
    merged = pd.merge(cases_df, policy_panel, left_on=['date', f'adm{cases_level}_name'], right_on=['date', f'adm{cases_level}_name'])
    
    return merged
Beispiel #36
0
def vq(obs, code_book, check_finite=True):
    """
    Assign codes from a code book to observations.

    Assigns a code from a code book to each observation. Each
    observation vector in the 'M' by 'N' `obs` array is compared with the
    centroids in the code book and assigned the code of the closest
    centroid.

    The features in `obs` should have unit variance, which can be
    achieved by passing them through the whiten function. The code
    book can be created with the k-means algorithm or a different
    encoding algorithm.

    Parameters
    ----------
    obs : ndarray
        Each row of the 'M' x 'N' array is an observation. The columns are
        the "features" seen during each observation. The features must be
        whitened first using the whiten function or something equivalent.
    code_book : ndarray
        The code book is usually generated using the k-means algorithm.
        Each row of the array holds a different code, and the columns are
        the features of the code.

         >>> #              f0    f1    f2   f3
         >>> code_book = [
         ...             [  1.,   2.,   3.,   4.],  #c0
         ...             [  1.,   2.,   3.,   4.],  #c1
         ...             [  1.,   2.,   3.,   4.]]  #c2

    check_finite : bool, optional
        Whether to check that the input matrices contain only finite numbers.
        Disabling may give a performance gain, but may result in problems
        (crashes, non-termination) if the inputs do contain infinities or NaNs.
        Default: True

    Returns
    -------
    code : ndarray
        A length M array holding the code book index for each observation.
    dist : ndarray
        The distortion (distance) between the observation and its nearest
        code.

    Examples
    --------
    >>> from numpy import array
    >>> from scipy.cluster.vq import vq
    >>> code_book = array([[1.,1.,1.],
    ...                    [2.,2.,2.]])
    >>> features  = array([[  1.9,2.3,1.7],
    ...                    [  1.5,2.5,2.2],
    ...                    [  0.8,0.6,1.7]])
    >>> vq(features,code_book)
    (array([1, 1, 0],'i'), array([ 0.43588989,  0.73484692,  0.83066239]))

    """
    obs = _asarray_validated(obs, check_finite=check_finite)
    code_book = _asarray_validated(code_book, check_finite=check_finite)
    ct = np.common_type(obs, code_book)

    c_obs = obs.astype(ct, copy=False)
    c_code_book = code_book.astype(ct, copy=False)

    if np.issubdtype(ct, np.float64) or np.issubdtype(ct, np.float32):
        return _vq.vq(c_obs, c_code_book)
    return py_vq(obs, code_book, check_finite=False)
Beispiel #37
0
def diff_array(array1, array2, showdiffs=10, raiseondiff=False):
    if len(array1) != len(array2):
        print("length is different: %d vs %d" % (len(array1), len(array2)))
        ids1 = array1['id']
        ids2 = array2['id']
        all_ids = np.union1d(ids1, ids2)
        notin1 = np.setdiff1d(ids1, all_ids)
        notin2 = np.setdiff1d(ids2, all_ids)
        if notin1:
            print("the following ids are not present in file 1:", notin1)
        elif notin2:
            print("the following ids are not present in file 2:", notin2)
        else:
            # some ids must be duplicated
            if len(ids1) > len(all_ids):
                print("file 1 contain duplicate ids:", end=' ')
                uniques, dupes = unique_dupes(ids1)
                print(dupes)
                array1 = array1[uniques]
            if len(ids2) > len(all_ids):
                print("file 2 contain duplicate ids:", end=' ')
                uniques, dupes = unique_dupes(ids2)
                print(dupes)
                array2 = array2[uniques]

    fields1 = get_fields(array1)
    fields2 = get_fields(array2)
    fnames1 = set(array1.dtype.names)
    fnames2 = set(array2.dtype.names)
    # use merge_items instead of fnames1 | fnames2 to preserve ordering
    for fname, _ in merge_items(fields1, fields2):
        print("  - %s:" % fname, end=' ')
        if fname not in fnames1:
            print("missing in file 1")
            continue
        elif fname not in fnames2:
            print("missing in file 2")
            continue
        col1, col2 = array1[fname], array2[fname]
        if np.issubdtype(col1.dtype, np.inexact):
            if len(col1) == len(col2):
                both_nan = np.isnan(col1) & np.isnan(col2)
                eq = np.all(both_nan | (col1 == col2))
            else:
                eq = False
        else:
            eq = np.array_equal(col1, col2)

        if eq:
            print("ok")
        else:
            print("different", end=' ')
            if len(col1) != len(col2):
                print("(length)")
            else:
                diff = (col1 != col2).nonzero()[0]
                print("(%d differences)" % len(diff))
                ids = array1['id']
                if len(diff) > showdiffs:
                    diff = diff[:showdiffs]
                print(
                    PrettyTable(
                        [['id', fname + ' (file1)', fname + ' (file2)']] +
                        [[ids[idx], col1[idx], col2[idx]] for idx in diff]))
            if raiseondiff:
                raise Exception('different')
Beispiel #38
0
def is_complex(dtype):
    """Returns whether this is a complex floating point type."""
    dtype = tf.as_dtype(dtype)
    if hasattr(dtype, 'is_complex'):
        return dtype.is_complex
    return np.issubdtype(np.dtype(dtype), np.complex)
Beispiel #39
0
def is_floating(dtype):
    """Returns whether this is a (non-quantized, real) floating point type."""
    dtype = tf.as_dtype(dtype)
    if hasattr(dtype, 'is_floating'):
        return dtype.is_floating
    return np.issubdtype(np.dtype(dtype), np.float)
Beispiel #40
0
def upload(meta,
           cache,
           image,
           offset,
           mip,
           compress=None,
           cdn_cache=None,
           parallel=1,
           progress=False,
           delete_black_uploads=False,
           non_aligned_writes=False,
           location=None,
           location_bbox=None,
           location_order='F',
           use_shared_memory=False,
           use_file=False,
           green=False):
    """Upload img to vol with offset. This is the primary entry point for uploads."""

    if not np.issubdtype(image.dtype, np.dtype(meta.dtype).type):
        raise ValueError("""
      The uploaded image data type must match the volume data type. 

      Volume: {}
      Image: {}
      """.format(meta.dtype, image.dtype))

    shape = Vec(*image.shape)[:3]
    offset = Vec(*offset)[:3]
    bounds = Bbox(offset, shape + offset)

    is_aligned = check_grid_aligned(meta,
                                    image,
                                    bounds,
                                    mip,
                                    throw_error=(non_aligned_writes == False))

    if is_aligned:
        upload_aligned(
            meta,
            cache,
            image,
            offset,
            mip,
            compress=compress,
            cdn_cache=cdn_cache,
            parallel=parallel,
            progress=progress,
            location=location,
            location_bbox=location_bbox,
            location_order=location_order,
            use_shared_memory=use_shared_memory,
            use_file=use_file,
            delete_black_uploads=delete_black_uploads,
            green=green,
        )
        return

    # Upload the aligned core
    expanded = bounds.expand_to_chunk_size(meta.chunk_size(mip),
                                           meta.voxel_offset(mip))
    retracted = bounds.shrink_to_chunk_size(meta.chunk_size(mip),
                                            meta.voxel_offset(mip))
    core_bbox = retracted.clone() - bounds.minpt

    if not core_bbox.subvoxel():
        core_img = image[core_bbox.to_slices()]
        upload_aligned(
            meta,
            cache,
            core_img,
            retracted.minpt,
            mip,
            compress=compress,
            cdn_cache=cdn_cache,
            parallel=parallel,
            progress=progress,
            location=location,
            location_bbox=location_bbox,
            location_order=location_order,
            use_shared_memory=use_shared_memory,
            use_file=use_file,
            delete_black_uploads=delete_black_uploads,
            green=green,
        )

    # Download the shell, paint, and upload
    all_chunks = set(
        chunknames(expanded, meta.bounds(mip), meta.key(mip),
                   meta.chunk_size(mip)))
    core_chunks = set(
        chunknames(retracted, meta.bounds(mip), meta.key(mip),
                   meta.chunk_size(mip)))
    shell_chunks = all_chunks.difference(core_chunks)

    def shade_and_upload(img3d, bbox):
        # decode is returning non-writable chunk
        # we're throwing them away so safe to write
        img3d.setflags(write=1)
        shade(img3d, bbox, image, bounds)
        threaded_upload_chunks(
            meta,
            cache,
            img3d,
            mip,
            ((Vec(0, 0, 0), Vec(*img3d.shape[:3]), bbox.minpt, bbox.maxpt), ),
            compress=compress,
            cdn_cache=cdn_cache,
            progress=progress,
            n_threads=0,
            delete_black_uploads=delete_black_uploads,
            green=green,
        )

    compress_cache = should_compress(meta.encoding(mip),
                                     compress,
                                     cache,
                                     iscache=True)

    download_chunks_threaded(meta,
                             cache,
                             mip,
                             shell_chunks,
                             fn=shade_and_upload,
                             fill_missing=False,
                             progress=progress,
                             compress_cache=compress_cache,
                             green=green)
def validate_cuba_keyword(value, key):
    ''' Validate the given `value` against `key` such that
    shape and type of value matches what was specified

    Parameters
    ----------
    value : object
       any value

    key : str
       CUBA key, can be stripped of 'CUBA.'

    Returns
    -------
    None

    Raises
    ------
    TypeError
        - if key is a CUBA keyword with shape and the value's shape
          or type does not match
        - if key corresponds to a class defined by the meta data and
          the value is not an instance of that class
    '''
    from . import api
    from simphony.core.keywords import KEYWORDS

    # Sanitising, although generated code already did
    key = key.replace('CUBA.', '')

    # Class name, e.g. cuds_item -> CUDSItem
    class_name = to_camel_case(key)

    # The corresponding class in the metadata
    api_class = getattr(api, class_name, None)

    # Keyword name in KEYWORDS
    keyword_name = key.upper()

    if api_class:
        if not isinstance(value, api_class):
            message = '{0!r} is not an instance of {1}'
            raise TypeError(message.format(value, api_class))
    elif keyword_name in KEYWORDS:
        keyword = KEYWORDS[keyword_name]

        # Check type
        value_arr = numpy.asarray(value)

        if not numpy.issubdtype(value_arr.dtype, keyword.dtype):
            message = ('value has dtype {dtype1} while {key} '
                       'needs to be a {dtype2}')
            raise TypeError(
                message.format(dtype1=value_arr.dtype,
                               key=key,
                               dtype2=keyword.dtype))
        # FIXME: STRING
        # cuba.yml gives a fix length for the shape of string
        # It actually means the maximum length of the string
        # We will skip checking validating it for now
        if keyword.dtype is str and value_arr.dtype.char[0] in ('S', 'U'):
            warnings.warn('Value is a string, its shape is not validated. '
                          'Please fix the cuba.yml shape syntax.')
            return

        check_cuba_shape(value, keyword_name)
    else:
        message = '{} is not defined in CUBA keyword or meta data'
        warnings.warn(message.format(key.upper()))
Beispiel #42
0
def is_integer(dtype):
    """Returns whether this is a (non-quantized) integer type."""
    dtype = tf.as_dtype(dtype)
    if hasattr(dtype, 'is_integer') and not callable(dtype.is_integer):
        return dtype.is_integer
    return np.issubdtype(np.dtype(dtype), np.integer)
Beispiel #43
0
np.issctype(object)
np.issctype("S8")

np.obj2sctype(list)
np.obj2sctype(list, default=None)
np.obj2sctype(list, default=np.string_)

np.issubclass_(np.int32, int)
np.issubclass_(np.float64, float)
np.issubclass_(np.float64, (int, float))

np.issubsctype("int64", int)
np.issubsctype(np.array([1]), np.array([1]))

np.issubdtype("S1", np.string_)
np.issubdtype(np.float64, np.float32)

np.sctype2char("S1")
np.sctype2char(list)

np.find_common_type([], [np.int64, np.float32, complex])
np.find_common_type((), (np.int64, np.float32, complex))
np.find_common_type([np.int64, np.float32], [])
np.find_common_type([np.float32], [np.int64, np.float64])

np.cast[int]
np.cast["i8"]
np.cast[np.int64]

np.nbytes[int]
Beispiel #44
0
def _reduce(tf_fn,
            a,
            axis=None,
            dtype=None,
            keepdims=None,
            promote_int=_TO_INT64,
            tf_bool_fn=None,
            preserve_bool=False):
    """A general reduction function.

  Args:
    tf_fn: the TF reduction function.
    a: the array to be reduced.
    axis: (optional) the axis along which to do the reduction. If None, all
      dimensions are reduced.
    dtype: (optional) the dtype of the result.
    keepdims: (optional) whether to keep the reduced dimension(s).
    promote_int: how to promote integer and bool inputs. There are three
      choices: (1) _TO_INT64: always promote them to int64 or uint64; (2)
      _TO_FLOAT: always promote them to a float type (determined by
      dtypes.default_float_type); (3) None: don't promote.
    tf_bool_fn: (optional) the TF reduction function for bool inputs. It
      will only be used if `dtype` is explicitly set to `np.bool_` or if `a`'s
      dtype is `np.bool_` and `preserve_bool` is True.
    preserve_bool: a flag to control whether to use `tf_bool_fn` if `a`'s dtype
      is `np.bool_` (some reductions such as np.sum convert bools to
      integers, while others such as np.max preserve bools.

  Returns:
    An ndarray.
  """
    if dtype:
        dtype = utils.result_type(dtype)
    if keepdims is None:
        keepdims = False
    a = array_creation.asarray(a, dtype=dtype)
    if ((dtype == np.bool_ or preserve_bool and a.dtype == np.bool_)
            and tf_bool_fn is not None):
        return utils.tensor_to_ndarray(
            tf_bool_fn(input_tensor=a.data, axis=axis, keepdims=keepdims))
    if dtype is None:
        dtype = a.dtype
        if np.issubdtype(dtype, np.integer) or dtype == np.bool_:
            if promote_int == _TO_INT64:
                # If a is an integer/bool type and whose bit width is less than 64,
                # numpy up-casts it to 64-bit.
                if dtype == np.bool_:
                    is_signed = True
                    width = 8  # We can use any number here that is less than 64
                else:
                    is_signed = np.issubdtype(dtype, np.signedinteger)
                    width = np.iinfo(dtype).bits
                if width < 64:
                    if is_signed:
                        dtype = np.int64
                    else:
                        dtype = np.uint64
                    a = a.astype(dtype)
            elif promote_int == _TO_FLOAT:
                a = a.astype(dtypes.default_float_type())

    return utils.tensor_to_ndarray(
        tf_fn(input_tensor=a.data, axis=axis, keepdims=keepdims))
Beispiel #45
0
def iscomplexobj(x):
    x = np_array_ops.array(x)
    return np.issubdtype(x.dtype.as_numpy_dtype, np.complexfloating)
Beispiel #46
0
def test_voxel_pooling(ml, pos_dtype, feat_dtype, position_fn, feature_fn):
    # yapf: disable

    points = np.array([
        # 3 points in voxel
        [0.5, 0.5, 0.5],
        [0.7, 0.2, 0.3],
        [0.7, 0.5, 0.9],
        # 2 points in another voxel
        [1.4, 1.5, 1.4],
        [1.7, 1.2, 1.3],
        ], dtype=pos_dtype)

    features = np.array([
        # 3 points in voxel
        [1,1],
        [2,1],
        [3,1],
        # 2 points in another voxel
        [4,1],
        [5,1],
        ], dtype=feat_dtype)

    # yapf: enable

    voxel_size = 1
    ans = mltest.run_op(ml, ml.device, True, ml.ops.voxel_pooling, points,
                        features, voxel_size, position_fn, feature_fn)

    if position_fn == 'average':
        expected_positions = np.stack(
            [np.mean(points[:3], axis=0),
             np.mean(points[3:], axis=0)])
    elif position_fn == 'center':
        expected_positions = np.array([[0.5, 0.5, 0.5], [1.5, 1.5, 1.5]],
                                      dtype=pos_dtype)
    elif position_fn == 'nearest_neighbor':
        expected_positions = np.array([points[0], points[3]], dtype=pos_dtype)

    assert len(ans.pooled_positions) == 2

    # compute assignment
    if np.linalg.norm(ans.pooled_positions[0] -
                      expected_positions[0]) < np.linalg.norm(
                          ans.pooled_positions[0] - expected_positions[1]):
        index = [0, 1]
    else:
        index = [1, 0]

    np.testing.assert_allclose(ans.pooled_positions, expected_positions[index])

    if feature_fn == 'average':
        if np.issubdtype(feat_dtype, np.integer):
            expected_features = np.stack([
                np.sum(features[:3], axis=0) // 3,
                np.sum(features[3:], axis=0) // 2
            ])
        else:
            expected_features = np.stack(
                [np.mean(features[:3], axis=0),
                 np.mean(features[3:], axis=0)])
    elif feature_fn == 'max':
        expected_features = np.stack(
            [np.max(features[:3], axis=0),
             np.max(features[3:], axis=0)])
    elif feature_fn == 'nearest_neighbor':
        expected_features = np.array([features[0], features[3]])

    np.testing.assert_allclose(ans.pooled_features, expected_features[index])
Beispiel #47
0
def _daal_check_array(array,
                      accept_sparse=False,
                      *,
                      accept_large_sparse=True,
                      dtype="numeric",
                      order=None,
                      copy=False,
                      force_all_finite=True,
                      ensure_2d=True,
                      allow_nd=False,
                      ensure_min_samples=1,
                      ensure_min_features=1,
                      estimator=None):
    """Input validation on an array, list, sparse matrix or similar.

    By default, the input is checked to be a non-empty 2D array containing
    only finite values. If the dtype of the array is object, attempt
    converting to float, raising on failure.

    Parameters
    ----------
    array : object
        Input object to check / convert.

    accept_sparse : string, boolean or list/tuple of strings (default=False)
        String[s] representing allowed sparse matrix formats, such as 'csc',
        'csr', etc. If the input is sparse but not in the allowed format,
        it will be converted to the first listed format. True allows the input
        to be any format. False means that a sparse matrix input will
        raise an error.

    accept_large_sparse : bool (default=True)
        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
        accept_sparse, accept_large_sparse=False will cause it to be accepted
        only if its indices are stored with a 32-bit dtype.

        .. versionadded:: 0.20

    dtype : string, type, list of types or None (default="numeric")
        Data type of result. If None, the dtype of the input is preserved.
        If "numeric", dtype is preserved unless array.dtype is object.
        If dtype is a list of types, conversion on the first type is only
        performed if the dtype of the input is not in the list.

    order : 'F', 'C' or None (default=None)
        Whether an array will be forced to be fortran or c-style.
        When order is None (default), then if copy=False, nothing is ensured
        about the memory layout of the output array; otherwise (copy=True)
        the memory layout of the returned array is kept as close as possible
        to the original array.

    copy : boolean (default=False)
        Whether a forced copy will be triggered. If copy=False, a copy might
        be triggered by a conversion.

    force_all_finite : boolean or 'allow-nan', (default=True)
        Whether to raise an error on np.inf, np.nan, pd.NA in array. The
        possibilities are:

        - True: Force all values of array to be finite.
        - False: accepts np.inf, np.nan, pd.NA in array.
        - 'allow-nan': accepts only np.nan and pd.NA values in array. Values
          cannot be infinite.

        .. versionadded:: 0.20
           ``force_all_finite`` accepts the string ``'allow-nan'``.

        .. versionchanged:: 0.23
           Accepts `pd.NA` and converts it into `np.nan`

    ensure_2d : boolean (default=True)
        Whether to raise a value error if array is not 2D.

    allow_nd : boolean (default=False)
        Whether to allow array.ndim > 2.

    ensure_min_samples : int (default=1)
        Make sure that the array has a minimum number of samples in its first
        axis (rows for a 2D array). Setting to 0 disables this check.

    ensure_min_features : int (default=1)
        Make sure that the 2D array has some minimum number of features
        (columns). The default value of 1 rejects empty datasets.
        This check is only enforced when the input data has effectively 2
        dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
        disables this check.

    estimator : str or estimator instance (default=None)
        If passed, include the name of the estimator in warning messages.

    Returns
    -------
    array_converted : object
        The converted and validated array.
    """
    if force_all_finite not in (True, False, 'allow-nan'):
        raise ValueError('force_all_finite should be a bool or "allow-nan"'
                         '. Got {!r} instead'.format(force_all_finite))

    if estimator is not None:
        if isinstance(estimator, str):
            estimator_name = estimator
        else:
            estimator_name = estimator.__class__.__name__
    else:
        estimator_name = "Estimator"
    context = " by %s" % estimator_name if estimator is not None else ""

    array_orig = array

    # a branch for heterogeneous pandas.DataFrame
    if is_DataFrame(array) and get_number_of_types(array) > 1:
        from pandas.api.types import is_sparse
        if hasattr(array, 'sparse') or \
                not array.dtypes.apply(is_sparse).any():
            return _pandas_check_array(array, array_orig, force_all_finite,
                                       ensure_min_samples, ensure_min_features,
                                       copy, context)

    # store whether originally we wanted numeric dtype
    dtype_numeric = isinstance(dtype, str) and dtype == "numeric"

    dtype_orig = getattr(array, "dtype", None)
    if not hasattr(dtype_orig, 'kind'):
        # not a data type (e.g. a column named dtype in a pandas DataFrame)
        dtype_orig = None

    # check if the object contains several dtypes (typically a pandas
    # DataFrame), and store them. If not, store None.
    dtypes_orig = None
    has_pd_integer_array = False
    if hasattr(array, "dtypes") and hasattr(array.dtypes, '__array__'):
        # throw warning if columns are sparse. If all columns are sparse, then
        # array.sparse exists and sparsity will be perserved (later).
        with suppress(ImportError):
            from pandas.api.types import is_sparse
            if not hasattr(array, 'sparse') and \
                    array.dtypes.apply(is_sparse).any():
                warnings.warn("pandas.DataFrame with sparse columns found."
                              "It will be converted to a dense numpy array.")

        dtypes_orig = list(array.dtypes)
        # pandas boolean dtype __array__ interface coerces bools to objects
        for i, dtype_iter in enumerate(dtypes_orig):
            if dtype_iter.kind == 'b':
                dtypes_orig[i] = np.dtype(np.object)
            elif dtype_iter.name.startswith(("Int", "UInt")):
                # name looks like an Integer Extension Array, now check for
                # the dtype
                with suppress(ImportError):
                    from pandas import (Int8Dtype, Int16Dtype, Int32Dtype,
                                        Int64Dtype, UInt8Dtype, UInt16Dtype,
                                        UInt32Dtype, UInt64Dtype)
                    if isinstance(
                            dtype_iter,
                        (Int8Dtype, Int16Dtype, Int32Dtype, Int64Dtype,
                         UInt8Dtype, UInt16Dtype, UInt32Dtype, UInt64Dtype)):
                        has_pd_integer_array = True

        if all(isinstance(dtype, np.dtype) for dtype in dtypes_orig):
            dtype_orig = np.result_type(*dtypes_orig)

    if dtype_numeric:
        if dtype_orig is not None and dtype_orig.kind == "O":
            # if input is object, convert to float.
            dtype = np.float64
        else:
            dtype = None

    if isinstance(dtype, (list, tuple)):
        if dtype_orig is not None and dtype_orig in dtype:
            # no dtype conversion required
            dtype = None
        else:
            # dtype conversion required. Let's select the first element of the
            # list of accepted types.
            dtype = dtype[0]

    if has_pd_integer_array:
        # If there are any pandas integer extension arrays,
        array = array.astype(dtype)

    # When all dataframe columns are sparse, convert to a sparse array
    if hasattr(array, 'sparse') and array.ndim > 1:
        # DataFrame.sparse only supports `to_coo`
        array = array.sparse.to_coo()

    if sp.issparse(array):
        _ensure_no_complex_data(array)
        array = _ensure_sparse_format(array,
                                      accept_sparse=accept_sparse,
                                      dtype=dtype,
                                      copy=copy,
                                      force_all_finite=force_all_finite,
                                      accept_large_sparse=accept_large_sparse)
    else:
        # If np.array(..) gives ComplexWarning, then we convert the warning
        # to an error. This is needed because specifying a non complex
        # dtype to the function converts complex to real dtype,
        # thereby passing the test made in the lines following the scope
        # of warnings context manager.
        with warnings.catch_warnings():
            try:
                warnings.simplefilter('error', ComplexWarning)
                if dtype is not None and np.dtype(dtype).kind in 'iu':
                    # Conversion float -> int should not contain NaN or
                    # inf (numpy#14412). We cannot use casting='safe' because
                    # then conversion float -> int would be disallowed.
                    array = np.asarray(array, order=order)
                    if array.dtype.kind == 'f':
                        _daal_assert_all_finite(array,
                                                allow_nan=False,
                                                msg_dtype=dtype)
                    array = array.astype(dtype, casting="unsafe", copy=False)
                else:
                    array = np.asarray(array, order=order, dtype=dtype)
            except ComplexWarning:
                raise ValueError("Complex data not supported\n"
                                 "{}\n".format(array))

        # It is possible that the np.array(..) gave no warning. This happens
        # when no dtype conversion happened, for example dtype = None. The
        # result is that np.array(..) produces an array of complex dtype
        # and we need to catch and raise exception for such cases.
        _ensure_no_complex_data(array)  # doing nothing for DataFrame

        if ensure_2d:
            # If input is scalar raise error
            if array.ndim == 0:
                raise ValueError(
                    "Expected 2D array, got scalar array instead:\narray={}.\n"
                    "Reshape your data either using array.reshape(-1, 1) if "
                    "your data has a single feature or array.reshape(1, -1) "
                    "if it contains a single sample.".format(array))
            # If input is 1D raise error
            if array.ndim == 1:
                raise ValueError(
                    "Expected 2D array, got 1D array instead:\narray={}.\n"
                    "Reshape your data either using array.reshape(-1, 1) if "
                    "your data has a single feature or array.reshape(1, -1) "
                    "if it contains a single sample.".format(array))

        # in the future np.flexible dtypes will be handled like object dtypes
        if dtype_numeric and np.issubdtype(array.dtype, np.flexible):
            warnings.warn(
                "Beginning in version 0.22, arrays of bytes/strings will be "
                "converted to decimal numbers if dtype='numeric'. "
                "It is recommended that you convert the array to "
                "a float dtype before using it in scikit-learn, "
                "for example by using "
                "your_array = your_array.astype(np.float64).",
                FutureWarning,
                stacklevel=2)

        # make sure we actually converted to numeric:
        if dtype_numeric and array.dtype.kind == "O":
            array = array.astype(np.float64)
        if not allow_nd and array.ndim >= 3:
            raise ValueError("Found array with dim %d. %s expected <= 2." %
                             (array.ndim, estimator_name))

        if force_all_finite:
            _daal_assert_all_finite(array,
                                    allow_nan=force_all_finite == 'allow-nan')

    if ensure_min_samples > 0:
        n_samples = _num_samples(array)
        if n_samples < ensure_min_samples:
            raise ValueError(
                "Found array with %d sample(s) (shape=%s) while a"
                " minimum of %d is required%s." %
                (n_samples, array.shape, ensure_min_samples, context))

    if ensure_min_features > 0 and array.ndim == 2:
        n_features = array.shape[1]
        if n_features < ensure_min_features:
            raise ValueError(
                "Found array with %d feature(s) (shape=%s) while"
                " a minimum of %d is required%s." %
                (n_features, array.shape, ensure_min_features, context))

    if copy and np.may_share_memory(array, array_orig):
        array = np.array(array, dtype=dtype, order=order)

    return array
Beispiel #48
0
def assert_column_equal(
    left,
    right,
    check_dtype=True,
    check_column_type="equiv",
    check_less_precise=False,
    check_exact=False,
    check_datetimelike_compat=False,
    check_categorical=True,
    check_category_order=True,
    obj="ColumnBase",
):
    """
    Check that left and right columns are equal

    This function is intended to compare two columns and output
    any differences. Additional parameters allow varying the strictness
    of the equality checks performed.

    Parameters
    ----------
    left : Column
        left Column to compare
    right : Column
        right Column to compare
    check_dtype : bool, default True
        Whether to check the Column dtype is identical.
    check_column_type : bool or {‘equiv’}, default ‘equiv’
        Whether to check the columns class, dtype and
        inferred_type are identical. Currently it is idle,
        and similar to pandas.
    check_less_precise : bool or int, default False
        Not yet supported
    check_exact : bool, default False
        Whether to compare number exactly.
    check_datetime_like_compat : bool, default False
        Compare datetime-like which is comparable ignoring dtype.
    check_categorical : bool, default True
        Whether to compare internal Categorical exactly.
    check_category_order : bool, default True
        Whether to compare category order of internal Categoricals
    obj : str, default ‘ColumnBase’
        Specify object name being compared, internally used to
        show appropriate assertion message.
    """
    if check_dtype is True:
        if (is_categorical_dtype(left) and is_categorical_dtype(right)
                and not check_categorical):
            pass
        else:
            if type(left) != type(right) or left.dtype != right.dtype:
                msg1 = f"{left.dtype}"
                msg2 = f"{right.dtype}"
                raise_assert_detail(obj, "Dtypes are different", msg1, msg2)

    if check_datetimelike_compat:
        if np.issubdtype(left.dtype, np.datetime64):
            right = right.astype(left.dtype)
        elif np.issubdtype(right.dtype, np.datetime64):
            left = left.astype(right.dtype)

        if np.issubdtype(left.dtype, np.datetime64):
            if not left.equals(right):
                raise AssertionError(
                    f"[datetimelike_compat=True] {left.values} "
                    f"is not equal to {right.values}.")
            return

    if check_exact and check_categorical:
        if is_categorical_dtype(left) and is_categorical_dtype(right):
            left_cat = left.cat().categories
            right_cat = right.cat().categories

            if check_category_order:
                assert_index_equal(
                    left_cat,
                    right_cat,
                    exact=check_dtype,
                    check_exact=True,
                    check_categorical=False,
                )
                assert_column_equal(
                    left.codes,
                    right.codes,
                    check_dtype=check_dtype,
                    check_exact=True,
                    check_categorical=False,
                    check_category_order=False,
                )

            if left.ordered != right.ordered:
                msg1 = f"{left.ordered}"
                msg2 = f"{right.ordered}"
                raise_assert_detail("{obj} category", "Orders are different",
                                    msg1, msg2)

    if (not check_dtype and is_categorical_dtype(left)
            and is_categorical_dtype(right)):
        left = left.astype(left.categories.dtype)
        right = right.astype(right.categories.dtype)

    columns_equal = False
    try:
        columns_equal = left.equals(right)
    except TypeError as e:
        if str(e) != "Categoricals can only compare with the same type":
            raise e
        if is_categorical_dtype(left) and is_categorical_dtype(right):
            left = left.astype(left.categories.dtype)
            right = right.astype(right.categories.dtype)
    if not columns_equal:
        msg1 = f"{left.to_array()}"
        msg2 = f"{right.to_array()}"
        try:
            diff = left.apply_boolean_mask(left != right).size
            diff = diff * 100.0 / left.size
        except BaseException:
            diff = 100.0
        raise_assert_detail(
            obj,
            f"values are different ({np.round(diff, 5)} %)",
            msg1,
            msg2,
        )
    'age_smoke': 'int64',
    'fagerstromtotal': 'int64',
    'weekday': 'object',
    'hour_of_day': 'int64',
    'part_of_day_afternoon': 'object',
    'part_of_day_evening': 'object',
    'part_of_day_morning': 'object',
    'part_of_day_night': 'object',
    'episode_type': 'object'
}
X = X.astype(convert_dict)

# standardise all features in X:

scaler = StandardScaler()
num_cols = X.columns[X.dtypes.apply(lambda c: np.issubdtype(c, np.number))]
X[num_cols] = scaler.fit_transform(X[num_cols])

del X['episode_type']

y = final_df.loc[:, final_df.columns == 'episode_type']
y = y.astype('int')

# create training and testing set of data split 70/30:

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)
columns = X_train.columns
Beispiel #50
0
def average(a, axis=None, weights=None, returned=False):  # pylint: disable=missing-docstring
    if axis is not None and not isinstance(axis, six.integer_types):
        # TODO(wangpeng): Support tuple of ints as `axis`
        raise ValueError('`axis` must be an integer. Tuple of ints is not '
                         'supported yet. Got type: %s' % type(axis))
    a = np_array_ops.array(a)
    if weights is None:  # Treat all weights as 1
        if not np.issubdtype(a.dtype.as_numpy_dtype, np.inexact):
            a = a.astype(
                np_utils.result_type(a.dtype, np_dtypes.default_float_type()))
        avg = math_ops.reduce_mean(a, axis=axis)
        if returned:
            if axis is None:
                weights_sum = array_ops.size(a)
            else:
                weights_sum = array_ops.shape(a)[axis]
            weights_sum = math_ops.cast(weights_sum, a.dtype)
    else:
        if np.issubdtype(a.dtype.as_numpy_dtype, np.inexact):
            out_dtype = np_utils.result_type(a.dtype, weights)
        else:
            out_dtype = np_utils.result_type(a.dtype, weights,
                                             np_dtypes.default_float_type())
        a = np_array_ops.array(a, out_dtype)
        weights = np_array_ops.array(weights, out_dtype)

        def rank_equal_case():
            control_flow_ops.Assert(
                math_ops.reduce_all(
                    array_ops.shape(a) == array_ops.shape(weights)),
                [array_ops.shape(a),
                 array_ops.shape(weights)])
            weights_sum = math_ops.reduce_sum(weights, axis=axis)
            avg = math_ops.reduce_sum(a * weights, axis=axis) / weights_sum
            return avg, weights_sum

        if axis is None:
            avg, weights_sum = rank_equal_case()
        else:

            def rank_not_equal_case():
                control_flow_ops.Assert(
                    array_ops.rank(weights) == 1, [array_ops.rank(weights)])
                weights_sum = math_ops.reduce_sum(weights)
                axes = ops.convert_to_tensor([[axis], [0]])
                avg = math_ops.tensordot(a, weights, axes) / weights_sum
                return avg, weights_sum

            # We condition on rank rather than shape equality, because if we do the
            # latter, when the shapes are partially unknown but the ranks are known
            # and different, np_utils.cond will run shape checking on the true branch,
            # which will raise a shape-checking error.
            avg, weights_sum = np_utils.cond(
                math_ops.equal(array_ops.rank(a), array_ops.rank(weights)),
                rank_equal_case, rank_not_equal_case)

    avg = np_array_ops.array(avg)
    if returned:
        weights_sum = np_array_ops.broadcast_to(weights_sum,
                                                array_ops.shape(avg))
        return avg, weights_sum
    return avg
    def test_frame_add_datetime64_column(self):
        rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
        df = DataFrame(index=np.arange(len(rng)))

        df['A'] = rng
        assert np.issubdtype(df['A'].dtype, np.dtype('M8[ns]'))
Beispiel #52
0
def tw_matrices_to_lists(doc_word, doc_tag):
    """
    Modification of matrix_to_lists from lda.utils into its tag-word analogue
    Convert (sparse) matrices of counts into arrays of tagwords and doc indices

    Parameters
    ----------
    doc_word : array or sparse matrix (D, W)
    doc_tag : array or sparse matrix (D, T)

    Returns
    -------
    (TS, WS, DS) : Tuple of three arrays
        TS[k] contains the tag in the kth tag-word in the corpus
        WS[k] contains the word in the kth tag-word in the corpus
        DS[k] contains the document index for the kth tag-word

    """
    if np.count_nonzero(doc_word.sum(axis=1)) != doc_word.shape[0]:
        logger.warning("all zero row in document-word matrix found")
    if np.count_nonzero(doc_word.sum(axis=0)) != doc_word.shape[1]:
        logger.warning("all zero column in document-word matrix found")
    if np.count_nonzero(doc_tag.sum(axis=1)) != doc_tag.shape[0]:
        logger.warning("all zero row in document-tag matrix found")
    if np.count_nonzero(doc_tag.sum(axis=0)) != doc_tag.shape[1]:
        logger.warning("all zero column in document-tag matrix found")

    dw_sparse = True
    try:
        # if doc_word is a scipy sparse matrix
        doc_word = doc_word.copy().tolil()
    except AttributeError:
        dw_sparse = False

    dt_sparse = True
    try:
        # if doc_tag is a scipy sparse matrix
        doc_tag = doc_tag.copy().tolil()
    except AttributeError:
        dt_sparse = False

    if (dw_sparse and not np.issubdtype(doc_word.dtype, int)) or (dt_sparse and not np.issubdtype(doc_tag.dtype, int)):
        raise ValueError("expected sparse matrix with integer values, found float values")

    #Obtain doc id + word/tag id lists for nonzero entries in doc_word and doc_tag
    dw_doc_i, dw_word_i = np.nonzero(doc_word)
    if dw_sparse:
        dw_counts_i = np.array(list(doc_word[i, j] for i, j in zip(dw_doc_i, dw_word_i)))
    else:
        dw_counts_i = doc_word[dw_doc_i, dw_word_i]
    dt_doc_i, dt_tag_i = np.nonzero(doc_tag)

    #group the words and tags by doc id, each iterator returns at each step (doc_id, iter_of_words/tags)
    dw_word_gb_doc = it.groupby(zip(dw_doc_i, dw_word_i, dw_counts_i), lambda x: x[0])
    dt_tag_gb_doc = it.groupby(zip(dt_doc_i,dt_tag_i), lambda x: x[0])
    #get an iterator that returns at each step for different doc_ids, (iter_of_words X iter_of_tags) * word repetition
    #in the form (doc_id, word_id, tag_id) * repetition
    doc_tagword_iter = it.chain.from_iterable(
                            it.repeat((tup[0][0], tup[0][1], tup[1][1]), tup[0][2]) for tup in (it.chain.from_iterable(
                                it.product(doc_words[1], doc_tags[1]) for doc_words, doc_tags in zip(dw_word_gb_doc, dt_tag_gb_doc)
                            ))
                        )

    #doc-tagword array
    DTWS = np.array(list(doc_tagword_iter))

    #return TS, WS, DS
    return DTWS[:,2], DTWS[:,1], DTWS[:,0]
Beispiel #53
0
def specshow(data,
             x_coords=None,
             y_coords=None,
             x_axis=None,
             y_axis=None,
             sr=22050,
             hop_length=512,
             fmin=None,
             fmax=None,
             bins_per_octave=12,
             **kwargs):
    '''Display a spectrogram/chromagram/cqt/etc.


    Parameters
    ----------
    data : np.ndarray [shape=(d, n)]
        Matrix to display (e.g., spectrogram)

    sr : number > 0 [scalar]
        Sample rate used to determine time scale in x-axis.

    hop_length : int > 0 [scalar]
        Hop length, also used to determine time scale in x-axis

    x_axis : None or str

    y_axis : None or str
        Range for the x- and y-axes.

        Valid types are:

        - None, 'none', or 'off' : no axis decoration is displayed.

        Frequency types:

        - 'linear', 'fft', 'hz' : frequency range is determined by
          the FFT window and sampling rate.
        - 'log' : the spectrum is displayed on a log scale.
        - 'mel' : frequencies are determined by the mel scale.
        - 'cqt_hz' : frequencies are determined by the CQT scale.
        - 'cqt_note' : pitches are determined by the CQT scale.

        All frequency types are plotted in units of Hz.

        Categorical types:

        - 'chroma' : pitches are determined by the chroma filters.
          Pitch classes are arranged at integer locations (0-11).

        - 'tonnetz' : axes are labeled by Tonnetz dimensions (0-5)
        - 'frames' : markers are shown as frame counts.


        Time types:

        - 'time' : markers are shown as milliseconds, seconds,
          minutes, or hours
        - 'lag' : like time, but past the half-way point counts
          as negative values.

        All time types are plotted in units of seconds.

        Other:

        - 'tempo' : markers are shown as beats-per-minute (BPM)
            using a logarithmic scale.

    x_coords : np.ndarray [shape=data.shape[1]+1]
    y_coords : np.ndarray [shape=data.shape[0]+1]

        Optional positioning coordinates of the input data.
        These can be use to explicitly set the location of each
        element `data[i, j]`, e.g., for displaying beat-synchronous
        features in natural time coordinates.

        If not provided, they are inferred from `x_axis` and `y_axis`.

    fmin : float > 0 [scalar] or None
        Frequency of the lowest spectrogram bin.  Used for Mel and CQT
        scales.

        If `y_axis` is `cqt_hz` or `cqt_note` and `fmin` is not given,
        it is set by default to `note_to_hz('C1')`.

    fmax : float > 0 [scalar] or None
        Used for setting the Mel frequency scales

    bins_per_octave : int > 0 [scalar]
        Number of bins per octave.  Used for CQT frequency scale.

    kwargs : additional keyword arguments
        Arguments passed through to `matplotlib.pyplot.pcolormesh`.


    Returns
    -------
    axes
        The axis handle for the figure.


    See Also
    --------
    cmap : Automatic colormap detection

    matplotlib.pyplot.pcolormesh


    Examples
    --------
    Visualize an STFT power spectrum

    >>> import matplotlib.pyplot as plt
    >>> y, sr = librosa.load(librosa.util.example_audio_file())
    >>> plt.figure(figsize=(12, 8))

    >>> D = librosa.amplitude_to_db(librosa.stft(y), ref=np.max)
    >>> plt.subplot(4, 2, 1)
    >>> librosa.display.specshow(D, y_axis='linear')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.title('Linear-frequency power spectrogram')


    Or on a logarithmic scale

    >>> plt.subplot(4, 2, 2)
    >>> librosa.display.specshow(D, y_axis='log')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.title('Log-frequency power spectrogram')


    Or use a CQT scale

    >>> CQT = librosa.amplitude_to_db(librosa.cqt(y, sr=sr), ref=np.max)
    >>> plt.subplot(4, 2, 3)
    >>> librosa.display.specshow(CQT, y_axis='cqt_note')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.title('Constant-Q power spectrogram (note)')

    >>> plt.subplot(4, 2, 4)
    >>> librosa.display.specshow(CQT, y_axis='cqt_hz')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.title('Constant-Q power spectrogram (Hz)')


    Draw a chromagram with pitch classes

    >>> C = librosa.feature.chroma_cqt(y=y, sr=sr)
    >>> plt.subplot(4, 2, 5)
    >>> librosa.display.specshow(C, y_axis='chroma')
    >>> plt.colorbar()
    >>> plt.title('Chromagram')


    Force a grayscale colormap (white -> black)

    >>> plt.subplot(4, 2, 6)
    >>> librosa.display.specshow(D, cmap='gray_r', y_axis='linear')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.title('Linear power spectrogram (grayscale)')


    Draw time markers automatically

    >>> plt.subplot(4, 2, 7)
    >>> librosa.display.specshow(D, x_axis='time', y_axis='log')
    >>> plt.colorbar(format='%+2.0f dB')
    >>> plt.title('Log power spectrogram')


    Draw a tempogram with BPM markers

    >>> plt.subplot(4, 2, 8)
    >>> Tgram = librosa.feature.tempogram(y=y, sr=sr)
    >>> librosa.display.specshow(Tgram, x_axis='time', y_axis='tempo')
    >>> plt.colorbar()
    >>> plt.title('Tempogram')
    >>> plt.tight_layout()


    Draw beat-synchronous chroma in natural time

    >>> plt.figure()
    >>> tempo, beat_f = librosa.beat.beat_track(y=y, sr=sr, trim=False)
    >>> beat_f = librosa.util.fix_frames(beat_f, x_max=C.shape[1])
    >>> Csync = librosa.util.sync(C, beat_f, aggregate=np.median)
    >>> beat_t = librosa.frames_to_time(beat_f, sr=sr)
    >>> ax1 = plt.subplot(2,1,1)
    >>> librosa.display.specshow(C, y_axis='chroma', x_axis='time')
    >>> plt.title('Chroma (linear time)')
    >>> ax2 = plt.subplot(2,1,2, sharex=ax1)
    >>> librosa.display.specshow(Csync, y_axis='chroma', x_axis='time',
    ...                          x_coords=beat_t)
    >>> plt.title('Chroma (beat time)')
    >>> plt.tight_layout()
    '''

    kwargs.setdefault('shading', 'flat')

    if np.issubdtype(data.dtype, np.complex):
        warnings.warn('Trying to display complex-valued input. '
                      'Showing magnitude instead.')
        data = np.abs(data)

    kwargs.setdefault('cmap', cmap(data))

    all_params = dict(kwargs=kwargs,
                      sr=sr,
                      fmin=fmin,
                      fmax=fmax,
                      bins_per_octave=bins_per_octave,
                      hop_length=hop_length)

    # Get the x and y coordinates
    y_coords = __mesh_coords(y_axis, y_coords, data.shape[0], **all_params)
    x_coords = __mesh_coords(x_axis, x_coords, data.shape[1], **all_params)

    axes = plt.gca()
    out = axes.pcolormesh(x_coords, y_coords, data, **kwargs)
    plt.sci(out)

    axes.set_xlim(x_coords.min(), x_coords.max())
    axes.set_ylim(y_coords.min(), y_coords.max())

    # Set up axis scaling
    __scale_axes(axes, x_axis, 'x')
    __scale_axes(axes, y_axis, 'y')

    # Construct tickers and locators
    __decorate_axis(axes.xaxis, x_axis)
    __decorate_axis(axes.yaxis, y_axis)

    return axes
Beispiel #54
0
def pad_sequences(sequences,
                  maxlen=None,
                  dtype='int32',
                  padding='pre',
                  truncating='pre',
                  value=0.,
                  eos=1.):
    """
    Pads sequences to the same length.

    This function transforms a list of
    `num_samples` sequences (lists of integers)
    into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
    `num_timesteps` is either the `maxlen` argument if provided,
    or the length of the longest sequence otherwise.

    Sequences that are shorter than `num_timesteps`
    are padded with `value` at the end.

    Sequences longer than `num_timesteps` are truncated
    so that they fit the desired length.
    The position where padding or truncation happens is determined by
    the arguments `padding` and `truncating`, respectively.

    Pre-padding is the default.

    # Arguments
        sequences: List of lists, where each element is a sequence.
        maxlen: Int, maximum length of all sequences.
        dtype: Type of the output sequences.
            To pad sequences with variable length strings, you can use `object`.
        padding: String, 'pre' or 'post':
            pad either before or after each sequence.
        truncating: String, 'pre' or 'post':
            remove values from sequences larger than
            `maxlen`, either at the beginning or at the end of the sequences.
        value: Float or String, padding value.
        eos = end of sentence index to end each sentence
    # Returns
        x: Numpy array with shape `(len(sequences), maxlen)`

    # Raises
        ValueError: In case of invalid values for `truncating` or `padding`,
            or in case of invalid shape for a `sequences` entry.
    """
    if not hasattr(sequences, '__len__'):
        raise ValueError('`sequences` must be iterable.')
    lengths = []
    for x in sequences:
        if not hasattr(x, '__len__'):
            raise ValueError('`sequences` must be a list of iterables. '
                             'Found non-iterable: ' + str(x))
        lengths.append(len(x))

    num_samples = len(sequences)
    if maxlen is None:
        maxlen = np.max(lengths)

    # take the sample shape from the first non empty sequence
    # checking for consistency in the main loop below.
    sample_shape = tuple()
    for s in sequences:
        if len(s) > 0:
            sample_shape = np.asarray(s).shape[1:]
            break

    is_dtype_str = np.issubdtype(dtype, np.str_) or np.issubdtype(
        dtype, np.unicode_)
    if isinstance(value,
                  six.string_types) and dtype != object and not is_dtype_str:
        raise ValueError(
            "`dtype` {} is not compatible with `value`'s type: {}:\n"
            "You should set `dtype=object` for variable length strings.".
            format(dtype, type(value)))

    x = np.full((num_samples, maxlen + 1) + sample_shape, value, dtype=dtype)
    for idx, s in enumerate(sequences):
        if not len(s):
            continue  # empty list/array was found
        if truncating == 'pre':
            trunc = s[-maxlen:]
        elif truncating == 'post':
            trunc = s[:maxlen] + [float(eos)]
        else:
            raise ValueError('Truncating type "%s" '
                             'not understood' % truncating)

        # check `trunc` has expected shape
        trunc = np.asarray(trunc, dtype=dtype)
        if trunc.shape[1:] != sample_shape:
            raise ValueError('Shape of sample %s of sequence at position %s '
                             'is different from expected shape %s' %
                             (trunc.shape[1:], idx, sample_shape))

        if padding == 'post':
            x[idx, :len(trunc)] = trunc
        elif padding == 'pre':
            x[idx, -len(trunc):] = trunc
        else:
            raise ValueError('Padding type "%s" not understood' % padding)
    return x
Beispiel #55
0
def infer_problem_type(y: Series, silent=False) -> str:
    """ Identifies which type of prediction problem we are interested in (if user has not specified).
        Ie. binary classification, multi-class classification, or regression.
    """
    if len(y) == 0:
        raise ValueError("provided labels cannot have length = 0")
    y = y.dropna(
    )  # Remove missing values from y (there should not be any though as they were removed in Learner.general_data_processing())
    num_rows = len(y)

    unique_values = y.unique()

    MULTICLASS_LIMIT = 1000  # if numeric and class count would be above this amount, assume it is regression
    if num_rows > 1000:
        REGRESS_THRESHOLD = 0.05  # if the unique-ratio is less than this, we assume multiclass classification, even when labels are integers
    else:
        REGRESS_THRESHOLD = 0.1

    unique_count = len(unique_values)
    if unique_count == 2:
        problem_type = BINARY
        reason = "only two unique label-values observed"
    elif y.dtype.name in ['object', 'category']:
        problem_type = MULTICLASS
        reason = f"dtype of label-column == {y.dtype.name}"
    elif np.issubdtype(y.dtype, np.floating):
        unique_ratio = unique_count / float(num_rows)
        if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <=
                                                    MULTICLASS_LIMIT):
            try:
                can_convert_to_int = np.array_equal(y, y.astype(int))
                if can_convert_to_int:
                    problem_type = MULTICLASS
                    reason = "dtype of label-column == float, but few unique label-values observed and label-values can be converted to int"
                else:
                    problem_type = REGRESSION
                    reason = "dtype of label-column == float and label-values can't be converted to int"
            except:
                problem_type = REGRESSION
                reason = "dtype of label-column == float and label-values can't be converted to int"
        else:
            problem_type = REGRESSION
            reason = "dtype of label-column == float and many unique label-values observed"
    elif np.issubdtype(y.dtype, np.integer):
        unique_ratio = unique_count / float(num_rows)
        if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <=
                                                    MULTICLASS_LIMIT):
            problem_type = MULTICLASS  # TODO: Check if integers are from 0 to n-1 for n unique values, if they have a wide spread, it could still be regression
            reason = "dtype of label-column == int, but few unique label-values observed"
        else:
            problem_type = REGRESSION
            reason = "dtype of label-column == int and many unique label-values observed"
    else:
        raise NotImplementedError(f'label dtype {y.dtype} not supported!')
    if not silent:
        logger.log(
            25,
            f"AutoGluon infers your prediction problem is: '{problem_type}' (because {reason})."
        )

        # TODO: Move this outside of this function so it is visible even if problem type was not inferred.
        if problem_type in [BINARY, MULTICLASS]:
            if unique_count > 10:
                logger.log(
                    20,
                    f'\tFirst 10 (of {unique_count}) unique label values:  {list(unique_values[:10])}'
                )
            else:
                logger.log(
                    20,
                    f'\t{unique_count} unique label values:  {list(unique_values)}'
                )
        elif problem_type == REGRESSION:
            y_max = y.max()
            y_min = y.min()
            y_mean = y.mean()
            y_stddev = y.std()
            logger.log(
                20,
                f'\tLabel info (max, min, mean, stddev): ({y_max}, {y_min}, {round(y_mean, 5)}, {round(y_stddev, 5)})'
            )

        logger.log(
            25,
            f"\tIf '{problem_type}' is not the correct problem_type, please manually specify the problem_type argument in fit() (You may specify problem_type as one of: {[BINARY, MULTICLASS, REGRESSION]})"
        )
    return problem_type
    def test_frame_ctor_datetime64_column(self):
        rng = date_range('1/1/2000 00:00:00', '1/1/2000 1:59:50', freq='10s')
        dates = np.asarray(rng)

        df = DataFrame({'A': np.random.randn(len(rng)), 'B': dates})
        assert np.issubdtype(df['B'].dtype, np.dtype('M8[ns]'))
Beispiel #57
0
def check_array(array, accept_sparse=False, accept_large_sparse=True,
                dtype="numeric", order=None, copy=False, force_all_finite=True,
                ensure_2d=True, allow_nd=False, ensure_min_samples=1,
                ensure_min_features=1, estimator=None) -> Tensor:

    """Input validation on a tensor, list, sparse matrix or similar.

    By default, the input is checked to be a non-empty 2D array containing
    only finite values. If the dtype of the tensor is object, attempt
    converting to float, raising on failure.

    Parameters
    ----------
    array : object
        Input object to check / convert.

    accept_sparse : string, boolean or list/tuple of strings (default=False)
        String[s] representing allowed sparse matrix formats, such as 'csc',
        'csr', etc. If the input is sparse but not in the allowed format,
        it will be converted to the first listed format. True allows the input
        to be any format. False means that a sparse matrix input will
        raise an error.

    accept_large_sparse : bool (default=True)
        If a CSR, CSC, COO or BSR sparse matrix is supplied and accepted by
        accept_sparse, accept_large_sparse=False will cause it to be accepted
        only if its indices are stored with a 32-bit dtype.

    dtype : string, type, list of types or None (default="numeric")
        Data type of result. If None, the dtype of the input is preserved.
        If "numeric", dtype is preserved unless array.dtype is object.
        If dtype is a list of types, conversion on the first type is only
        performed if the dtype of the input is not in the list.

    order : 'F', 'C' or None (default=None)
        Whether a tenor will be forced to be fortran or c-style.
        When order is None (default), then if copy=False, nothing is ensured
        about the memory layout of the output tensor; otherwise (copy=True)
        the memory layout of the returned tensor is kept as close as possible
        to the original tensor.

    copy : boolean (default=False)
        Whether a forced copy will be triggered. If copy=False, a copy might
        be triggered by a conversion.

    force_all_finite : boolean or 'allow-nan', (default=True)
        Whether to raise an error on np.inf and np.nan in tensor. The
        possibilities are:

        - True: Force all values of tensor to be finite.
        - False: accept both np.inf and np.nan in tensor.
        - 'allow-nan': accept only np.nan values in tensor. Values cannot
          be infinite.

        For object dtyped data, only np.nan is checked and not np.inf.

    ensure_2d : boolean (default=True)
        Whether to raise a value error if tensor is not 2D.

    allow_nd : boolean (default=False)
        Whether to allow tensor.ndim > 2.

    ensure_min_samples : int (default=1)
        Make sure that the tensor has a minimum number of samples in its first
        axis (rows for a 2D tensor). Setting to 0 disables this check.

    ensure_min_features : int (default=1)
        Make sure that the 2D tensor has some minimum number of features
        (columns). The default value of 1 rejects empty datasets.
        This check is only enforced when the input data has effectively 2
        dimensions or is originally 1D and ``ensure_2d`` is True. Setting to 0
        disables this check.

    estimator : str or estimator instance (default=None)
        If passed, include the name of the estimator in warning messages.

    Returns
    -------
    array_converted : object
        The converted and validated tensor.
    """

    # store whether originally we wanted numeric dtype
    dtype_numeric = isinstance(dtype, str) and dtype == "numeric"

    dtype_orig = getattr(array, "dtype", None)
    if not hasattr(dtype_orig, 'kind'):
        # not a data type (e.g. a column named dtype in a pandas DataFrame)
        dtype_orig = None

    if dtype_numeric:
        if dtype_orig is not None and dtype_orig.kind == "O":
            # if input is object, convert to float.
            dtype = np.float64
        else:
            dtype = None

    if isinstance(dtype, (list, tuple)):
        if dtype_orig is not None and dtype_orig in dtype:
            # no dtype conversion required
            dtype = None
        else:
            # dtype conversion required. Let's select the first element of the
            # list of accepted types.
            dtype = dtype[0]

    if force_all_finite not in (True, False, 'allow-nan'):
        raise ValueError('force_all_finite should be a bool or "allow-nan"'
                         f'. Got {force_all_finite!r} instead')

    if estimator is not None:
        if isinstance(estimator, str):
            estimator_name = estimator
        else:
            estimator_name = estimator.__class__.__name__
    else:
        estimator_name = "Estimator"
    context = f" by {estimator_name}" if estimator is not None else ""

    if (hasattr(array, 'issparse') and array.issparse()) or issparse(array):
        _ensure_no_complex_data(array)
        array = mt.asarray(array)
        array = _ensure_sparse_format(array, accept_sparse=accept_sparse,
                                      dtype=dtype, copy=copy,
                                      force_all_finite=force_all_finite,
                                      accept_large_sparse=accept_large_sparse)
    else:
        # If np.array(..) gives ComplexWarning, then we convert the warning
        # to an error. This is needed because specifying a non complex
        # dtype to the function converts complex to real dtype,
        # thereby passing the test made in the lines following the scope
        # of warnings context manager.
        with warnings.catch_warnings():
            try:
                warnings.simplefilter('error', ComplexWarning)
                array = mt.asarray(array, dtype=dtype, order=order)
            except ComplexWarning:
                raise ValueError(f"Complex data not supported\n{array}\n")

        # It is possible that the np.array(..) gave no warning. This happens
        # when no dtype conversion happened, for example dtype = None. The
        # result is that np.array(..) produces an array of complex dtype
        # and we need to catch and raise exception for such cases.
        _ensure_no_complex_data(array)

        if ensure_2d:
            # If input is scalar raise error
            if array.ndim == 0:
                raise ValueError(
                    f"Expected 2D array, got scalar array instead:\narray={array}.\n"
                    "Reshape your data either using array.reshape(-1, 1) if "
                    "your data has a single feature or array.reshape(1, -1) "
                    "if it contains a single sample.")
            # If input is 1D raise error
            if array.ndim == 1:
                raise ValueError(
                    f"Expected 2D array, got 1D array instead:\narray={array}.\n"
                    "Reshape your data either using array.reshape(-1, 1) if "
                    "your data has a single feature or array.reshape(1, -1) "
                    "if it contains a single sample.")

        # in the future np.flexible dtypes will be handled like object dtypes
        if dtype_numeric and np.issubdtype(array.dtype, np.flexible):
            warnings.warn(
                "Beginning in version 0.22, arrays of bytes/strings will be "
                "converted to decimal numbers if dtype='numeric'. "
                "It is recommended that you convert the array to "
                "a float dtype before using it in scikit-learn, "
                "for example by using "
                "your_array = your_array.astype(np.float64).",
                FutureWarning)

        # make sure we actually converted to numeric:
        if dtype_numeric and array.dtype.kind == "O":
            array = array.astype(np.float64)
        if not allow_nd and array.ndim >= 3:
            raise ValueError("Found array with dim %d. %s expected <= 2."
                             % (array.ndim, estimator_name))
        if force_all_finite:
            array = _assert_all_finite(
                array, allow_nan=force_all_finite == 'allow-nan', check_only=False)

    if ensure_min_samples > 0:
        n_samples = _num_samples(array)
        if n_samples < ensure_min_samples:
            raise ValueError("Found array with %d sample(s) (shape=%s) while a"
                             " minimum of %d is required%s."
                             % (n_samples, array.shape, ensure_min_samples,
                                context))

    if ensure_min_features > 0 and array.ndim == 2:
        n_features = array.shape[1]
        if n_features < ensure_min_features:
            raise ValueError("Found array with %d feature(s) (shape=%s) while"
                             " a minimum of %d is required%s."
                             % (n_features, array.shape, ensure_min_features,
                                context))

    if copy:
        array = mt.array(array, dtype=dtype, order=order)

    return array
Beispiel #58
0
def rdpg_corr(X, Y, r, rescale=False, directed=False, loops=False):
    r"""
    Samples a random graph pair based on the latent positions in X (and
    optionally in Y)
    If only X :math:`\in\mathbb{R}^{n\times d}` is given, the P matrix is calculated as
    :math:`P = XX^T`. If X, Y :math:`\in\mathbb{R}^{n\times d}` is given, then
    :math:`P = XY^T`. These operations correspond to the dot products between a set of
    latent positions, so each row in X or Y represents the latent positions in
    :math:`\mathbb{R}^{d}` for a single vertex in the random graph
    Note that this function may also rescale or clip the resulting P
    matrix to get probabilities between 0 and 1, or remove loops.
    A binary random graph is then sampled from the P matrix described
    by X (and possibly Y).
    Read more in the :ref:`tutorials <simulations_tutorials>`

    Parameters
    ----------
    X: np.ndarray, shape (n_vertices, n_dimensions)
        latent position from which to generate a P matrix
        if Y is given, interpreted as the left latent position

    Y: np.ndarray, shape (n_vertices, n_dimensions) or None, optional
        right latent position from which to generate a P matrix

    r: float
        The value of the correlation between the same vertices in two graphs.

    rescale: boolean, optional (default=True)
        when rescale is True, will subtract the minimum value in
        P (if it is below 0) and divide by the maximum (if it is
        above 1) to ensure that P has entries between 0 and 1. If
        False, elements of P outside of [0, 1] will be clipped.

    directed: boolean, optional (default=False)
        If False, output adjacency matrix will be symmetric. Otherwise, output adjacency
        matrix will be asymmetric.

    loops: boolean, optional (default=True)
        If False, no edges will be sampled in the diagonal. Diagonal elements in P
        matrix are removed prior to rescaling (see above) which may affect behavior.
        Otherwise, edges are sampled in the diagonal.

    Returns
    -------
    G1: ndarray (n_vertices, n_vertices)
        A matrix representing the probabilities of connections between
        vertices in a random graph based on their latent positions

    G2: ndarray (n_vertices, n_vertices)
        A matrix representing the probabilities of connections between
        vertices in a random graph based on their latent positions

    References
    ----------
    .. [1] Vince Lyzinski, Donniell E Fishkind profile imageDonniell E. Fishkind, Carey E Priebe.
       "Seeded graph matching for correlated Erdös-Rényi graphs".
       The Journal of Machine Learning Research, January 2014

    Examples
    --------
    >>> np.random.seed(1234)
    >>> X = np.random.dirichlet([1, 1], size=5)
    >>> Y = None

    Generate random latent positions using 2-dimensional Dirichlet distribution.
    Then sample a correlated RDPG graph pair:

    >>> rdpg_corr(X, Y, 0.3, rescale=False, directed=False, loops=False)
    (array([[0., 1., 0., 1., 0.],
           [1., 0., 0., 1., 1.],
           [0., 0., 0., 0., 0.],
           [1., 1., 0., 0., 0.],
           [0., 1., 0., 0., 0.]]), array([[0., 1., 0., 1., 0.],
           [1., 0., 0., 0., 1.],
           [0., 0., 0., 0., 0.],
           [1., 0., 0., 0., 0.],
           [0., 1., 0., 0., 0.]]))
    """
    # check r
    if not np.issubdtype(type(r), np.floating):
        raise TypeError("r is not of type float.")
    elif r < -1 or r > 1:
        msg = "r must between -1 and 1."
        raise ValueError(msg)

    # check directed and loops
    if type(directed) is not bool:
        raise TypeError("directed is not of type bool.")
    if type(loops) is not bool:
        raise TypeError("loops is not of type bool.")

    # check dimensions of X and Y
    if Y != None:
        if type(X) is not np.ndarray or type(Y) is not np.ndarray:
            raise TypeError("Latent positions must be numpy.ndarray")
        if X.ndim != 2 or Y.ndim != 2:
            raise ValueError(
                "Latent positions must have dimension 2 (n_vertices, n_dimensions)"
            )
        if X.shape != Y.shape:
            raise ValueError(
                "Dimensions of latent positions X and Y must be the same")
    if Y is None:
        Y = X

    P = p_from_latent(X, Y, rescale=rescale, loops=loops)
    n = P.shape[0]
    R = np.full((n, n), r)
    G1, G2 = sample_edges_corr(P, R, directed=directed, loops=loops)
    return G1, G2
    def fit(self, df):
        """Main fit method for SAR.

        Args:
            df (pd.DataFrame): User item rating dataframe
        """

        # generate continuous indices if this hasn't been done
        if self.index2item is None:
            self.set_index(df)

        logger.info("Collecting user affinity matrix")
        if not np.issubdtype(df[self.col_rating].dtype, np.number):
            raise TypeError("Rating column data type must be numeric")

        # copy the DataFrame to avoid modification of the input
        select_columns = [self.col_user, self.col_item, self.col_rating]
        if self.time_decay_flag:
            select_columns += [self.col_timestamp]
        temp_df = df[select_columns].copy()

        if self.time_decay_flag:
            logger.info("Calculating time-decayed affinities")
            temp_df = self.compute_time_decay(df=temp_df,
                                              decay_column=self.col_rating)
        else:
            # without time decay use the latest user-item rating in the dataset as the affinity score
            logger.info("De-duplicating the user-item counts")
            temp_df = temp_df.drop_duplicates([self.col_user, self.col_item],
                                              keep="last")

        logger.info("Creating index columns")
        # add mapping of user and item ids to indices
        temp_df.loc[:, self.col_item_id] = temp_df[self.col_item].apply(
            lambda item: self.item2index.get(item, np.NaN))
        temp_df.loc[:, self.col_user_id] = temp_df[self.col_user].apply(
            lambda user: self.user2index.get(user, np.NaN))

        if self.normalize:
            logger.info("Calculating normalization factors")
            temp_df[self.col_unity_rating] = 1.0
            if self.time_decay_flag:
                temp_df = self.compute_time_decay(
                    df=temp_df, decay_column=self.col_unity_rating)
            self.unity_user_affinity = self.compute_affinity_matrix(
                df=temp_df, rating_col=self.col_unity_rating)

        # affinity matrix
        logger.info("Building user affinity sparse matrix")
        self.user_affinity = self.compute_affinity_matrix(
            df=temp_df, rating_col=self.col_rating)

        # calculate item co-occurrence
        logger.info("Calculating item co-occurrence")
        item_cooccurrence = self.compute_coocurrence_matrix(df=temp_df)

        # free up some space
        del temp_df

        self.item_frequencies = item_cooccurrence.diagonal()

        logger.info("Calculating item similarity")
        if self.similarity_type is COOCCUR:
            logger.info("Using co-occurrence based similarity")
            self.item_similarity = item_cooccurrence
        elif self.similarity_type is JACCARD:
            logger.info("Using jaccard based similarity")
            self.item_similarity = jaccard(item_cooccurrence).astype(
                df[self.col_rating].dtype)
        elif self.similarity_type is LIFT:
            logger.info("Using lift based similarity")
            self.item_similarity = lift(item_cooccurrence).astype(
                df[self.col_rating].dtype)
        else:
            raise ValueError("Unknown similarity type: {}".format(
                self.similarity_type))

        # free up some space
        del item_cooccurrence

        logger.info("Done training")
Beispiel #60
0
    def get_attr_info(self, variable=None, flag=False):
        """
        Get ARM quality control definitions from the ARM standard
        bit_#_description, ... attributes and return as dictionary.
        Will attempt to guess if the flag is integer or bit packed
        based on what attributes are set.

        Parameters
        ----------
        variable : str
            Variable name to get attribute information. If set to None
            will get global attributes.
        flag : bool
            Optional flag indicating if QC is expected to be bitpacked
            or integer. Flag = True indicates integer QC. Default
            is bitpacked or False.

        Returns
        -------
        attributes dictionary : dict or None
            A dictionary contianing the attribute information converted from
            ARM QC to CF QC. All keys include 'flag_meanings', 'flag_masks',
            'flag_values', 'flag_assessments', 'flag_tests', 'arm_attributes'.
            Returns None if none found.

        """
        string = 'bit'
        if flag:
            string = 'flag'
        else:
            found_string = False
            try:
                if self._obj.attrs['qc_bit_comment']:
                    string = 'bit'
                    found_string = True
            except KeyError:
                pass

            if found_string is False:
                try:
                    if self._obj.attrs['qc_flag_comment']:
                        string = 'flag'
                        found_string = True
                except KeyError:
                    pass

            if found_string is False:
                var = self.matched_qc_variables
                if len(var) > 0:
                    try:
                        if self._obj[variable].attrs[
                                'flag_method'] == 'integer':
                            string = 'flag'
                        found_string = True
                        del self._obj[variable].attrs['flag_method']
                    except KeyError:
                        pass

        try:
            if variable:
                attr_description_pattern = (r"(^" + string +
                                            r")_([0-9]+)_(description$)")
                attr_assessment_pattern = (r"(^" + string +
                                           r")_([0-9]+)_(assessment$)")
                attr_comment_pattern = (r"(^" + string +
                                        r")_([0-9]+)_(comment$)")
                attributes = self._obj[variable].attrs
            else:
                attr_description_pattern = (r"(^qc_" + string +
                                            r")_([0-9]+)_(description$)")
                attr_assessment_pattern = (r"(^qc_" + string +
                                           r")_([0-9]+)_(assessment$)")
                attr_comment_pattern = (r"(^qc_" + string +
                                        r")_([0-9]+)_(comment$)")
                attributes = self._obj.attrs
        except KeyError:
            return None

        assessment_bit_num = []
        description_bit_num = []
        comment_bit_num = []
        flag_masks = []
        flag_meanings = []
        flag_assessments = []
        flag_comments = []
        arm_attributes = []

        dtype = np.int32
        for att_name in attributes:
            try:
                description = re.match(attr_description_pattern, att_name)
                description_bit_num.append(int(description.groups()[1]))
                flag_meanings.append(attributes[att_name])
                arm_attributes.append(att_name)
            except AttributeError:
                pass

            try:
                assessment = re.match(attr_assessment_pattern, att_name)
                assessment_bit_num.append(int(assessment.groups()[1]))
                flag_assessments.append(attributes[att_name])
                arm_attributes.append(att_name)
            except AttributeError:
                pass

            try:
                comment = re.match(attr_comment_pattern, att_name)
                comment_bit_num.append(int(comment.groups()[1]))
                flag_comments.append(attributes[att_name])
                arm_attributes.append(att_name)
            except AttributeError:
                pass

        if variable is not None:
            # Try and get the data type from the variable if it is an integer
            # If not an integer make the flag values integers.
            try:
                dtype = self._obj[variable].values.dtype
                if np.issubdtype(dtype, np.integer):
                    pass
                else:
                    dtype = np.int32
            except AttributeError:
                pass

        # Sort on bit number to ensure correct description order
        index = np.argsort(description_bit_num)
        flag_meanings = np.array(flag_meanings)
        description_bit_num = np.array(description_bit_num)
        flag_meanings = flag_meanings[index]
        description_bit_num = description_bit_num[index]

        # Sort on bit number to ensure correct assessment order
        if len(flag_assessments) > 0:
            if len(flag_assessments) < len(flag_meanings):
                for ii in range(1, len(flag_meanings) + 1):
                    if ii not in assessment_bit_num:
                        assessment_bit_num.append(ii)
                        flag_assessments.append('')
            index = np.argsort(assessment_bit_num)
            flag_assessments = np.array(flag_assessments)
            flag_assessments = flag_assessments[index]

        # Sort on bit number to ensure correct comment order
        if len(flag_comments) > 0:
            if len(flag_comments) < len(flag_meanings):
                for ii in range(1, len(flag_meanings) + 1):
                    if ii not in comment_bit_num:
                        comment_bit_num.append(ii)
                        flag_comments.append('')
            index = np.argsort(comment_bit_num)
            flag_comments = np.array(flag_comments)
            flag_comments = flag_comments[index]

        # Convert bit number to mask number
        if len(description_bit_num) > 0:
            flag_masks = np.array(description_bit_num)
            flag_masks = np.left_shift(1, flag_masks - 1)

        # build dictionary to return values
        if len(flag_masks) > 0 or len(description_bit_num) > 0:
            return_dict = dict()
            return_dict['flag_meanings'] = list(
                np.array(flag_meanings, dtype=str))

            if len(flag_masks) > 0 and max(flag_masks) > np.iinfo(
                    np.uint32).max:
                flag_mask_dtype = np.uint64
            else:
                flag_mask_dtype = np.uint32

            if flag:
                return_dict['flag_values'] = list(
                    np.array(description_bit_num, dtype=dtype))
                return_dict['flag_masks'] = list(
                    np.array([], dtype=flag_mask_dtype))
            else:
                return_dict['flag_values'] = list(np.array([], dtype=dtype))
                return_dict['flag_masks'] = list(
                    np.array(flag_masks, dtype=flag_mask_dtype))

            return_dict['flag_assessments'] = list(
                np.array(flag_assessments, dtype=str))
            return_dict['flag_tests'] = list(
                np.array(description_bit_num, dtype=dtype))
            return_dict['flag_comments'] = list(
                np.array(flag_comments, dtype=str))
            return_dict['arm_attributes'] = arm_attributes

        else:
            # If nothing to return set to None
            return_dict = None

        return return_dict