Ejemplo n.º 1
0
 def init(self):
     self.windows = self.param['window']
     self.cols = self.param['col']
     self.types = self.param['type']
     self.translation_cols = self.param.get('translation')
     self.scale_cols = self.param.get('scale')
     self.move_window_mapping = {
         "mean":
         lambda c, s, t, w: bn.move_mean(c, w) * s + t,
         "std":
         lambda c, s, t, w: bn.move_std(c, w) * s,
         "var":
         lambda c, s, t, w: bn.move_var(c, w) * s * s,
         "min":
         lambda c, s, t, w: bn.move_min(c, w) * s + t,
         "max":
         lambda c, s, t, w: bn.move_max(c, w) * s + t,
         "rank":
         lambda c, s, t, w: bn.move_rank(c, w),
         "sum":
         lambda c, s, t, w: bn.move_sum(c, w) * s + t * w,
         "ema":
         lambda c, s, t, w: F.
         ema(c, 2.0 /
             (w + 1), start_indices=self.base.start_indices) * s + t,
         "rsi":
         lambda c, s, t, w: F.rsi(
             c, w, start_indices=self.base.start_indices),
         "psy":
         lambda c, s, t, w: F.psy(
             c, w, start_indices=self.base.start_indices),
         "bias":
         lambda c, s, t, w: F.bias(
             c, w, start_indices=self.base.start_indices)
     }
Ejemplo n.º 2
0
def test_move_std_sqrt():
    "Test move_std for neg sqrt."

    a = [
        0.0011448196318903589, 0.00028718669878572767, 0.00028718669878572767,
        0.00028718669878572767, 0.00028718669878572767
    ]
    err_msg = "Square root of negative number. ndim = %d"
    b = bn.move_std(a, window=3)
    assert_true(np.isfinite(b[2:]).all(), err_msg % 1)

    a2 = np.array([a, a])
    b = bn.move_std(a2, window=3, axis=1)
    assert_true(np.isfinite(b[:, 2:]).all(), err_msg % 2)

    a3 = np.array([[a, a], [a, a]])
    b = bn.move_std(a3, window=3, axis=2)
    assert_true(np.isfinite(b[:, :, 2:]).all(), err_msg % 3)
Ejemplo n.º 3
0
def test_move_std_sqrt():
    "Test move_std for neg sqrt."

    a = [0.0011448196318903589,
         0.00028718669878572767,
         0.00028718669878572767,
         0.00028718669878572767,
         0.00028718669878572767]
    err_msg = "Square root of negative number. ndim = %d"
    b = bn.move_std(a, window=3)
    assert_true(np.isfinite(b[2:]).all(), err_msg % 1)

    a2 = np.array([a, a])
    b = bn.move_std(a2, window=3, axis=1)
    assert_true(np.isfinite(b[:, 2:]).all(), err_msg % 2)

    a3 = np.array([[a, a], [a, a]])
    b = bn.move_std(a3, window=3, axis=2)
    assert_true(np.isfinite(b[:, :, 2:]).all(), err_msg % 3)
def Stddev(A, n):
    '''
    window天的移动标准差
    window >= 2
    '''
    if n < 2:
        #print ("计算stddev,n不能小于2,返回输入")
        return A
    result = bk.move_std(A, n, min_count=2, axis=0, ddof=1)
    result = fillna(result)  #利用每一天所有股票的均值来填充空值,根据broadcast的原理,需要转置后再填充
    result[np.isnan(A)] = np.nan
    return result
Ejemplo n.º 5
0
def numpy_normxcorr(templates, stream, pads, *args, **kwargs):
    """
    Compute the normalized cross-correlation using numpy and bottleneck.

    :param templates: 2D Array of templates
    :type templates: np.ndarray
    :param stream: 1D array of continuous data
    :type stream: np.ndarray
    :param pads: List of ints of pad lengths in the same order as templates
    :type pads: list

    :return: np.ndarray of cross-correlations
    :return: np.ndarray channels used
    """
    import bottleneck

    # Generate a template mask
    used_chans = ~np.isnan(templates).any(axis=1)
    # Currently have to use float64 as bottleneck runs into issues with other
    # types: https://github.com/kwgoodman/bottleneck/issues/164
    stream = stream.astype(np.float64)
    templates = templates.astype(np.float64)
    template_length = templates.shape[1]
    stream_length = len(stream)
    assert stream_length > template_length, "Template must be shorter than " \
                                            "stream"
    fftshape = next_fast_len(template_length + stream_length - 1)
    # Set up normalizers
    stream_mean_array = bottleneck.move_mean(
        stream, template_length)[template_length - 1:]
    stream_std_array = bottleneck.move_std(
        stream, template_length)[template_length - 1:]
    # because stream_std_array is in denominator or res, nan all 0s
    stream_std_array[stream_std_array == 0] = np.nan
    # Normalize and flip the templates
    norm = ((templates - templates.mean(axis=-1, keepdims=True)) /
            (templates.std(axis=-1, keepdims=True) * template_length))
    norm_sum = norm.sum(axis=-1, keepdims=True)
    stream_fft = np.fft.rfft(stream, fftshape)
    template_fft = np.fft.rfft(np.flip(norm, axis=-1), fftshape, axis=-1)
    res = np.fft.irfft(template_fft * stream_fft,
                       fftshape)[:, 0:template_length + stream_length - 1]
    res = ((_centered(res,
                      (templates.shape[0], stream_length - template_length +
                       1))) - norm_sum * stream_mean_array) / stream_std_array
    res[np.isnan(res)] = 0.0

    for i, pad in enumerate(pads):
        res[i] = np.append(res[i], np.zeros(pad))[pad:]
    return res.astype(np.float32), used_chans
Ejemplo n.º 6
0
def moving_std(x: np.ndarray, n: int) -> np.ndarray:
    if bottleneck_found:
        return bn.move_std(x, n)[n - 1:]

    sums = np.empty(x.size - n + 1)
    sqrs = np.empty(x.size - n + 1)

    tab = np.cumsum(x) / n
    sums[0] = tab[n - 1]
    sums[1:] = tab[n:] - tab[:-n]

    tab = np.cumsum(x * x) / n
    sqrs[0] = tab[n - 1]
    sqrs[1:] = tab[n:] - tab[:-n]

    return np.sqrt(sqrs - sums * sums)
Ejemplo n.º 7
0
def get_mean_hierarchy_assignment(assignments, params_full):
    steps = assignments.shape[0]
    assign = _get_MPEAR(assignments)
    clusters = np.unique(assign)

    params = np.zeros((clusters.size, params_full.shape[2]))
    for i, cluster in enumerate(clusters):
        cells_cl_idx = assign == cluster
        cells = np.nonzero(cells_cl_idx)[0]
        other = np.nonzero(~cells_cl_idx)[0]
        # Paper - section 2.3: first criteria
        if cells.size == 1:
            same_cluster = np.ones(steps).astype(bool)
        else:
            same_cluster = 0 == bn.nansum(bn.move_std(assignments[:, cells],
                                                      2,
                                                      axis=1),
                                          axis=1)
        # Paper - section 2.3: second criteria
        cl_ids = assignments[:, cells[0]]
        other_cl_id = assignments[:, other]
        no_others = [cl_ids[j] not in other_cl_id[j] for j in range(steps)]

        # At least criteria 1 fullfilled
        if any(same_cluster):
            # Both criteria fullfilled in at least 1 posterior sample
            if any(same_cluster & no_others):
                step_idx = np.argwhere(same_cluster & no_others).flatten()
            else:
                step_idx = np.argwhere(same_cluster).flatten()

            for step in step_idx:
                cl_id = np.argwhere(np.unique(assignments[step]) == cl_ids[step]) \
                    .flatten()[0]
                params[i] += params_full[step][cl_id]
            params[i] /= step_idx.size
        # If not, take parameters from all posterior samples
        else:
            for step, step_assign in enumerate(assignments):
                cl_id_all = np.unique(step_assign)
                cl_id, cnt = np.unique(step_assign[cells], return_counts=True)
                cl_id_new = np.argwhere(np.in1d(cl_id_all, cl_id)).flatten()
                params[i] += np.dot(cnt, params_full[step][cl_id_new])
            params[i] /= steps * cells.size

    params_df = pd.DataFrame(params).T[assign]
    return assign, params_df
Ejemplo n.º 8
0
    def _set_data(self):
        try:
            endcol1 = self.dl.dates_to_indices(self.identified_date)
        except KeyError:
            argwhere = np.argwhere(self.dl.dates > self.identified_date)
            if not len(argwhere):
                raise KeyError('It seems that {} is neither a valid date, nor a date when data is available'.format(
                    self.identified_date))
            else:
                endcol1 = argwhere[0][0]
                self.identified_date = self.dl.dates[endcol1]
        # endcol = min(endcol1 + self.MAX_OBS_DAYS + self.MAX_HLD_DAYS + 1, len(self.dl))
        endcol = min(endcol1 + int((self.MAX_OBS_DAYS + self.MAX_HLD_DAYS) * 1.1), len(self.dl))
        startcol = endcol1 - 252
        assert startcol >= 0, '%s is too early to have enough data required for computation' % self.identified_date
        # according to R implementation, should minus 251, but here change it to 252 so the identified day can also be
        # trading trigger/activation day
        self._identified_date_id = endcol1
        pair_prices = self.dl['PRCCD', self.pair][:, startcol:endcol]
        pair_wealth = self.dl['CUM_WEALTH', self.pair][:, startcol:endcol]
        pair_prices = pair_prices[:, :1] * pair_wealth / pair_wealth[:, :1]

        has_na = np.isnan(pair_prices[:, 252:]).any(axis=0)
        self._data_dict['has_na'] = has_na  # start from identified
        self._data_dict['cum_na'] = has_na.cumsum()  # start from identified
        # Note: actually no missing values were found during my experiment.
        # this block might be redundant

        pair_prices = foward_fillna_2darray(pair_prices)
        ratio = np.log(pair_prices[0] / pair_prices[1])
        self._data_dict['ratio_history'] = ratio
        mean_mv = bn.move_mean(ratio, window=252, min_count=200)[251:]
        sd_mv = bn.move_std(ratio, window=252, min_count=200, ddof=1)[251:]
        # min_count is used to address the extreme case where the first 50 days are all missing data.
        # this is likely under the parameter settings of correlation computation
        ub_mv = mean_mv + 2. * sd_mv  # start from identified - 1
        lb_mv = mean_mv - 2. * sd_mv  # start from identified - 1
        ratio = ratio[251:]  # start from identified - 1

        self._data_dict['ratio'] = ratio[1:]  # start from identified
        self._data_dict['above_upper'] = np.ediff1d(np.where(ratio >= ub_mv, 1, 0))  # start from identified
        self._data_dict['above_mean'] = np.ediff1d(np.where(ratio >= mean_mv, 1, 0))
        self._data_dict['below_mean'] = np.ediff1d(np.where(ratio <= mean_mv, 1, 0))
        self._data_dict['below_lower'] = np.ediff1d(np.where(ratio <= lb_mv, 1, 0))
        self._data_dict['in_flag'] = bn.nansum(self.dl['IN_US_1', self.pair][:, endcol1:endcol], axis=0) == 2
Ejemplo n.º 9
0
def scipy_normxcorr(templates, stream, pads):
    """
    Compute the normalized cross-correlation of multiple templates with data.

    :param templates: 2D Array of templates
    :type templates: np.ndarray
    :param stream: 1D array of continuous data
    :type stream: np.ndarray
    :param pads: List of ints of pad lengths in the same order as templates
    :type pads: list

    :return: np.ndarray of cross-correlations
    :return: np.ndarray channels used
    """
    import bottleneck
    from scipy.signal.signaltools import _centered

    # Generate a template mask
    used_chans = ~np.isnan(templates).any(axis=1)
    # Currently have to use float64 as bottleneck runs into issues with other
    # types: https://github.com/kwgoodman/bottleneck/issues/164
    stream = stream.astype(np.float64)
    templates = templates.astype(np.float64)
    template_length = templates.shape[1]
    stream_length = len(stream)
    fftshape = next_fast_len(template_length + stream_length - 1)
    # Set up normalizers
    stream_mean_array = bottleneck.move_mean(
        stream, template_length)[template_length - 1:]
    stream_std_array = bottleneck.move_std(
        stream, template_length)[template_length - 1:]
    # Normalize and flip the templates
    norm = ((templates - templates.mean(axis=-1, keepdims=True)) /
            (templates.std(axis=-1, keepdims=True) * template_length))
    norm_sum = norm.sum(axis=-1, keepdims=True)
    stream_fft = np.fft.rfft(stream, fftshape)
    template_fft = np.fft.rfft(np.flip(norm, axis=-1), fftshape, axis=-1)
    res = np.fft.irfft(template_fft * stream_fft,
                       fftshape)[:, 0:template_length + stream_length - 1]
    res = ((_centered(res, stream_length - template_length + 1)) -
           norm_sum * stream_mean_array) / stream_std_array
    res[np.isnan(res)] = 0.0
    for i in range(len(pads)):
        res[i] = np.append(res[i], np.zeros(pads[i]))[pads[i]:]
    return res.astype(np.float32), used_chans
Ejemplo n.º 10
0
def genweight(datname, dpath, wpath):
    """
    Combine time series with statistical weights calculated from scatter

    Arguments:
    - `datname`: Identifier of data file
    - `dpath`  : Path to data file (time series).
    - `wpath`  : Path to scatter file (with same time points!)
    """

    # Pretty print
    print('Generating weights for {0} !'.format(dpath))

    # Load data and weights
    t, d = np.loadtxt(dpath, unpack=True)
    tt, sig = np.loadtxt(wpath, unpack=True)

    # Check that times are indeed the same
    tdif = t - tt
    if tdif.any() != 0:
        print('Error! Not the same time points! Quitting!')
        exit()

    # Moving variance (Hans: M = 50 - 100)
    M = 70
    movstd = bn.move_std(sig, M, min_count=1)
    movvar = np.square(movstd)

    # Remove first point
    x = 1
    t = t[x:]
    d = d[x:]
    movvar = movvar[x:]

    # Calculate weights from scatter (1 / variance)
    w = np.divide(1.0, movvar)

    # Save
    outfile = star + '_with-weights.txt'
    np.savetxt(outfile, np.transpose([t, d, w]), fmt='%.15e', delimiter='\t')

    # Done!
    print('Done!\n')
Ejemplo n.º 11
0
    def __init__(self, signal, time, dtS=0.0002):
        """

        Parameters
        ----------
        signal : ndarray
            signal to be analyzed
        time : ndarray
            time basis
        dtS : floating
            At the init we also compute a normalize
            signal where normalization is of the form
            (x-<x>)/std(x) where the mean and average
            is a rolling mean and standard deviation on
            a window of the time dtS/dt
        Dependences
        -----------
            numpy
            scipy
            pycwt https://github.com/regeirk/pycwt.git
            astropy for better histogram function
            bottleneck (https://pypi.python.org/pypi/Bottleneck)
            for moving average
        """

        self.sig = copy.deepcopy(signal)
        self.time = copy.deepcopy(time)
        self.dt = (self.time.max() - self.time.min()) / (self.time.size - 1)
        self.nsamp = self.time.size
        self.signorm = (self.sig - self.sig.mean()) / self.sig.std()
        # since the moments of the signal are
        # foundamental quantities we compute them
        # at the initial
        self.moments()
        _nPoint = int(dtS / self.dt)
        self.rmsnorm = (
                           self.sig -
                           bottleneck.move_mean(self.sig, _nPoint, min_count=1)) / \
                       bottleneck.move_std(self.sig, _nPoint,min_count=1)
Ejemplo n.º 12
0
    def _set_data_for_visualization(self, days_before_identification=21, days_after_close=10):
        startcol = max(self._identified_date_id - 252 - days_before_identification + 1, 0)
        endcol = min(self._identified_date_id + self.MAX_OBS_DAYS + self.MAX_HLD_DAYS + days_after_close + 1,
                     len(self.dl))
        pair_prices = self.dl['PRCCD', self.pair][:, startcol:endcol]
        pair_wealth = self.dl['CUM_WEALTH', self.pair][:, startcol:endcol]
        pair_prices = pair_prices[:, :1] * pair_wealth / pair_wealth[:, :1]
        pair_prices = foward_fillna_2darray(pair_prices)

        ratio = np.log(pair_prices[0] / pair_prices[1])
        mean_mv = bn.move_mean(ratio, window=252, min_count=200)[251:]
        sd_mv = bn.move_std(ratio, window=252, min_count=200, ddof=1)[251:]
        ub_mv = mean_mv + 2. * sd_mv  # start from identified - days_before_identification
        lb_mv = mean_mv - 2. * sd_mv  # start from identified - days_before_identification
        ratio = ratio[251:]  # start from identified - days_before_identification

        idtf_idx = self._identified_date_id - startcol - 251
        open_idx = self.dl.dates_to_indices(self.open_date) - self._identified_date_id + idtf_idx
        close_idx = self.dl.dates_to_indices(self.close_date) - self._identified_date_id + idtf_idx
        end_idx = close_idx + days_after_close

        return {'ratio': ratio[:end_idx + 1], 'upper': ub_mv[:end_idx + 1], 'lower': lb_mv[:end_idx + 1],
                'mean': mean_mv[:end_idx + 1],
                'open_idx': open_idx, 'close_idx': close_idx, 'idtf_idx': idtf_idx}
Ejemplo n.º 13
0
p.xlabel(f1short)
p.ylabel(f2short)
p.axis('equal')
#p.show()
#sys.exit()
###########################

##### WINDOW DISTRIBUTION ##########
windows = decSort2[decSort1.argsort()] - np.arange(decSort2.size)
p.subplot(2, 3, 5)
p.plot(windows, 'ro', alpha=0.5)
p.title('Rank Differences with standard deviations')
p.xlabel('Gene expression rank')
p.ylabel(f1short + '-' + f2short)

y = bn.move_std(windows, 10)
p.plot(y, 'b')
p.plot(-y, 'b')
###########################

##### VENN BUBBLES ##########
thresh1 = data1['FPKM'] > 0
thresh2 = data2['FPKM'] > 0

set1 = set(data1[thresh1]['tracking_id'])
set2 = set(data2[thresh2]['tracking_id'])

aandb = len(set1.intersection(set2))
a = len(set1.difference(set2))
b = len(set2.difference(set1))
Ejemplo n.º 14
0
def height_plot_across_folders(folder_list, inputsuffix='allz2.dat', 
                               label='Mean Light Weighted Age [Gyr]', 
                               col=6, errcol=None, lowhigh=False, 
                               order=5, ylims=None, bigpoints=False,
                               binz=True, combine_all=False, plot_std=False,
                               exclude=[[],[],[],[],[],[]]):

    axlist = []
    
    plist = [6,3,4,2,1,5]
    #color_list = ['blue','turquoise','chartreuse','yellow','tomato','red']
    color_list = ['blue','seagreen','darkorange','crimson','dimgray','mediumorchid','lightblue']
    style_list = ['-','-','-','-','-','-','-']

    if not isinstance(col,list):
        col = [col] * len(folder_list)

    for i in range(6):                
        pointing = plist[i]

        ax = plt.figure().add_subplot(111)
        ax.set_xlabel('|Height [kpc]|')
        ax.set_ylabel(label)
        ax.set_title('{}\nP{}'.format(time.asctime(),pointing))

        for f, folder in enumerate(folder_list):
            color = color_list[f]
            style = style_list[f]
            
            dat = glob('{}/*P{}*{}'.format(folder, pointing, inputsuffix))[0]
            print dat
            loc = glob('{}/*P{}*locations.dat'.format(folder, pointing))[0]
            print loc
            print 'Excluding: ', exclude[pointing-1]
    
            if errcol == None:
                td = np.loadtxt(dat, usecols=(col[f],), unpack=True)
            else:
                if lowhigh:
                    td, low, high = np.loadtxt(dat, usecols=(col[f],errcol,errcol+1), unpack=True)
                    te = np.vstack((low,high))
                else:
                    td, te = np.loadtxt(dat, usecols=(col[f],errcol), unpack=True)                
            r, tz = np.loadtxt(loc, usecols=(4,5), unpack=True)

            exarr = np.array(exclude[pointing-1])-1 #becuase aps are 1-indexed
            td = np.delete(td,exarr)
            r = np.delete(r,exarr)
            tz = np.delete(tz,exarr)
            if errcol != None:
                if lowhigh:
                    te = np.delete(te,exarr,axis=1)
                else:
                    te = np.delete(te,exarr)

            alpha=1.0
            if combine_all and f == 0:
                bigD = np.zeros(td.size)
                alpha=0.3
            
            if binz:
                z = np.array([])
                d = np.array([])
                e = np.array([])
                while tz.size > 0:
                    zi = tz[0]
                    idx = np.where(np.abs(tz - zi) < 0.05)
                    d = np.r_[d,np.mean(td[idx])]
                    e = np.r_[e,np.std(td[idx])]
                    z = np.r_[z,np.abs(zi)]
                    tz = np.delete(tz, idx)
                    td = np.delete(td, idx)
            else:
                z = tz
                d = td
                if errcol == None:
                    e = np.zeros(tz.size)
                else:
                    e = te

            if combine_all:
                bigD = np.vstack((bigD,d))
                bigz = z

            gidx = d == d
            d = d[gidx]
            z = z[gidx]
            if lowhigh:
                e = e[:,gidx]
            else:
                e = e[gidx]

            sidx = np.argsort(z)
            dp = np.r_[d[sidx][order::-1],d[sidx]]
            zp = np.r_[z[sidx][order::-1],z[sidx]]
            mean = bn.move_mean(dp,order)[order+1:]
            std = bn.move_std(dp,order)[order+1:]
            spl = spi.UnivariateSpline(z[sidx],d[sidx])
            mean = spl(z[sidx])
            # mean = np.convolve(d[sidx],np.ones(order)/order,'same')
            # std = np.sqrt(np.convolve((d - mean)**2,np.ones(order)/order,'same'))
        
            # ax.plot(z[sidx],mean,color=color, ls=style, label=folder, alpha=alpha)
            # ax.fill_between(z[sidx],mean-std,mean+std, alpha=0.1, color=color)

            # print d.shape, np.sum(e,axis=0).shape
            # d = d/np.sum(e,axis=0)
            # e = np.diff(e,axis=0)[0]
            # print e.shape

            ax.errorbar(z, d, yerr=e, fmt='.', color=color,alpha=alpha,capsize=0, label=folder)

        ax.set_xlim(-0.1,2.6)
        
        if ylims is not None:
            ax.set_ylim(*ylims)
        ax.legend(loc=0,numpoints=1)

        if combine_all:
            sidx = np.argsort(bigz)
            bigD = bigD[1:]
            bigMean = bn.nanmean(bigD,axis=0)
            bigStd = bn.nanstd(bigD,axis=0)
            bigspl = spi.UnivariateSpline(bigz[sidx],bigMean[sidx])
            bigFit = bigspl(bigz[sidx])
            
            ax.plot(bigz[sidx], bigFit, 'k-', lw=2)
            ax.errorbar(bigz, bigMean, yerr=bigStd, fmt='.', color='k',capsize=0)

        axlist.append(ax)
    
        if combine_all and plot_std:
            ax2 = plt.figure().add_subplot(111)
            ax2.set_xlabel('|Height [kpc]|')
            ax2.set_ylabel('$\delta$'+label)
            ax2.set_title(ax.get_title())
            ax2.plot(bigz, bigStd, 'k')
            axlist.append(ax2)

    return axlist
Ejemplo n.º 15
0
def plot_heights_with_err(inputsuffix,label=r'$\tau_{\mathrm{V,Balm}}$',basedir='.',
                          col=1, errcol=2, lowhigh=False, order=5, bigorder=60, 
                          s=None, ylims=None, labelr=False, bigpoints=False,
                          plotfit=True, exclude=exclude, printdate=True, printfit=True):

    zz = np.array([])
    dd = np.array([])
    if lowhigh:
        ee = np.array([[],[]])
    else:
        ee = np.array([])
    axlist = []

    bigax = plt.figure().add_subplot(111)
    bigax.set_xlabel(r'$|z| \mathrm{\ [kpc]}$')
    bigax.set_ylabel(label)
    
    plist = [6,3,4,2,1,5]
    color_list = ['blue','seagreen','sienna','orange','yellowgreen','darkturquoise']
    style_list = ['-','-','-','--','--','--']

    for i in range(6):
        pointing = plist[i]
        color = color_list[i]
        style = style_list[i]

        dat = glob('{}/*P{}*{}'.format(basedir, pointing, inputsuffix))[0]
        print dat
        loc = glob('{}/*P{}*locations.dat'.format(basedir, pointing))[0]
        print loc
        print 'Excluding: ', exclude[pointing-1]
    
        if errcol is not None:
            if lowhigh:
                data, Lerr, Herr = np.loadtxt(dat, usecols=(col,errcol,errcol+1), unpack=True)
                err = np.vstack((Lerr,Herr))
            else:
                data, err = np.loadtxt(dat, usecols=(col,errcol), unpack=True)
        else:
            data = np.loadtxt(dat, usecols=(col,), unpack=True)
            err = np.ones(data.size)*0.01

        r, z = np.loadtxt(loc, usecols=(4,5), unpack=True)
        avgr = np.mean(r)

        ax = plt.figure().add_subplot(111)
        ax.set_xlabel('|Height [kpc]|')
        ax.set_ylabel(label)
        if labelr:
            ax.set_title('{:4.0f} kpc'.format(avgr))
            linelabel = '{:4.0f} kpc'.format(avgr)
        else:
            ax.set_title('{}\nP{}'.format(time.asctime(),pointing))
            linelabel = 'P{}'.format(pointing)

        exarr = np.array(exclude[pointing-1])-1 #becuase aps are 1-indexed
        data = np.delete(data,exarr)
        r = np.delete(r,exarr)
        z = np.delete(z,exarr)

        gidx = data == data
        data = data[gidx]
        z = z[gidx]
        if lowhigh:
            err = np.delete(err,exarr,axis=1)
            err = err[:,gidx]
            ee = np.hstack((ee,err))
        else:
            err = np.delete(err,exarr)
            err = err[gidx]
            ee = np.r_[ee,err]
        
        zz = np.r_[zz,z]
        dd = np.r_[dd,data]
        sidx = np.argsort(z)
        data_pad = np.r_[data[sidx][order::-1],data[sidx]]
        z_pad = np.r_[z[sidx][order::-1],z[sidx]]
        # mean = bn.move_mean(data_pad,order)[order+1:]
        std = bn.move_std(data_pad,order)[order+1:]
        spl = spi.UnivariateSpline(z[sidx],data[sidx])
        mean = spl(z[sidx])
        # mean = np.convolve(d[sidx],np.ones(order)/order,'same')
        # std = np.sqrt(np.convolve((d - mean)**2,np.ones(order)/order,'same'))

        bigax.errorbar(z, data, yerr=err, fmt='.', label=linelabel, color=color, capsize=0)
        
        # ax.plot(z[sidx],mean,color=color, ls=style)
        # ax.fill_between(z[sidx],mean-std,mean+std, alpha=0.1, color=color)

        ax.errorbar(z, data, yerr=err, fmt='.', color=color, capsize=0)
        ax.set_xlim(-0.1,2.6)
        
        if ylims is not None:
            ax.set_ylim(*ylims)
        
        axlist.append(ax)
        
    if printdate:
        plot_title = time.asctime()
    else:
        plot_title = ''
    if plotfit:
        sidx = np.argsort(zz)
        big_data_pad = np.r_[dd[sidx][bigorder::-1],dd[sidx]]
        big_z_pad = np.r_[zz[sidx][bigorder::-1],zz[sidx]]
        big_e_pad = np.r_[ee[sidx][bigorder::-1],ee[sidx]]
        big_sum = bn.move_sum(big_data_pad/big_e_pad,bigorder)[bigorder+1:]
        big_weight = bn.move_sum(1./big_e_pad,bigorder)[bigorder+1:]
        big_mean = big_sum/big_weight

        # std = bn.move_std(data_pad,order)[order+1:]
        # big_spl = spi.UnivariateSpline(zz[sidx],dd[sidx],w = 1./ee[sidx]**2, k=k, s=s)
        # big_mean = big_spl(zz[sidx])
        # big_pc = np.polyfit(zz[sidx], dd[sidx], polydeg, w=1./ee[sidx]**2)
        # big_poly = np.poly1d(big_pc)
        # big_mean = big_poly(zz[sidx])
        
        p = np.poly1d(np.polyfit(zz[sidx],big_mean,1))
        print p.coeffs
        
        # bigax.plot(zz[sidx],big_mean,'-k',lw=2)
        bigax.plot(zz[sidx],p(zz[sidx]),'--k',lw=2)
        if printdate:
            plot_title += '\n'
        if printfit:
            plot_title += label+'$={:4.2f}z{:+4.2f}$'.format(p.coeffs[0],p.coeffs[1])

    bigax.set_title(plot_title)
    bigax.legend(loc=0, numpoints=1, scatterpoints=1)
    bigax.set_xlim(-0.1,2.6)

    print zz.size

    if ylims is not None:
        bigax.set_ylim(*ylims)

    axlist = [bigax] + axlist
    
    return axlist
Ejemplo n.º 16
0
def simple_plot(inputsuffix='allz2.dat', label='Mean Light Weighted Age [Gyr]', 
                col=62, order=5, ylims=None, labelr=False, bigpoints=False,
                exclude=[[],[],[],[],[],[]]):

    zz = np.array([])
    dd = np.array([])

    axlist = []

    bigax = plt.figure().add_subplot(111)
    bigax.set_xlabel('|Height [kpc]|')
    bigax.set_ylabel(label)
    
    plist = [6,3,4,2,1,5]
    #color_list = ['blue','turquoise','chartreuse','yellow','tomato','red']
    color_list = ['blue','seagreen','sienna','sienna','seagreen','blue']
    style_list = ['-','-','-','--','--','--']

    for i in range(6):
        pointing = plist[i]
        color = color_list[i]
        style = style_list[i]

        dat = glob('*P{}*{}'.format(pointing, inputsuffix))[0]
        print dat
        loc = glob('*P{}*locations.dat'.format(pointing))[0]
        print loc
        print 'Excluding: ', exclude[pointing-1]
    
        td = np.loadtxt(dat, usecols=(col,), unpack=True)
        r, tz = np.loadtxt(loc, usecols=(4,5), unpack=True)
        avgr = np.mean(r)

        ax = plt.figure().add_subplot(111)
        ax.set_xlabel('|Height [kpc]|')
        ax.set_ylabel(label)
        if labelr:
            ax.set_title('{:4.0f} kpc'.format(avgr))
            linelabel = '{:4.0f} kpc'.format(avgr)
        else:
            ax.set_title('{}\nP{}'.format(time.asctime(),pointing))
            linelabel = 'P{}'.format(pointing)

        exarr = np.array(exclude[pointing-1])-1 #becuase aps are 1-indexed
        td = np.delete(td,exarr)
        t = np.delete(r,exarr)
        tz = np.delete(tz,exarr)

        z = np.array([])
        d = np.array([])
        e = np.array([])
        while tz.size > 0:
            zi = tz[0]
            idx = np.where(np.abs(tz - zi) < 0.05)
            d = np.r_[d,np.mean(td[idx])]
            e = np.r_[e,np.std(td[idx])]
            z = np.r_[z,np.abs(zi)]
            tz = np.delete(tz, idx)
            td = np.delete(td, idx)

        gidx = d == d
        d = d[gidx]
        z = z[gidx]
        e = e[gidx]

        sidx = np.argsort(z)
        dp = np.r_[d[sidx][order::-1],d[sidx]]
        zp = np.r_[z[sidx][order::-1],z[sidx]]
        mean = bn.move_mean(dp,order)[order+1:]
        std = bn.move_std(dp,order)[order+1:]
        spl = spi.UnivariateSpline(z[sidx],d[sidx])
        mean = spl(z[sidx])
        # mean = np.convolve(d[sidx],np.ones(order)/order,'same')
        # std = np.sqrt(np.convolve((d - mean)**2,np.ones(order)/order,'same'))

        bigax.plot(z[sidx],mean, label=linelabel, color=color, ls=style)
        bigax.fill_between(z[sidx],mean-std,mean+std, alpha=0.1, color=color)
        if bigpoints:
            bigax.errorbar(z, d, yerr=e, fmt='.', color=color, alpha=0.6, capsize=0)
        
        ax.plot(z[sidx],mean,color=color, ls=style)
        ax.fill_between(z[sidx],mean-std,mean+std, alpha=0.1, color=color)

        ax.errorbar(z, d, yerr=e, fmt='.', color=color)
        ax.set_xlim(-0.1,2.6)
        
        if ylims is not None:
            ax.set_ylim(*ylims)
        
        axlist.append(ax)

    bigax.legend(loc=0, numpoints=1, scatterpoints=1)

    bigax.set_title(time.asctime())
    bigax.set_xlim(-0.1,2.6)

    if ylims is not None:
        bigax.set_ylim(*ylims)

    axlist = [bigax] + axlist

    return axlist
Ejemplo n.º 17
0
        data, wav_params = wavLoad(input_file)
        fs = wav_params[2]
    except IOError, e:
        print "Could not read file: %s" % e
        sys.exit(-1)


    # cast to mono
    if len(data.shape) == 2:
        data = data.sum(axis=0)  # should this be .mean()?

    window_frames= int(fs * window_seconds)
    silence_frames = int(fs * silence_seconds)

    print "Analyzing"
    move_std = move_std(data, window=window_frames/2)
    mean_std = nanmean(move_std)

    print "move_std shape: ", move_std.shape
    print "len(data): ", len(data)

    widgets = ["Creating file", Bar(), ETA()]
    pbar = ProgressBar(widgets=widgets, maxval=len(data)).start()

    new_data = []
    silence_count = 0
    for i, d in pbar(enumerate(data)):
        new_data.append(d)
        if move_std[i] is not np.nan and move_std[i] < mean_std:
            silence_count += 1
        else:
Ejemplo n.º 18
0
def m66(current_skyline_app, parent_pid, timeseries, algorithm_parameters):
    """
    A time series data points are anomalous if the 6th median is 6 standard
    deviations (six-sigma) from the time series 6th median standard deviation
    and persists for x_windows, where `x_windows = int(window / 2)`.
    This algorithm finds SIGNIFICANT cahngepoints in a time series, similar to
    PELT and Bayesian Online Changepoint Detection, however it is more robust to
    instaneous outliers and more conditionally selective of changepoints.

    :param current_skyline_app: the Skyline app executing the algorithm.  This
        will be passed to the algorithm by Skyline.  This is **required** for
        error handling and logging.  You do not have to worry about handling the
        argument in the scope of the custom algorithm itself,  but the algorithm
        must accept it as the first agrument.
    :param parent_pid: the parent pid which is executing the algorithm, this is
        **required** for error handling and logging.  You do not have to worry
        about handling this argument in the scope of algorithm, but the
        algorithm must accept it as the second argument.
    :param timeseries: the time series as a list e.g. ``[[1578916800.0, 29.0],
        [1578920400.0, 55.0], ... [1580353200.0, 55.0]]``
    :param algorithm_parameters: a dictionary of any required parameters for the
        custom_algorithm and algorithm itself for example:
        ``algorithm_parameters={
            'nth_median': 6,
            'sigma': 6,
            'window': 5,
            'return_anomalies' = True,
        }``
    :type current_skyline_app: str
    :type parent_pid: int
    :type timeseries: list
    :type algorithm_parameters: dict
    :return: True, False or Non
    :rtype: boolean

    Example CUSTOM_ALGORITHMS configuration:

    'm66': {
        'namespaces': [
            'skyline.analyzer.run_time', 'skyline.analyzer.total_metrics',
            'skyline.analyzer.exceptions'
        ],
        'algorithm_source': '/opt/skyline/github/skyline/skyline/custom_algorithms/m66.py',
        'algorithm_parameters': {
            'nth_median': 6, 'sigma': 6, 'window': 5, 'resolution': 60,
            'minimum_sparsity': 0, 'determine_duration': False,
            'return_anomalies': True, 'save_plots_to': False,
            'save_plots_to_absolute_dir': False, 'filename_prefix': False
        },
        'max_execution_time': 1.0
        'consensus': 1,
        'algorithms_allowed_in_consensus': ['m66'],
        'run_3sigma_algorithms': False,
        'run_before_3sigma': False,
        'run_only_if_consensus': False,
        'use_with': ['crucible', 'luminosity'],
        'debug_logging': False,
    },

    """

    # You MUST define the algorithm_name
    algorithm_name = 'm66'

    # Define the default state of None and None, anomalous does not default to
    # False as that is not correct, False is only correct if the algorithm
    # determines the data point is not anomalous.  The same is true for the
    # anomalyScore.
    anomalous = None
    anomalyScore = None

    return_anomalies = False
    anomalies = []
    anomalies_dict = {}
    anomalies_dict['algorithm'] = algorithm_name

    realtime_analysis = False

    current_logger = None
    dev_null = None

    # If you wanted to log, you can but this should only be done during
    # testing and development
    def get_log(current_skyline_app):
        current_skyline_app_logger = current_skyline_app + 'Log'
        current_logger = logging.getLogger(current_skyline_app_logger)
        return current_logger

    start = timer()

    # Use the algorithm_parameters to determine the sample_period
    debug_logging = None
    try:
        debug_logging = algorithm_parameters['debug_logging']
    except:
        debug_logging = False
    if debug_logging:
        try:
            current_logger = get_log(current_skyline_app)
            current_logger.debug(
                'debug :: %s :: debug_logging enabled with algorithm_parameters - %s'
                % (algorithm_name, str(algorithm_parameters)))
        except Exception as e:
            # This except pattern MUST be used in ALL custom algortihms to
            # facilitate the traceback from any errors.  The algorithm we want to
            # run super fast and without spamming the log with lots of errors.
            # But we do not want the function returning and not reporting
            # anything to the log, so the pythonic except is used to "sample" any
            # algorithm errors to a tmp file and report once per run rather than
            # spewing tons of errors into the log e.g. analyzer.log
            dev_null = e
            record_algorithm_error(current_skyline_app, parent_pid,
                                   algorithm_name, traceback.format_exc())
            # Return None and None as the algorithm could not determine True or False
            del dev_null
            if current_skyline_app == 'webapp':
                return (anomalous, anomalyScore, anomalies, anomalies_dict)
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            return (anomalous, anomalyScore)

    # Allow the m66 parameters to be passed in the algorithm_parameters
    window = 6
    try:
        window = algorithm_parameters['window']
    except KeyError:
        window = 6
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    nth_median = 6
    try:
        nth_median = algorithm_parameters['nth_median']
    except KeyError:
        nth_median = 6
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    n_sigma = 6
    try:
        n_sigma = algorithm_parameters['sigma']
    except KeyError:
        n_sigma = 6
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    resolution = 0
    try:
        resolution = algorithm_parameters['resolution']
    except KeyError:
        resolution = 0
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    determine_duration = False
    try:
        determine_duration = algorithm_parameters['determine_duration']
    except KeyError:
        determine_duration = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    minimum_sparsity = 0
    try:
        minimum_sparsity = algorithm_parameters['minimum_sparsity']
    except KeyError:
        minimum_sparsity = 0
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    shift_to_start_of_window = True
    try:
        shift_to_start_of_window = algorithm_parameters[
            'shift_to_start_of_window']
    except KeyError:
        shift_to_start_of_window = True
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    save_plots_to = False
    try:
        save_plots_to = algorithm_parameters['save_plots_to']
    except KeyError:
        save_plots_to = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    save_plots_to_absolute_dir = False
    try:
        save_plots_to_absolute_dir = algorithm_parameters[
            'save_plots_to_absolute_dir']
    except KeyError:
        save_plots_to_absolute_dir = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e
    filename_prefix = False
    try:
        filename_prefix = algorithm_parameters['filename_prefix']
    except KeyError:
        filename_prefix = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    if debug_logging:
        current_logger.debug('debug :: algorithm_parameters :: %s' %
                             (str(algorithm_parameters)))

    return_anomalies = False
    try:
        return_anomalies = algorithm_parameters['return_anomalies']
    except KeyError:
        return_anomalies = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    try:
        realtime_analysis = algorithm_parameters['realtime_analysis']
    except KeyError:
        realtime_analysis = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    save_plots_to = False
    try:
        save_plots_to = algorithm_parameters['save_plots_to']
    except KeyError:
        save_plots_to = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    save_plots_to_absolute_dir = False
    try:
        save_plots_to_absolute_dir = algorithm_parameters[
            'save_plots_to_absolute_dir']
    except KeyError:
        save_plots_to = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e
    filename_prefix = False
    try:
        filename_prefix = algorithm_parameters['filename_prefix']
    except KeyError:
        filename_prefix = False
    except Exception as e:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        dev_null = e

    try:
        base_name = algorithm_parameters['base_name']
    except Exception as e:
        # This except pattern MUST be used in ALL custom algortihms to
        # facilitate the traceback from any errors.  The algorithm we want to
        # run super fast and without spamming the log with lots of errors.
        # But we do not want the function returning and not reporting
        # anything to the log, so the pythonic except is used to "sample" any
        # algorithm errors to a tmp file and report once per run rather than
        # spewing tons of errors into the log e.g. analyzer.log
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        # Return None and None as the algorithm could not determine True or False
        dev_null = e
        del dev_null
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (False, None, anomalies)
        return (False, None)
    if debug_logging:
        current_logger.debug('debug :: %s :: base_name - %s' %
                             (algorithm_name, str(base_name)))

    anomalies_dict['metric'] = base_name
    anomalies_dict['anomalies'] = {}

    use_bottleneck = True
    if save_plots_to:
        use_bottleneck = False
    if use_bottleneck:
        import bottleneck as bn

    # ALWAYS WRAP YOUR ALGORITHM IN try and the BELOW except
    try:
        start_preprocessing = timer()

        # INFO: Sorting time series of 10079 data points took 0.002215 seconds
        timeseries = sorted(timeseries, key=lambda x: x[0])
        if debug_logging:
            current_logger.debug('debug :: %s :: time series of length - %s' %
                                 (algorithm_name, str(len(timeseries))))

        # Testing the data to ensure it meets minimum requirements, in the case
        # of Skyline's use of the m66 algorithm this means that:
        # - the time series must have at least 75% of its full_duration
        do_not_use_sparse_data = False
        if current_skyline_app == 'luminosity':
            do_not_use_sparse_data = True

        if minimum_sparsity == 0:
            do_not_use_sparse_data = False

        total_period = 0
        total_datapoints = 0

        calculate_variables = False
        if do_not_use_sparse_data:
            calculate_variables = True
        if determine_duration:
            calculate_variables = True

        if calculate_variables:
            try:
                start_timestamp = int(timeseries[0][0])
                end_timestamp = int(timeseries[-1][0])
                total_period = end_timestamp - start_timestamp
                total_datapoints = len(timeseries)
            except SystemExit as e:
                if debug_logging:
                    current_logger.debug(
                        'debug_logging :: %s :: SystemExit called, exiting - %s'
                        % (algorithm_name, e))
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)
            except:
                traceback_msg = traceback.format_exc()
                record_algorithm_error(current_skyline_app, parent_pid,
                                       algorithm_name, traceback_msg)
                if debug_logging:
                    current_logger.error(traceback_msg)
                    current_logger.error(
                        'error :: debug_logging :: %s :: failed to determine total_period and total_datapoints'
                        % (algorithm_name))
                timeseries = []
            if not timeseries:
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)

            if current_skyline_app == 'analyzer':
                # Default for analyzer at required period to 18 hours
                period_required = int(FULL_DURATION * 0.75)
            else:
                # Determine from timeseries
                if total_period < FULL_DURATION:
                    period_required = int(FULL_DURATION * 0.75)
                else:
                    period_required = int(total_period * 0.75)

            if determine_duration:
                period_required = int(total_period * 0.75)

        if do_not_use_sparse_data:
            # If the time series does not have 75% of its full_duration it does
            # not have sufficient data to sample
            try:
                if total_period < period_required:
                    if debug_logging:
                        current_logger.debug(
                            'debug :: %s :: time series does not have sufficient data'
                            % (algorithm_name))
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)
            except SystemExit as e:
                if debug_logging:
                    current_logger.debug(
                        'debug_logging :: %s :: SystemExit called, exiting - %s'
                        % (algorithm_name, e))
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)
            except:
                traceback_msg = traceback.format_exc()
                record_algorithm_error(current_skyline_app, parent_pid,
                                       algorithm_name, traceback_msg)
                if debug_logging:
                    current_logger.error(traceback_msg)
                    current_logger.error(
                        'error :: debug_logging :: %s :: falied to determine if time series has sufficient data'
                        % (algorithm_name))
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)

            # If the time series does not have 75% of its full_duration
            # datapoints it does not have sufficient data to sample

            # Determine resolution from the last 30 data points
            # INFO took 0.002060 seconds
            if not resolution:
                resolution_timestamps = []
                metric_resolution = False
                for metric_datapoint in timeseries[-30:]:
                    timestamp = int(metric_datapoint[0])
                    resolution_timestamps.append(timestamp)
                timestamp_resolutions = []
                if resolution_timestamps:
                    last_timestamp = None
                    for timestamp in resolution_timestamps:
                        if last_timestamp:
                            resolution = timestamp - last_timestamp
                            timestamp_resolutions.append(resolution)
                            last_timestamp = timestamp
                        else:
                            last_timestamp = timestamp
                    try:
                        del resolution_timestamps
                    except:
                        pass
                if timestamp_resolutions:
                    try:
                        timestamp_resolutions_count = Counter(
                            timestamp_resolutions)
                        ordered_timestamp_resolutions_count = timestamp_resolutions_count.most_common(
                        )
                        metric_resolution = int(
                            ordered_timestamp_resolutions_count[0][0])
                    except SystemExit as e:
                        if debug_logging:
                            current_logger.debug(
                                'debug_logging :: %s :: SystemExit called, exiting - %s'
                                % (algorithm_name, e))
                        if current_skyline_app == 'webapp':
                            return (anomalous, anomalyScore, anomalies,
                                    anomalies_dict)
                        if return_anomalies:
                            return (anomalous, anomalyScore, anomalies)
                        return (anomalous, anomalyScore)
                    except:
                        traceback_msg = traceback.format_exc()
                        record_algorithm_error(current_skyline_app, parent_pid,
                                               algorithm_name, traceback_msg)
                        if debug_logging:
                            current_logger.error(traceback_msg)
                            current_logger.error(
                                'error :: debug_logging :: %s :: failed to determine if time series has sufficient data'
                                % (algorithm_name))
                    try:
                        del timestamp_resolutions
                    except:
                        pass
            else:
                metric_resolution = resolution

            minimum_datapoints = None
            if metric_resolution:
                minimum_datapoints = int(period_required / metric_resolution)
            if minimum_datapoints:
                if total_datapoints < minimum_datapoints:
                    if debug_logging:
                        current_logger.debug(
                            'debug :: %s :: time series does not have sufficient data, minimum_datapoints required is %s and time series has %s'
                            % (algorithm_name, str(minimum_datapoints),
                               str(total_datapoints)))
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)

            # Is the time series fully populated?
            # full_duration_datapoints = int(full_duration / metric_resolution)
            total_period_datapoints = int(total_period / metric_resolution)
            # minimum_percentage_sparsity = 95
            minimum_percentage_sparsity = 90
            sparsity = int(total_datapoints / (total_period_datapoints / 100))
            if sparsity < minimum_percentage_sparsity:
                if debug_logging:
                    current_logger.debug(
                        'debug :: %s :: time series does not have sufficient data, minimum_percentage_sparsity required is %s and time series has %s'
                        % (algorithm_name, str(minimum_percentage_sparsity),
                           str(sparsity)))
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)
            if len(set(item[1] for item in timeseries)) == 1:
                if debug_logging:
                    current_logger.debug(
                        'debug :: %s :: time series does not have sufficient variability, all the values are the same'
                        % algorithm_name)
                anomalous = False
                anomalyScore = 0.0
                if current_skyline_app == 'webapp':
                    return (anomalous, anomalyScore, anomalies, anomalies_dict)
                if return_anomalies:
                    return (anomalous, anomalyScore, anomalies)
                return (anomalous, anomalyScore)

        end_preprocessing = timer()
        preprocessing_runtime = end_preprocessing - start_preprocessing
        if debug_logging:
            current_logger.debug(
                'debug :: %s :: preprocessing took %.6f seconds' %
                (algorithm_name, preprocessing_runtime))

        if not timeseries:
            if debug_logging:
                current_logger.debug('debug :: %s :: m66 not run as no data' %
                                     (algorithm_name))
            anomalies = []
            if current_skyline_app == 'webapp':
                return (anomalous, anomalyScore, anomalies, anomalies_dict)
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            return (anomalous, anomalyScore)
        if debug_logging:
            current_logger.debug('debug :: %s :: timeseries length: %s' %
                                 (algorithm_name, str(len(timeseries))))

        anomalies_dict['timestamp'] = int(timeseries[-1][0])
        anomalies_dict['from_timestamp'] = int(timeseries[0][0])

        start_analysis = timer()
        try:
            # bottleneck is used because it is much faster
            # pd dataframe method (1445 data point - 24hrs): took 0.077915 seconds
            # bottleneck method (1445 data point - 24hrs): took 0.005692 seconds
            # numpy and pandas rolling
            # 2021-07-30 12:37:31 :: 2827897 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 136.93 seconds
            # 2021-07-30 12:44:53 :: 2855884 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 148.82 seconds
            # 2021-07-30 12:48:41 :: 2870822 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 145.62 seconds
            # 2021-07-30 12:55:00 :: 2893634 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 139.00 seconds
            # 2021-07-30 12:59:31 :: 2910443 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 144.80 seconds
            # 2021-07-30 13:02:31 :: 2922928 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 143.35 seconds
            # 2021-07-30 14:12:56 :: 3132457 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 129.25 seconds
            # 2021-07-30 14:22:35 :: 3164370 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 125.72 seconds
            # 2021-07-30 14:28:24 :: 3179687 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 222.43 seconds
            # 2021-07-30 14:33:45 :: 3179687 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 244.00 seconds
            # 2021-07-30 14:36:27 :: 3214047 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 141.10 seconds
            # numpy and bottleneck
            # 2021-07-30 16:41:52 :: 3585162 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 73.92 seconds
            # 2021-07-30 16:46:46 :: 3585162 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 68.84 seconds
            # 2021-07-30 16:51:48 :: 3585162 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 70.55 seconds
            # numpy and bottleneck (passing resolution and not calculating in m66)
            # 2021-07-30 16:57:46 :: 3643253 :: cloudbursts :: find_cloudbursts completed on 1530 metrics in 65.59 seconds

            if use_bottleneck:
                if len(timeseries) < 10:
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)

                x_np = np.asarray([x[1] for x in timeseries])
                # Fast Min-Max scaling
                data = (x_np - x_np.min()) / (x_np.max() - x_np.min())

                # m66 - calculate to nth_median
                median_count = 0
                while median_count < nth_median:
                    median_count += 1
                    rolling_median_s = bn.move_median(data, window=window)
                    median = rolling_median_s.tolist()
                    data = median
                    if median_count == nth_median:
                        break

                # m66 - calculate the moving standard deviation for the
                # nth_median array
                rolling_std_s = bn.move_std(data, window=window)
                std_nth_median_array = np.nan_to_num(rolling_std_s,
                                                     copy=False,
                                                     nan=0.0,
                                                     posinf=None,
                                                     neginf=None)
                std_nth_median = std_nth_median_array.tolist()
                if debug_logging:
                    current_logger.debug(
                        'debug :: %s :: std_nth_median calculated with bn' %
                        (algorithm_name))
            else:
                df = pd.DataFrame(timeseries, columns=['date', 'value'])
                df['date'] = pd.to_datetime(df['date'], unit='s')
                datetime_index = pd.DatetimeIndex(df['date'].values)
                df = df.set_index(datetime_index)
                df.drop('date', axis=1, inplace=True)
                original_df = df.copy()
                # MinMax scale
                df = (df - df.min()) / (df.max() - df.min())
                # window = 6
                data = df['value'].tolist()

                if len(data) < 10:
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)

                # m66 - calculate to nth_median
                median_count = 0
                while median_count < nth_median:
                    median_count += 1
                    s = pd.Series(data)
                    rolling_median_s = s.rolling(window).median()
                    median = rolling_median_s.tolist()
                    data = median
                    if median_count == nth_median:
                        break

                # m66 - calculate the moving standard deviation for the
                # nth_median array
                s = pd.Series(data)
                rolling_std_s = s.rolling(window).std()

                nth_median_column = 'std_nth_median_%s' % str(nth_median)
                df[nth_median_column] = rolling_std_s.tolist()
                std_nth_median = df[nth_median_column].fillna(0).tolist()

            # m66 - calculate the standard deviation for the entire nth_median
            # array
            metric_stddev = np.std(std_nth_median)
            std_nth_median_n_sigma = []
            anomalies_found = False

            for value in std_nth_median:
                # m66 - if the value in the 6th median array is > six-sigma of
                # the metric_stddev the datapoint is anomalous
                if value > (metric_stddev * n_sigma):
                    std_nth_median_n_sigma.append(1)
                    anomalies_found = True
                else:
                    std_nth_median_n_sigma.append(0)
            std_nth_median_n_sigma_column = 'std_median_%s_%s_sigma' % (
                str(nth_median), str(n_sigma))
            if not use_bottleneck:
                df[std_nth_median_n_sigma_column] = std_nth_median_n_sigma

            anomalies = []
            # m66 - only label anomalous if the n_sigma triggers are persisted
            # for (window / 2)
            if anomalies_found:
                current_triggers = []
                for index, item in enumerate(timeseries):
                    if std_nth_median_n_sigma[index] == 1:
                        current_triggers.append(index)
                    else:
                        if len(current_triggers) > int(window / 2):
                            for trigger_index in current_triggers:
                                # Shift the anomaly back to the beginning of the
                                # window
                                if shift_to_start_of_window:
                                    anomalies.append(
                                        timeseries[(trigger_index -
                                                    (window * int(
                                                        (nth_median / 2))))])
                                else:
                                    anomalies.append(timeseries[trigger_index])
                        current_triggers = []
                # Process any remaining current_triggers
                if len(current_triggers) > int(window / 2):
                    for trigger_index in current_triggers:
                        # Shift the anomaly back to the beginning of the
                        # window
                        if shift_to_start_of_window:
                            anomalies.append(
                                timeseries[(trigger_index - (window * int(
                                    (nth_median / 2))))])
                        else:
                            anomalies.append(timeseries[trigger_index])
            if not anomalies:
                anomalous = False

            if anomalies:
                anomalous = True
                anomalies_data = []
                anomaly_timestamps = [int(item[0]) for item in anomalies]
                for item in timeseries:
                    if int(item[0]) in anomaly_timestamps:
                        anomalies_data.append(1)
                    else:
                        anomalies_data.append(0)
                if not use_bottleneck:
                    df['anomalies'] = anomalies_data
                anomalies_list = []
                for ts, value in timeseries:
                    if int(ts) in anomaly_timestamps:
                        anomalies_list.append([int(ts), value])
                        anomalies_dict['anomalies'][int(ts)] = value

            if anomalies and save_plots_to:
                try:
                    from adtk.visualization import plot
                    metric_dir = base_name.replace('.', '/')
                    timestamp_dir = str(int(timeseries[-1][0]))
                    save_path = '%s/%s/%s/%s' % (save_plots_to, algorithm_name,
                                                 metric_dir, timestamp_dir)
                    if save_plots_to_absolute_dir:
                        save_path = '%s' % save_plots_to
                    anomalies_dict['file_path'] = save_path
                    save_to_file = '%s/%s.%s.png' % (save_path, algorithm_name,
                                                     base_name)
                    if filename_prefix:
                        save_to_file = '%s/%s.%s.%s.png' % (
                            save_path, filename_prefix, algorithm_name,
                            base_name)
                    save_to_path = os_path_dirname(save_to_file)
                    title = '%s\n%s - median %s %s-sigma persisted (window=%s)' % (
                        base_name, algorithm_name, str(nth_median),
                        str(n_sigma), str(window))

                    if not os_path_exists(save_to_path):
                        try:
                            mkdir_p(save_to_path)
                        except Exception as e:
                            current_logger.error(
                                'error :: %s :: failed to create dir - %s - %s'
                                % (algorithm_name, save_to_path, e))
                    if os_path_exists(save_to_path):
                        try:
                            plot(original_df['value'],
                                 anomaly=df['anomalies'],
                                 anomaly_color='red',
                                 title=title,
                                 save_to_file=save_to_file)
                            if debug_logging:
                                current_logger.debug(
                                    'debug :: %s :: plot saved to - %s' %
                                    (algorithm_name, save_to_file))
                            anomalies_dict['image'] = save_to_file
                        except Exception as e:
                            current_logger.error(
                                'error :: %s :: failed to plot - %s - %s' %
                                (algorithm_name, base_name, e))
                    anomalies_file = '%s/%s.%s.anomalies_list.txt' % (
                        save_path, algorithm_name, base_name)
                    with open(anomalies_file, 'w') as fh:
                        fh.write(str(anomalies_list))
                        # os.chmod(anomalies_file, mode=0o644)
                    data_file = '%s/data.txt' % (save_path)
                    with open(data_file, 'w') as fh:
                        fh.write(str(anomalies_dict))
                except SystemExit as e:
                    if debug_logging:
                        current_logger.debug(
                            'debug_logging :: %s :: SystemExit called during save plot, exiting - %s'
                            % (algorithm_name, e))
                    if current_skyline_app == 'webapp':
                        return (anomalous, anomalyScore, anomalies,
                                anomalies_dict)
                    if return_anomalies:
                        return (anomalous, anomalyScore, anomalies)
                    return (anomalous, anomalyScore)
                except Exception as e:
                    traceback_msg = traceback.format_exc()
                    record_algorithm_error(current_skyline_app, parent_pid,
                                           algorithm_name, traceback_msg)
                    if debug_logging:
                        current_logger.error(traceback_msg)
                        current_logger.error(
                            'error :: %s :: failed to plot or save anomalies file - %s - %s'
                            % (algorithm_name, base_name, e))

            try:
                del df
            except:
                pass
        except SystemExit as e:
            if debug_logging:
                current_logger.debug(
                    'debug_logging :: %s :: SystemExit called, during analysis, exiting - %s'
                    % (algorithm_name, e))
            if current_skyline_app == 'webapp':
                return (anomalous, anomalyScore, anomalies, anomalies_dict)
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            return (anomalous, anomalyScore)
        except:
            traceback_msg = traceback.format_exc()
            record_algorithm_error(current_skyline_app, parent_pid,
                                   algorithm_name, traceback_msg)
            if debug_logging:
                current_logger.error(traceback_msg)
                current_logger.error(
                    'error :: debug_logging :: %s :: failed to run on ts' %
                    (algorithm_name))
            if current_skyline_app == 'webapp':
                return (anomalous, anomalyScore, anomalies, anomalies_dict)
            if return_anomalies:
                return (anomalous, anomalyScore, anomalies)
            return (anomalous, anomalyScore)

        end_analysis = timer()
        analysis_runtime = end_analysis - start_analysis

        if debug_logging:
            current_logger.debug(
                'debug :: analysis with %s took %.6f seconds' %
                (algorithm_name, analysis_runtime))

        if anomalous:
            anomalyScore = 1.0
        else:
            anomalyScore = 0.0

        if debug_logging:
            current_logger.info(
                '%s :: anomalous - %s, anomalyScore - %s' %
                (algorithm_name, str(anomalous), str(anomalyScore)))

        if debug_logging:
            end = timer()
            processing_runtime = end - start
            current_logger.info('%s :: completed in %.6f seconds' %
                                (algorithm_name, processing_runtime))
        try:
            del timeseries
        except:
            pass
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (anomalous, anomalyScore, anomalies)
        return (anomalous, anomalyScore)
    except SystemExit as e:
        if debug_logging:
            current_logger.debug(
                'debug_logging :: %s :: SystemExit called (before StopIteration), exiting - %s'
                % (algorithm_name, e))
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (anomalous, anomalyScore, anomalies)
        return (anomalous, anomalyScore)
    except StopIteration:
        # This except pattern MUST be used in ALL custom algortihms to
        # facilitate the traceback from any errors.  The algorithm we want to
        # run super fast and without spamming the log with lots of errors.
        # But we do not want the function returning and not reporting
        # anything to the log, so the pythonic except is used to "sample" any
        # algorithm errors to a tmp file and report once per run rather than
        # spewing tons of errors into the log e.g. analyzer.log
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (False, None, anomalies)
        return (False, None)
    except:
        record_algorithm_error(current_skyline_app, parent_pid, algorithm_name,
                               traceback.format_exc())
        # Return None and None as the algorithm could not determine True or False
        if current_skyline_app == 'webapp':
            return (anomalous, anomalyScore, anomalies, anomalies_dict)
        if return_anomalies:
            return (False, None, anomalies)
        return (False, None)

    if current_skyline_app == 'webapp':
        return (anomalous, anomalyScore, anomalies, anomalies_dict)
    if return_anomalies:
        return (anomalous, anomalyScore, anomalies)
    return (anomalous, anomalyScore)
Ejemplo n.º 19
0
 def time_move_std(self, dtype, shape, window):
     bn.move_std(self.arr, window)
Ejemplo n.º 20
0
 def time_move_std(self, dtype, shape, order, axis, window):
     bn.move_std(self.arr, window, axis=axis)