Beispiel #1
0
    def ForwardPropagate(self):
        """
            This function will
            (1) fetch observation data from self.o_data (size T x dim_o)
            (2) perform forward propagate
            (3) save results in self.s_data (size T x M x dim_s); also calculate error and save in error (size T x (K+1))
        """
        
        
        error = np.zeros((self.T,self.K+1))

        # deal with t=0
        feed = {  self.o_t   : self.o_data[0],
                  self.s_old : self.sess.run(self.s_0),
                }
        s_pre, prob, o_forecast=self.sess.run([self.s_new,self.s_new_w,self.o_forecast], feed)
        for i in range(self.K+1):
            error[0,i]  = np.sum((self.o_data[i] - np.array(o_forecast[i]))**2)        
        util.resample(self.s_data[0,:],s_pre,prob[:,0])

        # deal with t>0
        for t in range(1,self.T-self.K):
            feed={  self.o_t    : self.o_data[t],
                    self.s_old  : self.s_data[t-1,:,:],
                    }
            s_pre, prob, o_forecast = self.sess.run([self.s_new,self.s_new_w,self.o_forecast],feed)
            for i in range(self.K+1):
                error[t,i]  = np.sum((self.o_data[t+i] - np.array(o_forecast[i]))**2)
            util.resample(self.s_data[t,:],s_pre,prob[:,0])

        return error
def create_input_files(tile_id):

    print "Getting extent of", tile_id
    xmin, ymin, xmax, ymax = uu.coords(tile_id)

    # # Soil tiles are already processed, so there's no need to include them here.
    # # Below is the old code for tile-izing the histosole soil raster.
    # # Leaving this in case I ever add in soil processing again.
    # print "clip soil"
    # extra_param = ['-tr', '.00025', '.00025', '-dstnodata', '0']
    # clip_soil_tile = util.clip('hwsd_oc_final.tif', '{}_soil.tif'.format(tile_id), xmin, ymin, xmax, ymax, extra_param)
    #
    # print "removing no data flag from soil"
    # cmd = ['gdal_edit.py', '-unsetnodata', clip_soil_tile]
    # subprocess.check_call(cmd)
    #
    # print "uploading soil tile to s3"
    # util.upload(clip_soil_tile, cn.soil_C_processed_dir)

    print "Rasterizing ecozone"
    rasterized_eco_zone_tile = util.rasterize(
        'fao_ecozones_bor_tem_tro.shp',
        "{}_fao_ecozones_bor_tem_tro.tif".format(tile_id), xmin, ymin, xmax,
        ymax, '.008', 'Byte', 'recode', '0')

    print "Resampling eco zone"
    resampled_ecozone = util.resample(
        rasterized_eco_zone_tile,
        "{0}_{1}.tif".format(tile_id, cn.pattern_fao_ecozone_processed))

    print "Uploading processed ecozone"
    util.upload(resampled_ecozone, cn.fao_ecozone_processed_dir)

    print "Clipping srtm"
    tile_srtm = util.clip('srtm.vrt', '{}_srtm.tif'.format(tile_id), xmin,
                          ymin, xmax, ymax)

    print "Resampling srtm"
    tile_res_srtm = util.resample(
        tile_srtm, '{0}_{1}.tif'.format(tile_id, cn.pattern_srtm))

    print "Uploading processed srtm"
    util.upload(tile_res_srtm, cn.srtm_processed_dir)

    print "Clipping precipitation"
    clipped_precip_tile = util.clip('add_30s_precip.tif',
                                    '{}_clip_precip.tif'.format(tile_id), xmin,
                                    ymin, xmax, ymax)

    print "Resampling precipitation"
    resample_precip_tile = util.resample(
        clipped_precip_tile, '{0}_{1}.tif'.format(tile_id, cn.pattern_precip))

    print "Uploading processed precipitation"
    util.upload(resample_precip_tile, cn.precip_processed_dir)
Beispiel #3
0
 def summary_turnover(self, by=None):
     """Returns a turnover-related metrics summary Dataframe."""
     index = ['turnover_t', 'turnover_h', 'turnover_d']
     tvr_t, tvr_h, tvr_d = self.get_turnover()
     res = {
         'turnover_t': util.resample(tvr_t, how='mean', by=by),
         'turnover_h': util.resample(tvr_h, how='mean', by=by),
         'turnover_d': util.resample(tvr_d, how='mean', by=by),
     }
     res = pd.Series(res) if by is None else pd.DataFrame(res).T
     res = res.reindex(index)
     return pd.DataFrame({'ALL': res}) if by is None else res
Beispiel #4
0
    def fit(self, data, verbose=False):
        """
        Fits a consensus matrix for each number of clusters
        Args:
          * data -> (examples,attributes) format
          * verbose -> should print or not
        """
        N = data.shape[0]  # number of points
        Mk = np.zeros((self.K_ - self.L_, N, N))
        Is = np.zeros(
            (N, N))  # counter for each pair of points if they were used in resample data for current number of clusters
        for k in range(self.L_, self.K_):  # for each number of clusters
            i_ = k - self.L_
            if verbose:
                print("At k = %d, aka. iteration = %d" % (k, i_))
            for h in range(self.H_):  # resample H times
                if verbose:
                    print("\tAt resampling h = %d, (k = %d)" % (h, k))
                resampled_indices, resample_data = util.resample(data, self.resample_proportion_)
                Mh = self.cluster_(n_clusters=k).fit_predict(resample_data)
                # find indexes of elements from same clusters with bisection
                # on sorted array => this is more efficient than brute force search
                id_clusts = np.argsort(Mh)
                sorted_ = Mh[id_clusts]  # 0000000000111111111111222222
                for i in range(k):  # for each cluster
                    ia = bisect.bisect_left(sorted_, i)
                    ib = bisect.bisect_right(sorted_, i)
                    cluster_indices = id_clusts[ia:ib]
                    is_ = resampled_indices[cluster_indices]
                    ids_ = np.array(list(combinations(is_, 2))).T  # get all pairs of i-th cluster
                    # sometimes only one element is in a cluster (no combinations)
                    if ids_.size != 0:
                        Mk[i_, ids_[0], ids_[1]] += 1
                # increment counts
                ids_2 = np.array(list(combinations(resampled_indices, 2))).T
                Is[ids_2[0], ids_2[1]] += 1
            Is += Is.T
            Mk[i_] /= Is + 1e-8  # consensus matrix
            Mk[i_] += Mk[i_].T  # Mk[i_] is upper triangular (with zeros on diagonal), we now make it symmetric
            Mk[i_] += np.eye(N)
            # Mk[i_, range(N), range(N)] = 1  # always with self, fill the diag
            Is.fill(0)  # reset counter
        self.Mk = Mk

        # fits areas under the CDFs
        self.Ak = np.zeros(self.K_ - self.L_)
        for i, m in enumerate(Mk):
            hist, bins = np.histogram(m.ravel(), density=True)
            self.Ak[i] = np.sum(h * (b - a)
                                for b, a, h in zip(bins[1:], bins[:-1], np.cumsum(hist)))

        # fits differences between areas under CDFs
        self.deltaK = np.array([(Ab - Aa) / Aa if i > 2 else Aa
                                for Ab, Aa, i in zip(self.Ak[1:], self.Ak[:-1], range(self.L_, self.K_ - 1))])
        self.bestK = np.argmax(self.deltaK) + \
                     self.L_ if self.deltaK.size > 0 else self.L_
Beispiel #5
0
    def summary_turnover(self, by=None, freq='daily'):
        """Returns a turnover-related metrics summary Series/Dataframe.

        :param str freq: Which frequency of statistics is of interest? 'daily'(default): only returns turnover, AC1, rAC1; 'weekly': returns also AC5, rAC5; 'monthly': returns also AC20, rAC20

        These metrics are:
           * turnover: average daily turnover
           * AC1: average daily 1-day auto-corrwithelation
           * AC5: average daily 5-day auto-corrwithelation
           * AC20: average daily 20-day auto-corrwithelation
           * rAC1: average daily 1-day rank auto-corrwithelation
           * rAC5: average daily 5-day rank auto-corrwithelation
           * rAC20: average daily 20-day rank auto-corrwithelation
        """

        index = ['turnover', 'AC1', 'rAC1']
        tmp = {
            'turnover': util.resample(self.get_turnover(), how='mean', by=by),
            'AC1': util.resample(self.get_ac(1), how='mean', by=by),
            'rAC1': util.resample(self.get_ac(1, rank=True), how='mean',
                                  by=by),
        }
        if freq == 'weekly':
            index.extend(['AC5', 'rAC5'])
            tmp.update({
                'AC5':
                util.resample(self.get_ac(5), how='mean', by=by),
                'rAC5':
                util.resample(self.get_ac(5, rank=True), how='mean', by=by),
            })
        elif freq == 'monthly':
            index.extend(['AC5', 'rAC5', 'AC20', 'rAC20'])
            tmp.update({
                'AC5':
                util.resample(self.get_ac(5), how='mean', by=by),
                'rAC5':
                util.resample(self.get_ac(5, rank=True), how='mean', by=by),
                'AC20':
                util.resample(self.get_ac(20), how='mean', by=by),
                'rAC20':
                util.resample(self.get_ac(20, rank=True), how='mean', by=by),
            })
        res = pd.Series(tmp) if by is None else pd.DataFrame(tmp).T
        res = res.reindex(index)
        return res
Beispiel #6
0
def estimate(trainX, trainY, resample_num):
    sample_pos_means = []
    sample_pos_covs = []
    sample_neg_means = []
    sample_neg_covs = []

    for i in xrange(resample_num):
        [sampledX, sampledY] = util.resample(trainX, trainY)
        [positiveX, negativeX] = util.split(sampledX, sampledY)

        sample_pos_means.append(np.mean(positiveX, 0))
        sample_neg_means.append(np.mean(negativeX, 0))
        sample_pos_covs.append(np.cov(np.array(positiveX).T))
        sample_neg_covs.append(np.cov(np.array(negativeX).T))

    nominal_pos_mean = np.mean(sample_pos_means, 0)
    nominal_neg_mean = np.mean(sample_neg_means, 0)
    nominal_pos_cov = np.mean(sample_pos_covs, 0)
    nominal_neg_cov = np.mean(sample_neg_covs, 0)

    sample_pos_means_cov = np.cov(np.array(sample_pos_means).T)
    sample_neg_means_cov = np.cov(np.array(sample_neg_means).T)
    #log(sample_pos_means_cov)
    #log(sample_neg_means_cov)
    np.linalg.cholesky(sample_pos_means_cov +
                       np.eye(sample_pos_means_cov.shape[0]) * 1e-8)
    np.linalg.cholesky(sample_neg_means_cov +
                       np.eye(sample_neg_means_cov.shape[0]) * 1e-8)
    P_pos = np.linalg.inv(sample_pos_means_cov +
                          np.eye(sample_pos_means_cov.shape[0]) *
                          1e-8) / len(trainX)
    P_neg = np.linalg.inv(sample_neg_means_cov +
                          np.eye(sample_pos_means_cov.shape[0]) *
                          1e-8) / len(trainX)
    np.linalg.cholesky(P_pos + np.eye(sample_neg_means_cov.shape[0]) * 1e-3)
    np.linalg.cholesky(P_neg + np.eye(sample_neg_means_cov.shape[0]) * 1e-3)

    rho_pos = 0
    rho_neg = 0

    for cov_matrix in sample_pos_covs:
        dis = util.F_norm(cov_matrix - nominal_pos_cov)
        rho_pos = max(dis, rho_pos)

    for cov_matrix in sample_neg_covs:
        dis = util.F_norm(cov_matrix - nominal_neg_cov)
        rho_neg = max(dis, rho_neg)

    return [
        nominal_pos_mean, P_pos, nominal_neg_mean, P_neg, nominal_pos_cov,
        rho_pos, nominal_neg_cov, rho_neg
    ]
Beispiel #7
0
    def summary_ir(self, by=None, freq='daily'):
        """Returns a IR-related metrics summary Series/Dataframe.

        :param str freq: Which frequency of statistics is of interest? 'daily'(default): only returns IR1, rIR1; 'weekly': returns also IR5, rIR5; 'monthly': returns also IR20, rIR20

        These metrics are:
           * IR1: mean(IC(1)) / std(IC(1))
           * IR5: mean(IC(5)) / std(IC(5))
           * IR20: mean(IC(20)) / std(IC(20))
           * rIR1: mean(rank IC(1)) / std(rank IC(1))
           * rIR5: mean(rank IC(5)) / std(rank IC(5))
           * rIR20: mean(rank IC(20)) / std(rank IC(20))
        """

        index = ['days', 'IR1', 'rIR1']
        tmp = {
            'days': util.resample(self.get_ic(1), how='count', by=by),
            'IR1': util.resample(self.get_ic(1), how='ir', by=by),
            'rIR1': util.resample(self.get_ic(1, rank=True), how='ir', by=by),
        }

        if freq == 'weekly':
            index.extend(['IR5', 'rIR5'])
            tmp.update({
                'IR5':
                util.resample(self.get_ic(5), how='ir', by=by),
                'rIR5':
                util.resample(self.get_ic(5, rank=True), how='ir', by=by),
            })
        elif freq == 'monthly':
            index.extend(['IR5', 'rIR5', 'IR20', 'rIR20'])
            tmp.update({
                'IR5':
                util.resample(self.get_ic(5), how='ir', by=by),
                'rIR5':
                util.resample(self.get_ic(5, rank=True), how='ir', by=by),
                'IR20':
                util.resample(self.get_ic(20), how='ir', by=by),
                'rIR20':
                util.resample(self.get_ic(20, rank=True), how='ir', by=by),
            })
        res = pd.Series(tmp) if by is None else pd.DataFrame(tmp).T
        res = res.reindex(index)
        return res
def estimate(trainX, trainY, resample_num):
	sample_pos_means = []
	sample_pos_covs = []
	sample_neg_means = []
	sample_neg_covs = []

	for i in xrange(resample_num):
		[sampledX, sampledY] = util.resample(trainX, trainY)
		[positiveX, negativeX] = util.split(sampledX, sampledY)

		sample_pos_means.append(np.mean(positiveX, 0))
		sample_neg_means.append(np.mean(negativeX, 0))
		sample_pos_covs.append(np.cov(np.array(positiveX).T))
		sample_neg_covs.append(np.cov(np.array(negativeX).T))

	nominal_pos_mean = np.mean(sample_pos_means, 0)
	nominal_neg_mean = np.mean(sample_neg_means, 0)
	nominal_pos_cov = np.mean(sample_pos_covs, 0)
	nominal_neg_cov = np.mean(sample_neg_covs, 0)

	sample_pos_means_cov = np.cov(np.array(sample_pos_means).T)
	sample_neg_means_cov = np.cov(np.array(sample_neg_means).T)
	#log(sample_pos_means_cov)
	#log(sample_neg_means_cov)
	np.linalg.cholesky(sample_pos_means_cov+ np.eye(sample_pos_means_cov.shape[0]) * 1e-8)
	np.linalg.cholesky(sample_neg_means_cov+ np.eye(sample_neg_means_cov.shape[0]) * 1e-8)
	P_pos = np.linalg.inv(sample_pos_means_cov + np.eye(sample_pos_means_cov.shape[0]) * 1e-8) / len(trainX)
	P_neg = np.linalg.inv(sample_neg_means_cov + np.eye(sample_pos_means_cov.shape[0]) * 1e-8) / len(trainX)
	np.linalg.cholesky(P_pos+ np.eye(sample_neg_means_cov.shape[0]) * 1e-3)
	np.linalg.cholesky(P_neg+ np.eye(sample_neg_means_cov.shape[0]) * 1e-3)

	rho_pos = 0
	rho_neg = 0

	for cov_matrix in sample_pos_covs:
		dis = util.F_norm(cov_matrix - nominal_pos_cov)
		rho_pos = max(dis, rho_pos)

	for cov_matrix in sample_neg_covs:
		dis = util.F_norm(cov_matrix - nominal_neg_cov)
		rho_neg = max(dis, rho_neg)

	return [nominal_pos_mean, P_pos, nominal_neg_mean, P_neg,
		nominal_pos_cov, rho_pos, nominal_neg_cov, rho_neg]
Beispiel #9
0
 def summary_ir(self, by=None, freq='daily'):
     """Returns a IR-related metrics summary Dataframe."""
     index = ['days', 'IR_t', 'rIR_t', 'IR_h', 'rIR_h', 'IR_d', 'rIR_d']
     ic_t, ic_h, ic_d = self.get_ic()
     ric_t, ric_h, ric_d = self.get_ic(rank=True)
     res = {
         'days': util.resample(ic_t, how='count', by=by),
         'IR_t': util.resample(ic_t, how='ir', by=by),
         'rIR_t': util.resample(ric_t, how='ir', by=by),
         'IR_h': util.resample(ic_h, how='ir', by=by),
         'rIR_h': util.resample(ric_h, how='ir', by=by),
         'IR_d': util.resample(ic_d, how='ir', by=by),
         'rIR_d': util.resample(ric_d, how='ir', by=by),
     }
     res = pd.Series(res) if by is None else pd.DataFrame(res).T
     res = res.reindex(index)
     return pd.DataFrame({'ALL': res}) if by is None else res
Beispiel #10
0
def predict(path: str, data_x: np.ndarray):
    # Pretreatment
    data_x = [data_x]
    data_x, length = util.resample(data_x, 600)
    data_x = util.reshape(data_x, length)
    for i in range(len(data_x)):
        data_x[i, :, 0] = util.regularize(data_x[i, :, 0])
        data_x[i, :, 1] = util.regularize(data_x[i, :, 1])
        data_x[i, :, 2] = util.regularize(data_x[i, :, 2])

    with tf.Session() as sess:
        saver = tf.train.import_meta_graph(os.path.join(path, '.meta'))
        saver.restore(sess, path)
        graph = tf.get_default_graph()
        placehold_x = graph.get_tensor_by_name('input/data_x:0')
        predict_value = graph.get_tensor_by_name('accuracy/predict:0')
        keep_prob = graph.get_tensor_by_name('keep_prob:0')

        return sess.run(predict_value, feed_dict={placehold_x: data_x, keep_prob: 1})[0][0]
def resampled_mv(X, alg, n_base_partitions=30, resample_proportion=0.8):
    """Majority voting with resampling"""
    N = X.shape[0]
    ca = np.zeros((N, N))
    Is = np.zeros((N, N))

    for h in range(n_base_partitions):
        resampled_indices, resampled_data = resample(X, resample_proportion)
        alg.fit(resampled_data)

        if hasattr(alg, 'predict'):
            Mh = alg.predict(resampled_data)
        else:
            Mh = alg.labels_

        id_clusts = np.argsort(Mh)
        sorted_ = Mh[id_clusts]

        k = len(np.unique(sorted_))

        for i in range(k):  # for each cluster
            ia = bisect.bisect_left(sorted_, i)
            ib = bisect.bisect_right(sorted_, i)
            cluster_indices = id_clusts[ia:ib]
            is_ = resampled_indices[cluster_indices]
            ids_ = np.array(list(combinations(is_, 2))).T

            if ids_.size != 0:
                ca[ids_[0], ids_[1]] += 1

        ids_2 = np.array(list(combinations(resampled_indices, 2))).T
        Is[ids_2[0], ids_2[1]] += 1
    Is += Is.T
    ca = ca / (Is + 1e-8)
    ca += ca.T
    ca += np.eye(N)

    labels = mv_consensus(ca)

    return labels
Beispiel #12
0
    def fit_from_cfg(self, data):
        self.X = data
        N = data.shape[0]  # number of points
        Mk = np.zeros((N, N))
        Is = np.zeros(
            (N, N))  # counter for each pair of points if they were used in resample data for current number of clusters

        for h in range(self.H_):  # resample H times
            resampled_indices, resample_data = util.resample(data, self.resample_proportion_)
            self.cluster_.fit(resample_data)
            if hasattr(self.cluster_, 'predict'):
                Mh = self.cluster_.predict(resample_data)
            else:
                Mh = self.cluster_.labels_

            id_clusts = np.argsort(Mh)
            sorted_ = Mh[id_clusts]  # 0000000000111111111111222222

            k = len(np.unique(sorted_))

            for i in range(k):  # for each cluster
                ia = bisect.bisect_left(sorted_, i)
                ib = bisect.bisect_right(sorted_, i)
                cluster_indices = id_clusts[ia:ib]
                is_ = resampled_indices[cluster_indices]
                ids_ = np.array(list(combinations(is_, 2))).T  # get all pairs of i-th cluster
                # sometimes only one element is in a cluster (no combinations)
                if ids_.size != 0:
                    Mk[ids_[0], ids_[1]] += 1
            # increment counts
            ids_2 = np.array(list(combinations(resampled_indices, 2))).T
            Is[ids_2[0], ids_2[1]] += 1
        Is += Is.T
        Mk /= Is + 1e-8  # consensus matrix
        Mk += Mk.T  # Mk[i_] is upper triangular (with zeros on diagonal), we now make it symmetric
        Mk += np.eye(N)
        Is.fill(0)  # reset counter
        self.Mk = Mk
Beispiel #13
0
 def _resample(self, tmp_env, frame_rate):
     return resample(tmp_env, frame_rate, self.sr)
def create_hdf5(series_list, output_dir, resample=False, max_series=1e5):
    hdf5_fh = h5py.File(os.path.join(output_dir, 'data.hdf5'), 'a')
    for group_name in ('series', 'aneurysm_masks'):
        if group_name not in hdf5_fh:
            hdf5_fh.create_group('/{}'.format(group_name))

    assert len(series_list) < 1e5, 'Too many series for 5-digit IDs.'
    for i, s in enumerate(series_list):
        if i >= max_series:
            break
        dset_path = '/series/{:05d}'.format(i + 1)
        if dset_path in hdf5_fh:
            continue
        print('Processing series {} from study {}...'.format(
            s.series_number, s.study_name))
        pixel_arrays = []
        is_valid_series = True
        for slice_name in tqdm(s.slice_names, total=len(s), unit=' slices'):
            # Process and write slices
            dcm_path = os.path.join(s.dcm_dir, slice_name + '.dcm')
            dcm = util.read_dicom(dcm_path)
            try:
                pixel_arrays.append(util.dcm_to_raw(dcm))
            except NotImplementedError:
                print('Unsupported image format, not converting study: {}'.
                      format(s.study_name))
                is_valid_series = False
                break
        if not is_valid_series:
            continue

        volume = np.stack(pixel_arrays)

        aneurysm_mask_path = os.path.join(s.dcm_dir, 'aneurysm_mask.npy')
        if os.path.exists(aneurysm_mask_path):
            s.aneurysm_mask_path = aneurysm_mask_path
            aneurysm_mask = np.transpose(np.load(s.aneurysm_mask_path),
                                         [2, 0, 1])
        else:
            s.aneurysm_mask_path = None
            aneurysm_mask = None

        assert aneurysm_mask is None or aneurysm_mask.shape == volume.shape, \
            'Mismatched aneurysm mask and volume shapes: {} and {}'.format(aneurysm_mask.shape, volume.shape)
        if len(s) > 0 and resample:
            util.print_err('Resampling volume... Shape before: {}'.format(
                volume.shape))
            tick = time.time()
            dcm = util.read_dicom(
                os.path.join(s.dcm_dir, s.slice_names[0] + '.dcm'))
            volume, real_scale = util.resample(volume, dcm.SliceThickness,
                                               dcm.PixelSpacing, (1.5, 1., 1.))
            util.print_err('Shape after: {}. Resample took {} s.'.format(
                volume.shape,
                time.time() - tick))
            if aneurysm_mask is not None:
                util.print_err(
                    'Resampling mask... Shape before: {}, count before: {}.'.
                    format(aneurysm_mask.shape, np.sum(aneurysm_mask > 0)))
                tick = time.time()
                aneurysm_mask, mask_scale = util.resample(
                    aneurysm_mask, dcm.SliceThickness, dcm.PixelSpacing,
                    (1.5, 1., 1.))
                util.print_err(
                    'Mask shape after: {}, count after: {}. Resample took {} s.'
                    .format(aneurysm_mask.shape, np.sum(aneurysm_mask > 0),
                            time.time() - tick))
                if not aneurysm_mask.any():
                    raise RuntimeError(
                        'Mask has zero volume after resampling.')

                if s.is_aneurysm:
                    # Recompute slice numbers where the aneurysm lives
                    s.aneurysm_bounds = get_aneurysm_range(aneurysm_mask)
                    s.aneurysm_ranges = [s.aneurysm_bounds]
                    s.absolute_range = [0, aneurysm_mask.shape[0]]

        # Create one dataset for the volume (int16), one for the mask (bool)
        s.dset_path = dset_path
        hdf5_fh.create_dataset(s.dset_path,
                               data=volume,
                               dtype='i2',
                               chunks=True)

        if aneurysm_mask is not None:
            s.aneurysm_mask_path = '/aneurysm_masks/{:05d}'.format(i + 1)
            hdf5_fh.create_dataset(s.aneurysm_mask_path,
                                   data=aneurysm_mask,
                                   dtype='?',
                                   chunks=True)

    # Print summary
    util.print_err('Series: {}'.format(len(hdf5_fh['/series'])))
    util.print_err('Aneurysm Masks: {}'.format(len(
        hdf5_fh['/aneurysm_masks'])))

    # Dump pickle and JSON (updated dset_path and mask_path attributes)
    util.print_err('Dumping pickle file...')
    with open(os.path.join(output_dir, 'series_list.pkl'), 'wb') as pkl_fh:
        pickle.dump(series_list, pkl_fh)
    util.print_err('Dumping JSON file...')
    with open(os.path.join(output_dir, 'series_list.json'), 'w') as json_file:
        json.dump([dict(series) for series in series_list],
                  json_file,
                  indent=4,
                  sort_keys=True,
                  default=util.json_encoder)

    # Clean up
    hdf5_fh.close()
Beispiel #15
0
def calc_stoi(clean_sig, bad_sig, fs_signal):
    if len(clean_sig) != len(bad_sig):
        raise ValueError('the length of clean signal and bad signal not equal')

    x, y = np.array(clean_sig), np.array(bad_sig)

    fs = 10000
    N_frame = 256
    K = 512
    J = 15
    mn = 150
    H, _ = _thirdoct(fs, K, J, mn)
    N = 30
    Beta = -15
    dyn_range = 40

    if fs_signal != fs:
        x = util.resample(x, fs_signal, fs)
        y = util.resample(y, fs_signal, fs)

    x, y = _rm_silent_frame(x, y, dyn_range, N_frame, N_frame // 2)
    if len(x) <= 0:
        raise ValueError("Signal contains no speech fragments")

    x_hat = _stdft(x, N_frame, N_frame / 2, K)
    y_hat = _stdft(y, N_frame, N_frame / 2, K)

    x_hat = np.transpose(x_hat[:, 0:K // 2 + 1])
    y_hat = np.transpose(y_hat[:, 0:K // 2 + 1])

    X, Y = [], []

    for i in range(x_hat.shape[1]):
        X.append(np.sqrt(H.dot(np.abs(x_hat[:, i])**2)))
        Y.append(np.sqrt(H.dot(np.abs(y_hat[:, i])**2)))
    X = np.array(X)
    Y = np.array(Y)
    X = X.T
    Y = Y.T

    c = 10**(-Beta / 20.)

    score, count = 0., 0
    for m in range(N, X.shape[1] + 1):
        X_seg = X[:, m - N:m]
        Y_seg = Y[:, m - N:m]

        Y_square_sum = np.sum(np.square(Y_seg), axis=1)
        Y_square_sum[Y_square_sum <= 0] = np.finfo(np.float64).eps
        alpha = np.sqrt(np.sum(np.square(X_seg), axis=1) / Y_square_sum)
        alpha = np.reshape(alpha, [len(alpha), 1])
        aY_seg = Y_seg * np.tile(alpha, [1, N])

        for j in range(J):
            aX = X_seg[j, :] + X_seg[j, :].dot(c)
            Y_prime = [min(x, y) for x, y in zip(aY_seg[j, :], aX)]
            Y_prime = np.array(Y_prime)
            s = _correlation_coefficient(X_seg[j, :], Y_prime)
            score += s
            count += 1

    score /= max(count, 1)

    return score
Beispiel #16
0
 def get_ir(self, rank=False, by=None):
     ic_t, ic_h, ic_d = self.get_ic(rank=rank)
     return util.resample(ic_t, how='ir', by=by), util.resample(
         ic_h, how='ir', by=by), util.resample(ic_d, how='ir', by=by)
Beispiel #17
0
def train():
    TIMESTAMP = "{0:%Y-%m-%d-%H-%M/}".format(datetime.now())
    log.log_info('program start')
    data, num_good, num_bad = util.load_train_data(num_data // 2)
    log.log_debug('Data loading completed')

    # resample
    data, length = util.resample(data, 600)
    data = util.reshape(data, length)
    good_data_origin = data[:num_good, :]
    bad_data_origin = data[num_good:, :]

    # extract bad data for test and train
    permutation = list(np.random.permutation(len(bad_data_origin)))
    shuffled_bad_data = bad_data_origin[permutation, :]
    test_bad_data = shuffled_bad_data[:int(num_bad * 0.3), :]
    train_bad_data_origin = shuffled_bad_data[int(num_bad * 0.3):, :]
    # extract corresponding good data for test and train
    permutation = list(np.random.permutation(len(good_data_origin)))
    shuffled_good_data = good_data_origin[permutation, :]
    test_good_data = shuffled_good_data[:len(test_bad_data), :]
    train_good_data = shuffled_good_data[len(test_bad_data):, :]

    assert len(test_bad_data) == len(test_good_data)
    # construct test data
    test_y = np.array([1.] * len(test_good_data) + [0.] * len(test_bad_data), dtype=np.float).reshape(
        (len(test_bad_data) + len(test_good_data), 1))
    test_x = np.vstack((test_good_data, test_bad_data))

    # expand the number of bad data for train
    train_x = np.vstack((train_good_data, train_bad_data_origin))
    train_y = np.array([1.] * len(train_good_data) + [0.] * len(train_bad_data_origin), dtype=np.float).reshape(
        (len(train_bad_data_origin) + len(train_good_data), 1))

    train_x, train_y, num_expand = util.expand(train_x, train_y)

    # regularize
    for i in range(len(train_x)):
        train_x[i, :, 0] = util.regularize(train_x[i, :, 0])
        train_x[i, :, 1] = util.regularize(train_x[i, :, 1])
        train_x[i, :, 2] = util.regularize(train_x[i, :, 2])
    for i in range(len(test_x)):
        test_x[i, :, 0] = util.regularize(test_x[i, :, 0])
        test_x[i, :, 1] = util.regularize(test_x[i, :, 1])
        test_x[i, :, 2] = util.regularize(test_x[i, :, 2])

    # random
    train_x, train_y = util.shuffle_data(train_x, train_y)

    log.log_debug('prepare completed')
    log.log_info('convolution layers: ' + str(conv_layers))
    log.log_info('filters: ' + str(filters))
    log.log_info('full connected layers: ' + str(fc_layers))
    log.log_info('learning rate: %f' % learning_rate)
    log.log_info('keep prob: ' + str(keep_prob))
    log.log_info('the number of expanding bad data: ' + str(num_expand))
    log.log_info('mini batch size: ' + str(mini_batch_size))

    if mini_batch_size != 0:
        assert mini_batch_size <= len(train_x)

    cnn = Cnn(conv_layers, fc_layers, filters, learning_rate)
    (m, n_W0, n_C0) = train_x.shape
    n_y = train_y.shape[1]

    # construction calculation graph
    cnn.initialize(n_W0, n_C0, n_y)
    cost = cnn.cost()
    optimizer = cnn.get_optimizer(cost)
    predict, accuracy = cnn.predict()

    init = tf.global_variables_initializer()
    saver = tf.train.Saver()

    with tf.Session() as sess:

        # log for tensorboard
        merged = tf.summary.merge_all()
        train_writer = tf.summary.FileWriter("resource/tsb/train/" + TIMESTAMP, sess.graph)
        test_writer = tf.summary.FileWriter("resource/tsb/test/" + TIMESTAMP)

        if enable_debug:
            sess = tf_debug.LocalCLIDebugWrapperSession(sess)

        sess.run(init)

        for i in range(1, num_epochs + 1):
            if mini_batch_size != 0:
                num_mini_batches = int(m / mini_batch_size)
                mini_batches = util.random_mini_batches(train_x, train_y, mini_batch_size)

                cost_value = 0
                for mini_batch in mini_batches:
                    (mini_batch_x, mini_batch_y) = mini_batch
                    _, temp_cost = sess.run([optimizer, cost], feed_dict={cnn.x: mini_batch_x, cnn.y: mini_batch_y,
                                                                          cnn.keep_prob: keep_prob})
                    cost_value += temp_cost
                cost_value /= num_mini_batches
            else:
                _, cost_value = sess.run([optimizer, cost],
                                         feed_dict={cnn.x: train_x, cnn.y: train_y, cnn.keep_prob: keep_prob})

            # disable dropout
            summary_train, train_accuracy = sess.run([merged, accuracy],
                                                     feed_dict={cnn.x: train_x, cnn.y: train_y,
                                                                cnn.keep_prob: 1})
            summary_test, test_accuracy = sess.run([merged, accuracy],
                                                   feed_dict={cnn.x: test_x, cnn.y: test_y, cnn.keep_prob: 1})

            train_writer.add_summary(summary_train, i - 1)
            test_writer.add_summary(summary_test, i - 1)

            if print_detail and (i % 10 == 0 or i == 1):
                info = '\nIteration %d\n' % i + \
                       'Cost: %f\n' % cost_value + \
                       'Train accuracy: %f\n' % train_accuracy + \
                       'Test accuracy: %f' % test_accuracy
                log.log_info(info)

            # stop when test>0.95 and train>0.99
            if test_accuracy >= 0.95 and train_accuracy >= 0.99:
                info = '\nIteration %d\n' % i + \
                       'Cost: %f\n' % cost_value + \
                       'Train accuracy: %f\n' % train_accuracy + \
                       'Test accuracy: %f' % test_accuracy
                log.log_info(info)
                saver.save(sess, "resource/model/" + TIMESTAMP)
                break
            saver.save(sess, "resource/model/" + TIMESTAMP)
        train_writer.close()
        test_writer.close()

    log.log_info('program end')
Beispiel #18
0
from util import resample, get_list_of_files
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

file_list = get_list_of_files('./', 'log')

bids = []
asks = []

for file in tqdm(file_list):
    bid, ask = resample(file, '1Min')
    bids.append(bid)
    asks.append(ask)

main = pd.DataFrame(bids[0])
main1 = pd.DataFrame(asks[0])

for bid in bids[1:]:
    main = main.append(bid)

for ask in asks[1:]:
    main1 = main1.append(ask)

main.sort_index(inplace=True)
main1.sort_index(inplace=True)

main.to_csv(r'bid_ohlc_1min.csv')
main1.to_csv(r'ask_ohlc_1min.csv')

# # PART 2
Beispiel #19
0
 def get_ir(self, n=1, rank=False, by=None):
     return util.resample(self.get_ic(n=n, rank=rank), how='ir', by=by)