Ejemplo n.º 1
0
def cp_tune(dataT):
    dataCorr = np.corrcoef(dataT.T)
    pca = PCA(n_components=2)
    pc = pca.fit_transform(dataCorr)
    pervals = pca.explained_variance_ratio_[0:2]
    pervals = pervals / np.sum(pervals)
    MX = pervals[1] * pc[:, 0] + pervals[0] * np.mean(dataT, axis=0)
    SX = pervals[0] * mnorm(np.exp(1 - gene_density)) + pervals[1] * chr_len
    EX = pervals[1] * mnorm(np.exp(1 - gene_density)) + pervals[0] * chr_len
    GX = loess_1d(SX, MX, frac=2 / 3.0)[1]
    LX = loess_1d(EX, MX, frac=2 / 3.0)[1]
    return (GX, LX)
Ejemplo n.º 2
0
def test_loess_1d():
    """
    Usage example for loess_1d

    """
    n = 200
    np.random.seed(123)
    x = np.random.uniform(-1, 1, n)
    x.sort()

    y = np.sin(3 * x)
    sigy = 0.4
    yran = np.random.normal(y, sigy)

    nbad = int(n * 0.1)  # 10% outliers
    w = np.random.randint(0, n, nbad)  # random indices from 0-n
    yran[w] += np.random.normal(0, 5 * sigy, nbad)

    xout, yout, weigts = loess_1d(x, yran, frac=0.3)

    w = weigts < 0.34  # identify outliers
    plt.clf()
    plt.plot(x, yran, 'ro', label='Noisy')
    plt.plot(xout, yout, 'b', linewidth=4, label='LOESS')
    plt.plot(x, y, color='limegreen', linewidth=4, label='True')
    plt.plot(x[w], yran[w], '+k', ms=20, label='Outliers')
    plt.legend(loc='lower right')
    plt.pause(1)
Ejemplo n.º 3
0
    def find_outlires(self):
        # clear by work types, working and other
        work_types_flags = ['WorkDay', 'no']
        countOfReplace = 0
        # alg for clear outliers
        for wt in work_types_flags:
            # select new df through current worktype
            if wt == work_types_flags[0]:
                workingDays = self.dfToClean.loc[self.dfToClean['WorkType'] == work_types_flags[0], :]
            else:
                workingDays = self.dfToClean.loc[self.dfToClean['WorkType'] != work_types_flags[0], :]
            # clear in all years and by current year
            years = workingDays.Year.unique()

            for yr in years:
                myDf = workingDays.copy().loc[workingDays['Year'] == yr, :]
                # clear by current month in year
                months = myDf['Month'].unique()
                for mn in months:
                    # select df with current month in current year
                    my_df_current_month_in_year = myDf.loc[myDf['Month'] == mn, :]
                    # get results from loess fitting
                    xout, yout, weigts = loess_1d.loess_1d(my_df_current_month_in_year['Time'].values,
                                                           my_df_current_month_in_year['DiffElLoad'].values, frac=0.2)
                    # create column with loess result
                    my_df_current_month_in_year['LoessSm'] = yout
                    # calc resudials of fitting from initial data
                    resudials = my_df_current_month_in_year['DiffElLoad'].values - \
                                my_df_current_month_in_year['LoessSm'].values
                    # create column with resudials in current df
                    my_df_current_month_in_year['Resudials'] = resudials
                    # create confideince interval
                    # get id values
                    id_vec = my_df_current_month_in_year.index.tolist()
                    # get lower and higher quantilies
                    qL, qH = np.percentile(my_df_current_month_in_year['Resudials'], [15, 85])
                    # iqr interval
                    my_iqr = qH - qL
                    # coeff to iqr interval
                    coef_conf = 2.0
                    # lower bound of conf interval
                    lower_conf = qL - (coef_conf * my_iqr)
                    # top bound of conf interval
                    top_conf = qH + (coef_conf * my_iqr)
                    # now replace outlier, i.e. value that outside conf interval
                    for k in id_vec:
                        # search outliers
                        #current candidate
                        candidate = my_df_current_month_in_year.ix[k, 'Resudials']
                        if candidate < lower_conf or candidate > top_conf:
                            self.dfToClean.ix[k, 'DiffElLoad'] =\
                                my_df_current_month_in_year.ix[k, 'LoessSm']
                            countOfReplace = countOfReplace + 1
        print('Count of all outliers = ', countOfReplace)
Ejemplo n.º 4
0
def bleach_fit(brange, frange, intensity, fitter):
    """Fit decay in intensity for bleach correction"""
    intensity_values = np.array([intensity[x] for x in brange])

    # Choose type of decay
    if (fitter == 'linear'):
        # Fitting regularized linear model
        reg = linear_model.Ridge(alpha=1000, fit_intercept=True)
        try:
            reg.fit(brange.reshape(-1, 1), intensity_values.reshape(-1, 1))
        except:
            raise ValueError('Fit not found - try a larger range')
        pred = reg.predict(frange.reshape(-1, 1))

    elif (fitter == 'exponential'):
        # Fitting exponential model
        guess = (intensity[0], 0.001, 0)
        try:
            popt, _ = curve_fit(exp_func, brange, intensity_values, p0=guess)
        except:
            raise ValueError('Fit not found - try a larger range')
        pred = exp_func(frange, *popt)

    elif (fitter == 'loess'):
        # Fitting loess model
        try:
            _, pred, _ = loess_1d.loess_1d(brange,
                                           intensity_values,
                                           xnew=None,
                                           degree=1,
                                           frac=0.5,
                                           npoints=None,
                                           rotate=False,
                                           sigy=None)
        except:
            raise ValueError('Fit not found - try a larger range')

    # Bleach corrected intensity values
    corr = np.divide(pred[0], pred)

    return corr
Ejemplo n.º 5
0
clf = KernelRidge(kernel='rbf', gamma=0.1, degree=5)
clf.fit(x[:,None], y)
f_kernelridge = clf.predict(x0[:,None])
print("Scikit-Learn: ", time()-t0)

# Lowess GitHub library
t0 = time()
f_lowess = lowess(x, y, x0, deg=2, l=0.5)
print("Lowess GitHub library: ", time()-t0)

# Statsmodels
t0 = time()
res = statslowess(y, x, return_sorted=True, frac=0.1, it=0)
x_stats = res[:,0]
f_stats = res[:,1]
print("Statsmodels: ", time()-t0)

# Loess from PyPI
t0 = time()
x_loess, f_loess, w_loess = loess_1d(x, y, degree=2, frac=0.1, x0=x0)
print("Loess for PyPI: ", time()-t0)

plt.plot(x, y, '.', markersize=1)
plt.plot(x0, f(x0), '--', label='Ground truth')
plt.plot(x_stats, f_stats, label='Statsmodels')
plt.plot(x0, f_lowess, label='LOWESS')
plt.plot(x0, f_loess, label='LOESS')
plt.plot(x0, f_kernelridge, label='Kernel Ridge')
plt.legend()
plt.show()
Ejemplo n.º 6
0
from loess.loess_1d import loess_1d

np.random.seed(1234)

# Generate some data
x = np.arange(0, 10, 0.1)
y = np.sin(x) + 0.2 * np.random.randn(len(x))

# Eliminate some, so that we don't have equal sampling distances
cur_ind = np.where((x > 5) & (x < 6))
x_space = np.delete(x, cur_ind)
y_space = np.delete(y, cur_ind)
plt.plot(x_space, y_space, '.', label='rawdata')

# Smooth the data with Lowess, from the package "statsmodels"
smoothed = lowess(y_space, x_space, frac=0.1)
index, data = smoothed.T
plt.plot(index, data, label='lowess')

# Smooth with Loess, from the package "loess"
x_out, y_out, weights = loess_1d(x_space, y_space, frac=0.1)
plt.plot(x_out, y_out, label='loess')
plt.legend()

# Save and show the image
out_file = 'loess.jpg'
plt.savefig(out_file, dpi=200, quality=90)
print(f'Image saved to {out_file}')

plt.show()
Ejemplo n.º 7
0
print("Number of points: {}".format(len(facet_currents)))

if len(sys.argv) > 3:
    dz = float(sys.argv[3])
else:
    dz = 0.0001

window = 7.5e-3
frac = window / l
# z0 = np.arange(zmin, zmax + dz/2, dz)
z0 = np.linspace(zmin, zmax, int(l / dz) + 1, endpoint=True)
# res = lowess(facet_currents, z_centroids, it=0, return_sorted=True, frac=frac)
# Is = res[:,1]
z_loess, Is, w_loess = loess_1d(z_centroids,
                                facet_currents,
                                degree=2,
                                frac=frac,
                                x0=z0)

Is *= 2 * np.pi * r
facet_currents *= 2 * np.pi * r

with open(sys.argv[2], 'w') as file:
    file.write("#:xaxis\tz\n")
    file.write("#:name\tz\tOML\tI\tg\ti\n")
    file.write("#:units\tm\tA/m\tA/m\tdimensionless\tA/m\n")

    for z, facet_current, I in zip(z0, facet_currents, Is):
        file.write("{}\t{}\t{}\t{}\t{}\n".format(z, I_OML, I, I / I_OML,
                                                 facet_current))
    def run_loraccs(self, ref_img_band, tgt_img_band, band_num, band_name,
                    band_max_spectra, loess_frac, tgt_img_fp, outdir):
        '''
        Runs the LORACCS method.
        '''

        os.chdir(outdir)

        # Plot 2d histogram
        index = (ref_img_band > 0) & (tgt_img_band > 0)
        ref_img_band_sub = ref_img_band[index]
        tgt_img_band_sub = tgt_img_band[index]

        plt.hist2d(
            tgt_img_band_sub,
            ref_img_band_sub,
            bins=200,
            cmin=5,
            cmap=plt.cm.jet,
        )
        plt.colorbar()
        plt.title('%s Band 2D Histogram' % band_name)
        plt.xlabel('Target')
        plt.ylabel('Reference')
        save_fig = '%s_2d_hist.png' % band_name
        plt.savefig(save_fig)
        plt.show()

        ### Extract spectral values into a dict

        # Get unique values from target image
        tgt_uniq = np.unique(tgt_img_band)

        if 0 in tgt_uniq:
            tgt_uniq = tgt_uniq[tgt_uniq != 0]

        counts_dict = dict()
        for uniq in tgt_uniq:
            counts_dict[uniq] = []

        img_rows = range(0, tgt_img_band.shape[0])
        img_row_pixel = range(0, tgt_img_band.shape[1])

        for band_row in img_rows:  # iterate through rows
            for pixel in img_row_pixel:  # iterate through pixels
                tgt_val = tgt_img_band[band_row][pixel]
                ref_val = ref_img_band[band_row][pixel]
                if tgt_val != 0:
                    if ref_val != 0:
                        # Add value to the dict
                        values = counts_dict[tgt_val]
                        try:
                            values.append(ref_val)
                        except:
                            values = ref_val
                else:
                    continue

        # Generate stats
        if max(tgt_uniq) < band_max_spectra:
            spec_range = list(range(min(tgt_uniq), max(tgt_uniq)))
        else:
            spec_range = list(range(min(tgt_uniq), band_max_spectra))

        print('Maxiumum spectral value being set to: ', max(spec_range))

        stats_df = pd.DataFrame()
        stats_df['Spec_vals'] = spec_range
        stats_df['Mean'] = 0
        #stats_df['Std'] = std
        stats_df['Pixels'] = 0

        for uniq in tgt_uniq:
            values = np.array(counts_dict[uniq])

            if len(values) > 5:
                # Subset out values to get rid of outliers
                sub = np.sort(values)
                sub = sub[sub < band_max_spectra]
                val_sub = sub[int(len(sub) * .025):int(len(sub) * .975)]
                mean = np.mean(val_sub)
                stats_df.loc[stats_df['Spec_vals'] == uniq, 'Mean'] = mean
                stats_df.loc[stats_df['Spec_vals'] == uniq,
                             'Pixels'] = len(values)

        # Remove all NaN
        stats_df = stats_df.fillna(0)
        stats_df_valid = stats_df[stats_df.Mean != 0]
        # Remove entries with pixel count less than 6
        stats_df_valid = stats_df_valid[stats_df_valid.Pixels > 5]

        ### Create model

        # Set up params for LOESS
        x = stats_df_valid.Spec_vals.values

        xnew = stats_df.Spec_vals.values

        y = stats_df_valid.Mean.values

        # Run LOESS
        xout, yout, wout = loess_1d(x,
                                    y,
                                    xnew=xnew,
                                    frac=loess_frac,
                                    degree=2,
                                    rotate=False)

        # Save values into the dataframe
        stats_df['Mean_LOESS'] = yout

        # Remove any bad LOESS values (rare)
        stats_df = stats_df[
            stats_df['Mean_LOESS'].values < band_max_spectra].copy()
        stats_df = stats_df[stats_df['Mean_LOESS'].values != 0].copy()

        # Save the data to CSV
        stats_df.to_csv('%s_df.csv' % band_name, index=False)

        ### Plot result of LORACCS along with histogram
        fig, ax = plt.subplots(nrows=1, figsize=(6, 4))

        for_plot = stats_df.copy()
        for_plot = for_plot[for_plot['Pixels'] != 0]

        x = for_plot['Spec_vals'].values
        y1 = for_plot['Mean_LOESS'].values
        y2 = for_plot['Pixels'].values
        y3 = for_plot['Mean'].values

        # Plot histogram
        ax.bar(x, y2, width=1, color='lightgray')
        gray_patch = mpatches.Patch(color='lightgray', label='Histogram')

        # Set plot to have two y axes
        ax2 = ax.twinx()

        # Original target values as a scatterplot
        ax2.scatter(x,
                    y3,
                    color='tab:gray',
                    marker='.',
                    label='Mean Reference')

        #LORACCS regression line
        ax2.plot(x,
                 y1,
                 color='tab:orange',
                 label='LORACCS Target',
                 linewidth=2)

        # Fix tick marks
        ylabs = ax2.get_yticks()
        ax2.yaxis.tick_left()
        ax2.set_yticklabels(ylabs, fontsize=13)
        ax2.yaxis.set_major_formatter(FormatStrFormatter('%.0f'))

        y2labs = ax.get_yticks()

        ax.yaxis.tick_right()
        ax.set_yticklabels(y2labs, fontsize=13)
        ax.yaxis.set_major_formatter(FormatStrFormatter('%.0f'))

        xlabs = ax2.get_xticks()
        ax2.set_xticklabels(xlabs, fontsize=13)
        ax2.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))

        ax.set_title('LORACCS Model: %s Band' % band_name, fontsize=20)
        ax.set_xlabel('Target Spectral Values', fontsize=15)

        ax.yaxis.set_label_position('right')
        ax.set_ylabel('Reference Histogram', fontsize=15)

        ax2.yaxis.set_label_position('left')
        ax2.set_ylabel('Reference Spectral Values', fontsize=15)

        ax.legend(fontsize=12, loc='upper left', handles=[gray_patch])
        ax2.legend(fontsize=12, loc='lower right')

        save_fig = '%s_LORACCS_full_spectra_plot.png' % band_name
        plt.savefig(save_fig)
        plt.show()

        ### Transform image using filled-in LORACCS function

        # Read in target image
        full_tgt_img = rasterio.open(tgt_img_fp)

        # Read in as numpy arrays
        data = full_tgt_img.read(band_num)

        spec_vals_dict = dict(zip(stats_df.Spec_vals, stats_df.Mean_LOESS))

        # Change the data type in preparation for changing values
        data = data.astype('float32')

        # Loop through spectral values, replace with new value / 100000.  Division
        # necessary so already replaced values are not overwritten
        for spec_val in spec_vals_dict:
            data[data == spec_val] = spec_vals_dict[spec_val] / 100000

        # Multiply by 100000 to restore proper values, return dtype
        data = data * 100000
        data = data.astype('uint16')

        return data  # Returns band array transformed by the LORACCS method
Ejemplo n.º 9
0
    def run_loraccs(self, ref_img_band, tgt_img_band, band_num, band_name, 
                    band_max_spectra, loess_frac, tgt_img_fp, outdir):
        
        '''
        Runs the LORACCS method.
        '''
        
        os.chdir(outdir)

        # Plot 2d histogram
        index = (ref_img_band>0)&(tgt_img_band>0)
        ref_img_band_sub = ref_img_band[index]
        tgt_img_band_sub = tgt_img_band[index]

        plt.hist2d(tgt_img_band_sub, ref_img_band_sub, bins=200, cmin = 5, cmap=plt.cm.jet, )
        plt.colorbar()
        plt.title('%s Band 2D Histogram' %band_name)
        plt.xlabel('Target')
        plt.ylabel('Reference')
        save_fig = '%s_2d_hist.png' %band_name
        plt.savefig(save_fig)
        plt.show()   

        ### Extract spectral values into a dict

        # Get unique values from target image
        tgt_uniq = np.unique(tgt_img_band)
        counts_dict = dict()
        for uniq in tgt_uniq:
            counts_dict[uniq] = []

        img_rows = range(0, tgt_img_band.shape[0])
        img_row_pixel = range(0, tgt_img_band.shape[1])

        for band_row in img_rows:       # iterate through rows
            for pixel in img_row_pixel: # iterate through pixels
                tgt_val = tgt_img_band[band_row][pixel]
                ref_val = ref_img_band[band_row][pixel]
                if tgt_val != 0:
                    if ref_val != 0:
                        # Add value to the dict
                        values = counts_dict[tgt_val]
                        try:
                            values.append(ref_val)
                        except:
                            values = ref_val
                else:
                    continue 

        # Generate stats
        for uniq in tgt_uniq:
            values = np.array(counts_dict[uniq])
            
            pixels = len(values)

            # Subset out values to get rid of outliers
            sub = np.sort(values)
            sub = sub[sub < band_max_spectra]
            val_sub = sub[int(len(sub) * .025) : int(len(sub) * .975)]

            try:
                mean = np.mean(val_sub)
                std = np.std(val_sub)
            except:
                print('Exception used')
                mean = np.mean(counts_dict[uniq])
                std = np.std(counts_dict[uniq]) 

            new_dict = {'values' : counts_dict[uniq], 'mean' : mean, 'std' : std, 'pixels' : pixels}
            counts_dict[uniq] = new_dict  

        # Create pandas DataFrame of values
        spec_vals = tgt_uniq
        mean = []
        std = []
        pix = []

        for uniq in tgt_uniq:
            mean.append(counts_dict[uniq]['mean'])
            std.append(counts_dict[uniq]['std'])
            pix.append(counts_dict[uniq]['pixels'])

        stats_df = pd.DataFrame()
        stats_df['Spec_vals'] = spec_vals
        stats_df['Mean'] = mean
        stats_df['Std'] = std
        stats_df['Pixels'] = pix
        # Remove all NaN
        stats_df = stats_df.fillna(0)
        stats_df_valid = stats_df[stats_df.Mean != 0]
        # Remove entries with pixel count less than 6
        stats_df_valid = stats_df_valid[stats_df_valid.Pixels > 5]

        ### Create model
        
        # Set up params for LOESS
        x = stats_df_valid.Spec_vals.values
        y = stats_df_valid.Mean.values

        # Run LOESS
        xout, yout, wout = loess_1d(x, y, frac=loess_frac, degree=2, rotate=False)

        # Save values into the dataframe
        stats_df_valid['Mean_LOESS'] = yout
        
        # Remove any bad LOESS values (rare)
        stats_df_valid = stats_df_valid[stats_df_valid['Mean_LOESS'].values < band_max_spectra].copy()
        stats_df_valid = stats_df_valid[stats_df_valid['Mean_LOESS'].values != 0].copy()
        
        # Save the data to CSV
        stats_df_valid.to_csv('%s_df.csv' %band_name, index=False)

        # Fill gaps in spectra
        min_spectra = min(stats_df_valid.Spec_vals.values)
        max_spectra = max(stats_df_valid.Spec_vals.values)

        if max_spectra > band_max_spectra:
            reasonable_spec_vals = stats_df_valid[stats_df_valid['Spec_vals'] < band_max_spectra]
            max_spectra = reasonable_spec_vals['Spec_vals'].values[-1]

        print('Maxiumum spectral value being set to: ', max_spectra)

        spectral_range = range(int(min_spectra), int(max_spectra+1))

        full_spectra = pd.DataFrame()
        full_spectra['Spec_vals'] = spectral_range
        full_spectra = full_spectra.merge(stats_df_valid, how='left', on='Spec_vals')
        full_spectra.drop(['Std'], axis=1, inplace=True)
        full_spectra.rename(columns={'Mean':'Org_Mean'}, inplace=True)
        full_spectra['Missing'] = pd.isna(full_spectra['Mean_LOESS']) # Identify missing spectral values
                
        all_y_values = []

        # Predict missing spectral values
        for item in range(0, len(full_spectra)):
                        
            if full_spectra['Missing'].iloc[item] == True:

                # Find nearest values on either side   
                invalid_before_value = True
                n = item
                while invalid_before_value == True:
                    n = n-1
                    invalid_before_value = full_spectra['Missing'].iloc[n]

                x1 = full_spectra['Spec_vals'].iloc[n]
                y1 = full_spectra['Mean_LOESS'].iloc[n]  

                n = item
                invalid_after_value = True
                while invalid_after_value == True:
                    n = n+1
                    invalid_after_value = full_spectra['Missing'].iloc[n]

                x2 = full_spectra['Spec_vals'].iloc[n]
                y2 = full_spectra['Mean_LOESS'].iloc[n]  

                # Predict new spectra value using the equation of a line between points
                new_x = full_spectra['Spec_vals'].iloc[item]
                new_y = self.get_new_spec_val(x1, x2, y1, y2, new_x)

            else: 
                new_y = full_spectra['Mean_LOESS'].iloc[item]
            
            all_y_values.append(new_y)
        
        full_spectra['Filled_LOESS']=all_y_values
        full_spectra.fillna(0, inplace=True)
        
        ### Write full spectra data frame to csv
        full_spectra.to_csv('%s full spectra.csv' %band_name, index=False)

        ### Plot result of LORACCS along with histogram
        fig, ax = plt.subplots(nrows=1, figsize=(6,4))

        for_plot = full_spectra.copy()
        for_plot = for_plot[for_plot['Missing'] == False]

        x=for_plot['Spec_vals'].values
        y1=for_plot['Filled_LOESS'].values
        y2=for_plot['Pixels'].values
        y3=for_plot['Org_Mean'].values

        # Plot histogram
        ax.bar(x, y2, width=1, color='lightgray')
        gray_patch = mpatches.Patch(color='lightgray', label='Histogram')

        # Set plot to have two y axes
        ax2 = ax.twinx()

        # Original target values as a scatterplot 
        ax2.scatter(x, y3, color='tab:gray', marker='.', label='Mean Reference')

        #LORACCS regression line
        ax2.plot(x, y1, color='tab:orange', label='LORACCS Target', linewidth=2)

        # Fix tick marks
        ylabs = ax2.get_yticks()
        ax2.yaxis.tick_left()
        ax2.set_yticklabels(ylabs, fontsize=13)
        ax2.yaxis.set_major_formatter(FormatStrFormatter('%.0f'))

        y2labs = ax.get_yticks()

        ax.yaxis.tick_right()
        ax.set_yticklabels(y2labs, fontsize=13)
        ax.yaxis.set_major_formatter(FormatStrFormatter('%.0f'))

        xlabs = ax2.get_xticks()
        ax2.set_xticklabels(xlabs, fontsize=13)
        ax2.xaxis.set_major_formatter(FormatStrFormatter('%.0f'))

        ax.set_title('LORACCS Model: %s Band' %band_name, fontsize=20)
        ax.set_xlabel('Target Spectral Values', fontsize=15)

        ax.yaxis.set_label_position('right')
        ax.set_ylabel('Reference Histogram', fontsize=15)        

        ax2.yaxis.set_label_position('left')
        ax2.set_ylabel('Reference Spectral Values', fontsize=15)

        ax.legend(fontsize=12, loc='upper left', handles=[gray_patch])
        ax2.legend(fontsize=12, loc='lower right')

        save_fig = '%s_LORACCS_full_spectra_plot.png' %band_name
        plt.savefig(save_fig)
        plt.show()

        ### Transform image using filled-in LORACCS function

        # Read in target image
        full_tgt_img = gdal.Open(tgt_img_fp)

        # Get bands
        band_data = full_tgt_img.GetRasterBand(band_num)

        # Read in as numpy arrays
        data = gdal_array.BandReadAsArray(band_data)

        spec_vals_dict = dict(zip(full_spectra.Spec_vals, full_spectra.Filled_LOESS))

        # Change the data type in preparation for changing values
        data = data.astype('float')

        # Loop through spectral values, replace with new value / 100000.  Division
        # necessary so already replaced values are not overwritten
        for spec_val in spec_vals_dict:   
            data[data == spec_val] = spec_vals_dict[spec_val] / 100000

        # Multiply by 100000 to restore proper values, return dtype
        data = data*100000
        data = data.astype('uint16')

        return data # Returns band array transformed by the LORACCS method
Ejemplo n.º 10
0
    def preOutlierDetection(self, frame: pd.DataFrame, options: dict) -> dict:
        """
        This function utilizes the loess method to strip the seasonality from
        the target column and determine a trend. Based on the difference
        between the trend and the actual target column, outliers are 
        identified as a function of the options['outlierStdevMultiplier']
        value, which should be an int or a float.

        Parameters
        ----------
        frame : pd.DataFrame
            pandas dataframe that includes the data to be forecast
        options : dict
            dictionary that includes at least 'seasonalityBandwidth', 
            'targetColumn', 'outlierStdevMultiplier'

        Returns
        -------
        dict
            Will return with two keys, 'frame' which will be the original
            pandas dataframe but now with the X_INTERPOLATED and X_OUTLIER
            columns, and 'options', the value of which is whatever dictionary
            was originally passed through the options parameter.

        """
        targetColumn = options['targetColumn']

        frame['X_INDEX'] = frame.index.values
        frame['X_INTERPOLATED'] = frame[targetColumn]

        # split the data into past/future based on null in target column
        nullIdx = frame[targetColumn].isnull()
        futureData = frame[nullIdx]
        historicalIdx = list(map(operator.not_, nullIdx))
        historicalData = frame[historicalIdx]

        x = np.asarray(historicalData['X_INDEX'].tolist())
        y = np.asarray(historicalData[params.getParam('targetColumn',
                                                      options)].tolist())
        bandwidth = params.getParam('seasonalityBandwidth', options)
        xout, yout, weights = lo.loess_1d(x, y, frac=bandwidth, degree=2)

        frame['X_TREND'] = np.append(
            yout, np.asarray(futureData[targetColumn].tolist()))
        frame['X_TREND_DIFF'] = frame[targetColumn] - frame['X_TREND']

        stdev = frame['X_TREND_DIFF'].std()
        avg = frame['X_TREND_DIFF'].mean()

        mult = params.getParam('outlierStdevMultiplier', options)

        #identifies outliers based on the number of standard deviations
        #from the mean as specified in line 100. It is thus not the strict
        #mean of the target column, but the mean of the difference
        #between the target column and the loess trend calculated in line 94.
        frame['X_OUTLIER'] = 0
        for index, row in frame.iterrows():
            diff = abs(frame['X_TREND_DIFF'][index])
            if diff > avg + mult * stdev:
                if index > 0 and index <= frame.shape[0] - 1:
                    frame['X_INTERPOLATED'][index] = mean([
                        frame['X_INTERPOLATED'][index - 1],
                        frame['X_INTERPOLATED'][index + 1]
                    ])
                    frame['X_OUTLIER'][index] = 1
                else:
                    frame['X_INTERPOLATED'][index] = frame['X_TREND'][index]
                    frame['X_OUTLIER'][index] = 1

        frame.drop(columns=['X_TREND', 'X_TREND_DIFF', 'X_INDEX'])

        fdict = dict()
        fdict['frame'] = frame
        fdict['options'] = options

        return fdict
Ejemplo n.º 11
0
    def prepare(self, frame: pd.DataFrame, options: dict) -> dict:
        """
        This function does a few things in an attempt to prepare the data for
        forecasting. First, if specified in the 'options' dictionary, it will
        scale all of the variables to between 0 and 1. It will then add to the
        dataframe 'frame' an X_TREND, X_TREND_DIFF, and X_TREND_RATIO column
        to be used in later modeling/prediction. It will also create, as 
        specified in the 'options' dictionary, indexes that will be passed
        through in the return to identify which parts of the model should be
        used for training, evaluation, etc.

        Parameters
        ----------
        frame : pd.DataFrame
            pandas dataframe that includes all the information necessary to
            forecast
        options : dict
            dictionary that includes at least the keys 'scalePredictors',
            'predictorColumns', 'targetColumn', 'numHoldoutRows', and 
            'seasonalityBandwidth'

        Returns
        -------
        dict
            dictionary that includes the dataframe passed through the frame
            parameter with additions, the options dictionary, and keys 
            'historicalIdx', 'futureIdx', and 'evalIdx', corresponding to
            the training, forecasting, and holdout periods as measured by
            the index of the dataframe stored under the 'frame' key

        """
        # create copy of target for modification (fill zeros with very small number)
        random.seed(158923)
        targetColumn = params.getParam('targetColumn', options)

        frame['X_INDEX'] = frame.index.values

        # scale predictors between 0 and 1
        try:
            newPredCols = []
            if params.getParam('scalePredictors', options):
                for predCol in params.getParam('predictorColumns', options):
                    newCol = 'X_' + predCol
                    frame[newCol] = (frame[predCol] - frame[predCol].min()) / (
                        frame[predCol].max() - frame[predCol].min())
                    newPredCols.append(newCol)
                options['predictorColumns'] = newPredCols
        except Exception as e:
            print("Unable to scale predictors: ", e)

        # ensure predictors and target are float
        frame[targetColumn] = frame[targetColumn].astype(float)
        frame[params.getParam('predictorColumns',
                              options)] = frame[params.getParam(
                                  'predictorColumns', options)].astype(float)

        newTargetColumn = 'X_' + targetColumn

        # if we have done outlier detection there will be an interpolated column that has the interpolated actuals
        if 'X_INTERPOLATED' in frame:
            frame[newTargetColumn] = list(
                map(lambda x: (x if x != 0.0 else random.random() / 1E5),
                    frame['X_INTERPOLATED']))
        else:
            frame[newTargetColumn] = list(
                map(lambda x: (x if x != 0.0 else random.random() / 1E5),
                    frame[params.getParam('targetColumn', options)]))

        options['targetColumn'] = newTargetColumn

        # split the data into past/future based on null in target column
        lastNonNullIdx = self.lastNonNullIndex(frame[targetColumn])
        fullHistoricalIdx = frame['X_INDEX'] <= lastNonNullIdx

        # we use full history for trending/smoothing (but NOT modeling in the future)
        fullHistoricalData = frame[fullHistoricalIdx]
        numHoldoutRows = params.getParam('numHoldoutRows', options)
        fullFutureIdx = frame['X_INDEX'] > lastNonNullIdx
        fullFutureData = frame[fullFutureIdx]

        # we store history minus hold-out for future modeling

        # could these variables potentially be renamed?
        # the subtraction of numHoldoutRows really changes the "concept"
        # of what's being discussed here
        lastNonNullIdx = lastNonNullIdx - numHoldoutRows
        historicalIdx = frame['X_INDEX'] <= lastNonNullIdx
        #historicalData = frame[historicalIdx]
        futureIdx = frame['X_INDEX'] > lastNonNullIdx
        #futureData = frame[futureIdx]

        if (numHoldoutRows > 0):
            # if variable names are switched as discussed above it would avoid
            # some of the awkward constructions here
            evalIdx = list(
                map(
                    lambda x: x > lastNonNullIdx and x <=
                    (lastNonNullIdx + numHoldoutRows), frame['X_INDEX']))
        else:
            evalIdx = historicalIdx

        x = np.asarray(fullHistoricalData['X_INDEX'].tolist())
        y = np.asarray(fullHistoricalData[newTargetColumn].tolist())
        bandwidth = params.getParam('seasonalityBandwidth', options)
        xout, yout, weights = lo.loess_1d(x, y, frac=bandwidth, degree=2)

        frame['X_TREND'] = np.append(
            yout, np.asarray(fullFutureData[targetColumn].tolist()))
        #for use with additive seasonality?
        frame['X_TREND_DIFF'] = frame[targetColumn] - frame['X_TREND']
        #for use with multiplicative seasonality?
        frame['X_TREND_RATIO'] = frame[targetColumn] / frame['X_TREND']

        fdict = dict()
        fdict['historicalIdx'] = historicalIdx
        fdict['futureIdx'] = futureIdx
        fdict['evalIdx'] = evalIdx
        fdict['frame'] = frame
        fdict['options'] = options

        return fdict
Ejemplo n.º 12
0
distArray = np.zeros((samples, len(chrm)))

gene_density = np.array([
    7.86, 4.87, 5.20, 3.77, 4.62, 5.86, 5.37, 4.36, 5.30, 5.27, 9.16, 7.37,
    2.65, 5.37, 5.33, 8.67, 13.68, 3.29, 22.53, 8.22, 4.43, 8.15, 5.19
])
chr_len = np.array([249250621,243199373,198022430,191154276,180915260,171115067,159138663,146364022,141213431,135534747,135006516,133851895,115169878,107349540, \
   102531392,90354753,81195210,78077248,59128983,63025520,48129895,51304566,155270560])

chr_len = chr_len / 1E6

gene_density = mnorm(gene_density)

chr_len = mnorm(chr_len)

z = loess_1d(gene_density, chr_len, frac=2. / 3)

file_list = [
    sd + '/csn_' + str(_) + '_coor.txt' for _ in xrange(1, samples + 1)
]

for findex, file in enumerate(file_list):
    #print findex
    G_coor = np.loadtxt(file)
    chrcoords = [[] for _ in chrm]
    chrcenter = np.zeros((len(chrm), 3))
    for i in xrange(len(node_label)):
        if str(i) in cnodes:
            chrcounts[node_label[i]] += 1
            chrcenter[node_label[i]] += G_coor[i, :]
    for i in xrange(len(chrm)):