def cut_series(data,time,fs):
    cut_length=time*fs
    number=int(len(data)/cut_length)
    #print(number)
    for i in range(0,number):
        cut=data[i*cut_length:(i+1)*cut_length].reshape(-1,1)
        cut=MinMaxScaler(feature_range=(0,1)).fit_transform(cut)
        #print(cut)
        #plt.plot(cut)
        #plt.show()
        x_data.append(cut.flatten())
def read_single_data(type,number):
    file_path=PATH+type+number
    record=wfdb.rdrecord(file_path,channels=[0])#读取
    rdata=record.p_signal.flatten()
    data=denoise(rdata)#去噪
    new_data=np.array(data)
    new_data=new_data.reshape(-1,1)
    new_data=MinMaxScaler(feature_range=(0,1)).fit_transform(new_data)
    #print(new_data)
    #plt.plot(new_data)
    #plt.show()
    x_data.append(new_data.flatten())
Esempio n. 3
0
def rescaler(series,
             max_range_old,
             min_range_old,
             max_range_new,
             min_range_new,
             invert_sign=False):
    """Rescales a series, while maintaining the distance of the highest 
    and lowest value in the data to the highest and lowest vlaue of the desired 
    rnage."""
    from sklearn.preprocessing import MinMaxScaler
    min_series_old = series.min()
    max_series_old = series.max()
    #    print(max_series)
    minmax_to_mid_old = abs(max_range_old) - abs(
        (max_range_old + min_range_old) / 2)
    minmax_to_mid_new = abs(max_range_new) - abs(
        (max_range_new + min_range_new) / 2)
    # Calculate how big the max and min in the series are compared to the
    # maximal values possible
    diff_range_series_max_old = abs(max_range_old) - abs(max_series_old)
    diff_range_series_min_old = abs(min_range_old) - abs(min_series_old)
    #
    frac_to_max = diff_range_series_max_old / minmax_to_mid_old
    frac_to_min = diff_range_series_min_old / minmax_to_mid_old
    #
    diff_range_series_max_new = frac_to_max * minmax_to_mid_new
    diff_range_series_min_new = frac_to_min * minmax_to_mid_new
    #
    max_series_new = max_range_new - diff_range_series_max_new
    min_series_new = min_range_new - diff_range_series_min_new

    # Scale the data
    series = MinMaxScaler(
        feature_range=[min_series_new, max_series_new]).fit_transform(
            series.to_numpy().reshape(-1, 1))
    series = pd.Series(series.flatten())
    #  print(series)
    # Aridity has to be inverted as a value of 1 in the files provided by wouter
    # is equal to - 1 in regard to the value it his paper.
    if invert_sign:
        for i in series.index:
            print(series[i])
            if series[i] > 0:
                series[i] = 0 - series[i]
            else:
                series[i] = abs(series[i])
    print(series)
    return series
Esempio n. 4
0
    def calc_pca(self,
                 write=True,
                 dom_data=(None, None),
                 explain_fa_dfilepath=None):
        """
        Method to perform Principal Component Analysis. Steps performed in this function are referenced from the following source - 
        https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5065523/
        Args: 
            write: Boolean variable to change whether output is written to a file or not (True: writes output to file)
            dom_data(None, None): contains the flag and data that is passed, if PCA is being performed on each domain 
                dom_data[0]: Flag to show if domains are present 
                dom_data[1]: Contains domain data subset 
            explain_fa_filepath: contains file path for writing factor analysis results per domain 
        Returns: 
            scores: list of scores for towns in alphabetical order 
        """
        # Check if the function is acting on domain data, if True, then set Kaiser condition cutoff to 0.01,
        # else the cutoff is 1.0
        data = None
        cut_off = None
        if dom_data[0]:
            data = dom_data[1]
            cut_off = 1e-2
        else:
            data = pd.read_csv(self.data, index_col=0)
            cut_off = 1.0

        # Convert data to numpy array for further processing
        loaded_data = np.array(data)

        #find the number of variables
        num_var = loaded_data.shape[1]

        #calc is the cutoff to find which eigenvector correlations are significant
        calc = np.abs(np.sqrt(1 / num_var))

        #cov_mat is the covariance matrix of the data
        cov_mat = np.cov(loaded_data.T)

        #perform PCA on the covariance matrix
        self.pca = PCA()
        self.pca.fit(cov_mat)

        # kaiser criterion : Components with eigen_values > cut_off should be retained
        selected_components = np.argwhere(
            self.pca.explained_variance_ > cut_off).flatten()
        selected_vr = self.pca.explained_variance_ratio_[selected_components]

        # second criterion : Among selected components, retain those with proportion of variance  > 10%
        mod_idxs = np.argwhere(selected_vr.flatten() > 0.1)
        selected_vr = selected_vr[selected_vr > 0.1]
        selected_components = selected_components[mod_idxs].flatten()

        # Check number of selected PCs
        # If selected PCs are less than 1 because both of the above conditions aren't met, change PCs to 1
        self.n = len(selected_components)
        if self.n < 1:
            self.n = 1

        # Weights assigned to every PC
        weights = np.exp(
            (np.log(self.pca.explained_variance_) -
             np.log(logsumexp(self.pca.explained_variance_[:self.n]))))

        # Choose only n selected PCs
        weights = weights[:self.n]

        # Keep only the eigenvector correlations that satisfy the indicator variable loading cutoff
        self.var_load = self.pca.components_.T[:, :self.n]
        self.var_load[self.var_load > calc] = 0

        # Calculate feature scores for every feature
        factor_scores = self.var_load @ weights

        # Calculate the health score for every town by multiplying the original data value to the feature score.
        # The final health score for every town is a linear combination of the health scores
        health_status = np.array([loaded_data @ factor_scores]).T
        health_status = MinMaxScaler().fit_transform(health_status)
        health_status = health_status.flatten()

        # Round scores to 2 decimal places
        scores = [round(x, 2) for x in health_status]

        # Get town names
        towns = self.extract_towns()

        if (write == True):
            # Combine the towns, health scores to write to a file
            health_scores = sorted(zip(towns, scores), key=lambda x: x[1])

            with open(self.output, "w", newline="") as f:
                writer = csv.writer(f)
                writer.writerow(['Town', 'Health Score'])
                writer.writerows(health_scores)

        return scores
Esempio n. 5
0
        s_rel, m_rel = sce, np.copy(cle)
        m_rel[m_rel==2]=-1
        m_rel[m_rel==1]=-1
        m_rel[m_rel==3]=1
        m_rel[m_rel==4]=1
        m_rel[m_rel==0]=1

        rel = performance(s_rel,m_rel)
        print(alg, loc, rel, file=open(outpath+'ex_Loc_Rel.txt', "a"))
        print("Locality: ", loc, ", Relativity: ", rel)

        fig = plt.figure(1, figsize=(4, 4), dpi=100,)
        sc = -1/(1+scores[training_len:])
        sc = sc.reshape(-1, 1) 
        sc2 = MinMaxScaler().fit_transform(sc)
        plt.scatter(data[training_len:,0],data[training_len:,1], s=1, cmap='coolwarm', c=sc2.flatten())
        plt.colorbar(ticks=[np.min(sc2), np.max(sc2)])
        plt.title(alg+' scores', fontsize=14)
        plt.savefig(outpath+'ex'+alg+'.png')
        fig = plt.figure(2, figsize=(4, 4), dpi=100,)
        plt.scatter(data[training_len:,0],data[training_len:,1], s=1, cmap='Reds', c=scbin.flatten())
        plt.colorbar(ticks=[np.min(scbin), np.max(scbin)])
        plt.title(alg+' top 100 outliers', fontsize=14)
        plt.savefig(outpath+'ex_bin_'+alg+'.png')
        img = Image.open(outpath+'ex_bin_'+alg+'.png') 
        left, top, right, bottom = 0, 0, 313, 400
        img_res = img.crop((left, top, right, bottom)) 
        img_fin = Image.new("RGB", (400,400), (255, 255, 255))
        img_fin.paste(img_res, (left, top, right, bottom))
        img_fin.save(outpath+'ex_bin_'+alg+'.png') 
Esempio n. 6
0
    def preprocess_df(self, df):
        # temp = df[['id','original_mql_at','joined_at','converted']]
        if self.config['metric'] == 'conversion time':
            df = df.loc[df['converted'] == True]

        df['join_days'] = df.apply(lambda x: abs(
            pd.Timedelta(x['joined_at'] - x['original_mql_at']).days)
                                   if x['converted'] == True else np.NaN,
                                   axis=1)
        days1 = df.loc[df['converted'] == True, 'join_days'].values
        fig, ax = plt.subplots(1, 1, figsize=(8, 6))
        ax = sns.kdeplot(days1)
        fig.savefig(
            os.path.join(self.dir_['result'], 'hist_conversion_days.png'))

        days1 = np.log(days1 + 1)

        fig, ax = plt.subplots(1, 1, figsize=(8, 6))
        ax = sns.kdeplot(days1)
        fig.savefig(
            os.path.join(self.dir_['result'], 'log_hist_conversion_days.png'))

        employees = df['number_of_employees'].values
        fig, ax = plt.subplots(1, 1, figsize=(8, 6))
        ax = sns.kdeplot(employees)
        fig.savefig(os.path.join(self.dir_['result'],
                                 'log_hist_employees.png'))

        df['plan cost'] = 39.0 + df['lead_signup_size'] * 6.0
        plancost = df['plan cost'].values
        fig, ax = plt.subplots(1, 1, figsize=(8, 6))
        ax = sns.kdeplot(plancost)
        fig.savefig(os.path.join(self.dir_['result'], 'log_hist_plancost.png'))
        # to_log = ['join_days','number_of_employees', 'plan cost', 'lead_signup_size']
        # to_log = ['join_days']

        # df[to_log] = df[to_log].applymap(lambda x: np.log(x + 1))

        #log transform the data,

        # cats = pd.qcut(days1, 4)
        # print pd.value_counts(cats)
        #
        # cats = pd.cut(days1, 4)
        # print pd.value_counts(cats)
        # sys.exit(0)

        days1 = days1[:, np.newaxis]
        days1_idx = df.loc[df['converted'] == True, 'id'].values

        days1 = MinMaxScaler().fit_transform(days1)
        days1 = days1.flatten()

        days1 = pd.qcut(days1, 4, labels=False)
        # print np.max(days1), np.min(days1)
        days1 = 4 - days1
        days0_idx = df.loc[df['converted'] == False, 'id'].values
        y0 = np.zeros(days0_idx.shape[0])
        y = np.concatenate([days1, y0])
        idx = np.concatenate([days1_idx, days0_idx])
        temp2 = pd.DataFrame({'id': idx, 'Class': y})
        df = pd.merge(left=df,
                      right=temp2.set_index('id', drop=True),
                      left_on=['id'],
                      how='left',
                      right_index=True)

        df['Class'] = df['Class'].astype(int)
        df = self.process_x(df)

        return df