def resample(self):
        """
        """

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Import the k-NN classifier
        from sklearn.neighbors import NearestNeighbors

        # Create a k-NN to fit the whole data
        nn_obj = NearestNeighbors(n_neighbors=self.size_ngh)

        # Fit the whole dataset
        nn_obj.fit(self.x)

        idx_to_exclude = []
        # Loop over the other classes under picking at random
        for key in self.ucd.keys():

            # Get the sample of the current class
            sub_samples_x = self.x[self.y == key]

            # Get the samples associated
            idx_sub_sample = np.nonzero(self.y == key)[0]

            # Find the NN for the current class
            nnhood_idx = nn_obj.kneighbors(sub_samples_x, return_distance=False)

            # Get the label of the corresponding to the index
            nnhood_label = (self.y[nnhood_idx] == key)

            # Check which one are the same label than the current class
            # Make an AND operation through the three neighbours
            nnhood_bool = np.logical_not(np.all(nnhood_label, axis=1))

            # If the minority class remove the majority samples (as in politic!!!! ;))
            if key == self.minc:
                # Get the index to exclude
                idx_to_exclude += nnhood_idx[np.nonzero(nnhood_label[np.nonzero(nnhood_bool)])].tolist()
            else:
                # Get the index to exclude
                idx_to_exclude += idx_sub_sample[np.nonzero(nnhood_bool)].tolist()

        # Create a vector with the sample to select
        sel_idx = np.ones(self.y.shape)
        sel_idx[idx_to_exclude] = 0

        # Get the samples from the majority classes
        sel_x = np.squeeze(self.x[np.nonzero(sel_idx), :])
        sel_y = self.y[np.nonzero(sel_idx)]

        underx = concatenate((underx, sel_x), axis=0)
        undery = concatenate((undery, sel_y), axis=0)

        if self.verbose:
            print("Under-sampling performed: " + str(Counter(undery)))

        return underx, undery
    def resample(self):
        """
        """

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Import the K-NN classifier
        from sklearn.neighbors import KNeighborsClassifier

        # Loop over the other classes under picking at random
        for key in self.ucd.keys():

            # If the minority class is up, skip it
            if key == self.minc:
                continue

            # Randomly get one sample from the majority class
            maj_sample = sample(self.x[self.y == key],
                                self.n_seeds_S)

            # Create the set C
            C_x = np.append(self.x[self.y == self.minc],
                            maj_sample,
                            axis=0)
            C_y = np.append(self.y[self.y == self.minc],
                            [key] * self.n_seeds_S)

            # Create the set S
            S_x = self.x[self.y == key]
            S_y = self.y[self.y == key]

            # Create a k-NN classifier
            knn = KNeighborsClassifier(n_neighbors=self.size_ngh,
                                       **self.kwargs)

            # Fit C into the knn
            knn.fit(C_x, C_y)

            # Classify on S
            pred_S_y = knn.predict(S_x)

            # Find the misclassified S_y
            sel_x = np.squeeze(S_x[np.nonzero(pred_S_y != S_y), :])
            sel_y = S_y[np.nonzero(pred_S_y != S_y)]

            underx = concatenate((underx, sel_x), axis=0)
            undery = concatenate((undery, sel_y), axis=0)

        if self.verbose:
            print("Under-sampling performed: " + str(Counter(undery)))

        return underx, undery
    def resample(self):
        """
        """

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Import the K-NN classifier
        from sklearn.neighbors import KNeighborsClassifier

        # Loop over the other classes under picking at random
        for key in self.ucd.keys():

            # If the minority class is up, skip it
            if key == self.minc:
                continue

            # Randomly get one sample from the majority class
            maj_sample = sample(self.x[self.y == key], self.n_seeds_S)

            # Create the set C
            C_x = np.append(self.x[self.y == self.minc], maj_sample, axis=0)
            C_y = np.append(self.y[self.y == self.minc],
                            [key] * self.n_seeds_S)

            # Create the set S
            S_x = self.x[self.y == key]
            S_y = self.y[self.y == key]

            # Create a k-NN classifier
            knn = KNeighborsClassifier(n_neighbors=self.size_ngh,
                                       **self.kwargs)

            # Fit C into the knn
            knn.fit(C_x, C_y)

            # Classify on S
            pred_S_y = knn.predict(S_x)

            # Find the misclassified S_y
            sel_x = np.squeeze(S_x[np.nonzero(pred_S_y != S_y), :])
            sel_y = S_y[np.nonzero(pred_S_y != S_y)]

            underx = concatenate((underx, sel_x), axis=0)
            undery = concatenate((undery, sel_y), axis=0)

        if self.verbose:
            print("Under-sampling performed: " + str(Counter(undery)))

        return underx, undery
Example #4
0
 def evaluate_performance(self):
     # make a prediction
     X = self.test_X  # np.expand_dims(self.test_X, axis=-1)
     yhat = self.model_type.value.predict(X)
     test_X = self.test_X.reshape((self.test_X.shape[0], self.test_X.shape[2]))
     # invert scaling for forecast
     inv_yhat = pd.concatenate((yhat, test_X[:, 1:]), axis=1)
     inv_yhat = self.transformer.inverse_transform(inv_yhat)
     inv_yhat = inv_yhat[:, 0]
     # invert scaling for actual
     test_y = self.test_y.reshape((len(self.test_y), 1))
     inv_y = pd.concatenate((test_y, test_X[:, 1:]), axis=1)
     inv_y = self.transformer.inverse_transform(inv_y)
     inv_y = inv_y[:, 0]
     # calculate RMSE
     rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
     logging.debug('Test RMSE: %.3f' % rmse)
    def resample(self):
        """
        ???

        :return:
        """

        # Create the clustering object
        from sklearn.cluster import KMeans
        kmeans = KMeans(random_state=self.rs)
        kmeans.set_params(**self.kwargs)

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Loop over the other classes under picking at random
        for key in self.ucd.keys():
            # If the minority class is up, skip it.
            if key == self.minc:
                continue

            # Set the number of clusters to be no more than the number of
            # samples
            if self.ratio * self.ucd[self.minc] > self.ucd[key]:
                n_clusters = self.ucd[key]
            else:
                n_clusters = int(self.ratio * self.ucd[self.minc])

            # Set the number of clusters and find the centroids
            kmeans.set_params(n_clusters=n_clusters)
            kmeans.fit(self.x[self.y == key])
            centroids = kmeans.cluster_centers_

            # Concatenate to the minority class
            underx = concatenate((underx, centroids), axis=0)
            undery = concatenate((undery, ones(n_clusters) * key), axis=0)

        if self.verbose:
            print("Under-sampling performed: " + str(Counter(undery)))

        return underx, undery
    def resample(self):
        """
        ???

        :return:
        """

        # Create the clustering object
        from sklearn.cluster import KMeans
        kmeans = KMeans(random_state=self.rs)
        kmeans.set_params(**self.kwargs)

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Loop over the other classes under picking at random
        for key in self.ucd.keys():
            # If the minority class is up, skip it.
            if key == self.minc:
                continue

            # Set the number of clusters to be no more than the number of
            # samples
            if self.ratio * self.ucd[self.minc] > self.ucd[key]:
                n_clusters = self.ucd[key]
            else:
                n_clusters = int(self.ratio * self.ucd[self.minc])

            # Set the number of clusters and find the centroids
            kmeans.set_params(n_clusters=n_clusters)
            kmeans.fit(self.x[self.y == key])
            centroids = kmeans.cluster_centers_

            # Concatenate to the minority class
            underx = concatenate((underx, centroids), axis=0)
            undery = concatenate((undery, ones(n_clusters) * key), axis=0)

        if self.verbose:
            print("Under-sampling performed: " + str(Counter(undery)))

        return underx, undery
    def resample(self):
        """
        ...
        """

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Loop over the other classes under picking at random
        for key in self.ucd.keys():
            # If the minority class is up, skip it
            if key == self.minc:
                continue

            # Set the ratio to be no more than the number of samples available
            if self.ratio * self.ucd[self.minc] > self.ucd[key]:
                num_samples = self.ucd[key]
            else:
                num_samples = int(self.ratio * self.ucd[self.minc])

            # Pick some elements at random
            seed(self.rs)
            if self.replacement:
                indx = randint(low=0, high=self.ucd[key], size=num_samples)
            else:
                indx = sample(range((self.y == key).sum()), num_samples)

            # Concatenate to the minority class
            underx = concatenate((underx, self.x[self.y == key].iloc[indx]),
                                 axis=0)
            undery = concatenate((undery, self.y[self.y == key].iloc[indx]),
                                 axis=0)

        if self.verbose:
            print("Under-sampling performed: " + str(Counter(undery)))

        return underx, undery
    def resample(self):
        """
        ...
        """

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Loop over the other classes under picking at random
        for key in self.ucd.keys():
            # If the minority class is up, skip it
            if key == self.minc:
                continue

            # Set the ratio to be no more than the number of samples available
            if self.ratio * self.ucd[self.minc] > self.ucd[key]:
                num_samples = self.ucd[key]
            else:
                num_samples = int(self.ratio * self.ucd[self.minc])

            # Pick some elements at random
            seed(self.rs)
            if self.replacement:
                indx = randint(low=0, high=self.ucd[key], size=num_samples)
            else:
                indx = sample(range((self.y == key).sum()), num_samples)

            # Concatenate to the minority class
            underx = concatenate((underx, self.x[self.y == key].iloc[indx]), axis=0)
            undery = concatenate((undery, self.y[self.y == key].iloc[indx]), axis=0)

        if self.verbose:
            print("Under-sampling performed: " + str(Counter(undery)))

        return underx, undery
Example #9
0
def calc_returns(split_data):
    '''
    Calculate annual returns for periods optimized over slices (of size HINDSIGHT) of past data. Gives an idea of what kind of results to realistically expect
    '''
    annual_returns = []
    max_return = float('-inf')
    min_return = float('inf')
    for i in range(2, len(split_data)):
        test_year = split_data[i]
        optimize_period = pd.concatenate(split_data[i - HINDSIGHT:i])
        print('optimize period:')
        print(optimize_period)
        periods = optimize(optimize_period)
        print('periods:')
        print(periods)
        profit = run_analysis(periods, test_year)
        annual_returns.append(profit)
        if profit > max_return: max_return = profit
        if profit < min_return: min_return = profit
    return annual_returns, max_return, min_return
    def gen_feature_dict(self):
        if not self.train_file or not self.test_file:
            raise Exception("provide file for train and test sets")
        if not self.numeric_cols:
            raise Exception("provide which columns are numeric")

        self.df_train = pd.read_csv(self.train_file)
        self.df_test = pd.read_csv(self.test_file)
        df = pd.concatenate([self.df_train, self.df_test])

        for col in df.columns:
            if col in self.ignore_cols:
                continue
            if col in self.numeric_cols:
                self.feature_to_type[col] = 'numeric'
            else:
                le = LabelEncoder()
                le.fit(df[col])
                self.feature_to_encoder[col] = le
                self.feature_to_type[col] = 'cat'
            self.columns.append(col)
        return self.df_train, self.df_test
def main():
    parser = argparse.ArgumentParser(
        description="Reads benchmark_results filenames from rados bench and" \
        " plots the results"
    )
    parser.add_argument("--paths",
                        nargs="+",
                        required=False,
                        default=["bench_results.txt"],
                        help="The path of a file(s)")
    args = parser.parse_args()
    paths = args.paths
    if len(paths) <= 1:
        plt = plot_bench_results(paths[0])
    else:
        dfs = []
        for path in paths:
            print("path", path)
            df = bench_results_to_df(path)
            df = add_rolling_results(df)
            df["filename"] = path
            dfs.append(df)
        master_df = pd.concatenate(dfs, axis=1)
        master_df.plot(x="sec", y="MA_30s_ops")
    def resample(self):
        """
        Main method of all children classes.

        :return: Over-sampled data set.
        """

        # Start by separating minority class features and target values.
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # If regular SMOTE is to be performed#
        if self.kind == 'regular':
            # Print if verbose is true#
            if self.verbose:
                print("Finding the %i nearest neighbours..." % self.k, end="")

            # Look for k-th nearest neighbours, excluding, of course, the
            # point itself.#
            self.nearest_neighbour_.fit(minx)

            # Matrix with k-th nearest neighbours indexes for each minority
            # element.#
            nns = self.nearest_neighbour_.kneighbors(minx,
                                                     return_distance=False)[:, 1:]

            # Print status if verbose is true#
            if self.verbose:
                ##
                print("done!")

                # Creating synthetic samples #
                print("Creating synthetic samples...", end="")

            # --- Generating synthetic samples
            # Use static method make_samples to generate minority samples
            # FIX THIS SHIT!!!#
            sx, sy = self.make_samples(x=minx,
                                       nn_data=minx,
                                       y_type=self.minc,
                                       nn_num=nns,
                                       n_samples=int(self.ratio * len(miny)),
                                       step_size=1.0,
                                       random_state=self.rs,
                                       verbose=self.verbose)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            ret_x = concatenate((self.x, sx), axis=0)
            ret_y = concatenate((self.y, sy), axis=0)

            return ret_x, ret_y

        if (self.kind == 'borderline1') or (self.kind == 'borderline2'):

            if self.verbose:
                print("Finding the %i nearest neighbours..." % self.m, end="")

            # Find the NNs for all samples in the data set.
            self.nearest_neighbour_.fit(self.x)

            if self.verbose:
                print("done!")

            # Boolean array with True for minority samples in danger
            danger_index = [self.in_danger(x, self.y, self.m, miny[0],
                            self.nearest_neighbour_) for x in minx]

            # Turn into numpy array#
            danger_index = asarray(danger_index)

            # If all minority samples are safe, return the original data set.
            if not any(danger_index):
                ##
                if self.verbose:
                    print('There are no samples in danger. No borderline '
                          'synthetic samples created.')

                # All are safe, nothing to be done here.#
                return self.x, self.y

            # If we got here is because some samples are in danger, we need to
            # find the NNs among the minority class to create the new synthetic
            # samples.
            #
            # We start by changing the number of NNs to consider from m + 1
            # to k + 1
            self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour_.fit(minx)

            # nns...#
            nns = self.nearest_neighbour_.kneighbors(minx[danger_index],
                                                     return_distance=False)[:, 1:]

            # B1 and B2 types diverge here!!!
            if self.kind == 'borderline1':
                # Create synthetic samples for borderline points.
                sx, sy = self.make_samples(minx[danger_index],
                                           minx,
                                           miny[0],
                                           nns,
                                           int(self.ratio * len(miny)),
                                           random_state=self.rs,
                                           verbose=self.verbose)

                # Concatenate the newly generated samples to the original data set
                ret_x = concatenate((self.x, sx), axis=0)
                ret_y = concatenate((self.y, sy), axis=0)

                return ret_x, ret_y

            else:
                # Split the number of synthetic samples between only minority
                # (type 1), or minority and majority (with reduced step size)
                # (type 2).
                np.random.seed(self.rs)

                # The fraction is sampled from a beta distribution centered
                # around 0.5 with variance ~0.01#
                fractions = betavariate(alpha=10, beta=10)

                # Only minority
                sx1, sy1 = self.make_samples(minx[danger_index],
                                             minx,
                                             self.minc,
                                             nns,
                                             fractions * (int(self.ratio * len(miny)) + 1),
                                             step_size=1,
                                             random_state=self.rs,
                                             verbose=self.verbose)

                # Only majority with smaller step size
                sx2, sy2 = self.make_samples(minx[danger_index],
                                             self.x[self.y != self.minc],
                                             self.minc, nns,
                                             (1 - fractions) * int(self.ratio * len(miny)),
                                             step_size=0.5,
                                             random_state=self.rs,
                                             verbose=self.verbose)

                # Concatenate the newly generated samples to the original data set
                ret_x = np.concatenate((self.x, sx1, sx2), axis=0)
                ret_y = np.concatenate((self.y, sy1, sy2), axis=0)

                return ret_x, ret_y

        if self.kind == 'svm':
            # The SVM smote model fits a support vector machine
            # classifier to the data and uses the support vector to
            # provide a notion of boundary. Unlike regular smote, where
            # such notion relies on proportion of nearest neighbours
            # belonging to each class.#

            # Fit SVM to the full data#
            self.svm_.fit(self.x, self.y)

            # Find the support vectors and their corresponding indexes
            support_index = self.svm_.support_[self.y[self.svm_.support_] == self.minc]
            support_vector = self.x[support_index]

            # First, find the nn of all the samples to identify samples in danger
            # and noisy ones
            if self.verbose:
                print("Finding the %i nearest neighbours..." % self.m, end="")

            # As usual, fit a nearest neighbour model to the data
            self.nearest_neighbour_.fit(self.x)

            if self.verbose:
                print("done!")

            # Now, get rid of noisy support vectors

            # Boolean array with True for noisy support vectors
            noise_bool = []
            for x in support_vector:
                noise_bool.append(self.is_noise(x, self.y, self.minc,
                                                self.nearest_neighbour_))

            # Turn into array#
            noise_bool = asarray(noise_bool)

            # Remove noisy support vectors
            support_vector = support_vector[np.logical_not(noise_bool)]

            # Find support_vectors there are in danger (interpolation) or not
            # (extrapolation)
            danger_bool = [self.in_danger(x,
                                          self.y,
                                          self.m,
                                          self.minc,
                                          self.nearest_neighbour_)
                           for x in support_vector]

            # Turn into array#
            danger_bool = asarray(danger_bool)

            # Something ...#
            safety_bool = np.logical_not(danger_bool)

            if self.verbose:
                print("Out of {0} support vectors, {1} are noisy, "
                      "{2} are in danger "
                      "and {3} are safe.".format(support_vector.shape[0],
                                                 noise_bool.sum().astype(int),
                                                 danger_bool.sum().astype(int),
                                                 safety_bool.sum().astype(int)
                                                 )
                      )

                # Proceed to find support vectors NNs among the minority class
                print("Finding the %i nearest neighbours..." % self.k, end="")

            self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour_.fit(minx)

            if self.verbose:
                print("done!")
                print("Creating synthetic samples...", end="")

            # Split the number of synthetic samples between interpolation and
            # extrapolation

            # The fraction are sampled from a beta distribution with mean
            # 0.5 and variance 0.01#
            np.random.seed(self.rs)
            fractions = betavariate(alpha=10, beta=10)

            # Interpolate samples in danger
            if (np.count_nonzero(danger_bool) > 0):
                nns = self.nearest_neighbour_.kneighbors(support_vector[danger_bool],
                                                         return_distance=False)[:, 1:]

                sx1, sy1 = self.make_samples(support_vector[danger_bool],
                                             minx,
                                             self.minc, nns,
                                             fractions * (int(self.ratio * len(minx)) + 1),
                                             step_size=1,
                                             random_state=self.rs,
                                             verbose=self.verbose)

            # Extrapolate safe samples
            if (np.count_nonzero(safety_bool) > 0):
                nns = self.nearest_neighbour_.kneighbors(support_vector[safety_bool],
                                                         return_distance=False)[:, 1:]
                
                sx2, sy2 = self.make_samples(support_vector[safety_bool],
                                             minx,
                                             self.minc, nns,
                                             (1 - fractions) * int(self.ratio * len(minx)),
                                             step_size=-self.out_step,
                                             random_state=self.rs,
                                             verbose=self.verbose)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            if (  (np.count_nonzero(danger_bool) > 0) and
                  (np.count_nonzero(safety_bool) > 0)     ):
                ret_x = concatenate((self.x, sx1, sx2), axis=0)
                ret_y = concatenate((self.y, sy1, sy2), axis=0)
            # not any support vectors in danger
            elif np.count_nonzero(danger_bool) == 0:
                ret_x = concatenate((self.x, sx2), axis=0)
                ret_y = concatenate((self.y, sy2), axis=0)
            # All the support vector in danger
            elif np.count_nonzero(safety_bool) == 0:
                ret_x = concatenate((self.x, sx1), axis=0)
                ret_y = concatenate((self.y, sy1), axis=0)

            return ret_x, ret_y
Example #13
0
    def resample(self):
        """
        Over samples the minority class by randomly picking samples with
        replacement.

        :return:
            overx, overy: The features and target values of the over-sampled
            data set.
        """

        # Start with the majority class
        overx = self.x[self.y == self.maxc]
        overy = self.y[self.y == self.maxc]

        # Loop over the other classes over picking at random
        for key in self.ucd.keys():
            if key == self.maxc:
                continue

            # If the ratio given is too large such that the minority becomes a
            # majority, clip it.
            if self.ratio * self.ucd[key] > self.ucd[self.maxc]:
                num_samples = self.ucd[self.maxc] - self.ucd[key]
            else:
                num_samples = int(self.ratio * self.ucd[key])

            if (self.method == 'replacement'):
                # Pick some elements at random
                seed(self.rs)
                indx = randint(low=0, high=self.ucd[key], size=num_samples)

                # Concatenate to the majority class
                overx = concatenate((overx, self.x[self.y == key],
                                     self.x[self.y == key].iloc[indx]),
                                    axis=0)

                overy = concatenate((overy, self.y[self.y == key],
                                     self.y[self.y == key].iloc[indx]),
                                    axis=0)

            elif (self.method == 'gaussian-perturbation'):
                # Pick the index of the samples which will be modified
                seed(self.rs)
                indx = randint(low=0, high=self.ucd[key], size=num_samples)

                # Generate the new samples
                sam_pert = []
                for i in indx:
                    pert = np.random.normal(self.mean_gaussian,
                                            self.std_gaussian,
                                            self.x[self.y == key][i])
                    sam_pert.append(self.x[self.y == key][i] + pert)

                # Convert the list to numpy array
                sam_pert = np.array(sam_pert)

                # Concatenate to the majority class
                overx = concatenate((overx, self.x[self.y == key], sam_pert),
                                    axis=0)

                overy = concatenate((overy, self.y[self.y == key],
                                     self.y[self.y == key].iloc[indx]),
                                    axis=0)

        if self.verbose:
            print("Over-sampling performed: " + str(Counter(overy)))

        # Return over sampled dataset
        return overx, overy
Example #14
0
合并  pandas.merge(frame1,frame2,on='id') on属性指定以那一列进行合并
要合并多个键,就把对个键传给 on=['id','brand']

列的名字不同
pandas.merge(frame1,frME2,left_on='id',right_on='sid') 
 -----how属性指定链接方式 取值有 outer left right



---根据索引合并  将ringt_index和left_index的值改为True
pd.merge(fr1,fr2,right_index=True,left_index=True)
-----frame对象的 join()函数跟适合做 索引合并
fr1.join(fr2)  按照索引,列名不能一样====重点

----拼接  函数 concatenate()   ndarray 对象
pd.concatenate([array1,array2],axis=1) 列拼接

----按轴拼接  series和DataFrame对象
pd.concat([ser1,ser2])  默认axis=0    默认外链接,默认过滤缺失数据
属性join 的参数改变链接方式
pd.concat([ser1,ser2],axis=1,join='inner') 内连接
 keys属性在拼接的轴上创建等级索引
 pd.concat([ser1,ser2],asix=1,keys=[1,2])   设置ser1和ser2数据名称

 6.2.1 组合 我们无法通过合并和拼接组合数据,例如,两个数据集的索引
 完全或部分重合
 combine_first() 函数可以用组合series对象,同时对其数据

 ser1.combine_first(ser2) 按ser1对其数据
 部分合并
 ser[1:3].combine_first(ser2[:3])
Example #15
0
        idx = df[
            (df["Subject"] == "GreBla5671F") &
            (df["Date"] >= datetime.date(2020, 1, 7)) &
            (df["Date"] <= datetime.date(2020, 1, 10))
        ].index
        df[idx, "Condition"] = "MonthLater"

    return df


if __name__ == "__main__":
    from configs.active_config import config
    from analysis.download_scripts.project_lesions_2021 import download

    try:
        download()
    except:
        pass

    subject_dfs = []
    for subject in config.subjects:
        # Preprocessing steps
        df = run_pipeline_subject(subject, config)
        subject_dfs.append(df)

    full_df = pd.concatenate(subject_dfs).reset_index(drop=True)
    full_df.to_csv(
        os.path.join(config.metadata_dir, "TrialsData.csv"),
        index=False
    )
Example #16
0
    def resample(self):
        """
        Main method of all children classes.

        :return: Over-sampled data set.
        """

        # Start by separating minority class features and target values.
        minx = self.x[self.y == self.minc]
        miny = self.y[self.y == self.minc]

        # If regular SMOTE is to be performed#
        if self.kind == 'regular':
            # Print if verbose is true#
            if self.verbose:
                print("Finding the %i nearest neighbours..." % self.k, end="")

            # Look for k-th nearest neighbours, excluding, of course, the
            # point itself.#
            self.nearest_neighbour_.fit(minx)

            # Matrix with k-th nearest neighbours indexes for each minority
            # element.#
            nns = self.nearest_neighbour_.kneighbors(minx,
                                                     return_distance=False)[:,
                                                                            1:]

            # Print status if verbose is true#
            if self.verbose:
                ##
                print("done!")

                # Creating synthetic samples #
                print("Creating synthetic samples...", end="")

            # --- Generating synthetic samples
            # Use static method make_samples to generate minority samples
            # FIX THIS SHIT!!!#
            sx, sy = self.make_samples(x=minx,
                                       nn_data=minx,
                                       y_type=self.minc,
                                       nn_num=nns,
                                       n_samples=int(self.ratio * len(miny)),
                                       step_size=1.0,
                                       random_state=self.rs,
                                       verbose=self.verbose)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            ret_x = concatenate((self.x, sx), axis=0)
            ret_y = concatenate((self.y, sy), axis=0)

            return ret_x, ret_y

        if (self.kind == 'borderline1') or (self.kind == 'borderline2'):

            if self.verbose:
                print("Finding the %i nearest neighbours..." % self.m, end="")

            # Find the NNs for all samples in the data set.
            self.nearest_neighbour_.fit(self.x)

            if self.verbose:
                print("done!")

            # Boolean array with True for minority samples in danger
            danger_index = [
                self.in_danger(x, self.y, self.m, miny[0],
                               self.nearest_neighbour_) for x in minx
            ]

            # Turn into numpy array#
            danger_index = asarray(danger_index)

            # If all minority samples are safe, return the original data set.
            if not any(danger_index):
                ##
                if self.verbose:
                    print('There are no samples in danger. No borderline '
                          'synthetic samples created.')

                # All are safe, nothing to be done here.#
                return self.x, self.y

            # If we got here is because some samples are in danger, we need to
            # find the NNs among the minority class to create the new synthetic
            # samples.
            #
            # We start by changing the number of NNs to consider from m + 1
            # to k + 1
            self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour_.fit(minx)

            # nns...#
            nns = self.nearest_neighbour_.kneighbors(minx[danger_index],
                                                     return_distance=False)[:,
                                                                            1:]

            # B1 and B2 types diverge here!!!
            if self.kind == 'borderline1':
                # Create synthetic samples for borderline points.
                sx, sy = self.make_samples(minx[danger_index],
                                           minx,
                                           miny[0],
                                           nns,
                                           int(self.ratio * len(miny)),
                                           random_state=self.rs,
                                           verbose=self.verbose)

                # Concatenate the newly generated samples to the original data set
                ret_x = concatenate((self.x, sx), axis=0)
                ret_y = concatenate((self.y, sy), axis=0)

                return ret_x, ret_y

            else:
                # Split the number of synthetic samples between only minority
                # (type 1), or minority and majority (with reduced step size)
                # (type 2).
                np.random.seed(self.rs)

                # The fraction is sampled from a beta distribution centered
                # around 0.5 with variance ~0.01#
                fractions = betavariate(alpha=10, beta=10)

                # Only minority
                sx1, sy1 = self.make_samples(minx[danger_index],
                                             minx,
                                             self.minc,
                                             nns,
                                             fractions *
                                             (int(self.ratio * len(miny)) + 1),
                                             step_size=1,
                                             random_state=self.rs,
                                             verbose=self.verbose)

                # Only majority with smaller step size
                sx2, sy2 = self.make_samples(minx[danger_index],
                                             self.x[self.y != self.minc],
                                             self.minc,
                                             nns, (1 - fractions) *
                                             int(self.ratio * len(miny)),
                                             step_size=0.5,
                                             random_state=self.rs,
                                             verbose=self.verbose)

                # Concatenate the newly generated samples to the original data set
                ret_x = np.concatenate((self.x, sx1, sx2), axis=0)
                ret_y = np.concatenate((self.y, sy1, sy2), axis=0)

                return ret_x, ret_y

        if self.kind == 'svm':
            # The SVM smote model fits a support vector machine
            # classifier to the data and uses the support vector to
            # provide a notion of boundary. Unlike regular smote, where
            # such notion relies on proportion of nearest neighbours
            # belonging to each class.#

            # Fit SVM to the full data#
            self.svm_.fit(self.x, self.y)

            # Find the support vectors and their corresponding indexes
            support_index = self.svm_.support_[self.y[self.svm_.support_] ==
                                               self.minc]
            support_vector = self.x[support_index]

            # First, find the nn of all the samples to identify samples in danger
            # and noisy ones
            if self.verbose:
                print("Finding the %i nearest neighbours..." % self.m, end="")

            # As usual, fit a nearest neighbour model to the data
            self.nearest_neighbour_.fit(self.x)

            if self.verbose:
                print("done!")

            # Now, get rid of noisy support vectors

            # Boolean array with True for noisy support vectors
            noise_bool = []
            for x in support_vector:
                noise_bool.append(
                    self.is_noise(x, self.y, self.minc,
                                  self.nearest_neighbour_))

            # Turn into array#
            noise_bool = asarray(noise_bool)

            # Remove noisy support vectors
            support_vector = support_vector[np.logical_not(noise_bool)]

            # Find support_vectors there are in danger (interpolation) or not
            # (extrapolation)
            danger_bool = [
                self.in_danger(x, self.y, self.m, self.minc,
                               self.nearest_neighbour_) for x in support_vector
            ]

            # Turn into array#
            danger_bool = asarray(danger_bool)

            # Something ...#
            safety_bool = np.logical_not(danger_bool)

            if self.verbose:
                print("Out of {0} support vectors, {1} are noisy, "
                      "{2} are in danger "
                      "and {3} are safe.".format(
                          support_vector.shape[0],
                          noise_bool.sum().astype(int),
                          danger_bool.sum().astype(int),
                          safety_bool.sum().astype(int)))

                # Proceed to find support vectors NNs among the minority class
                print("Finding the %i nearest neighbours..." % self.k, end="")

            self.nearest_neighbour_.set_params(**{'n_neighbors': self.k + 1})
            self.nearest_neighbour_.fit(minx)

            if self.verbose:
                print("done!")
                print("Creating synthetic samples...", end="")

            # Split the number of synthetic samples between interpolation and
            # extrapolation

            # The fraction are sampled from a beta distribution with mean
            # 0.5 and variance 0.01#
            np.random.seed(self.rs)
            fractions = betavariate(alpha=10, beta=10)

            # Interpolate samples in danger
            if (np.count_nonzero(danger_bool) > 0):
                nns = self.nearest_neighbour_.kneighbors(
                    support_vector[danger_bool], return_distance=False)[:, 1:]

                sx1, sy1 = self.make_samples(support_vector[danger_bool],
                                             minx,
                                             self.minc,
                                             nns,
                                             fractions *
                                             (int(self.ratio * len(minx)) + 1),
                                             step_size=1,
                                             random_state=self.rs,
                                             verbose=self.verbose)

            # Extrapolate safe samples
            if (np.count_nonzero(safety_bool) > 0):
                nns = self.nearest_neighbour_.kneighbors(
                    support_vector[safety_bool], return_distance=False)[:, 1:]

                sx2, sy2 = self.make_samples(support_vector[safety_bool],
                                             minx,
                                             self.minc,
                                             nns, (1 - fractions) *
                                             int(self.ratio * len(minx)),
                                             step_size=-self.out_step,
                                             random_state=self.rs,
                                             verbose=self.verbose)

            if self.verbose:
                print("done!")

            # Concatenate the newly generated samples to the original data set
            if ((np.count_nonzero(danger_bool) > 0)
                    and (np.count_nonzero(safety_bool) > 0)):
                ret_x = concatenate((self.x, sx1, sx2), axis=0)
                ret_y = concatenate((self.y, sy1, sy2), axis=0)
            # not any support vectors in danger
            elif np.count_nonzero(danger_bool) == 0:
                ret_x = concatenate((self.x, sx2), axis=0)
                ret_y = concatenate((self.y, sy2), axis=0)
            # All the support vector in danger
            elif np.count_nonzero(safety_bool) == 0:
                ret_x = concatenate((self.x, sx1), axis=0)
                ret_y = concatenate((self.y, sy1), axis=0)

            return ret_x, ret_y
Example #17
0
        columns=['file', 'xmin', 'ymin', 'xmax', 'ymax', 'conf', 'class'])
    detz = 0

    for ff in range(len(valid_files)):
        # for ff in [213,686,856,867,956,967]:
        if ff % 1000 == 0:
            print(ff)
        image_name = valid_files[ff]
        prefix = image_name[:-4]
        if image_name[-4:] == ".png":
            image_in = cv2.imread(test_img_in + image_name)
            # print(test_img_in + image_name)
            # image_in = cv2.imread(valid_image_folder + image_name)
            dummy_array = np.zeros((1, 1, 1, 1, n_anchors, 4))
            image_in = image_in / 255.
            image_in = image_in[:, :, ::-1]
            image_in = np.expand_dims(image_in, 0)
            netout = sess.run(y_pred, feed_dict={img_out: image_in})
            netout = np.reshape(netout, [1, boxy, boxx, n_anchors, out_len])
            boxes_pred = convert2box(netout)
            if len(boxes_pred) > 0:
                print(boxes_pred)
                boxes_pred["file"] = np.repeat(image_name, boxes_pred.shape[0])
                detect_all = pd.concatenate((detect_all, boxes_pred), axis=0)
                detz = detz + boxes_pred.shape[0]

    print("Detections:", detz)

    detect_all.to_csv("E:/CF_Calcs/BenchmarkSets/GFRC/core_test/detz.csv",
                      index=False)
Example #18
0
"""
create dummy dataframe about dragon ball z characters earth location and other information
"""

name_data_one = {"name": ["goku", "gohan"], "power": [200, 400], city": ["NY", "SEA"]}
name_data_two = {"name": ["srijan", "chuck"], "power": [400, 500], city": ["DEN", "SFO"]}
dragon_ball_data_one = pd.DataFrame(data=name_data_one)
dragon_ball__data_two = pd.DataFrame(data=name_data_two)

"""
Concatenate two dataframes

"""

pd.concatenate([name_data_one, name_data_two], axis=0) #concatenate along rows - stack vertically

pd.concatenate([name_data_one, name_data_two], axis=1) #concatenate along column - stack horizontally


"""
Join/Merge two dataframes

"""

pd.merge(name_data_one, name_data_two, on = "name", how="inner")


"""
Loop over dataframes
    def resample(self):
        """
        """

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Import the K-NN classifier
        from sklearn.neighbors import KNeighborsClassifier

        # Loop over the other classes under picking at random
        for key in self.ucd.keys():

            # If the minority class is up, skip it
            if key == self.minc:
                continue

            # Randomly get one sample from the majority class
            maj_sample = sample(self.x[self.y == key], self.n_seeds_S)

            # Create the set C
            C_x = np.append(self.x[self.y == self.minc], maj_sample, axis=0)
            C_y = np.append(self.y[self.y == self.minc],
                            [key] * self.n_seeds_S)

            # Create the set S
            S_x = self.x[self.y == key]
            S_y = self.y[self.y == key]

            # Create a k-NN classifier
            knn = KNeighborsClassifier(n_neighbors=self.size_ngh,
                                       **self.kwargs)

            # Fit C into the knn
            knn.fit(C_x, C_y)

            # Classify on S
            pred_S_y = knn.predict(S_x)

            # Find the misclassified S_y
            sel_x = np.squeeze(S_x[np.nonzero(pred_S_y != S_y), :])
            sel_y = S_y[np.nonzero(pred_S_y != S_y)]

            underx = concatenate((underx, sel_x), axis=0)
            undery = concatenate((undery, sel_y), axis=0)

        from sklearn.neighbors import NearestNeighbors

        # Find the nearest neighbour of every point
        nn = NearestNeighbors(n_neighbors=2)
        nn.fit(underx)
        nns = nn.kneighbors(underx, return_distance=False)[:, 1]

        # Send the information to is_tomek function to get boolean vector back
        if self.verbose:
            print("Looking for majority Tomek links...")
        links = self.is_tomek(undery, nns, self.minc, self.verbose)

        if self.verbose:
            print("Under-sampling "
                  "performed: " + str(Counter(undery[logical_not(links)])))

        # Return data set without majority Tomek links.
        return underx[logical_not(links)], undery[logical_not(links)]
    def resample(self):
        """
        """

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Import the k-NN classifier
        from sklearn.neighbors import NearestNeighbors

        # Create a k-NN to fit the whole data
        nn_obj = NearestNeighbors(n_neighbors=self.size_ngh)

        # Fit the whole dataset
        nn_obj.fit(self.x)

        idx_to_exclude = []
        # Loop over the other classes under picking at random
        for key in self.ucd.keys():

            # Get the sample of the current class
            sub_samples_x = self.x[self.y == key]

            # Get the samples associated
            idx_sub_sample = np.nonzero(self.y == key)[0]

            # Find the NN for the current class
            nnhood_idx = nn_obj.kneighbors(sub_samples_x,
                                           return_distance=False)

            # Get the label of the corresponding to the index
            nnhood_label = (self.y[nnhood_idx] == key)

            # Check which one are the same label than the current class
            # Make an AND operation through the three neighbours
            nnhood_bool = np.logical_not(np.all(nnhood_label, axis=1))

            # If the minority class remove the majority samples (as in politic!!!! ;))
            if key == self.minc:
                # Get the index to exclude
                idx_to_exclude += nnhood_idx[np.nonzero(
                    nnhood_label[np.nonzero(nnhood_bool)])].tolist()
            else:
                # Get the index to exclude
                idx_to_exclude += idx_sub_sample[np.nonzero(
                    nnhood_bool)].tolist()

        # Create a vector with the sample to select
        sel_idx = np.ones(self.y.shape)
        sel_idx[idx_to_exclude] = 0

        # Get the samples from the majority classes
        sel_x = np.squeeze(self.x[np.nonzero(sel_idx), :])
        sel_y = self.y[np.nonzero(sel_idx)]

        underx = concatenate((underx, sel_x), axis=0)
        undery = concatenate((undery, sel_y), axis=0)

        if self.verbose:
            print("Under-sampling performed: " + str(Counter(undery)))

        return underx, undery
    def resample(self):
        """
        """

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # For each element of the current class, find the set of NN
        # of the minority class
        from sklearn.neighbors import NearestNeighbors

        # Call the constructor of the NN
        nn_obj = NearestNeighbors(n_neighbors=self.size_ngh, **self.kwargs)

        # Fit the minority class since that we want to know the distance
        # to these point
        nn_obj.fit(self.x[self.y == self.minc])

        # Loop over the other classes under picking at random
        for key in self.ucd.keys():

            # If the minority class is up, skip it
            if key == self.minc:
                continue

            # Set the ratio to be no more than the number of samples available
            if self.ratio * self.ucd[self.minc] > self.ucd[key]:
                num_samples = self.ucd[key]
            else:
                num_samples = int(self.ratio * self.ucd[self.minc])

            # Get the samples corresponding to the current class
            sub_samples_x = self.x[self.y == key]
            sub_samples_y = self.y[self.y == key]

            if self.version == 1:
                # Find the NN
                dist_vec, idx_vec = nn_obj.kneighbors(sub_samples_x,
                                                      n_neighbors=self.size_ngh)

                # Select the right samples
                sel_x, sel_y = self.__SelectionDistBased__(dist_vec,
                                                           num_samples,
                                                           key,
                                                           sel_strategy='nearest')
            elif self.version == 2:
                # Find the NN
                dist_vec, idx_vec = nn_obj.kneighbors(sub_samples_x,
                                                      n_neighbors=self.y[self.y == self.minc].size)

                # Select the right samples
                sel_x, sel_y = self.__SelectionDistBased__(dist_vec,
                                                           num_samples,
                                                           key,
                                                           sel_strategy='nearest')
            elif self.version == 3:
                # We need a new NN object to fit the current class
                nn_obj_cc = NearestNeighbors(n_neighbors=self.ver3_samp_ngh,
                                             **self.kwargs)
                nn_obj_cc.fit(sub_samples_x)

                # Find the set of NN to the minority class
                dist_vec, idx_vec = nn_obj_cc.kneighbors(self.x[self.y == self.minc])

                # Create the subset containing the samples found during the NN
                # search. Linearize the indexes and remove the double values
                idx_vec = np.unique(idx_vec.reshape(-1))

                # Create the subset
                sub_samples_x = sub_samples_x[idx_vec, :]
                sub_samples_y = sub_samples_y[idx_vec]

                # Compute the NN considering the current class
                dist_vec, idx_vec = nn_obj.kneighbors(sub_samples_x,
                                                      n_neighbors=self.size_ngh)

                sel_x, sel_y = self.__SelectionDistBased__(dist_vec,
                                                           num_samples,
                                                           key,
                                                           sel_strategy='farthest')

            underx = concatenate((underx, sel_x), axis=0)
            undery = concatenate((undery, sel_y), axis=0)

        if self.verbose:
            print("Under-sampling performed: " + str(Counter(undery)))

        return underx, undery
    def resample(self):
        """
        """

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # Import the K-NN classifier
        from sklearn.neighbors import KNeighborsClassifier

        # Loop over the other classes under picking at random
        for key in self.ucd.keys():

            # If the minority class is up, skip it
            if key == self.minc:
                continue

            # Randomly get one sample from the majority class
            maj_sample = sample(self.x[self.y == key],
                                self.n_seeds_S)

            # Create the set C
            C_x = np.append(self.x[self.y == self.minc],
                            maj_sample,
                            axis=0)
            C_y = np.append(self.y[self.y == self.minc],
                            [key] * self.n_seeds_S)

            # Create the set S
            S_x = self.x[self.y == key]
            S_y = self.y[self.y == key]

            # Create a k-NN classifier
            knn = KNeighborsClassifier(n_neighbors=self.size_ngh,
                                       **self.kwargs)

            # Fit C into the knn
            knn.fit(C_x, C_y)

            # Classify on S
            pred_S_y = knn.predict(S_x)

            # Find the misclassified S_y
            sel_x = np.squeeze(S_x[np.nonzero(pred_S_y != S_y), :])
            sel_y = S_y[np.nonzero(pred_S_y != S_y)]

            underx = concatenate((underx, sel_x), axis=0)
            undery = concatenate((undery, sel_y), axis=0)

        from sklearn.neighbors import NearestNeighbors

        # Find the nearest neighbour of every point
        nn = NearestNeighbors(n_neighbors=2)
        nn.fit(underx)
        nns = nn.kneighbors(underx, return_distance=False)[:, 1]

        # Send the information to is_tomek function to get boolean vector back
        if self.verbose:
            print("Looking for majority Tomek links...")
        links = self.is_tomek(undery, nns, self.minc, self.verbose)

        if self.verbose:
            print("Under-sampling "
                  "performed: " + str(Counter(undery[logical_not(links)])))

        # Return data set without majority Tomek links.
        return underx[logical_not(links)], undery[logical_not(links)]
Example #23
0
                    epochs=50,
                    batch_size=72,
                    validation_data=(test_X, test_y),
                    verbose=2,
                    shuffle=False)
# plot history
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

# make a prediction
yhat = model.predict(test_X)
test_X = test_X.reshape((test_X.shape[0], n_hours * n_features))
# invert scaling for forecast
inv_yhat = pd.concatenate((yhat, test_X[:, -7:]), axis=1)
inv_yhat = X_scaler.inverse_transform(inv_yhat)
inv_yhat = inv_yhat[:, 0]
# invert scaling for actual
test_y = test_y.reshape((len(test_y), 1))
inv_y = concatenate((test_y, test_X[:, -7:]), axis=1)
inv_y = scaler.inverse_transform(inv_y)
inv_y = inv_y[:, 0]
# calculate RMSE
rmse = sqrt(mean_squared_error(inv_y, inv_yhat))
print('Test RMSE: %.3f' % rmse)
'''
# 모델의 설정, 컴파일, 실행
for train_index, validation_index in kf.split(X):  # 이하 모델을 학습한 뒤 테스트.
    print("loop num : ", len(accuracy)+1)
    print("TRAIN: %d" % len(train_index), "TEST: %d" % len(validation_index))
    def resample(self):
        """
        Over samples the minority class by randomly picking samples with
        replacement.

        :return:
            overx, overy: The features and target values of the over-sampled
            data set.
        """

        # Start with the majority class
        overx = self.x[self.y == self.maxc]
        overy = self.y[self.y == self.maxc]

        # Loop over the other classes over picking at random
        for key in self.ucd.keys():
            if key == self.maxc:
                continue

            # If the ratio given is too large such that the minority becomes a
            # majority, clip it.
            if self.ratio * self.ucd[key] > self.ucd[self.maxc]:
                num_samples = self.ucd[self.maxc] - self.ucd[key]
            else:
                num_samples = int(self.ratio * self.ucd[key])

            if (self.method == 'replacement'):
                # Pick some elements at random
                seed(self.rs)
                indx = randint(low=0, high=self.ucd[key], size=num_samples)

                # Concatenate to the majority class
                overx = concatenate((overx,
                                     self.x[self.y == key],
                                     self.x[self.y == key].iloc[indx]), axis=0)

                overy = concatenate((overy,
                                     self.y[self.y == key],
                                     self.y[self.y == key].iloc[indx]), axis=0)

            elif (self.method == 'gaussian-perturbation'):
                # Pick the index of the samples which will be modified
                seed(self.rs)
                indx = randint(low=0, high=self.ucd[key], size=num_samples)

                # Generate the new samples
                sam_pert = []
                for i in indx:
                    pert = np.random.normal(self.mean_gaussian, self.std_gaussian, self.x[self.y == key][i])
                    sam_pert.append(self.x[self.y == key][i] + pert)

                # Convert the list to numpy array
                sam_pert = np.array(sam_pert)

                # Concatenate to the majority class
                overx = concatenate((overx,
                                     self.x[self.y == key],
                                     sam_pert), axis=0)

                overy = concatenate((overy,
                                     self.y[self.y == key],
                                     self.y[self.y == key].iloc[indx]), axis=0)

        if self.verbose:
            print("Over-sampling performed: " + str(Counter(overy)))

        # Return over sampled dataset
        return overx, overy
Example #25
0
def prepare_dect2(csv_path, image_source_dir, data_save_dir, df_train_vin, df_val_vin,\
        k=5, heatmap=True, txt_prefix='dec2_', yaml_name='detect2.yaml', nc=6, ext='.png'):

    class_names = [
        'Atelectasis', 'Cardiomegaly', 'Infiltration', 'Nodule/Mass',
        'Pleural effusion', 'Pneumothorax'
    ]

    class_map = dict(Atelectasis=0,
                     Cardiomegaly=1,
                     Infiltrate=2,
                     Mass=3,
                     Nodule=3,
                     Effusion=4,
                     Pneumothorax=5)

    # adding features to the dataframe
    print('preparing csv file ...')
    df = prepare_nih_bbox_csv(csv_path, image_source_dir, class_map)
    print('done preparing ^^')
    print()

    # spliting train / val dataset
    print('spliting train val dataset ...')
    df = stratified_kfold_split(df, k=k, heatmap=heatmap)
    print('done spliting data ^^')
    print()

    fold = 0
    df_train = df[df.fold != fold]
    df_val = df[df.fold == fold]

    # preparing train / val dirs
    img_train_dir = os.path.join(data_save_dir, 'images', 'train')
    img_val_dir = os.path.join(data_save_dir, 'images', 'val')
    label_train_dir = os.path.join(data_save_dir, 'labels', 'train')
    label_val_dir = os.path.join(data_save_dir, 'labels', 'val')

    os.makedirs(img_train_dir, exist_ok=True)
    os.makedirs(img_val_dir, exist_ok=True)
    os.makedirs(label_train_dir, exist_ok=True)
    os.makedirs(label_val_dir, exist_ok=True)

    # copying images to the appropriate dirs
    # creating .txt labels files
    print('segregating data ...')
    segregate_data(df_train, img_train_dir, label_train_dir)
    segregate_data(df_val, img_val_dir, label_val_dir)
    print('done segregating data ^^')
    print()

    df_train['image_new_path'] = df.image_id.apply(
        lambda x: os.path.join(img_train_dir, x + ext))
    df_val['image_new_path'] = df.image_id.apply(
        lambda x: os.path.join(img_val_dir, x + ext))

    print('filtering vin data ...')
    df_train_vin = filter_vin_to_nih(df_train_vin)
    df_val_vin = filter_vin_to_nih_df(df_val_vin)
    print('done filtering vin data ^^')
    print()

    # concatenate each pair of dataframes
    print('concatenating dataframes ...')
    df_train = pd.concatenate([df_train, df_train_vin],
                              axis=0,
                              ignore_index=True)
    df_val = pd.concatenate([df_val, df_val_vin], axis=0, ignore_index=True)
    print('done concatenating ^^')
    print()

    # prepare .txt files
    train_txt = os.path.join(data_save_dir, txt_prefix + 'train.txt')
    val_txt = os.path.join(data_save_dir, txt_prefix + 'val.txt')

    print('preparing .txt files ...')
    prepare_txt(train_txt, df_train.image_new_path.unique())
    prepare_txt(val_txt, df_val.image_new_path.unique())
    print('done preparing .txt files ^^')
    print()

    # prepare .yaml file
    print('preparing .yaml files ...')
    prepare_yaml(data_save_dir, yaml_name, (train_txt, val_txt), nc,
                 class_names)

    return df_train, df_val
Example #26
0
                        bsobj.findAll('cite')[3].get_text(),
                        bsobj.findAll('cite')[4].get_text(),
                        bsobj.findAll('a')[0].attrs['href'],
                        bsobj.findAll('a')[0].get_text(),
                        bsobj.findAll('a')[1].attrs['href'],
                        bsobj.findAll('a')[1].attrs['title'],
                        bsobj.findAll('cite')[2].get_text(),
                        bsobj.findAll('a')[2].attrs['href'] 
                if bool(bsobj.findAll('cite')[2].find('a')) else None
            ] for bsobj in bsObj.find("ul",{"class":"newlist"}).findAll("li")],columns=urls_colnames)
    return temp_array
    

pool=Pool()    
total_list=pool.map(get_urls_info,raw_pool)
temp=pd.concatenate(np.array(total_list),axis=0)
temp.回复数=temp.回复数.apply(lambda x:int(x))
temp.to_csv("C:/Users/User/Desktop/华南BOSS/NLP+策略/store.csv")
temp1=temp[temp.回复数>4]
url_pool=[i for i in temp1["帖子内链"]]
sub_pool=[url_pool[4]]
sub_pool.append(url_pool[22])

#===================返回小吧帖子信息=====================
def get_suburls_info(url):
    resp = requests.get(url ,headers = headers)
    bsObj=BeautifulSoup(resp.text,"lxml")
    temp_array1=np.array([[
                           int(bsobj.findAll('span')[0].get_text()),
                          int(bsobj.findAll('span')[1].get_text()),
                        bsobj.find('span',{'class':'l6'}).get_text(),#发帖时间
    def resample(self):
        """
        """

        # Start with the minority class
        underx = self.x[self.y == self.minc]
        undery = self.y[self.y == self.minc]

        # For each element of the current class, find the set of NN
        # of the minority class
        from sklearn.neighbors import NearestNeighbors

        # Call the constructor of the NN
        nn_obj = NearestNeighbors(n_neighbors=self.size_ngh, **self.kwargs)

        # Fit the minority class since that we want to know the distance
        # to these point
        nn_obj.fit(self.x[self.y == self.minc])

        # Loop over the other classes under picking at random
        for key in self.ucd.keys():

            # If the minority class is up, skip it
            if key == self.minc:
                continue

            # Set the ratio to be no more than the number of samples available
            if self.ratio * self.ucd[self.minc] > self.ucd[key]:
                num_samples = self.ucd[key]
            else:
                num_samples = int(self.ratio * self.ucd[self.minc])

            # Get the samples corresponding to the current class
            sub_samples_x = self.x[self.y == key]
            sub_samples_y = self.y[self.y == key]

            if self.version == 1:
                # Find the NN
                dist_vec, idx_vec = nn_obj.kneighbors(
                    sub_samples_x, n_neighbors=self.size_ngh)

                # Select the right samples
                sel_x, sel_y = self.__SelectionDistBased__(
                    dist_vec, num_samples, key, sel_strategy='nearest')
            elif self.version == 2:
                # Find the NN
                dist_vec, idx_vec = nn_obj.kneighbors(
                    sub_samples_x,
                    n_neighbors=self.y[self.y == self.minc].size)

                # Select the right samples
                sel_x, sel_y = self.__SelectionDistBased__(
                    dist_vec, num_samples, key, sel_strategy='nearest')
            elif self.version == 3:
                # We need a new NN object to fit the current class
                nn_obj_cc = NearestNeighbors(n_neighbors=self.ver3_samp_ngh,
                                             **self.kwargs)
                nn_obj_cc.fit(sub_samples_x)

                # Find the set of NN to the minority class
                dist_vec, idx_vec = nn_obj_cc.kneighbors(
                    self.x[self.y == self.minc])

                # Create the subset containing the samples found during the NN
                # search. Linearize the indexes and remove the double values
                idx_vec = np.unique(idx_vec.reshape(-1))

                # Create the subset
                sub_samples_x = sub_samples_x[idx_vec, :]
                sub_samples_y = sub_samples_y[idx_vec]

                # Compute the NN considering the current class
                dist_vec, idx_vec = nn_obj.kneighbors(
                    sub_samples_x, n_neighbors=self.size_ngh)

                sel_x, sel_y = self.__SelectionDistBased__(
                    dist_vec, num_samples, key, sel_strategy='farthest')

            underx = concatenate((underx, sel_x), axis=0)
            undery = concatenate((undery, sel_y), axis=0)

        if self.verbose:
            print("Under-sampling performed: " + str(Counter(undery)))

        return underx, undery