コード例 #1
0
def create_metric(soft, metric, release, fold=3, boderlinesmote=False):
    all = []
    for i in range(release):
        path = 'F:\\orca-master\\exampledata\\mData\\ordinalRegressionData\\Three severity\\' + metric + '\\' + soft + '\\' + str(
            i + 1) + '_code&network_metrics&bugs.csv'
        auto_spearman_metric, auto_spearman_metric_data = getAutoSpearmanMetric(
            path)
        all.append(auto_spearman_metric)
        for k in range(fold):
            if boderlinesmote:
                # 使用borderlinSMOTE
                auto_spearman_metric_data = auto_spearman_metric_data.dropna(
                    axis=1)
                x = auto_spearman_metric_data.iloc[:, 0:-1]
                y = auto_spearman_metric_data.iloc[:, -1:]

                bord_smote = BorderlineSMOTE(random_state=16,
                                             kind="borderline-1")
                x_res, y_res = bord_smote.fit_resample(x, y)
                auto_spearman_metric_data = pd.merge(x_res,
                                                     y_res,
                                                     how='left',
                                                     left_index=True,
                                                     right_index=True)
            save_path = 'F:\\orca-master\\exampledata\\' + metric + '\\' + soft + '\\' + str(
                fold) + '-fold\\' + soft + str(
                    i +
                    1) + '\\matlab\\' + 'train_' + soft + str(i +
                                                              1) + '.' + str(k)
            tmp = shuffle(auto_spearman_metric_data)
            tmp.to_csv(save_path, header=None, index=False, sep=" ")
    return all
コード例 #2
0
def main(path, began, is_cent_data=0, iterations=30, temperature=5, attractive_force=1, repulsive_force=0.4, speed=0.02, k=0.5):
    # 读取数据
    # 读取df类型的归一化数据
    my_data = tool.unitilize_data(tool.read_KEEL_data(path, began))
    # 准备训练数据和测试数据
    # 使用fr模型和smote模型数理数据,形成新的数据
    # 使用处理完成的数据,进行建模,并预测
    # 是否对数据进行中心化处理
    if is_cent_data != 0:
        cent_point = get_cent_point(my_data)
    else:
        cent_point = np.array(my_data)
    fr_data = pd.DataFrame(fr(cent_point,iterations, temperature, attractive_force, repulsive_force, speed, k))
    fr_data_x = fr_data.iloc[:, 0:-1]
    fr_data_y = fr_data.iloc[:, -1]
    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    # 随机抽取测试数据的训练数据
    X_train, X_test, Y_train, Y_test = train_test_split(fr_data_x, fr_data_y, test_size=0.3, random_state=42)
    # 根据抽取的训练数据,生成平衡过后的数据集
    my_B_SMOTE = B_SMOTE()
    fr_data_x_smote, fr_data_y_smote = my_B_SMOTE.fit_sample(X_train, Y_train)

    #经过处理的数数据
    print("经过处理的数据:")
    train(fr_data_x_smote, fr_data_y_smote, X_test, Y_test)

    #未经处理的数据

    X_train_org, X_test_org, Y_train_org, Y_test_org = train_test_split(my_data.iloc[:, 0:-1], my_data.iloc[:, -1], test_size=0.3, random_state=42)

    data_x_org, data_y_org = my_B_SMOTE.fit_sample(X_train_org,  Y_train_org)
    print("未处理的数据:")
    train(data_x_org, data_y_org, X_test_org, Y_test_org)
コード例 #3
0
ファイル: sk_proc.py プロジェクト: ikem55/NRAsystem
 def del_set_smote_data(self):
     """ 学習データのSMOTE処理を行い学習データを更新する  """
     # 対象数が少ない場合はサンプリングレートを下げる
     positive_count_train = self.y_train.sum()
     negative_count_train = len(self.y_train) - positive_count_train
     print("check y_train value 0:" + str(negative_count_train) + " 1:" +
           str(positive_count_train))
     if positive_count_train >= 6:
         smote = BorderlineSMOTE()
         self.X_train, self.y_train = smote.fit_sample(
             self.X_train, self.y_train)
     else:
         print("----- RandomOverSampler ----- ")
         ros = RandomOverSampler(
             # ratio={1: self.X_train.shape[0], 0: self.X_train.shape[0] // 3}, random_state=71)
             ratio={
                 1: negative_count_train,
                 0: negative_count_train
             },
             random_state=71)
         # 学習用データに反映
         self.X_train, self.y_train = ros.fit_sample(
             self.X_train, self.y_train)
     print("-- after sampling: " +
           str(np.unique(self.y_train, return_counts=True)))
コード例 #4
0
ファイル: model_manager.py プロジェクト: hackjinlee/HCC_prj
def oversample_with_smote(x_train, y_train, iterator=10):
    '''
    SMOTE를 이용하여 데이터를 oversampling 해줌.
    :param x_train: 모델에 입력되는 데이터
    :param y_train: 모델이 예측할 타겟
    :param iterator: sampling 반복 정도
    :return: oversampling 된 X, Y
    '''
    sm = BorderlineSMOTE()
    x_train_sm, y_train_sm = sm.fit_sample(x_train, y_train)
    x_train_fin = []
    y_train_fin = []
    for i in range(iterator):
        temp_x = []
        temp_y = []
        indexes = list(range(len(y_train_sm)))
        random.shuffle(indexes)
        cnt = 0
        max_cnt = len(y_train_sm) // 10
        for j in indexes:
            x = x_train_sm[j]
            y = y_train_sm[j]
            if y == i % 2:
                temp_x.append(x)
                temp_y.append(y)
            elif cnt < max_cnt:
                temp_x.append(x)
                temp_y.append(y)
                cnt += 1
        x_sm_new, y_sm_new = sm.fit_sample(temp_x, temp_y)
        x_train_fin.extend(x_sm_new)
        y_train_fin.extend(y_sm_new)
    return x_train_fin, y_train_fin
コード例 #5
0
 def train(self, gridsearch=False):
     tic = time.time()
     self.set_pipeline()
     X_train_preproc = self.pipeline_feature.fit_transform(self.X_train)
     bm = BorderlineSMOTE(random_state=2,
                          sampling_strategy='minority',
                          k_neighbors=1,
                          m_neighbors=20)
     self.X_train_smote, self.y_train_smote = bm.fit_resample(
         X_train_preproc, self.y_train)
     if gridsearch:
         self.model = RandomizedSearchCV(
             estimator=self.get_estimator(),
             param_distributions=self.model_params,
             n_iter=10,
             cv=2,
             verbose=5,
             random_state=42,
             n_jobs=None,
         )
         self.model.fit(self.X_train_smote, self.y_train_smote)
         self.mlflow_log_metric("train_time", int(time.time() - tic))
         print(colored(f'best score: {self.model.best_score_}', "blue"))
         print(colored(f'best params: {self.model.best_params_}', "blue"))
         self.model = self.model.best_estimator_
     else:
         self.model = self.get_estimator()
         self.model.fit(self.X_train_smote, self.y_train_smote)
         self.mlflow_log_metric("train_time", int(time.time() - tic))
コード例 #6
0
ファイル: process.py プロジェクト: paradiser/imbalanced-learn
def oversample(x, y, method):
    randomstate = 42
    if method == 'No Sample':
        # 不采样
        return x, y
    elif method == 'random':
        # 随机过采样
        ros = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=randomstate)
        X_resampled, y_resampled = ros.fit_resample(x, y)
    elif method == 'SMOTE':
        # SMOTE算法
        X_resampled, y_resampled = SMOTE(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y)
    elif method == 'Sparse SMOTE':
        # Sparse SMOTE算法
        X_resampled, y_resampled = SparseSMOTE(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y)
    elif method == 'SMOTEBorderline-1':
        # BorderlineSmote算法 borderline-1
        X_resampled, y_resampled = BorderlineSMOTE(sampling_strategy=sampling_strategy, kind='borderline-1', random_state=randomstate).fit_resample(x, y)
    elif method == 'SMOTEBorderline-2':
        # BorderlineSmote算法 borderline-2
        X_resampled, y_resampled = BorderlineSMOTE(sampling_strategy=sampling_strategy, kind='borderline-2', random_state=randomstate).fit_resample(x, y)
    elif method == 'SVMSMOTE':
        # SVMSMOTE算法
        X_resampled, y_resampled = SVMSMOTE(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y)
    elif method == 'ADASYN':
        # ADASYN算法
        X_resampled, y_resampled = ADASYN(sampling_strategy=sampling_strategy, random_state=randomstate).fit_resample(x, y)
    elif method == 'mwmote':
        # MWMOTE算法
        X_resampled, y_resampled = MWMOTE.MWMOTE(x, y, N=1000, return_mode='append')
    # 统计过采样数量
    # from collections import Counter
    # print(sorted(Counter(y_resampled).items()))
    return X_resampled, y_resampled
コード例 #7
0
        def Smote_bd(
                data,
                label):  #样本的近邻至少有一半是其他类,(此时样本被称为危险样本)最近邻中的随机样本b与该少数类样本a来自于不同的类
            from imblearn.over_sampling import BorderlineSMOTE

            smote = BorderlineSMOTE(random_state=0)
            data_smote_bd, label_smote_bd = smote.fit_sample(data, label)
            return data_smote_bd, label_smote_bd
コード例 #8
0
def bordersmote(x, y):
    # Borderline-SMOTE
    k_neighbors = math.ceil(sum(y) * 0.01)
    m_neighbors = math.ceil(sum(y) * 0.01)
    
    bordersmote = BorderlineSMOTE(sampling_strategy=1, 
                                  k_neighbors=k_neighbors, 
                                  m_neighbors=m_neighbors)
    
    return bordersmote.fit_resample(x, y)
コード例 #9
0
ファイル: dam3.py プロジェクト: amir-abolfazli/DAM3
    def oversample_remainingSet(self, instances, labels, kind='borderline-1'):
        """oversamples remaining set (using BorderlineSMOTE) after a drift is detected."""
        if len(np.unique(labels)) >= 2:
            minority_class = collections.Counter(labels.tolist()).most_common()[-1][0]

            if np.sum(labels == minority_class) > self.n_neighbors:
                oversample = BorderlineSMOTE(k_neighbors=self.n_neighbors, m_neighbors=5, kind=kind, random_state=self.random_state)
                instances, labels = oversample.fit_sample(instances, labels)

        return instances, labels
コード例 #10
0
def over_under_sampling(x, y):
    print('Generating synthetic samples...')
    over = BorderlineSMOTE()
    # under = RandomUnderSampler(sampling_strategy=0.5)
    # steps = [('o', over), ('u', under)]
    # pipeline = Pipeline(steps=steps)
    # x, y = pipeline.fit_resample(x, y)
    x, y = over.fit_resample(x, y.idxmax(axis=1))
    y = pd.get_dummies(y)
    return x, y
コード例 #11
0
def test_borderline_smote(kind, data):
    bsmote = BorderlineSMOTE(kind=kind, random_state=42)
    bsmote_nn = BorderlineSMOTE(kind=kind, random_state=42,
                                k_neighbors=NearestNeighbors(n_neighbors=6),
                                m_neighbors=NearestNeighbors(n_neighbors=11))

    X_res_1, y_res_1 = bsmote.fit_resample(*data)
    X_res_2, y_res_2 = bsmote_nn.fit_resample(*data)

    assert_allclose(X_res_1, X_res_2)
    assert_array_equal(y_res_1, y_res_2)
コード例 #12
0
def up_sampling(X_train, y_train, ratio=2):
    pos_num = (y_train == 1).sum()
    if pos_num == 0:
        return X_train, y_train
    pos_sap_num = int(pos_num * ratio)
    X_train.fillna(0, inplace=True)
    smo = BorderlineSMOTE(sampling_strategy={1: pos_sap_num},
                          random_state=2019,
                          n_jobs=8)
    X_train, y_train = smo.fit_resample(X_train, y_train)

    return X_train, y_train
コード例 #13
0
def borderline_smote(X,
                     y,
                     visualize=False,
                     pca2d=True,
                     pca3d=True,
                     tsne=True,
                     pie_evr=True):
    sm = BorderlineSMOTE(random_state=42)
    X_res, y_res = sm.fit_resample(X, y)
    if visualize == True:
        hist_over_and_undersampling(y_res)
        pca_general(X_res, y_res, d2=pca2d, d3=pca3d, pie_evr=pie_evr)
    return X_res, y_res
コード例 #14
0
def resample(X, Y, resampling):
  X_resampled, y_resampled = X, Y
  if resampling == 'oversampling':
    from imblearn.over_sampling import RandomOverSampler
    ros = RandomOverSampler(random_state=0)
    X_resampled, y_resampled = ros.fit_resample(X, Y)
  if resampling == 'undersampling':
    from imblearn.under_sampling import ClusterCentroids
    cc = ClusterCentroids(random_state=0)
    X_resampled, y_resampled = cc.fit_resample(X, Y)
  if resampling == 'smote':
    from imblearn.over_sampling import BorderlineSMOTE
    # from imblearn.over_sampling import SMOTE
    X_resampled, y_resampled = BorderlineSMOTE().fit_resample(X, Y)
  return X_resampled.fillna(0), y_resampled.fillna(0)
コード例 #15
0
ファイル: results.py プロジェクト: Wwinson/publications
def generate_oversamplers(factor):
    """Generate a list of oversamplers that pre-apply undersampling."""
    if factor is None:
        return [('BENCHMARK METHOD', None, {})]
    return [('NO OVERSAMPLING',
             UnderOverSampler(oversampler=None, factor=factor), {}),
            ('RANDOM OVERSAMPLING',
             UnderOverSampler(oversampler=RandomOverSampler(),
                              factor=factor), {}),
            ('SMOTE', UnderOverSampler(oversampler=SMOTE(), factor=factor), {
                'oversampler__k_neighbors': [3, 5]
            }),
            ('BORDERLINE SMOTE',
             UnderOverSampler(oversampler=BorderlineSMOTE(), factor=factor), {
                 'oversampler__k_neighbors': [3, 5]
             }),
            ('G-SMOTE',
             UnderOverSampler(oversampler=GeometricSMOTE(), factor=factor), {
                 'oversampler__k_neighbors': [3, 5],
                 'oversampler__selection_strategy':
                 ['combined', 'minority', 'majority'],
                 'oversampler__truncation_factor':
                 [-1.0, -0.5, .0, 0.25, 0.5, 0.75, 1.0],
                 'oversampler__deformation_factor':
                 [.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0]
             })]
コード例 #16
0
def test_combine_results_multiple():
    """Test the combination of experimental results for different
    datasets, oversamplers and classifiers."""

    # Clone and fit experiments
    experiment1 = (clone(EXPERIMENT).set_params(
        oversamplers=[('bsmote', BorderlineSMOTE(), {
            'k_neighbors': [2, 5]
        })],
        classifiers=[('gbc', GradientBoostingClassifier(), {})],
        scoring=['accuracy', 'f1'],
    ).fit(DATASETS[:-1]))
    experiment2 = (clone(EXPERIMENT).set_params(
        scoring=['accuracy', 'f1']).fit(DATASETS[-1:]))

    # Extract combined results
    combined_results = combine_results(experiment1.results_,
                                       experiment2.results_)
    results = combined_results.reset_index()

    # Assertions
    assert set(results.Dataset) == {'A', 'B', 'C'}
    assert set(results.Oversampler) == {'random', 'smote', 'bsmote'}
    assert set(results.Classifier) == {'dtc', 'knc', 'gbc'}
    assert set([scorer[0] for scorer in combined_results.columns
                ]) == set(['accuracy', 'f1'])
    pd.testing.assert_frame_equal(
        combined_results,
        pd.concat([experiment1.results_, experiment2.results_]).sort_index(),
    )
コード例 #17
0
def perform_smote_undersample(x, y, smote_type='regular', strategy='auto', seed=16, binary=False):
	_np.random.seed(seed)
	if smote_type == 'regular':
		sm = SMOTE(random_state=seed, k_neighbors=3, sampling_strategy=strategy, n_jobs=14)
	elif smote_type == 'borderline':
		sm = BorderlineSMOTE(random_state=seed, k_neighbors=5, sampling_strategy=strategy, n_jobs=14)
	if len(y.shape) > 1:
		x, y = sm.fit_resample(x, y[:,1].reshape(-1))
	else:
		x, y = sm.fit_resample(x, y)
	y = y.reshape(-1).astype(_np.int8)
	#print('Head of y: {}'.format(y[:6]))

	if binary:
		y_binary = _np.zeros((y.shape[0], 2))
		for i in range(y.shape[0]):
			#print('i: {} y[i]= {}'.format(i, y[i]))
			y_binary[i, y[i]] = 1

		_np.random.seed(seed)
		_np.random.shuffle(y_binary)
		y = y_binary
	#print('Head y_binary: {}'.format(y_binary[:6, :]))

	return x, y
コード例 #18
0
def  upper_region():
    X = data_frame.drop([TOP_LEVEL_TARGET, SECOND_LEVEL_TARGET], axis=1)  # Features - drop region, class
    y = data_frame[TOP_LEVEL_TARGET]  # Labels
    print("Region Count ", (pd.DataFrame(y)).groupby(TOP_LEVEL_TARGET).size())

    kfold = StratifiedKFold(n_splits=10, random_state=1, shuffle=False)

    model = tune_random_forest(RandomForestClassifier(random_state=42), X, y)
    clf = svm.SVC(kernel='linear', C=1, random_state=0)
    knn = KNeighborsClassifier(n_neighbors=10)
    gb = GradientBoostingClassifier()
    gb = tune_gb(X, y)
    lr = LogisticRegression(multi_class='ovr')
    rf = RandomForestClassifier(n_estimators=200, max_depth=20)

    pipeline = Pipeline(
        [
            ('ROS', BorderlineSMOTE()),
            ('model', clf)
        ]
    )
    scoring = ['accuracy', 'f1_micro', 'precision_micro', 'recall_micro']
    cv_results = cross_validate(pipeline, X, y, cv=kfold, scoring=scoring)
    # print('%f (%f)' % (cv_results.mean(), cv_results.std())) - error
    print(sorted(cv_results.keys()))
    print(cv_results['fit_time'].mean())
    print(cv_results['score_time'].mean())
    print(cv_results['test_accuracy'].mean())
    print(cv_results['test_f1_micro'].mean())
    print(cv_results['test_precision_micro'].mean())
    print(cv_results['test_recall_micro'].mean())

    joblib.dump(clf, filename='../resources/models/parent_classifier.pkl')
def sampler(X, y, over_pct=0.1, under_pct=0.2):
    over = BorderlineSMOTE(random_state=42, sampling_strategy=over_pct)
    under = RandomUnderSampler(random_state=42, sampling_strategy=under_pct)
    steps = [('o', over), ('u', under)]
    pipeline = Pipeline(steps=steps)
    X, y = pipeline.fit_resample(X, y)
    return X, y
コード例 #20
0
ファイル: ml-helpper.py プロジェクト: HaritzSaiz/ml-helpper
def oversample_borderline_SMOTE(df, variant=1, debug=True):
    X = df.values[:, :-1]
    y = df.values[:, -1].astype(int)
    if debug:
        print('Original dataset shape %s' % Counter(y))
    if variant == 1:
        sm = BorderlineSMOTE(random_state=0, kind="borderline-1")
    else:
        sm = BorderlineSMOTE(random_state=0, kind="borderline-2")

    X_res, y_res = sm.fit_resample(X, y)
    df_resampled = pd.DataFrame(X_res, columns=df.columns[:-1])
    df_resampled.insert(len(df_resampled.columns), df.columns[-1], y_res)
    if debug:
        print('Resampled dataset shape %s' % Counter(y_res))
    return df_resampled
コード例 #21
0
def smote_tomek(x_train, y_train):
    oversample = BorderlineSMOTE(sampling_strategy=0.5,
                                 random_state=0,
                                 k_neighbors=5,
                                 m_neighbors=10,
                                 n_jobs=-1,
                                 kind='borderline-1')
    X, y = oversample.fit_resample(x_train, y_train)

    tom_lin = TomekLinks(sampling_strategy='majority', n_jobs=-1)
    X, y = tom_lin.fit_resample(X, y)
    # print(len([i for i in y_train.values if i==1]))
    # print(len([i for i in y.values if i==1]))
    # print(len(y_train))
    # print(len(y))
    return X, y
コード例 #22
0
    def __init__(self, lemmatization=False):
        BugCoupleModel.__init__(self, lemmatization)

        self.sampler = BorderlineSMOTE(random_state=0)
        self.calculate_importance = False

        cleanup_functions = [
            feature_cleanup.responses(),
            feature_cleanup.hex(),
            feature_cleanup.dll(),
            feature_cleanup.fileref(),
            feature_cleanup.url(),
            feature_cleanup.synonyms(),
            feature_cleanup.crash(),
        ]

        self.extraction_pipeline = Pipeline(
            [
                ("bug_extractor", bug_features.BugExtractor([], cleanup_functions)),
                (
                    "union",
                    ColumnTransformer([("text", self.text_vectorizer(), "text")]),
                ),
            ]
        )

        self.clf = LinearSVCWithLabelEncoding(LinearSVC())
コード例 #23
0
def Borderline_DBSCAN(train_data, label, eps=20.1, min_samples=5):
    label_index = 0
    if label == 'c':
        label_index = 1
    if label == 'b':
        label_index = 0

    print(train_data['label'].value_counts())
    boSMOTE = BorderlineSMOTE(kind='borderline-1')
    x, y = boSMOTE.fit_resample(train_data.iloc[:, :-1], train_data.iloc[:, -1])

    # print(boSMOTE.sample)
    BMG_sample = boSMOTE.sample[label_index][1]
    BMG_sample = pd.DataFrame(BMG_sample, columns=train_data.columns.values.tolist()[:-1])
    BMG_sample['label'] = label


    max_sample = []
    min_sample = []
    # print(train_data.shape[0])
    for temp in range(train_data.shape[0]):
        if train_data.iloc[temp, -1] == label:
            min_sample.append(train_data.iloc[temp, :].values)
        else:
            max_sample.append(train_data.iloc[temp, :].values)

    max_sample = pd.DataFrame(max_sample, columns=train_data.columns.values.tolist())
    min_sample = pd.DataFrame(min_sample, columns=train_data.columns.values.tolist())
    mergeSample = pd.concat([max_sample, BMG_sample], ignore_index=False)
    # print(min_sample.shape[0])
    # print(max_sample.shape[0])
    # print("**9**")
    # print(mergeSample.shape[0])
    dbsc = DBSCAN(eps=eps, min_samples=min_samples).danger_fit(X=mergeSample, danger_sample=BMG_sample)
    array_neighborhoods = dbsc.neighborhoods
    neighborhoods_index = []
    array_n_neighbors = dbsc.n_neighbors
    for temp in range(len(array_n_neighbors)):
        if array_n_neighbors[temp] >= 5:
            for i in range(array_n_neighbors[temp]):
                neighborhoods_index.append(array_neighborhoods[temp][i])
    new_sample_index = list(set(neighborhoods_index))
    num_sample = BMG_sample.shape[0]
    # print(array_neighborhoods)
    # print(len(new_sample_index))
    # print(train_data.shape[0])
    return min_sample, mergeSample, new_sample_index, num_sample
コード例 #24
0
    def __init__(self, lemmatization=False):
        BugModel.__init__(self, lemmatization)

        self.sampler = BorderlineSMOTE(random_state=0)
        self.calculate_importance = False

        feature_extractors = [
            bug_features.has_str(),
            bug_features.has_regression_range(),
            bug_features.severity(),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.product(),
            # TODO: We would like to use the component at the time of filing too,
            # but we can't because the rollback script doesn't support changes to
            # components yet.
            # bug_features.component(),
            bug_features.num_words_title(),
            bug_features.num_words_comments(),
            bug_features.keywords(),
        ]

        cleanup_functions = [
            feature_cleanup.fileref(),
            feature_cleanup.url(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline(
            [
                (
                    "bug_extractor",
                    bug_features.BugExtractor(
                        feature_extractors, cleanup_functions, rollback=True
                    ),
                ),
                (
                    "union",
                    ColumnTransformer(
                        [
                            ("data", DictVectorizer(), "data"),
                            ("title", self.text_vectorizer(min_df=0.0001), "title"),
                            (
                                "comments",
                                self.text_vectorizer(min_df=0.0001),
                                "comments",
                            ),
                        ]
                    ),
                ),
            ]
        )

        self.clf = xgboost.XGBClassifier(n_jobs=16)
        self.clf.set_params(predictor="cpu_predictor")
コード例 #25
0
ファイル: bugtype.py プロジェクト: rajibmitra/bugbug
    def __init__(self, lemmatization=False, historical=False):
        BugModel.__init__(self, lemmatization)

        self.sampler = BorderlineSMOTE(random_state=0)

        feature_extractors = [
            bug_features.has_str(),
            bug_features.severity(),
            # Ignore keywords that would make the ML completely skewed
            # (we are going to use them as 100% rules in the evaluation phase).
            bug_features.keywords(set(keyword_dict.keys())),
            bug_features.is_coverity_issue(),
            bug_features.has_crash_signature(),
            bug_features.has_url(),
            bug_features.has_w3c_url(),
            bug_features.has_github_url(),
            bug_features.whiteboard(),
            bug_features.patches(),
            bug_features.landings(),
            bug_features.title(),
            bug_features.blocked_bugs_number(),
            bug_features.ever_affected(),
            bug_features.affected_then_unaffected(),
            bug_features.product(),
            bug_features.component(),
        ]

        cleanup_functions = [
            feature_cleanup.url(),
            feature_cleanup.fileref(),
            feature_cleanup.synonyms(),
        ]

        self.extraction_pipeline = Pipeline([
            (
                "bug_extractor",
                bug_features.BugExtractor(feature_extractors,
                                          cleanup_functions),
            ),
            (
                "union",
                ColumnTransformer([
                    ("data", DictVectorizer(), "data"),
                    ("title", self.text_vectorizer(min_df=0.001), "title"),
                    (
                        "first_comment",
                        self.text_vectorizer(min_df=0.001),
                        "first_comment",
                    ),
                    (
                        "comments",
                        self.text_vectorizer(min_df=0.001),
                        "comments",
                    ),
                ]),
            ),
        ])

        self.clf = OneVsRestClassifier(xgboost.XGBClassifier(n_jobs=16))
コード例 #26
0
 def classification(self,X,Y):
     X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
     #text_clf = Pipeline([('tfidf', TfidfVectorizer()),('clf', MultinomialNB())])
     vectorizer = TfidfVectorizer()
     # vectorizer2 = TfidfVectorizer()
     X_train_tfidf = vectorizer.fit_transform(X_train)
     X_test_tfidf = vectorizer.transform(X_test)
     sm = BorderlineSMOTE()
     X_res, Y_res = sm.fit_sample(X_train_tfidf, y_train)
     clf = MultinomialNB()
     clf.fit(X_res, Y_res)
     prediction = clf.predict(X_test_tfidf)
     print(prediction)
     final_time = start_time - datetime.datetime.now()
     print(final_time)
     print(metrics.classification_report(y_test,prediction))
     print(metrics.roc_auc_score(y_test, prediction))
コード例 #27
0
def imbalanced_sampler(input_data, input_labels, method='SMOTE'):
    if method == 'SMOTE':
        sampler = BorderlineSMOTE(n_jobs=4, random_state=RANDOM_STATE)
    elif method == 'Near Miss':
        sampler = NearMiss(n_jobs=4, random_state=RANDOM_STATE)
    else:
        print('Invalid sampler type. Only `SMOTE` (Borderline) and `Near Miss` are supported...')
        sys.exit(0)
    # TODO save samples by class to reduce file size
    max_class_num = np.max(input_labels)
    class_range = np.arange(1, max_class_num)
    x_sampled, y_sampled = sampler.fit_resample(input_data, input_labels)
    for i in class_range:
        idx = np.argwhere(y_sampled == i)
        pickle.dump(x_sampled[idx][:], open(method + '_Class_' + str(i) + '_data_samples.pkl', 'wb'))
        pickle.dump(y_sampled[idx], open(method + '_Class_' + str(i) + '_label_samples.pkl', 'wb'))
    return x_sampled, y_sampled
コード例 #28
0
    def fit(self, x, y, sampling="under", show_info=False):
        """
        训练集成器

        :param x:样本
        :param y:标签
        """
        IR = len(y[y == 1]) / len(y[y == 0])
        # 下采样
        if sampling == "under":
            sampling_interval = 1 / (IR * np.log2(IR))  # 采样间隔
            balance_rate = 1 / IR  # 平衡采样率
            start_sampling_rate = balance_rate + sampling_interval
            if show_info:
                print("下采样")
                print("采样前 IR=%.2f" % IR)
                print("平衡采样率 %.4f 采样间隔 %.4f" %
                      (balance_rate, sampling_interval))
            for i in range(self.n_estimator):
                # 采样率越来越小,采样数量也就越来越少
                sampling_rate = start_sampling_rate - pow(2, i + 1) / pow(
                    2, self.n_estimator) * sampling_interval

                # 基于密度采样
                # x_train, y_train = DBUSampler(sampling_rate=sampling_rate, show_info=False).fit_resample(x, y)

                # 随机下采样,采样多数类
                x_train, y_train = myRandomSampler().under_sampling(
                    x, y, sampling_rate)

                if show_info:
                    print("当前采样率:%.4f" % sampling_rate)
                    IR = len(y_train[y_train == 1]) / len(
                        y_train[y_train == 0])
                    print("采样后 IR=%.2f" % IR)

                self.classifiers[i].fit(x_train, y_train)
        else:
            # 上采样
            sampling_interval = len(y[y == 1]) / len(y[y == 0]) - 1
            if show_info:
                print("上采样")
                print("采样前 IR=%.2f" % IR)

            for i in range(self.n_estimator):
                sampling_rate = 1 + math.log(
                    i + 1, self.n_estimator) * sampling_interval
                n_sampling = int(sampling_rate * len(y[y == 0]))
                x_train, y_train = BorderlineSMOTE(sampling_strategy={
                    0: n_sampling
                }).fit_resample(x, y)
                if show_info:
                    print("当前采样率 %.4f" % sampling_rate)
                    IR = len(y_train[y_train == 1]) / len(
                        y_train[y_train == 0])
                    print("采样后 IR=%.2f" % IR)

                self.classifiers[i].fit(x_train, y_train)
コード例 #29
0
def Resampling(train_x, train_y, resampling_method):
    train_y.data = LabelEncoder().fit_transform(train_y.data)
    # summarize distribution

    # scommentare la riga di seguito se si vuole visualizzare il grafico a torta della distribuzione delle classi prima di resampling
    #plotGraphics.piePlot(train_y, "Before Resampling")

    # ---- UNDER-SAMPLING ------ #
    if resampling_method == "ClusterCentroids":
        resample = ClusterCentroids(voting='hard', random_state=42)

    if resampling_method == "CondensedNearestNeighbour":
        resample = CondensedNearestNeighbour(n_neighbors=7, random_state=42)

    if resampling_method == "EditedNearestNeighbours":
        resample = EditedNearestNeighbours(n_neighbors=7,
                                           kind_sel='mode',
                                           n_jobs=-1)

    if resampling_method == "RepeatedEditedNearestNeighbours":
        resample = RepeatedEditedNearestNeighbours(n_neighbors=7,
                                                   kind_sel='mode',
                                                   n_jobs=-1)

    if resampling_method == "AllKNN":
        resample = AllKNN(n_neighbors=7,
                          kind_sel='mode',
                          allow_minority=True,
                          n_jobs=-1)

    if resampling_method == "NearMiss":
        resample = NearMiss(n_neighbors=7, n_jobs=-1)

    if resampling_method == "NeighbourhoodCleaningRule":
        resample = NeighbourhoodCleaningRule(n_neighbors=7, kind_sel='all')

    if resampling_method == "RandomUnderSampler":
        resample = RandomUnderSampler(random_state=42)

    if resampling_method == "TomekLinks":
        resample = TomekLinks(n_jobs=-1)

    # ---- OVER-SAMPLING ------ #
    if resampling_method == "BorderlineSMOTE":
        resample = BorderlineSMOTE(random_state=42, n_jobs=-1)

    if resampling_method == "KMeansSMOTE":
        resample = KMeansSMOTE(random_state=42)

    if resampling_method == "RandomUnderSampler":
        resample = RandomOverSampler(random_state=42)

    if resampling_method == "SMOTE":
        resample = SMOTE(random_state=42, n_jobs=-1)

    # transform the dataset
    train_x.data, train_y.data = resample.fit_resample(train_x.data,
                                                       train_y.data)
コード例 #30
0
ファイル: Preprocessing.py プロジェクト: luer17/creditmodel
    def over_sample(self,
                    method="BorderLine",
                    sampling_strategy="minority",
                    random_state=42,
                    k_neighbors=5,
                    n_neighbors=10,
                    kind="borderline-1"):
        """
        过采样方法
        :param method: str, option: ADASYN, BorderLine,KMeans,Random,SVM
        :param sampling_strategy:str or dict, option: 'minority','not majority','all','auto', {1:n,0:m}
        :param random_state:int
        :param k_neighbors:int
        :param n_neighbors:int
        :param kind:str, borderline-1,borderline-2
        :return:df
        """
        feature_name = self._df.columns.difference(["id",
                                                    self._target]).tolist()
        X = self._df[feature_name].values
        y = self._df[self._target].values

        print("Original label shape {}".format(Counter(y)))

        if method == "ADASYN":
            overSm = ADASYN(sampling_strategy=sampling_strategy,
                            random_state=random_state,
                            n_neighbors=k_neighbors)
        elif method == "BorderLine":
            overSm = BorderlineSMOTE(sampling_strategy=sampling_strategy,
                                     random_state=random_state,
                                     k_neighbors=k_neighbors,
                                     m_neighbors=n_neighbors,
                                     kind=kind)
        elif method == "KMeans":
            overSm = KMeansSMOTE(sampling_strategy=sampling_strategy,
                                 random_state=random_state,
                                 k_neighbors=k_neighbors)
        elif method == "Random":
            overSm = RandomOverSampler(sampling_strategy=sampling_strategy,
                                       random_state=random_state)
        elif method == "SVM":
            overSm = SVMSMOTE(sampling_strategy=sampling_strategy,
                              random_state=random_state,
                              k_neighbors=k_neighbors,
                              m_neighbors=n_neighbors,
                              out_step=0.5)
        else:
            print("不支持{}该抽样方法".format(method))
            return self._df

        X_res, y_res = overSm.fit_resample(X, y)
        print("overSample label shape {}".format(Counter(y_res)))
        _data = np.concatenate([X_res, y_res.reshape(len(X_res), 1)], axis=1)
        df_new = pd.DataFrame(data=_data,
                              columns=feature_name + [self._target])
        return df_new
コード例 #31
0
    def use_parameters(self, X_train, selected_features):
        """
        Default Parameter

        """

        test_scaler = [
            StandardScaler(),
            RobustScaler(),
            QuantileTransformer(),
            Normalizer()
        ]
        test_sampling = [
            modelutil.Nosampler(),
            ClusterCentroids(),
            RandomUnderSampler(),
            # NearMiss(version=1),
            # EditedNearestNeighbours(),
            # AllKNN(),
            # CondensedNearestNeighbour(random_state=0),
            # InstanceHardnessThreshold(random_state=0,
            #                          estimator=LogisticRegression(solver='lbfgs', multi_class='auto')),
            RandomOverSampler(random_state=0),
            SMOTE(),
            BorderlineSMOTE(),
            SMOTEENN(),
            SMOTETomek(),
            ADASYN()
        ]
        test_C = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2, 1e3]
        test_C_linear = [1e-3, 1e-2, 1e-1, 1e0, 1e1, 1e2]

        # gamma default parameters
        param_scale = 1 / (X_train.shape[1] * np.mean(X_train.var()))

        parameters = [{
            'scaler': test_scaler,
            'sampling': test_sampling,
            'feat__cols': selected_features,
            'model__C': test_C_linear,  # default C=1
            'model__kernel': ['linear']
        }]

        # If no missing values, only one imputer strategy shall be used
        if X_train.isna().sum().sum() > 0:
            parameters['imputer__strategy'] = [
                'mean', 'median', 'most_frequent'
            ]
            print("Missing values used. Test different imputer strategies")
        else:
            print("No missing values. No imputer necessary")

            print("Selected Parameters: ", parameters)
        # else:
        print("Parameters defined in the input: ", parameters)

        return parameters
コード例 #32
0
def test_borderline_smote_wrong_kind(data):
    bsmote = BorderlineSMOTE(kind='rand')
    with pytest.raises(ValueError, match='The possible "kind" of algorithm'):
        bsmote.fit_resample(*data)