Beispiel #1
0
 def pre_training_without_SMOTE(self, pre_training_ratio):
     clf = ElasticNet(max_iter=10000)
     # self.smote_for_positive()
     positive_index = np.where(self.labels == 1)
     self.labels = [0 if label == -1 else label for label in self.labels]
     # verify the count of labeling negative
     label_negative_count = int(pre_training_ratio *
                                np.bincount(self.labels)[0])
     # fit pre-training model
     clf.fit(self.samples, self.labels)
     # get and sort distances
     distances = clf._decision_function(self.samples)
     distances_copy = distances.copy()
     distances_copy.sort()
     # get threshold
     negative_threshold = distances_copy[label_negative_count]
     self.labels = np.array(
         [-1 if label == 0 else label for label in self.labels])
     label_negative_count_cur = 0
     for i in range(self.length):
         if label_negative_count_cur >= label_negative_count:
             break
         if distances[i] <= negative_threshold and i not in positive_index[
                 0]:
             self.labels[i] = 0
             label_negative_count_cur += 1
     print Counter(self.labels)
     return self.labels[:self.length]
Beispiel #2
0
 def pre_training(self, pre_training_ratio):
     # clf = SVC(kernel='linear', probability=True, class_weight='balanced')
     # clf = SGDClassifier(loss='log', max_iter=10000)
     # clf = LogisticRegression(max_iter=10000)
     clf = ElasticNet(max_iter=10000)
     # clf = RandomForestClassifier()
     self.smote_for_positive()
     positive_index = np.where(self.labels == 1)
     self.labels = [0 if label == -1 else label for label in self.labels]
     # verify the count of labeling negative
     label_negative_count = int(pre_training_ratio *
                                np.bincount(self.labels)[0])
     # fit pre-training model
     clf.fit(self.samples, self.labels)
     # get and sort distances
     distances = clf._decision_function(self.samples)
     # proba = clf.predict_proba(self.samples)
     # print [distances[index] for index in positive_index]
     distances_copy = distances.copy()
     distances_copy.sort()
     # get threshold
     negative_threshold = distances_copy[label_negative_count]
     self.labels = np.array([
         0 if distance < negative_threshold else -1
         for distance in distances
     ])
     self.labels[positive_index] = 1
     return self.labels[:self.length]
Beispiel #3
0
    def add_reliable_samples(self,
                             class_prior,
                             speed,
                             add_ratio,
                             real_label,
                             model=RandomForestClassifier(n_estimators=100)):
        max_label_count = int(add_ratio * self.length)
        label_count = 0
        positive_label_count = 0
        positive_count_each_round = int(speed * class_prior)
        negative_count_each_round = int(speed * (1 - class_prior))
        while label_count < max_label_count:
            self.smote_for_positive()
            labeled_index = np.where(self.labels != -1)[0]
            unlabeled_index = np.where(self.labels == -1)[0]
            labeled_set = self.samples[labeled_index]
            unlabeled_set = self.samples[unlabeled_index]
            if len(unlabeled_set) == 0:
                break
            '''
                random forest
            '''
            clf = model
            clf.fit(labeled_set, self.labels[np.where(self.labels != -1)])
            prob_list = clf.predict_proba(unlabeled_set)
            prob_list = np.array([prob[1] for prob in prob_list])
            '''
                linear model
            '''
            clf_2 = ElasticNet(max_iter=10000)
            clf_2.fit(labeled_set, self.labels[np.where(self.labels != -1)])
            dis_list = clf_2._decision_function(unlabeled_set)

            distance_list = prob_list.tolist()
            distance_list.sort()
            '''get the index of positive and negative index in unlabeled samples'''
            negative_threshold = distance_list[negative_count_each_round - 1]
            positive_threshold = distance_list[-positive_count_each_round]
            '''use linear model select samples'''
            select_dis_list = dis_list[np.where(
                prob_list <= negative_threshold)].tolist()
            select_dis_list.sort()

            negative_dis_threshold = select_dis_list[negative_count_each_round
                                                     - 1]

            select_dis_list = dis_list[np.where(
                prob_list >= positive_threshold)].tolist()
            select_dis_list.sort()

            positive_dis_threshold = select_dis_list[
                -positive_count_each_round]

            positive_count_cur = 0
            negative_count_cur = 0

            for i in range(len(prob_list)):
                if prob_list[i] <= negative_threshold and dis_list[
                        i] <= negative_dis_threshold and negative_count_cur < negative_count_each_round:
                    label_count += 1
                    negative_count_cur += 1
                    self.labels[unlabeled_index[i]] = 0
                elif prob_list[i] >= positive_threshold and dis_list[
                        i] >= positive_dis_threshold and positive_count_cur < positive_count_each_round:
                    label_count += 1
                    positive_label_count += 1
                    positive_count_cur += 1
                    self.labels[unlabeled_index[i]] = real_label[
                        unlabeled_index[i]]
                    print real_label[unlabeled_index[i]]
                if label_count > max_label_count:
                    break
            print max(distance_list), min(distance_list)
            print label_count, positive_label_count
        print 'finish add reliable samples'
        return self.labels[:self.length], positive_label_count