Example #1
0
def cv5_smote_revc_psednc(fold_path, filename, k):
    # Generate pos and neg vecs and SMOTE synthetic vecs.
    lamada = 6
    w = 0.8
    revc_kmer = RevcKmer(k=k, normalize=True, upto=True)
    psednc = PseDNC(lamada, w)
    for i in range(5):
        # Generate RevcKmer_PseDNC vecs.
        with open(fold_path + "test_neg_" + str(i)) as fp:
            test_neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
        with open(fold_path + "test_neg_" + str(i)) as fp:
            test_neg_psednc_vecs = np.array(psednc.make_psednc_vec(fp))
        test_neg_revc_psednc_vecs = np.column_stack((test_neg_revc_kmer_vecs, test_neg_psednc_vecs[:, -lamada:]))

        with open(fold_path + "test_pos_" + str(i)) as fp:
            test_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
        with open(fold_path + "test_pos_" + str(i)) as fp:
            test_pos_psednc_vecs = np.array(psednc.make_psednc_vec(fp))
        test_pos_revc_psednc_vecs = np.column_stack((test_pos_revc_kmer_vecs, test_pos_psednc_vecs[:, -lamada:]))

        with open(fold_path + "train_neg_" + str(i)) as fp:
            train_neg_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
        with open(fold_path + "train_neg_" + str(i)) as fp:
            train_neg_psednc_vecs = np.array(psednc.make_psednc_vec(fp))
        train_neg_revc_psednc_vecs = np.column_stack((train_neg_revc_kmer_vecs, train_neg_psednc_vecs[:, -lamada:]))

        with open(fold_path + "train_pos_" + str(i)) as fp:
            train_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp))
        with open(fold_path + "train_pos_" + str(i)) as fp:
            train_pos_psednc_vecs = np.array(psednc.make_psednc_vec(fp))
        train_pos_revc_psednc_vecs = np.column_stack((train_pos_revc_kmer_vecs, train_pos_psednc_vecs[:, -lamada:]))

        # Generate synthetic vecs from pos_vecs.
        synthetic1 = (smote.smote(train_pos_revc_psednc_vecs, N=100, k=5)).tolist()
        synthetic2 = (smote.smote(train_pos_revc_psednc_vecs, N=50, k=5)).tolist()
        synthetic = np.row_stack((synthetic1, synthetic2))

        n_lamada = "_".join([str(lamada), str(w)])
        # Write test file.
        write_file = fold_path + filename + '_' + n_lamada + "_test_" + str(i) + ".txt"
        test_vecs = test_pos_revc_psednc_vecs.tolist() + test_neg_revc_psednc_vecs.tolist()
        test_vecs_labels = [1] * len(test_pos_revc_psednc_vecs) + [-1] * len(test_neg_revc_psednc_vecs)
        write_libsvm(test_vecs, test_vecs_labels, write_file)

        # Write train file.
        write_file = fold_path + filename + '_' + n_lamada + "_train_" + str(i) + ".txt"
        train_pos_vecs = train_pos_revc_psednc_vecs.tolist() + synthetic.tolist()
        train_vecs = train_pos_vecs + train_neg_revc_psednc_vecs.tolist()
        train_vecs_labels = [1] * len(train_pos_vecs) + [-1] * len(train_neg_revc_psednc_vecs)
        write_libsvm(train_vecs, train_vecs_labels, write_file)
Example #2
0
    def _oversample(self, class_name, rate):
        """
        Oversample examples of a class
        :param class_name: string, class name
        :param rate: float, rate of oversampling, 1 corresponds to 100%
        :return: nothing, generated examples are added to self._df_synthetic
        """
        n_examples = self._df['Id'][self._df['Class'].isin([class_name])].count()
        labels = self._df[self._class_columns][self._df['Class'].isin([class_name])].values[0]
        images = np.zeros((n_examples, self._image_height * self._image_width))
        i = 0
        for _, f in self._df.Image[self._df['Class'].isin([class_name])].iteritems():
            img = self._read_image(f)
            images[i] = img.flatten()
            i += 1

        n = math.ceil(n_examples * rate)
        n = int(n)

        #if self.verbose:
        #    synthetic_examples, parent_ids = smote(images, n, n_neighbours=5, return_parent_ids=True)
        #    self._save_synthetic_examples(synthetic_examples, images, parent_ids, class_name)
        #else:
        synthetic_examples = smote(images, n, n_neighbours=5)

        df = pd.DataFrame(index=np.arange(0, n), columns=self._df_synthetic.columns.values)

        for i, img in enumerate(synthetic_examples):
            df.loc[i].Id = 's_{}_{}'.format(class_name, i)
            img = img.reshape((self._image_height, self._image_width))
            df.loc[i].Image = img
            df.loc[i][self._class_columns] = labels

        self._df_synthetic = self._df_synthetic.append(df, ignore_index=True)
Example #3
0
def cv5_smote_revc_kmer(fold_path, filename, k):
    revc_kmer = RevcKmer(k=k, normalize=True, upto=True)
    for i in range(5):
        # Generate RevcKmer vecs.
        with open(fold_path + "test_neg_" + str(i)) as fp:
            test_neg_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp)
        with open(fold_path + "test_pos_" + str(i)) as fp:
            test_pos_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp)
        with open(fold_path + "train_neg_" + str(i)) as fp:
            train_neg_revc_kmer_vecs = revc_kmer.make_revckmer_vec(fp)
        with open(fold_path + "train_pos_" + str(i)) as fp:
            train_pos_revc_kmer_vecs = np.array(revc_kmer.make_revckmer_vec(fp))

        # Generate SMOTE synthetic vecs from train_pos_vecs.
        synthetic = smote.smote(train_pos_revc_kmer_vecs, N=200, k=5)

        # Write test file.
        write_file = fold_path + filename + "_test_" + str(i) + ".txt"
        test_vecs = test_pos_revc_kmer_vecs + test_neg_revc_kmer_vecs
        test_vecs_labels = [1] * len(test_pos_revc_kmer_vecs) + [-1] * len(test_neg_revc_kmer_vecs)
        write_libsvm(test_vecs, test_vecs_labels, write_file)

        # Write train file.
        write_file = fold_path + filename + "_train_" + str(i) + ".txt"
        train_pos_revc_kmer_vecs = train_pos_revc_kmer_vecs.tolist() + synthetic.tolist()
        train_vecs = train_pos_revc_kmer_vecs + train_neg_revc_kmer_vecs
        train_vecs_labels = [1] * len(train_pos_revc_kmer_vecs) + [-1] * len(train_neg_revc_kmer_vecs)
        write_libsvm(train_vecs, train_vecs_labels, write_file)
Example #4
0
def readSmoteDataset(file, properties):
    prefix = "tera/"
    suffix = ".csv"
    finput = open(prefix + file + suffix, 'rb')
    reader = csv.reader(finput, delimiter=',')
    dataread = smote(reader)
    return np.array(dataread[0]), dataread[-1]  # keeping the same format as Joe's code
Example #5
0
def contents(file, sep= The.reader.sep, bad= The.reader.bad, k=3):
    import csv
    if SMOTE is True:
        num_lines = sum(1 for line in open(file))
        f = open(file, "r")
        reader = csv.reader(f, delimiter=',')
        dataread = smote(reader)
        f.seek(0)
        dataread[0].insert(0, f.readline().split(',')[k:])
        ret = []
        for n, line in enumerate(dataread[0]):
            if n != 0:
                line = [float(l) for l in line]
            ret.append([n, line])
        return ret

    else:
        ret = []
        f = open(file)
        for n,line in enumerate(f):
            line = re.sub(bad,"",line) # kill white space
            if n != 0:
                try:
                    ret.append((n, [float(x) for x in line.split(sep)[k:]]))
                except:
                    ret.append((n, [1 if x == 'Y' else 0 for x in line.split(sep)[k:]]))
            else:
                ret.append([n, line.split(sep)[k:]])

        return ret
 def doSmote(self):
     df = pd.concat([self.train_X, self.train_y], axis=1)
     columnNames = self.data.columns.values.tolist()
     columnNames.append(self.class_label)
     smt = smote.smote(df, 5)
     self.data = smt.fit_transform()
     self.data.columns = columnNames
     self.train_y = self.data[self.class_label]
     self.data.drop([self.class_label], axis=1, inplace=True)
     self.train_X = self.data
Example #7
0
def smotify(model=MODEL(), rows=None, k=5, factor = 100):
  if rows == None:
    rows = model._rows
    
  klazzify(model, rows)
  classLength = [len(model.classes[i]) for i in model.classes]
  maxLen, minLen = max(classLength), min(classLength)
  clones = []
  for key in model.classes:
    classLength = len(model.classes[key])
    f = factor
    if classLength < ((maxLen + minLen)/2) :
      f = (factor*(maxLen + minLen)/2)/classLength
    clones += smote(model, model.classes[key], k=k, N = int(ceil(f)))
  return clones
Example #8
0
def smote_sample(x, y, N):
    nt = count_true(y)
    nf = np.size(y) - nt
    # N = int(np.floor((nf/nt*frac/(1-frac)-1)*100))
    x_true, x_false = true_false_split(x, y, 'True'), true_false_split(
        x, y, 'False')

    #generate synthetic true values
    smoted_data = smote(x_true, 5, N)
    #fix column of lowest sym_order to 1 for these, so they are classed as having a sym
    smoted_data[:, 2] = np.ones(np.size(smoted_data, axis=0))

    smote_sampled = np.vstack((x_true, smoted_data))
    smote_sampled = np.vstack((x_false, smote_sampled))
    np.random.shuffle(smote_sampled)
    return smote_sampled
Example #9
0
  def tune_SMOTE(train_pd):

    train_len = len(train_pd)
    new_train_index = random.sample(train_pd.index, int(train_len * 0.7))
    new_train = train_pd.ix[new_train_index]
    if "_TunedSmote" in isWhat:
      new_train_X = new_train.ix[:, new_train.columns[:-1]].values
      new_train_Y = new_train.ix[:, new_train.columns[-1]].values
      new_tune = train_pd.drop(new_train_index)
      new_tune_X = new_tune.ix[:, new_tune.columns[:-1]].values
      new_tune_Y = new_tune.ix[:, new_tune.columns[-1]].values
      # clf = learner(new_train_X, new_train_Y, new_tune_X, new_tune_Y)
      A_smote = smote(new_train)
      num_range = [[int(A_smote.get_majority_num() * 0.5),
                    int(A_smote.get_majority_num() * 1.5)]] * (
                    A_smote.label_num - 1)
      params_to_tune = {"k": [2, 20], "up_to_num": num_range}
      # pdb.set_trace()
      tuner = DE_Tune_SMOTE(learner, smote, params_to_tune, new_train,
                            new_tune, target_class, goal)
      params = tuner.Tune()
      return params, new_train
Example #10
0
            fpr, tpr = cross_validation(X, y[:, 0], clf, thresh)
            expB[j, i + 1, :] = [fpr, tpr]

            # Gaussian Naive Bayes classifier on Plain Under-sampled data
            clf = GaussianNB()
            fpr, tpr = cross_validation(X, y[:, 0], clf, NBthresh)
            print 'Gauss NB Class Priors [-ve class, +ve class]: ', clf.class_prior_
            priors.append(clf.class_prior_)
            expC[j, i + 1, :] = [fpr, tpr]

            # k-NN classifier on Plain Under-sampled data
            clf = KNeighborsClassifier(n_neighbors=KNN)
            fpr, tpr = cross_validation(X, y[:, 0], clf, thresh)
            expE[j, i + 1, :] = [fpr, tpr]

            dataC = smote.smote(data, minority_overSample_percent[j], 5)

            X = dataC[:, [1, 0]]
            y = dataC[:, [data.shape[1] - 1]]

            unique, counts = np.unique(dataC[:, [data.shape[1] - 1]],
                                       return_counts=True)
            freq = dict(zip(unique, counts))
            nPositive = freq[1.0]
            nNegative = freq[0.0]
            print 'SMOTED by ' + str(minority_overSample_percent[j]) + ' %'
            print '+ve Class: ', nPositive, ' -ve Class: ', nNegative

            dataC = smote.underSMOTE(dataC, majority_underSample_percent[i])

            unique, counts = np.unique(dataC[:, [data.shape[1] - 1]],
Example #11
0
def cross_val(pd_data, learner, target_class, goal, isWhat="", fold=5,
              repeats=2):
  """
  do 5-fold cross_validation
  """

  def tune_learner(train_X):
    train_len = len(train_X)
    new_train_index = np.random.choice(range(train_len), train_len * 0.7)
    new_tune_index = list(set(range(train_len)) - set(new_train_index))
    new_train_X = train_X[new_train_index]
    new_train_Y = train_Y[new_train_index]
    new_tune_X = train_X[new_tune_index]
    new_tune_Y = train_Y[new_tune_index]
    clf = learner(new_train_X, new_train_Y, new_tune_X, new_tune_Y, goal)
    tuner = DE_Tune_ML(clf, clf.get_param(), target_class, goal)
    return tuner.Tune()

  def tune_SMOTE(train_pd):

    train_len = len(train_pd)
    new_train_index = random.sample(train_pd.index, int(train_len * 0.7))
    new_train = train_pd.ix[new_train_index]
    if "_TunedSmote" in isWhat:
      new_train_X = new_train.ix[:, new_train.columns[:-1]].values
      new_train_Y = new_train.ix[:, new_train.columns[-1]].values
      new_tune = train_pd.drop(new_train_index)
      new_tune_X = new_tune.ix[:, new_tune.columns[:-1]].values
      new_tune_Y = new_tune.ix[:, new_tune.columns[-1]].values
      # clf = learner(new_train_X, new_train_Y, new_tune_X, new_tune_Y)
      A_smote = smote(new_train)
      num_range = [[int(A_smote.get_majority_num() * 0.5),
                    int(A_smote.get_majority_num() * 1.5)]] * (
                    A_smote.label_num - 1)
      params_to_tune = {"k": [2, 20], "up_to_num": num_range}
      # pdb.set_trace()
      tuner = DE_Tune_SMOTE(learner, smote, params_to_tune, new_train,
                            new_tune, target_class, goal)
      params = tuner.Tune()
      return params, new_train

  F = {}
  total_evaluation = 0
  for i in xrange(repeats):  # repeat 5 times here
    kf = StratifiedKFold(pd_data.ix[:, pd_data.columns[-1]].values, fold, shuffle=True)
    for train_index, test_index in kf:
      train_pd = pd_data.ix[train_index]
      test_pd = pd_data.ix[test_index]
      if "Smote" in isWhat:
        k = 5
        up_to_num = []
        if "_TunedSmote" in isWhat:
          params, train_pd = tune_SMOTE(train_pd)
          # use new training data not original, because some are used as tuning
          k = params["k"]
          up_to_num = params["up_to_num"]
        train_pd = smote(train_pd, k, up_to_num).run()

      train_X = train_pd.ix[:, train_pd.columns[:-1]].values
      train_Y = train_pd.ix[:, train_pd.columns[-1]].values
      test_X = test_pd.ix[:, test_pd.columns[:-1]].values
      test_Y = test_pd.ix[:, test_pd.columns[-1]].values
      params, evaluation = tune_learner(train_X) if "_TunedLearner" in isWhat else ({},0)
      F = learner(train_X, train_Y, test_X, test_Y, goal).learn(F, **params)
      total_evaluation +=evaluation
  # pdb.set_trace()
  avg_evaluation = total_evaluation / (repeats * fold)
  return avg_evaluation, F
Example #12
0
        #

        minoritySamples, majoritySamples = getSeparatedSamples('Input/diabetes.csv')

        print ("Number of Miniority Samples:" + str(len(minoritySamples)))
        print ("Number of Majority Samples:" + str(len(majoritySamples)))
        # print minoritySamples[0]

        minorityCounter = len(minoritySamples)
        majorityCounter = len(majoritySamples)

        underSampledMajoritySamples = underSample(minorityCounter, 100, majoritySamples, majorityCounter)
        underSampleOnlyHelper(minoritySamples)

        smoteHelper(underSampledMajoritySamples)
        smote(minorityCounter, 100, 5, minoritySamples)

        # plotROC(majoritySamples, minoritySamples)
        # treeClassifierLogisticRegression(majoritySamples, minoritySamples)
        # treeClassifier(majoritySamples, minoritySamples)


        treeClassifier2('Output/diabetes_Smote.csv')
        treeClassifier2('Output/diabetes_Under.csv')
        # plotConvexHull()

        naiveBayes(majoritySamples, minoritySamples)

        # csvfile.close

    except Exception as error:
    num_pos = sum([1 for sample_id in _Y if _Y[sample_id] == 1])
    num_neg = sum([1 for sample_id in _Y if not _Y[sample_id] == -1])

    return _X, _Y, num_pos, num_neg


if __name__ == '__main__':

    exp_name = sys.argv[1]
    neighbours_smote = int(sys.argv[2])
    neighbours_enn = int(sys.argv[3])

    path = './' + exp_name

    os.makedirs(path)

    problems = ['1']

    for p in problems:

        X, Y, num_pos, num_neg = read_data(p)

        X, Y, num_pos, num_neg = smote(X, Y, num_pos, num_neg, neighbours_smote)

        X, Y, num_pos, num_neg = enn(X, Y, neighbours_enn)

        path_file = path + '/%s_X.tsv' % p

        save_data(path_file, X, Y)