Example #1
0
    def initialize(self, X, k, random_seed, method='naive'):
        if method == 'naive':
            # Randomly pick k data points to be the centroids of the k clusters
            centroids = resample(X, n_samples=k, random_state=random_seed, replace=False)
        elif method == 'kmeans++': # https://en.wikipedia.org/wiki/K-means%2B%2B
            # Step 1: Choose one center uniformly at random from among the data points
            centroids = resample(X, n_samples=1, random_state=random_seed, replace=False)
            N = len(X)
            # Sampling the 1~k centroids
            for i in range(1, k):
                distances = [ -1 ] * N
                # Step 2: For each data point x, compute D(x)
                for j in range(N):
                    # The distance between x and the nearest center that has already been chosen
                    distances[j] = min(np.linalg.norm(X[j] - centroid) for centroid in centroids)

                # Step 3: Choose one new data point at randome as a new center,
                # using a weighted probability distribution where a point x is chosen with probability proportional to D(x)^2
                square_distances = [ distance ** 2 for distance in distances ]
                total_square_distance = sum(square_distances)
                # Naturally excluded already selected data points, because their probability is 0
                probabilities = [ square_distance / total_square_distance for square_distance in square_distances ]

                new_centroid_index = np.random.choice(range(N), size=1, replace=False, p=probabilities)[0]

                centroids = np.append(centroids, [ X[new_centroid_index] ], axis=0)

        return centroids
Example #2
0
def run_scikit_digits(epochs=0, layers=0, neuron_count=0):
    """ Run Handwritten Digits dataset from Scikit-Learn.  Learning set is split
    into 70% for training, 15% for testing, and 15% for validation.

    Parameters
    ----------
    epochs : int
        Number of iterations of the the traininng loop for the whole dataset
    layers : int
        Number of layers (not counting the input layer, but does count output
        layer)
    neuron_count : list
        The number of neurons in each of the layers (in order), does not count
        the bias term

    Attributes
    ----------
    target_values : list
        The possible values for each training vector

    """

    # Imported from linear_neuron
    temp_digits = datasets.load_digits()
    digits = utils.resample(temp_digits.data, random_state=3)
    temp_answers = utils.resample(temp_digits.target, random_state=3)
    # images = utils.resample(temp_digits.images, random_state=0)
    num_of_training_vectors = 1250
    answers, answers_to_test, validation_answers = (
        temp_answers[:num_of_training_vectors],
        temp_answers[num_of_training_vectors : num_of_training_vectors + 260],
        temp_answers[num_of_training_vectors + 260 :],
    )
    training_set, testing_set, validation_set = (
        digits[:num_of_training_vectors],
        digits[num_of_training_vectors : num_of_training_vectors + 260],
        digits[num_of_training_vectors + 260 :],
    )

    ###########
    # network.visualization(training_set[10], answers[10])
    # network.visualization(training_set[11], answers[11])
    # network.visualization(training_set[12], answers[12])

    network = Network(layers, neuron_count, training_set[0])
    network.train(training_set, answers, epochs)
    f = open("my_net.pickle", "wb")
    # fr = open('my_net.pickle', 'rb')
    dill.dump(network, f)
    # network = pickle.load(fr)
    # fr.close()
    f.close()
    # guess_list = network.run_unseen(testing_set)
    return network.run_unseen(testing_set)
Example #3
0
def test_resample():
    # Border case not worth mentioning in doctests
    assert resample() is None

    # Check that invalid arguments yield ValueError
    assert_raises(ValueError, resample, [0], [0, 1])
    assert_raises(ValueError, resample, [0, 1], [0, 1],
                  replace=False, n_samples=3)
    assert_raises(ValueError, resample, [0, 1], [0, 1], meaning_of_life=42)
    # Issue:6581, n_samples can be more when replace is True (default).
    assert_equal(len(resample([1, 2], n_samples=5)), 5)
Example #4
0
    def resample_training_dataset(self, labels, feature_array, sizes = (5000,500)):
        """
        Inputs:
            - labels
            - features
            - sizes: tuple, for each class (0,1,etc)m the number of training chunks you want.
            i.e for 500 seizures, 5000 baseline, sizes = (5000, 500), as 0 is baseline, 1 is Seizure
        Takes labels and features an

        WARNING: Up-sampling target class prevents random forest oob from being accurate.
        """
        if len (labels.shape) == 1:
            labels = labels[:, None]

        resampled_labels = []
        resampled_features = []
        for i,label in enumerate(np.unique(labels.astype('int'))):
            class_inds = np.where(labels==label)[0]

            class_labels = labels[class_inds]
            class_features = feature_array[class_inds,:]

            if class_features.shape[0] < sizes[i]: # need to oversample
                class_features_duplicated = np.vstack([class_features for i in range(int(sizes[i]/class_features.shape[0]))])
                class_labels_duplicated  = np.vstack([class_labels for i in range(int(sizes[i]/class_labels.shape[0]))])
                n_extra_needed = sizes[i] - class_labels_duplicated.shape[0]
                extra_features = resample(class_features, n_samples =  n_extra_needed,random_state = 7, replace = False)
                extra_labels = resample(class_labels, n_samples =  n_extra_needed,random_state = 7, replace = False)

                boot_array  = np.vstack([class_features_duplicated,extra_features])
                boot_labels = np.vstack([class_labels_duplicated,extra_labels])

            elif class_features.shape[0] > sizes[i]: # need to undersample
                boot_array  = resample(class_features, n_samples =  sizes[i],random_state = 7, replace = False)
                boot_labels = resample(class_labels,   n_samples =  sizes[i],random_state = 7, replace = False)

            elif class_features.shape[0] == sizes[i]:
                logging.debug('label '+str(label)+ ' had exact n as sample, doing nothing!')
                boot_array  = class_features
                boot_labels = class_labels
            else:
                print(class_features.shape[0], sizes[i])
                print ('fuckup')
            resampled_features.append(boot_array)
            resampled_labels.append(boot_labels)
        # stack both up...
        resampled_labels = np.vstack(resampled_labels)
        resampled_features = np.vstack(resampled_features)

        logging.debug('Original label counts: '+str(pd.Series(labels[:,0]).value_counts()))
        logging.debug('Resampled label counts: '+str(pd.Series(resampled_labels[:,0]).value_counts()))

        return resampled_labels, resampled_features
Example #5
0
def run_mnist(epochs, layers, neuron_count):
    """ Run Mnist dataset and output a guess list on the Kaggle test_set

    Parameters
    ----------
    epochs : int
        Number of iterations of the the traininng loop for the whole dataset
    layers : int
        Number of layers (not counting the input layer, but does count output
        layer)
    neuron_count : list
        The number of neurons in each of the layers (in order), does not count
        the bias term

    Attributes
    ----------
    target_values : list
        The possible values for each training vector

    """

    with open('train.csv', 'r') as f:
        reader = csv.reader(f)
        t = list(reader)
        train = [[int(x) for x in y] for y in t[1:]]

    with open('test.csv', 'r') as f:
        reader = csv.reader(f)
        raw_nums = list(reader)
        test_set = [[int(x) for x in y] for y in raw_nums[1:]]

    ans_train = [x[0] for x in train]
    train_set = [x[1:] for x in train]
    ans_train.pop(0)
    train_set.pop(0)

    train_set = utils.resample(train_set, random_state=2)
    ans_train = utils.resample(ans_train, random_state=2)

    network = Network(layers, neuron_count, train_set[0])
    network.train(train_set, ans_train, epochs)

    # For validation purposes
    # guess_list = network.run_unseen(train_set[4000:4500])
    # network.report_results(guess_list, ans_train[4000:4500])
    # guess_list = network.run_unseen(train_set[4500:5000])
    # network.report_results(guess_list, ans_train[4500:5000])

    guess_list = network.run_unseen(test_set)
    with open('digits.txt', 'w') as d:
        for elem in guess_list:
            d.write(str(elem)+'\n')
Example #6
0
def test_resample_stratified():
    # Make sure resample can stratify
    rng = np.random.RandomState(0)
    n_samples = 100
    p = .9
    X = rng.normal(size=(n_samples, 1))
    y = rng.binomial(1, p, size=n_samples)

    _, y_not_stratified = resample(X, y, n_samples=10, random_state=0,
                                   stratify=None)
    assert np.all(y_not_stratified == 1)

    _, y_stratified = resample(X, y, n_samples=10, random_state=0, stratify=y)
    assert not np.all(y_stratified == 1)
    assert np.sum(y_stratified) == 9  # all 1s, one 0
    def eval_prox_random(self, n_sample_node=5, sample_nodes=[]):
        cs = self.cs
        measurements = {}
        nodes = cs.nodes()

        test_nodes = []
        if len(sample_nodes):
            if type(sample_nodes[0]) is str:
                test_nodes = sample_nodes
            elif type(sample_nodes[0]) is int:
                test_nodes = [nodes[i] for i in sample_nodes]
        else:
            test_nodes = resample(nodes, n_samples=n_sample_node)

        # nae of coordinate-based proximity vs ground-proximity
        coor_test = self.coor_all[test_nodes]

        ground_prox = (
            cs.proximity_to(sources=test_nodes, dests=cs.nodes()).as_matrix().transpose()
        )  # shape: test_nodes x all_nodes
        coor_prox = np.dot(coor_test.as_matrix().transpose(), self.coor_all.as_matrix())

        nae = pd.Series.combine(
            pd.Series(coor_prox.flatten()), pd.Series(ground_prox.flatten()), lambda c, g: abs(c - g) / g
        )
        nae_plot = pd.Series(np.linspace(0.0, 1.0, num=len(nae)), index=nae.order())
        measurements["nae"] = nae
        measurements["nae_plot"] = nae_plot

        return measurements
Example #8
0
def bootstrap_auc(df, col, pred_col, n_bootstrap=1000):
    """
    Calculate the boostrapped AUC for a given col trying to predict a pred_col.

    Parameters
    ----------
    df : pandas.DataFrame
    col : str
        column to retrieve the values from
    pred_col : str
        the column we're trying to predict
    n_boostrap : int
        the number of bootstrap samples

    Returns
    -------
    list : AUCs for each sampling
    """
    scores = np.zeros(n_bootstrap)
    old_len = len(df)
    df.dropna(subset=[col], inplace=True)
    new_len = len(df)
    if new_len < old_len:
        logger.info("Dropping NaN values in %s to go from %d to %d rows" % (col, old_len, new_len))
    preds = df[pred_col].astype(int)
    for i in range(n_bootstrap):
        sampled_counts, sampled_pred = resample(df[col], preds)
        if is_single_class(sampled_pred, col=pred_col):
            continue
        scores[i] = roc_auc_score(sampled_pred, sampled_counts)
    return scores
def boot_estimates(model, X, y, nboot):
    '''
    Evaluate coefficient estimates for nboot boostrap samples
    '''
    coefs = [np.hstack([model.fit(iX, iy).intercept_, model.fit(iX, iy).coef_.ravel()]) 
            for iX, iy in (resample(X, y) for i in xrange(nboot))]  
    return np.vstack(coefs)
Example #10
0
File: utils.py Project: DSsoto/Sub8
def balanced_resample(data, labels):
    """Do a balanced resampling of data and labels, returning them
    See the test routine at the bottom for an example of behavior
    """
    most_common, num_required = mstats.mode(labels)
    possible_labels = np.unique(labels)

    data_resampled = []
    labels_resampled = []

    for possible_label in possible_labels:
        in_this_label = labels == possible_label

        data_buffered = np.array([])
        data_buffered = np.reshape(data_buffered, (0, data.shape[1]))
        labels_buffered = np.array([])

        while len(data_buffered) < num_required:
            data_buffered = np.vstack([data_buffered, data[in_this_label]])
            labels_buffered = np.hstack([labels_buffered, labels[in_this_label]])

        single_data_resampled, single_labels_resampled = utils.resample(
            data_buffered,
            labels_buffered,
            n_samples=int(num_required),
            replace=True
        )
        data_resampled.append(single_data_resampled)
        labels_resampled.append(single_labels_resampled)

    return np.vstack(data_resampled).astype(data.dtype), np.hstack(labels_resampled).astype(labels.dtype)
def run_method_usage(methods,cases):
    methods = [m[0] for m in methods]
    # Bootstrap the percentage error bars:
    percents =[]
    for i in range(10000):
        nc = resample(cases)
        percents.append(100*np.sum(nc,axis=0)/len(nc))
    percents=np.array(percents)
    mean_percents = np.mean(percents,axis=0)
    std_percents = np.std(percents,axis=0)*1.96
    inds=np.argsort(mean_percents).tolist()
    inds.reverse()
    avg_usage = np.mean(mean_percents)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    x=np.arange(len(methods))
    ax.plot(x,[avg_usage]*len(methods),'-',color='0.25',lw=1,alpha=0.2)
    ax.bar(x, mean_percents[inds], 0.6, color=paired[0],linewidth=0,
           yerr=std_percents[inds],ecolor=paired[1])
    #ax.set_title('Method Occurrence')
    ax.set_ylabel('Occurrence %',fontsize=30)
    ax.set_xlabel('Method',fontsize=30)
    ax.set_xticks(np.arange(len(methods)))
    ax.set_xticklabels(np.array(methods)[inds],fontsize=8)
    fig.autofmt_xdate()
    fix_axes()
    plt.tight_layout()
    fig.savefig(figure_path+'method_occurrence.pdf', bbox_inches=0)
    fig.show()
    return inds,mean_percents[inds]
 def fit(self, dataSet):
     for clt in self.forest:
         randSet= resample(dataSet)
         #print "randSet size = %d" % len(randSet)
         target = [x[0] for x in randSet]
         train = [x[1:] for x in randSet]
         clt.fit(train, target)
Example #13
0
def downsample(y, sizes = [30000, 3000]):
#     classes = Counter(y)
    res = []
    for class_i, sz in enumerate(sizes):
        indices = [x for x in y == class_i if x]
        res.append(resample(indices, replace = True, n_samples = sz))
    return tuple(res)
Example #14
0
def Reduce_scikit_kmeans(img, number_of_colors):
    t0 = time()
    from sklearn.cluster import KMeans
    img_64 = np.array(img, dtype=np.float64) / 255
    w, h, d = tuple(img_64.shape)
    assert d == 3
    image_array = np.reshape(img_64, (w * h, d))

    LOGGER.info("shape=%s", image_array.shape)
    from sklearn.utils import resample
    image_array_sample = resample(
        image_array,
        replace=True,
        n_samples=min([image_array.shape[0], 1000]),
        random_state=1
    )

    kmeans = KMeans(
        n_clusters=number_of_colors,
        random_state=1,
        precompute_distances=True).fit(image_array_sample)

    labels = kmeans.predict(image_array)
    LOGGER.info("ms=%s", ms(t0))

    return kmeans.cluster_centers_, labels
Example #15
0
    def fit(self, X, Y):
        num_examples = len(X)
        data_indices = np.arange(num_examples)
        self.data = X
        Y = np.array(Y, dtype=float)

        sample = resample(data_indices, replace=False, n_samples=min(20, num_examples), random_state=0)
        for i in sample:
            y = Y[i]
            self.S.add(i)
            self.y[i] = y
            self.alpha[i] = 0.0
            self.g[i] = y
       

        for i in xrange(5):
            min_delta = 999999999
            for i in data_indices:
                self.process(i, Y[i])
                delta = self.reprocess()
                min_delta = min(min_delta, delta)
            if min_delta < self.tau: break

            data_indices = shuffle(data_indices)

        while True:
            delta = self.reprocess()
            if delta < self.tau: break
Example #16
0
    def test_mnist(self):
        mnist = fetch_mldata('MNIST original')
        X, Y = resample(mnist.data, mnist.target, replace=False, n_samples=1000, random_state=0)
        X = X.astype(float)
        Y = [1 if y == 0 else -1 for y in Y]

        svm = LASVM(C=10, tau=0.001)
        svm.fit(X, Y)

        X_test, Y_test = resample(mnist.data, mnist.target, replace=False, n_samples=300, random_state=2)
        X_test = X_test.astype(float)
        Y_test = [1 if y == 0 else -1 for y in Y_test]
        Y_predict = svm.predict(X_test)
        percent_correct = np.sum(Y_predict == Y_test) / 300.0

        self.assertGreater(percent_correct, 0.95)
Example #17
0
def show_bootstrap_statistics(clf, X, y, features):
    num_features = len(features)

    coefs = []
    for i in range(num_features):
        coefs.append([])

    for _ in range(BOOTSTRAP_ITERATIONS):
        X_sample, y_sample = resample(X, y)
        clf.fit(X_sample, y_sample)
        for i, c in enumerate(get_normalized_coefs(clf)):
            coefs[i].append(c)

    poi_index = features.index('POI')
    building_index = features.index('Building')
    coefs[building_index] = coefs[poi_index]

    intervals = []

    print()
    print('***** Bootstrap statistics *****')
    print('{:<20}{:<20}{:<10}{:<10}'.format('Feature', '95% interval', 't-value', 'Pr(>|t|)'))
    print()
    for i, cs in enumerate(coefs):
        values = np.array(cs)
        lo = np.percentile(values, 2.5)
        hi = np.percentile(values, 97.5)
        interval = '({:.3f}, {:.3f})'.format(lo, hi)
        tv = np.mean(values) / np.std(values)
        pr = (1.0 - t.cdf(x=abs(tv), df=len(values))) * 0.5

        stv = '{:.3f}'.format(tv)
        spr = '{:.3f}'.format(pr)
        print('{:<20}{:<20}{:<10}{:<10}'.format(features[i], interval, stv, spr))
Example #18
0
def test_resample_stratify_2dy():
    # Make sure y can be 2d when stratifying
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.normal(size=(n_samples, 1))
    y = rng.randint(0, 2, size=(n_samples, 2))
    X, y = resample(X, y, n_samples=50, random_state=rng, stratify=y)
    assert y.ndim == 2
def make_pred_prob_plot_data(model, df, column):
    dfc = df.copy() 
    rng = np.linspace(df[column].min(), df[column].max())
    probs = []
    for val in rng:
        dfc[column] = val
        pred_probs = model.predict_proba(dfc)[:, 1]
        probs.append([boot_sample.mean() for boot_sample in (resample(pred_probs) for _ in xrange(1000))])
    return rng, np.array(probs).T
Example #20
0
def bootstrap_auc(y_c,y_pred,N=100):
    """Bootstrap the AUC score."""
    scores=[]
    for i in xrange(N):
        res_y=resample(np.column_stack([y_c,y_pred]))
        scores.append(roc_auc_score(res_y[:,0],res_y[:,1]))
        
    print 'Score is :', '%.4f' % np.mean(scores),
    print '+-','%.4f' % np.std(scores)
Example #21
0
    def _balance(self, class0_k, class1_k):
        """Balances collection with their respective coefficients for classes
        Collection should be sorted with 1 labels go before 0s.
        """
        import numpy as np
        from sklearn.utils import resample
        class1_count = len([1 for x in self.labels if x])
        class0_count = len(self.labels) - class1_count
        class1_col = self.collection[:class1_count]
        class0_col = self.collection[class1_count:]

        num_class0 = int(class0_count*class0_k)
        num_class1 = int(class1_count*class1_k)

        class0_col = resample(class0_col, replace=False, n_samples=num_class0, random_state=1)
        class1_col = resample(class1_col, replace=False, n_samples=num_class1, random_state=1)
        col = np.concatenate([class1_col, class0_col])
        labels = np.concatenate((np.ones(num_class1), np.zeros(num_class0)))
        return col, labels
Example #22
0
def resample_split(X, y, state):
    # Train index
    train_index = resample(range(0,len(X)), random_state = state)
    X_train = X[train_index]
    y_train = y[train_index]
    # Test are the rest
    test_index = [i for i in range(len(X)) if i not in train_index]
    X_test = [X[i] for i in range(len(X)) if i not in train_index]
    y_test = [y[i] for i in range(len(X)) if i not in train_index]  
    return X_train, y_train, X_test, y_test, test_index
Example #23
0
def test_resample_stratify_sparse_error():
    # resample must be ndarray
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.normal(size=(n_samples, 2))
    y = rng.randint(0, 2, size=n_samples)
    stratify = sp.csr_matrix(y)
    with pytest.raises(TypeError, match='A sparse matrix was passed'):
        X, y = resample(X, y, n_samples=50, random_state=rng,
                        stratify=stratify)
Example #24
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('prediction', type=str)
    parser.add_argument('--test_listfile', type=str, default='../data/length-of-stay/test/listfile.csv')
    parser.add_argument('--n_iters', type=int, default=1000)
    parser.add_argument('--save_file', type=str, default='los_results.json')
    args = parser.parse_args()

    pred_df = pd.read_csv(args.prediction, index_col=False, dtype={'period_length': np.float32,
                                                                   'y_true': np.float32})
    test_df = pd.read_csv(args.test_listfile, index_col=False, dtype={'period_length': np.float32,
                                                                   'y_true': np.float32})

    df = test_df.merge(pred_df, left_on=['stay', 'period_length'], right_on=['stay', 'period_length'],
                       how='left', suffixes=['_l', '_r'])
    assert (df['prediction'].isnull().sum() == 0)
    assert (df['y_true_l'].equals(df['y_true_r']))

    metrics = [('Kappa', 'kappa'),
               ('MAD', 'mad'),
               ('MSE', 'mse'),
               ('MAPE', 'mape')]

    data = np.zeros((df.shape[0], 2))
    data[:, 0] = np.array(df['prediction'])
    data[:, 1] = np.array(df['y_true_l'])

    results = dict()
    results['n_iters'] = args.n_iters
    ret = print_metrics_regression(data[:, 1], data[:, 0], verbose=0)
    for (m, k) in metrics:
        results[m] = dict()
        results[m]['value'] = ret[k]
        results[m]['runs'] = []

    for i in range(args.n_iters):
        cur_data = sk_utils.resample(data, n_samples=len(data))
        ret = print_metrics_regression(cur_data[:, 1], cur_data[:, 0], verbose=0)
        for (m, k) in metrics:
            results[m]['runs'].append(ret[k])

    for (m, k) in metrics:
        runs = results[m]['runs']
        results[m]['mean'] = np.mean(runs)
        results[m]['median'] = np.median(runs)
        results[m]['std'] = np.std(runs)
        results[m]['2.5% percentile'] = np.percentile(runs, 2.5)
        results[m]['97.5% percentile'] = np.percentile(runs, 97.5)
        del results[m]['runs']

    print "Saving the results in {} ...".format(args.save_file)
    with open(args.save_file, 'w') as f:
        json.dump(results, f)

    print results
Example #25
0
def run_scikit_digits(epochs, layers, neuron_count):
    """ Run Handwritten Digits dataset from Scikit-Learn.  Learning set is split
    into 70% for training, 15% for testing, and 15% for validation.

    Parameters
    ----------
    epochs : int
        Number of iterations of the the traininng loop for the whole dataset
    layers : int
        Number of layers (not counting the input layer, but does count output
        layer)
    neuron_count : list
        The number of neurons in each of the layers (in order), does not count
        the bias term

    Attributes
    ----------
    target_values : list
        The possible values for each training vector

    """

    # Imported from linear_neuron
    temp_digits = datasets.load_digits()
    digits = utils.resample(temp_digits.data, random_state=3)
    temp_answers = utils.resample(temp_digits.target, random_state=3)
    # images = utils.resample(temp_digits.images, random_state=0)
    num_of_training_vectors = 1250 
    answers, answers_to_test, validation_answers = temp_answers[:num_of_training_vectors], temp_answers[num_of_training_vectors:num_of_training_vectors+260], temp_answers[num_of_training_vectors+260:]
    training_set, testing_set, validation_set = digits[:num_of_training_vectors], digits[num_of_training_vectors:num_of_training_vectors+260], digits[num_of_training_vectors+260:]

    ###########
    # network.visualization(training_set[10], answers[10])
    # network.visualization(training_set[11], answers[11])
    # network.visualization(training_set[12], answers[12])

    network = Network(layers, neuron_count, training_set[0])
    network.train(training_set, answers, epochs)
    guess_list = network.run_unseen(testing_set)
    network.report_results(guess_list, answers_to_test)
    valid_list = network.run_unseen(validation_set)
    network.report_results(valid_list, validation_answers)
Example #26
0
def test_resample_stratified_replace():
    # Make sure stratified resampling supports the replace parameter
    rng = np.random.RandomState(0)
    n_samples = 100
    X = rng.normal(size=(n_samples, 1))
    y = rng.randint(0, 2, size=n_samples)

    X_replace, _ = resample(X, y, replace=True, n_samples=50,
                            random_state=rng, stratify=y)
    X_no_replace, _ = resample(X, y, replace=False, n_samples=50,
                               random_state=rng, stratify=y)
    assert np.unique(X_replace).shape[0] < 50
    assert np.unique(X_no_replace).shape[0] == 50

    # make sure n_samples can be greater than X.shape[0] if we sample with
    # replacement
    X_replace, _ = resample(X, y, replace=True, n_samples=1000,
                            random_state=rng, stratify=y)
    assert X_replace.shape[0] == 1000
    assert np.unique(X_replace).shape[0] == 100
def bootstrap(arr,n_boots):
    '''
    variables:
        arr = the data that we are random sampling from
        n_boots = the number of bootstraps we want to make
    
    returns:
        list of lists containing the number of bootstrap
        samples we wanted to make
    '''
    return [resample(arr) for _ in xrange(n_boots)]
    def _contin_value_plot():
        # We need to set up an array of x_i values to search over
        if num_linspace:
            # Create an interval over +-1 std of the mean of the x column
            mean, std = df[column].mean(), df[column].std()
            lower = np.max([mean - (std), df[column].min()])
            upper = np.min([mean + (std), df[column].max()])
            x_i = np.linspace(lower, upper, num=num_linspace)
        else:
            # If num_linspace=None, make x_i the unique values
            x_i = np.unique(df[column])

        # For each value in our search space, set the entire column in question to that value and run model.predict or model.predict_proba
        # Average out those predictions and add it to a list of y_hats that we are keeping track of
        preds = []
        for val in x_i:
            print val
            dfc[column] = val
            if classification:
                class_ind = list(model.classes_).index(class_pred)
                pred = model.predict_proba(dfc)[:, class_ind]
            else:
                pred = model.predict(dfc)

            preds.append([boot_sample.mean() for boot_sample in (resample(pred) for _ in xrange(1000))])
        probs = np.array(preds)
        prob_means = probs.mean(axis=1)
        lower_bounds = np.percentile(probs, q=10, axis=1)
        upper_bounds = np.percentile(probs, q=90, axis=1)

        # Create the fill to indicate the confidence bounds
        ax1.fill_between(x_i, lower_bounds, upper_bounds, facecolor=cmap[0], alpha=0.25)
        # Plot the predictions
        ax1.plot(x_i, prob_means, c=cmap[1], linewidth=2)
        ax1.tick_params(axis="x", labelsize=14)
        ax1.tick_params(axis="y", labelsize=13)

        if freq:
            ax2 = ax1.twinx()
            if num_linspace:
                ax2.hist(
                    df.loc[(df[column] >= mean - std) & (df[column] <= mean + std), column].values,
                    facecolor=cmap[1],
                    alpha=0.4,
                )
            else:
                ax2.hist(df[column].values, facecolor=cmap[1], alpha=0.4)
            ax2.set_ylabel("Frequency")

        # Set xlims to mirror the min and max datapoint
        if xlim == None:
            ax1.set_xlim([x_i.min(), x_i.max()])
        else:
            ax1.set_xlim(xlim)
    def _discrete_value_plot(scatter_num):
        # Create an array of the unique discrete bins
        labels = np.unique(dfc[column])

        # Create list for keeping track of predictions
        preds = []
        for label in labels:
            # Set all of that column to that particular label
            dfc[column] = label
            # Make predictions using inputed model
            if classification:
                pred = model.predict_proba(dfc)[:, 1]
            else:
                pred = model.predict(dfc)

            # Append array of means of bootstrapped predictions
            preds.append(
                np.array([boot_sample.mean() for boot_sample in (resample(pred) for _ in xrange(1000))]).reshape(-1, 1)
            )

        # Probably do this irrespective of discrete vs contin
        # fig, ax1 = plt.subplots(figsize=figsize)

        # Create the boxplots for each label and alter colors
        bp = plt.boxplot(preds, sym="", whis=[5, 95], labels=labels)  # , widths=0.35)
        plt.setp(bp["boxes"], color=cmap[0])
        plt.setp(bp["whiskers"], color=cmap[0])
        plt.setp(bp["caps"], color=cmap[0])

        # Fill the boxes with color
        for idx in xrange(len(labels)):
            box = bp["boxes"][idx]
            boxCoords = box.get_xydata()
            boxPolygon = Polygon(boxCoords, facecolor=cmap[0], alpha=0.7)
            ax1.add_patch(boxPolygon)

        # Set the xtick labels
        xtickNames = plt.setp(ax1, xticklabels=labels)
        plt.setp(xtickNames, rotation=-45, fontsize=14)

        # Superimpose jittered scatter plot if scatter is set to True
        if scatter:
            # If the number of points to plot was not set with scatter_num
            # Set the number to 200 * the number of discrete bins
            if not scatter_num:
                scatter_num = 200 * len(labels)
            # Make the number of points per bin perportional to that labels
            # representation in the original dataset
            num_per_label = [int((df[column] == label).mean() * scatter_num) for label in labels]
            for idx, num in enumerate(num_per_label):
                y_data = np.random.choice(preds[idx].flatten(), size=num)
                x_data = [bp["whiskers"][idx * 2].get_xdata()[0]] * num
                jittered_x = _rand_jitter(x_data, bp["boxes"][idx])
                ax1.scatter(jittered_x, y_data, c=cmap[1], alpha=0.6)
def balanced_index(targets):
    class_index = {0: [], 1: []}
    for i, c in enumerate(targets):
        class_index[c].append(i)
    minor_class = 0 if len(class_index[0]) < len(class_index[1]) else 1
    balanced_class_index = resample(class_index[1 - minor_class],
                                    n_samples=len(class_index[minor_class]),
                                    replace=False,
                                    random_state=5)
    index_ = np.concatenate((class_index[minor_class], balanced_class_index))
    index_.sort()
    return index_
Example #31
0
from sklearn.utils import resample
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn import svm

test_day = ['2020-01-19', '2020-02-01', '2020-02-02', '2020-02-08']
# test_day = ['2020-02-09', '2020-02-15', '2020-02-16', '2020-02-22']
training_x, training_y = DataGenerator.get_data(test_day, is_training=True)

# Up-sample
training_df = pd.concat([training_x, training_y], axis=1)

minor_df = training_df[training_df.win == 1]
major_df = training_df[training_df.win == 0]

minor_df_upsample = resample(minor_df,
                             replace=True,
                             n_samples=len(major_df),
                             random_state=1)

new_training_df = pd.concat([major_df, minor_df_upsample], axis=0)

training_y = new_training_df['win']
training_x = new_training_df.drop(['win'], axis=1)

scaler = MinMaxScaler()
training_x = scaler.fit_transform(training_x, training_y)

model = svm.SVC(C=1, kernel='linear', random_state=0)

feature_selection = SFS(model,
                        forward=False,
                        cv=10,
Example #32
0
train = covtype.loc[0:15119, :]
test = covtype.loc[15120:, :]

# In[5]:

# Features - target
X_train = train.loc[:, 0:53]
y_train = train.loc[:, 54]

X_test = test.loc[:, 0:53]
y_test = test.loc[:, 54]

# In[3]:

# Train set sampling (2000 samples)
train_sample = resample(train, n_samples=2000, random_state=0)

X_train_sample = train_sample.loc[:, 0:53]
y_train_sample = train_sample.loc[:, 54]

# In[6]:

# Scale features data in [0,1] range
X_train_sample_minmax = minmax_scaler.fit_transform(X_train_sample)

X_train_minmax = minmax_scaler.fit_transform(X_train)

X_test_minmax = minmax_scaler.fit_transform(X_test)

# In[7]:
Example #33
0
 def resampleProcedure(data):
     data_r = resample(data)
     return data_r
Example #34
0
def roc_calculate(Ytrue, Yscore, bootnum=1000, metric=None, val=None):
    """Calculates required metrics for the roc plot function (fpr, tpr, and tpr_ci).

    Parameters
    ----------
    Ytrue : array-like, shape = [n_samples]
        Binary label for samples (0s and 1s)

    Yscore : array-like, shape = [n_samples]
        Predicted y score for samples

    Returns
    ----------------------------------
    fpr : array-like, shape = [n_samples]
        False positive rates.

    tpr : array-like, shape = [n_samples]
        True positive rates.

    tpr_ci : array-like, shape = [n_samples, 2]
        True positive rates 95% confidence intervals [lowci, uppci].
    """

    # Get fpr, tpr
    fpr, tpr, threshold = metrics.roc_curve(Ytrue, Yscore, pos_label=1, drop_intermediate=False)

    # fpr, tpr with drop_intermediates for fpr = 0 (useful for plot... since we plot specificity on x-axis, we don't need intermediates when fpr=0)
    tpr0 = tpr[fpr == 0][-1]
    tpr = np.concatenate([[tpr0], tpr[fpr > 0]])
    fpr = np.concatenate([[0], fpr[fpr > 0]])

    # if metric is provided, calculate stats
    if metric is not None:
        specificity, sensitivity, threshold = get_spec_sens_cuttoff(Ytrue, Yscore, metric, val)
        stats = get_stats(Ytrue, Yscore, specificity)
        stats["val_specificity"] = specificity
        stats["val_sensitivity"] = specificity
        stats["val_cutoffscore"] = threshold

    # bootstrap using vertical averaging
    tpr_boot = []
    boot_stats = []
    for i in range(bootnum):
        # Resample and get tpr, fpr
        Ytrue_res, Yscore_res = resample(Ytrue, Yscore)
        fpr_res, tpr_res, threshold_res = metrics.roc_curve(Ytrue_res, Yscore_res, pos_label=1, drop_intermediate=False)

        # Drop intermediates when fpr=0
        tpr0_res = tpr_res[fpr_res == 0][-1]
        tpr_res = np.concatenate([[tpr0_res], tpr_res[fpr_res > 0]])
        fpr_res = np.concatenate([[0], fpr_res[fpr_res > 0]])

        # Vertical averaging... use closest fpr_res to fpr, and append the corresponding tpr
        idx = [np.abs(i - fpr_res).argmin() for i in fpr]
        tpr_list = tpr_res[idx]
        tpr_boot.append(tpr_list)

        # if metric is provided, calculate stats
        if metric is not None:
            stats_res = get_stats(Ytrue_res, Yscore_res, specificity)
            boot_stats.append(stats_res)

    # Get CI for bootstat
    if metric is not None:
        bootci_stats = {}
        for i in boot_stats[0].keys():
            stats_i = [k[i] for k in boot_stats]
            stats_i = np.array(stats_i)
            stats_i = stats_i[~np.isnan(stats_i)]  # Remove nans
            try:
                lowci = np.percentile(stats_i, 2.5)
                uppci = np.percentile(stats_i, 97.5)
            except IndexError:
                lowci = np.nan
                uppci = np.nan
            bootci_stats[i] = [lowci, uppci]

    # Get CI for tpr
    tpr_lowci = np.percentile(tpr_boot, 2.5, axis=0)
    tpr_uppci = np.percentile(tpr_boot, 97.5, axis=0)

    # Add the starting 0
    tpr = np.insert(tpr, 0, 0)
    fpr = np.insert(fpr, 0, 0)
    tpr_lowci = np.insert(tpr_lowci, 0, 0)
    tpr_uppci = np.insert(tpr_uppci, 0, 0)

    # Concatenate tpr_ci
    tpr_ci = np.array([tpr_lowci, tpr_uppci])

    if metric is None:
        return fpr, tpr, tpr_ci
    else:
        return fpr, tpr, tpr_ci, stats, bootci_stats
Example #35
0
    def __init__(self, config):
        """
        The constructor of the DataGenerator class. It loads the training
        labels and the images.

        Parameters
        ----------
            config: dict
                a dictionary with necessary information for the dataloader
                (e.g batch size)
        """
        cwd = os.getenv("DATA_PATH")
        if cwd is None:
            print("Set your DATA_PATH env first")
            sys.exit(1)
        self.config = config
        try:
            if self.config.augment:
                pass
        except AttributeError:
            self.config.augment = False

        # Read csv file
        tmp = pd.read_csv(os.path.abspath(os.path.join(cwd, 'train.csv')),
                          delimiter=',',
                          engine='python')
        # A vector of images id.
        image_ids = tmp["Id"]
        data_path = os.path.join(cwd, 'train')
        print(data_path)
        self.n = len(image_ids)

        # For each id sublist of the 4 filenames [batch_size, 4]
        self.filenames = np.asarray([[
            os.path.join(cwd, 'train', id + '_' + c + '.png')
            for c in ['red', 'green', 'yellow', 'blue']
        ] for id in image_ids])
        # Labels
        self.labels = tmp["Target"].values
        # To one-hot representation of labels
        # e.g. before e.g. ['22 0' '12 23 0']
        # after split [['22', '0'], ['12', '23', '0']]
        # after binarize it is one hot representation
        binarizer = MultiLabelBinarizer(classes=np.arange(28))
        self.labels = [[int(c) for c in l.split(' ')] for l in self.labels]
        self.labels = binarizer.fit_transform(self.labels)

        # Build a validation set
        try:
            self.train_filenames, self.val_filenames,\
                self.train_labels, self.val_labels = train_test_split(
                    self.filenames, self.labels,
                    test_size=self.config.val_split,
                    random_state=42)
        except AttributeError:
            print('WARN: val_split not set - using 0.1')
            self.train_filenames, self.val_filenames,\
                self.train_labels, self.val_labels = train_test_split(
                    self.filenames, self.labels,
                    test_size=0.1, random_state=42)

        print("Shape of training data: {}".format(self.train_filenames.shape))
        print("Shape of training labels: {}".format(self.train_labels.shape))

        # Get list of all possible images (incl. augmented if exist)
        data_train_folder = os.path.join(cwd, 'train')

        # Augment training data if specified in config file (and if possible)
        if self.config.augment:
            print("Getting augmented dataset...")
            filter_list = ['yellow', 'red', 'blue', 'green']
            aug_train_list = []
            aug_train_labels = []

            for i in range(0, self.train_filenames.shape[0]):
                filename = self.train_filenames[i][0] \
                    .rsplit('/')[-1].rsplit('_')[0]
                print("Augmenting {}".format(filename))
                temp_rot = []
                temp_rev = []
                counter = 1
                while True:
                    test_f = os.path.join(
                        data_train_folder,
                        filename + '_rot{}'.format(counter) + '_' +
                        filter_list[0] + '.png')
                    if os.path.isfile(test_f) is False:
                        break
                    temp_rot = [
                        os.path.join(
                            data_train_folder, filename +
                            '_rot{}'.format(counter) + '_' + f + '.png')
                        for f in filter_list
                    ]
                    temp_rev = [
                        os.path.join(
                            data_train_folder, filename +
                            '_rev{}'.format(counter) + '_' + f + '.png')
                        for f in filter_list
                    ]
                    flag = True
                    if SKIP_CHECK is False:
                        try:
                            for fname in temp_rev:
                                with open(fname, 'rb') as f:
                                    # Check header of file
                                    flag = flag and (f.read(4) == b'\x89PNG')
                            for fname in temp_rot:
                                with open(fname, 'rb') as f:
                                    # Check header of file
                                    flag = flag and (f.read(4) == b'\x89PNG')
                        except IOError as e:
                            print(e)
                            flag = False
                    if flag is True:
                        aug_train_list.append(temp_rot)
                        aug_train_labels.append(self.train_labels[i])
                        aug_train_list.append(temp_rev)
                        aug_train_labels.append(self.train_labels[i])
                    else:
                        print("corrupted images found")
                        print(temp_rot)
                        print(temp_rev)

                    counter += 1

            try:
                # Append list of all aug filenames to training set
                self.train_filenames = np.vstack(
                    (self.train_filenames, np.asarray(aug_train_list)))
                self.train_labels = np.vstack(
                    (self.train_labels, np.asarray(aug_train_labels)))
                # Append list of all aug filenames to 'all' set
                self.filenames = np.vstack(
                    (self.filenames, np.asarray(aug_train_list)))
                self.labels = np.vstack(
                    (self.labels, np.asarray(aug_train_labels)))
            # aug_train_list is empty (no aug data available)
            except ValueError:
                print('No augmented data found. Please augment first')

        # New label frequency
        print("New label distribution: {}".format(
            self.train_labels.sum(axis=0)))

        self.n_train = len(self.train_labels)
        self.n_val = len(self.val_labels)
        self.n = len(self.labels)

        if hasattr(config, 'random_state'):
            random_state = config.random_state
        else:
            random_state = 42
        np.random.seed(random_state)
        if hasattr(config, 'bootstrap_size'):
            n_samples = int(config.bootstrap_size * self.n_train)
            new_indices = resample(np.arange(self.n_train),
                                   n_samples=n_samples,
                                   random_state=random_state)
            self.train_filenames = self.train_filenames[new_indices]
            self.train_labels = self.train_labels[new_indices]
            self.n_train = len(self.train_labels)

        print('Size of training set is {}'.format(self.n_train))
        print('Size of validation set is {}'.format(self.n_val))
        # Compute class weigths
        self.class_weights = (self.n_train) * np.reshape(
            1 / np.sum(self.train_labels, axis=0), (1, -1))
        # Number batches per epoch
        self.train_batches_per_epoch = int(
            (self.n_train - 1) / self.config.batch_size) + 1
        self.val_batches_per_epoch = int(
            (self.n_val - 1) / self.config.batch_size) + 1
        self.all_batches_per_epoch = int(
            (self.n - 1) / self.config.batch_size) + 1
Example #36
0
tmp = pd.DataFrame();
df2 = pd.DataFrame();

few = ['spy.','perl.','phf.','multihop.','ftp_write.','loadmodule.','rootkit.','imap.','warezmaster.','land.','buffer_overflow.','guess_passwd.','pod.']
for fff in few:
  tmp = df.loc[df['41'] == fff];
  df2 = pd.concat([df2,tmp]);
  df.drop(df[df['41'] == fff].index ,inplace=True);    


########### SMOTE Smaller categories ###############
print("\t> Synthetically generating new smaller categories...")


td = df.loc[df['41'] == 'normal.'];
td = resample(td, replace=False, n_samples=450, random_state=1);   ##### C point - size of smaller sample smotes

for i in range(42):                                      
  df2.rename(columns = {i: str(i)}, inplace = True);

few = ['multihop.','ftp_write.','loadmodule.','rootkit.','imap.','warezmaster.','land.','buffer_overflow.','guess_passwd.','pod.'];
smotenc = SMOTENC([1,2,3,6,11,20,21], random_state=1);

for smaller in few:
  tt = df2.loc[df2['41'] == smaller];
  df_tmp = pd.concat([tt,td]);

  X_tmp  = df_tmp.iloc[:,:-1];
  Y_tmp  = np.array(df_tmp.iloc[:,-1]);
  Y_tmp  = Y_tmp.reshape(len(Y_tmp),1);
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(len(X_train))
print(len(X_test))
#df=df.iloc[row_selected,:]
X_train=pd.DataFrame(X_train,columns=X.columns)
y_train=pd.DataFrame(y_train,columns=['TARGET'])
train= pd.concat((X_train, y_train), axis = 1)

# Separate majority and minority classes
df_majority = train[train['TARGET']==0]
df_minority = train[train['TARGET']==1]
print(len(df_majority))
print(len(df_minority))
# Downsample majority class
df_majority_underampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=len(df_minority),     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
train = pd.concat([df_majority_underampled, df_minority])
X_train_new=train.drop(columns=['TARGET'])
y_train_new=train['TARGET']
 
# Display new class counts
#train['TARGET'].value_counts()
unique, counts = np.unique(y_train_new, return_counts=True)

y_pos = np.arange(len(unique))

 
plt.bar(y_pos, counts, align='center', alpha=0.5)
Example #38
0
                                                    test_size=0.2,
                                                    random_state=42)

# # PROCESSING TRAIN DATA(UPSAMPLING)

# In[15]:

X_train.y.value_counts()

# In[16]:

#upsampling of the dataset
from sklearn.utils import resample
major = X_train[X_train['y'] == -1]
minor = X_train[X_train['y'] == 1]
upsampled = resample(minor, replace=True, n_samples=1516, random_state=123)
newupsampled = pd.concat([major, upsampled])

# In[17]:

newupsampled.y.value_counts()

# In[18]:

ynew_train = newupsampled['y']
X = newupsampled.drop('y', axis=1)

# In[19]:

X.shape
Example #39
0
def outliers_removal(df, output_dir, log, detect_based_on_structure_features):
    """
    TBD
    """

    # Load dataset
    cells = initial_parsing(df=df)

    # %% Threshold for determing outliers
    cell_dens_th_CN = 1e-20  # for cell-nucleus metrics across all cells
    cell_dens_th_S = 1e-10  # for structure volume metrics

    # Remove outliers

    # %% Remove cells that lack a Structure Volume value
    cells_ao = cells[["CellId", "structure_name"]].copy()
    cells_ao["Outlier"] = "No"
    CellIds_remove = cells.loc[cells["Structure volume"].isnull(),
                               "CellId"].values
    cells_ao.loc[cells_ao["CellId"].isin(CellIds_remove),
                 "Outlier"] = "yes_missing_structure_volume"
    cells = cells.drop(cells[cells["CellId"].isin(CellIds_remove)].index)
    cells.reset_index(drop=True)
    log.info(
        f"Removing {len(CellIds_remove)} cells that lack a Structure Volume measurement value"
    )
    log.info(f"Shape of remaining dataframe: {cells.shape}")

    # %% Feature set for cell and nuclear features
    cellnuc_metrics = [
        "Cell surface area",
        "Cell volume",
        "Cell height",
        "Nuclear surface area",
        "Nuclear volume",
        "Nucleus height",
        "Cytoplasmic volume",
    ]
    cellnuc_abbs = [
        "Cell area",
        "Cell vol",
        "Cell height",
        "Nuc area",
        "Nuc vol",
        "Nuc height",
        "Cyto vol",
    ]

    # %% All metrics including height
    L = len(cellnuc_metrics)
    pairs = np.zeros((int(L * (L - 1) / 2), 2)).astype(np.int)
    i = 0
    for f1 in np.arange(L):
        for f2 in np.arange(L):
            if f2 > f1:
                pairs[i, :] = [f1, f2]
                i += 1

    # %% The typical six scatter plots
    xvec = [1, 1, 6, 1, 4, 6]
    yvec = [4, 6, 4, 0, 3, 3]
    pairs2 = np.stack((xvec, yvec)).T

    # %% Just one
    xvec = [1]
    yvec = [4]

    # %% Parameters
    nbins = 100
    N = 10000
    fac = 1000
    Rounds = 5

    # %% For all pairs compute densities
    remove_cells = cells["CellId"].to_frame().copy()
    for i, xy_pair in enumerate(pairs):

        metricX = cellnuc_metrics[xy_pair[0]]
        metricY = cellnuc_metrics[xy_pair[1]]
        log.info(f"{metricX} vs {metricY}")

        # data
        x = cells[metricX].to_numpy() / fac
        y = cells[metricY].to_numpy() / fac

        # density estimate, repeat because of probabilistic nature of density estimate
        # used here
        for r in np.arange(Rounds):
            remove_cells[f"{metricX} vs {metricY}_{r}"] = np.nan
            log.info(f"Round {r + 1} of {Rounds}")
            rs = int(r)
            xS, yS = resample(x,
                              y,
                              replace=False,
                              n_samples=np.amin([N, len(x)]),
                              random_state=rs)
            k = gaussian_kde(np.vstack([xS, yS]))
            cell_dens = k(np.vstack([x.flatten(), y.flatten()]))
            cell_dens = cell_dens / np.sum(cell_dens)
            remove_cells.loc[remove_cells.index[np.arange(len(cell_dens))],
                             f"{metricX} vs {metricY}_{r}", ] = cell_dens

    # %% Summarize across repeats
    remove_cells_summary = cells["CellId"].to_frame().copy()
    for i, xy_pair in enumerate(pairs):
        metricX = cellnuc_metrics[xy_pair[0]]
        metricY = cellnuc_metrics[xy_pair[1]]
        log.info(f"{metricX} vs {metricY}")
        metricX = cellnuc_metrics[xy_pair[0]]
        metricY = cellnuc_metrics[xy_pair[1]]
        filter_col = [
            col for col in remove_cells
            if col.startswith(f"{metricX} vs {metricY}")
        ]
        x = remove_cells[filter_col].to_numpy()
        pos = np.argwhere(np.any(x < cell_dens_th_CN, axis=1))
        y = x[pos, :].squeeze()

        fig, axs = plt.subplots(1, 2, figsize=(16, 9))
        xr = np.log(x.flatten())
        xr = np.delete(xr, np.argwhere(np.isinf(xr)))
        axs[0].hist(xr, bins=100)
        axs[0].set_title("Histogram of cell probabilities (log scale)")
        axs[0].set_yscale("log")
        im = axs[1].imshow(np.log(y), aspect="auto")
        plt.colorbar(im)
        axs[1].set_title("Heatmap with low probability cells (log scale)")

        plot_save_path = f"{output_dir}/{metricX}_vs_{metricY}_cellswithlowprobs.png"
        plt.savefig(plot_save_path, format="png", dpi=150)
        plt.close("all")

        remove_cells_summary[f"{metricX} vs {metricY}"] = np.median(x, axis=1)

    # %% Identify cells to be removed
    CellIds_remove_dict = {}
    CellIds_remove = np.empty(0, dtype=int)
    for i, xy_pair in enumerate(pairs):
        metricX = cellnuc_metrics[xy_pair[0]]
        metricY = cellnuc_metrics[xy_pair[1]]
        CellIds_remove_dict[f"{metricX} vs {metricY}"] = np.argwhere(
            remove_cells_summary[f"{metricX} vs {metricY}"].to_numpy() <
            cell_dens_th_CN)
        CellIds_remove = np.union1d(
            CellIds_remove, CellIds_remove_dict[f"{metricX} vs {metricY}"])
        log.info(len(CellIds_remove))

    # %% Plot and remove outliers
    plotname = "CellNucleus"
    oplot(
        cellnuc_metrics,
        cellnuc_abbs,
        pairs2,
        cells,
        True,
        output_dir,
        f"{plotname}_6_org_fine",
        0.5,
        [],
    )
    oplot(
        cellnuc_metrics,
        cellnuc_abbs,
        pairs2,
        cells,
        True,
        output_dir,
        f"{plotname}_6_org_thick",
        2,
        [],
    )
    oplot(
        cellnuc_metrics,
        cellnuc_abbs,
        pairs2,
        cells,
        True,
        output_dir,
        f"{plotname}_6_outliers",
        2,
        CellIds_remove_dict,
    )
    oplot(
        cellnuc_metrics,
        cellnuc_abbs,
        pairs,
        cells,
        True,
        output_dir,
        f"{plotname}_21_org_fine",
        0.5,
        [],
    )
    oplot(
        cellnuc_metrics,
        cellnuc_abbs,
        pairs,
        cells,
        True,
        output_dir,
        f"{plotname}_21_org_thick",
        2,
        [],
    )
    oplot(
        cellnuc_metrics,
        cellnuc_abbs,
        pairs,
        cells,
        True,
        output_dir,
        f"{plotname}_21_outliers",
        2,
        CellIds_remove_dict,
    )
    log.info(cells.shape)
    CellIds_remove = (cells.loc[cells.index[CellIds_remove],
                                "CellId"].squeeze().to_numpy())
    cells_ao.loc[cells_ao["CellId"].isin(CellIds_remove),
                 "Outlier"] = "yes_abnormal_cell_or_nuclear_metric"
    cells = cells.drop(cells.index[cells["CellId"].isin(CellIds_remove)])
    log.info(
        f"Removing {len(CellIds_remove)} cells due to abnormal cell or nuclear metric"
    )
    log.info(cells.shape)
    oplot(
        cellnuc_metrics,
        cellnuc_abbs,
        pairs2,
        cells,
        True,
        output_dir,
        f"{plotname}_6_clean_thick",
        2,
        [],
    )
    oplot(
        cellnuc_metrics,
        cellnuc_abbs,
        pairs2,
        cells,
        True,
        output_dir,
        f"{plotname}_6_clean_fine",
        0.5,
        [],
    )
    oplot(
        cellnuc_metrics,
        cellnuc_abbs,
        pairs,
        cells,
        True,
        output_dir,
        f"{plotname}_21_clean_thick",
        2,
        [],
    )
    oplot(
        cellnuc_metrics,
        cellnuc_abbs,
        pairs,
        cells,
        True,
        output_dir,
        f"{plotname}_21_clean_fine",
        0.5,
        [],
    )

    # %% Feature sets for structures
    selected_metrics = [
        "Cell volume",
        "Cell surface area",
        "Nuclear volume",
        "Nuclear surface area",
    ]
    selected_metrics_abb = ["Cell Vol", "Cell Area", "Nuc Vol", "Nuc Area"]
    selected_structures = [
        "LMNB1",
        "ST6GAL1",
        "TOMM20",
        "SEC61B",
        "ATP2A2",
        "LAMP1",
        "RAB5A",
        "SLC25A17",
        "TUBA1B",
        "TJP1",
        "NUP153",
        "FBL",
        "NPM1",
        "SON",
    ]
    structure_metric = "Structure volume"

    # %% Parameters
    N = 1000
    fac = 1000
    Rounds = 5

    if detect_based_on_structure_features:

        # We may want to skip this part when running the test dataset
        # or any small dataset that does not have enough cells per
        # structure.

        # %% For all pairs compute densities
        remove_cells = cells["CellId"].to_frame().copy()
        for xm, metric in enumerate(selected_metrics):
            for ys, struct in enumerate(selected_structures):

                # data
                x = (cells.loc[cells["structure_name"] == struct,
                               [metric]].squeeze().to_numpy() / fac)
                y = (cells.loc[cells["structure_name"] == struct,
                               [structure_metric]].squeeze().to_numpy() / fac)

                # density estimate, repeat because of probabilistic nature of density
                # estimate used here
                for r in np.arange(Rounds):
                    if ys == 0:
                        remove_cells[
                            f"{metric} vs {structure_metric}_{r}"] = np.nan
                    rs = int(r)
                    xS, yS = resample(x,
                                      y,
                                      replace=False,
                                      n_samples=np.amin([N, len(x)]),
                                      random_state=rs)
                    k = gaussian_kde(np.vstack([xS, yS]))
                    cell_dens = k(np.vstack([x.flatten(), y.flatten()]))
                    cell_dens = cell_dens / np.sum(cell_dens)
                    remove_cells.loc[
                        cells["structure_name"] == struct,
                        f"{metric} vs {structure_metric}_{r}", ] = cell_dens

    # remove_cells = pd.read_csv(data_root_extra / 'structures.csv')

    # %% Summarize across repeats
    remove_cells_summary = cells["CellId"].to_frame().copy()
    for xm, metric in enumerate(selected_metrics):
        log.info(metric)

        filter_col = [
            col for col in remove_cells
            if col.startswith(f"{metric} vs {structure_metric}")
        ]
        x = remove_cells[filter_col].to_numpy()
        pos = np.argwhere(np.any(x < cell_dens_th_S, axis=1))
        y = x[pos, :].squeeze()

        fig, axs = plt.subplots(1, 2, figsize=(16, 9))
        xr = np.log(x.flatten())
        xr = np.delete(xr, np.argwhere(np.isinf(xr)))
        axs[0].hist(xr, bins=100)
        axs[0].set_title("Histogram of cell probabilities (log scale)")
        axs[0].set_yscale("log")
        im = axs[1].imshow(np.log(y), aspect="auto")
        plt.colorbar(im)
        axs[1].set_title("Heatmap with low probability cells (log scale)")

        plot_save_path = (
            f"{output_dir}/{metric}_vs_{structure_metric}_cellswithlowprobs.png"
        )
        plt.savefig(plot_save_path, format="png", dpi=150)

        remove_cells_summary[f"{metric} vs {structure_metric}"] = np.median(
            x, axis=1)

    # %% Identify cells to be removed
    CellIds_remove_dict = {}
    CellIds_remove = np.empty(0, dtype=int)
    for xm, metric in enumerate(selected_metrics):
        log.info(metric)
        CellIds_remove_dict[f"{metric} vs {structure_metric}"] = np.argwhere(
            remove_cells_summary[f"{metric} vs {structure_metric}"].to_numpy()
            < cell_dens_th_S)
        CellIds_remove = np.union1d(
            CellIds_remove,
            CellIds_remove_dict[f"{metric} vs {structure_metric}"])
        log.info(len(CellIds_remove))

    # %% Plot and remove outliers
    plotname = "Structures"
    splot(
        selected_metrics,
        selected_metrics_abb,
        selected_structures[0:7],
        structure_metric,
        cells,
        True,
        output_dir,
        f"{plotname}_1_org_fine",
        0.5,
        [],
    )
    splot(
        selected_metrics,
        selected_metrics_abb,
        selected_structures[7:14],
        structure_metric,
        cells,
        True,
        output_dir,
        f"{plotname}_2_org_fine",
        0.5,
        [],
    )
    splot(
        selected_metrics,
        selected_metrics_abb,
        selected_structures[0:7],
        structure_metric,
        cells,
        True,
        output_dir,
        f"{plotname}_1_org_thick",
        2,
        [],
    )
    splot(
        selected_metrics,
        selected_metrics_abb,
        selected_structures[7:14],
        structure_metric,
        cells,
        True,
        output_dir,
        f"{plotname}_2_org_thick",
        2,
        [],
    )
    splot(
        selected_metrics,
        selected_metrics_abb,
        selected_structures[0:7],
        structure_metric,
        cells,
        True,
        output_dir,
        f"{plotname}_1_outliers",
        2,
        CellIds_remove_dict,
    )
    splot(
        selected_metrics,
        selected_metrics_abb,
        selected_structures[7:14],
        structure_metric,
        cells,
        True,
        output_dir,
        f"{plotname}_2_outliers",
        2,
        CellIds_remove_dict,
    )
    log.info(cells.shape)
    CellIds_remove = (cells.loc[cells.index[CellIds_remove],
                                "CellId"].squeeze().to_numpy())
    cells_ao.loc[cells_ao["CellId"].isin(CellIds_remove),
                 "Outlier"] = "yes_abnormal_structure_volume_metrics"
    cells = cells.drop(cells.index[cells["CellId"].isin(CellIds_remove)])
    log.info(
        f"Removing {len(CellIds_remove)} cells due to structure volume metrics"
    )
    log.info(cells.shape)
    splot(
        selected_metrics,
        selected_metrics_abb,
        selected_structures[0:7],
        structure_metric,
        cells,
        True,
        output_dir,
        f"{plotname}_1_clean_fine",
        0.5,
        [],
    )
    splot(
        selected_metrics,
        selected_metrics_abb,
        selected_structures[7:14],
        structure_metric,
        cells,
        True,
        output_dir,
        f"{plotname}_2_clean_fine",
        0.5,
        [],
    )
    splot(
        selected_metrics,
        selected_metrics_abb,
        selected_structures[0:7],
        structure_metric,
        cells,
        True,
        output_dir,
        f"{plotname}_1_clean_thick",
        2,
        [],
    )
    splot(
        selected_metrics,
        selected_metrics_abb,
        selected_structures[7:14],
        structure_metric,
        cells,
        True,
        output_dir,
        f"{plotname}_2_clean_thick",
        2,
        [],
    )

    # %% Final diagnostic plot
    cells = initial_parsing(df=df)
    CellIds_remove_dict = {}

    for i, xy_pair in enumerate(pairs):
        metricX = cellnuc_metrics[xy_pair[0]]
        metricY = cellnuc_metrics[xy_pair[1]]
        CellIds_remove_dict[f"{metricX} vs {metricY}"] = np.argwhere(
            (cells_ao["Outlier"] == "yes_abnormal_cell_or_nuclear_metric"
             ).to_numpy())
    oplot(
        cellnuc_metrics,
        cellnuc_abbs,
        pairs2,
        cells,
        True,
        output_dir,
        "Check_cellnucleus",
        2,
        CellIds_remove_dict,
    )

    CellIds_remove_dict = {}
    for xm, metric in enumerate(selected_metrics):
        CellIds_remove_dict[f"{metric} vs {structure_metric}"] = np.argwhere(
            ((cells_ao["Outlier"] == "yes_abnormal_structure_volume_metrics")
             | (cells_ao["Outlier"]
                == "yes_abnormal_cell_or_nuclear_metric")).to_numpy())
    splot(
        selected_metrics,
        selected_metrics_abb,
        selected_structures[0:7],
        structure_metric,
        cells,
        True,
        output_dir,
        "Check_structures_1",
        2,
        CellIds_remove_dict,
    )
    splot(
        selected_metrics,
        selected_metrics_abb,
        selected_structures[7:14],
        structure_metric,
        cells,
        True,
        output_dir,
        "Check_structures_2",
        2,
        CellIds_remove_dict,
    )

    cells_ao = cells_ao.set_index("CellId", drop=True)

    return cells_ao
Example #40
0
m = X_.shape[0]
batch_size = 11
steps_per_epoch = m // batch_size

graph = topological_sort(feed_dict)
trainables = [W1, b1, W2, b2]

print("Total number of examples = {}".format(m))

# Step 4
for i in range(epochs):
    loss = 0
    for j in range(steps_per_epoch):
        # Step 1
        # Randomly sample a batch of examples
        X_batch, y_batch = resample(X_, y_, n_samples=batch_size)

        # Reset value of X and y Inputs
        X.value = X_batch
        y.value = y_batch

        # Step 2
        forward_and_backward(graph)

        # Step 3
        sgd_update(trainables)

        loss += graph[-1].value

    print("Epoch: {}, Loss: {:.3f}".format(i+1, loss/steps_per_epoch))
Example #41
0
     print(f'% insufficient data for H{horizon} T{change}')
     continue  # not enough data
 if verbose:
     print(
         f'%%% Selecting features for H{horizon} T{change} with {n} data points'
     )
 for replica in range(replicas):  # if technically splits could be made
     parts = []
     goal = n // present
     for label in [0, 1, 2]:  # resample to balance the classes
         if label in counters:
             matches = data[data.label == label]
             lm = len(matches)
             if lm >= MINIMUM:  # disregard the nearly-absent class instead of gross oversampling
                 part = resample(matches,
                                 replace=lm < goal,
                                 n_samples=goal)
                 parts.append(part)
     training, testing = train_test_split(pd.concat(parts),
                                          test_size=0.3)
     expected = [l for l in testing[labels]]  # a simple list
     trainData = training[features].to_numpy()
     start = time()
     preproc = FRFS()  # very slow, perform on a subset
     sample = resample(training, replace=False, n_samples=ss)
     selected = preproc.process(sample[features].to_numpy(), \
                                np.reshape(sample[labels].to_numpy(), (ss, 1)))
     fstimes.append(1000 * (time() - start))  # ms
     for pos in range(len(selected)):  # update the usage counters
         if selected[pos]:
             i = features[pos]
Example #42
0
# print "ddd ",d_arr
# col4=[]
# for q in arr:
#     q1 = [' '.join(x) for x in ngrams(q, 1)]# q1:mang cac 1-grams
#     q2 = [' '.join(x) for x in ngrams(q, 2)]  # q2: mang cac phan tu 2-grams
#     print "q2 ",q2
#     q3 = [' '.join(x.replace(' ','_') for x in q2)]
#     print "q3  ",q3
#     y=q1+q3
#     z = " ".join(y)
#     print "yyyy ",z
#     col4.append(z)
# print "col4 ",col4
#
# a =['a b','c d']
# b = ['a b','c d','e f']

row  = np.array([0, 3, 1, 0])
# print row.shape
# X2 = np.array([[1., 0.], [2., 1.], [0., 0.]])
X = np.array([[1., 0.], [2., 1.], [0., 0.],[3., 5.], [0, 0]])
# print X.shape
y = np.array([0, 1, 2, 3, 4])
# y2 = np.array([0, 1, 2])
# X_sparse = coo_matrix(X)
# print X_sparse
X, y = resample(X, y, n_samples=7)

print X

print y
Example #43
0
        'threshold': pd.Series(threshold, index=i)
    })
    roc_t = roc.ix[(roc.tf - 0).abs().argsort()[:1]]
    return list(roc_t['threshold'])


# Load dataset
print('===> loading dataset')
data = pd.read_csv('~/repos/dataset/HR.csv')
dataset = data.rename(columns={'left': 'class'})

# Unbalanced dataset
# Upsampling
minority = dataset[dataset['class'] == 1]
minority_upsampled = resample(minority,
                              replace=True,
                              n_samples=11428,
                              random_state=123)

# Downsampling
majority = dataset[dataset['class'] == 0]
majority_downsampled = resample(majority,
                                replace=False,
                                n_samples=3571,
                                random_state=123)

dataset = pd.concat([minority, majority_downsampled])

# Transform features
dataset = pd.get_dummies(dataset, columns=['sales', 'salary'])

# Selection features
Example #44
0
'''
data=lung_cancer[lung_cancer['YEAR_DX'].between(2004,2010)].drop(columns=drop_cols+['survival_classes'])
target=lung_cancer[lung_cancer['YEAR_DX'].between(2004,2010)]['survival_classes']
data=pd.get_dummies(data,prefix=catg_cols,columns=catg_cols,drop_first=False)

class_weights = dict(enumerate(class_weight.compute_class_weight('balanced',
                                             pd.np.unique(target),target)))
'''
data = lung_cancer[lung_cancer['YEAR_DX'].between(
    2004, 2010)].drop(columns=drop_cols)
low_survival = data[data['survival_classes'] == '<=6months']
mid_survival = data[data['survival_classes'] == '0.5-2yrs']
high_survival = data[data['survival_classes'] == '>2yrs']

mid_survival = resample(mid_survival,
                        replace=True,
                        n_samples=len(low_survival),
                        random_state=21)
high_survival = resample(high_survival,
                         replace=True,
                         n_samples=len(low_survival),
                         random_state=21)

data_upsampled = pd.concat([low_survival, mid_survival, high_survival], axis=0)

data = data_upsampled.drop(columns=['survival_classes'])
target = data_upsampled['survival_classes']
data = pd.get_dummies(data,
                      prefix=catg_cols,
                      columns=catg_cols,
                      drop_first=False)
Example #45
0
def Bootstrap(x1, x2, y, N_boot=500, method='ols', degrees=5, random_state=42):
    """
    Computes bias^2, variance and the mean squared error using bootstrap resampling method
    for the provided data and the method.
    
    Arguments:
    x1: 1D numpy array, covariate
    x2: 1D numpy array, covariate
    N_boot: integer type, the number of bootstrap samples
    method: string type, accepts 'ols', 'ridge' or 'lasso' as arguments
    degree: integer type, polynomial degree for generating the design matrix
    random_state: integer, ensures the same split when using the train_test_split functionality
    
    Returns: Bias_vec, Var_vec, MSE_vec, betaVariance_vec
             numpy arrays. Bias, Variance, MSE and the variance of beta for the predicted model
    """
    ##split x1, x2 and y arrays as a train and test data and generate design matrix
    x1_train, x1_test, x2_train, x2_test, y_train, y_test = train_test_split(
        x1, x2, y, test_size=0.2, random_state=random_state)
    y_pred_test = np.zeros((y_test.shape[0], N_boot))
    X_test = designMatrix(x1_test, x2_test, degrees)

    betaMatrix = np.zeros((X_test.shape[1], N_boot))

    ##resample and fit the corresponding method on the train data
    for i in range(N_boot):
        x1_, x2_, y_ = resample(x1_train, x2_train, y_train)
        X_train = designMatrix(x1_, x2_, degrees)
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
        X_train[:, 0] = 1
        X_test = designMatrix(x1_test, x2_test, degrees)
        X_test = scaler.transform(X_test)
        X_test[:, 0] = 1

        if method == 'ols':
            manual_regression = linregOwn(method='ols')
            beta = manual_regression.fit(X_train, y_)
        if method == 'ridge':
            manual_regression = linregOwn(method='ridge')
            beta = manual_regression.fit(X_train, y_, lambda_=0.05)
        if method == 'lasso':
            manual_regression = linregOwn(method='lasso')
            beta = manual_regression.fit(X_train, y_, lambda_=0.05)

        ##predict on the same test data
        y_pred_test[:, i] = np.dot(X_test, beta)
        betaMatrix[:, i] = beta
    y_test = y_test.reshape(len(y_test), 1)

    Bias_vec = []
    Var_vec = []
    MSE_vec = []
    betaVariance_vec = []
    R2_score = []
    y_test = y_test.reshape(len(y_test), 1)
    MSE = np.mean(np.mean((y_test - y_pred_test)**2, axis=1, keepdims=True))
    bias = np.mean((y_test - np.mean(y_pred_test, axis=1, keepdims=True))**2)
    variance = np.mean(np.var(y_pred_test, axis=1, keepdims=True))
    betaVariance = np.var(betaMatrix, axis=1)
    print("-------------------------------------------------------------")
    print("Degree: %d" % degrees)
    print('MSE:', np.round(MSE, 3))
    print('Bias^2:', np.round(bias, 3))
    print('Var:', np.round(variance, 3))
    print('{} >= {} + {} = {}'.format(MSE, bias, variance, bias + variance))
    print("-------------------------------------------------------------")

    Bias_vec.append(bias)
    Var_vec.append(variance)
    MSE_vec.append(MSE)
    betaVariance_vec.append(betaVariance)
    return Bias_vec, Var_vec, MSE_vec, betaVariance_vec
Example #46
0
np.exp(0.11018577)
predictions = logreg.predict(X_test_sc)
# Making the Confusion Matrix and Accuracy_score¶
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, predictions)
cm
cm = pd.DataFrame(cm, columns=['Predicted Negative','Predicted Positive'], index=['Actual Negative','Actual Positive'])
cm
accuracy_score(y_test, predictions)
df['class'].value_counts()
df_v1['class'].value_counts()
df_v1.shape
df_v1['class'].value_counts()
df_v1_maj = df_v1[ df_v1['class'] == 'ckd' ]
df_v1_min = df_v1[ df_v1['class'] == 'notckd' ]
df_upsample = resample(df_v1_maj, replace = True, n_samples = 4850, random_state = 42)
df_upsample = pd.concat([df_upsample, df_v1_min])
df_upsample['class'].value_counts()
X = df_upsample[v1_features]
y = df_upsample['class']
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, random_state = 42)
ss.fit(X_train)
X_train_sc = ss.transform(X_train)
X_test_sc = ss.transform(X_test)
logreg.fit(X_train_sc, y_train)
# Earlier score was 0.96666666666
logreg.score(X_train_sc, y_train)
# Earlier score was 0.96
logreg.score(X_test_sc, y_test)
predictions = logreg.predict(X_test_sc)
len(df_no_missing)

# **29,932** samples is a relatively large number for a **Support Vector Machine**, so let's downsample. To make sure we get **1,000** of each category, we start by splitting the data into two **dataframes**, one for people that did not default and one for people that did.

# In[ ]:

df_no_default = df_no_missing[df_no_missing['DEFAULT'] == 0]
df_default = df_no_missing[df_no_missing['DEFAULT'] == 1]

# Now downsample the dataset that did not default...

# In[ ]:

df_no_default_downsampled = resample(df_no_default,
                                     replace=False,
                                     n_samples=1000,
                                     random_state=42)
len(df_no_default_downsampled)

# Now downsample the dataset that defaulted...

# In[ ]:

df_default_downsampled = resample(df_default,
                                  replace=False,
                                  n_samples=1000,
                                  random_state=42)
len(df_default_downsampled)

# Now let's merge the two downsampled datasets into a single **dataframe** and print out the total number of samples to make sure everything is hunky dory.
Example #48
0
""" A note on bootstrapping estimates:
To generate the bootstrap estimates we modify the 
sess_estimates.py script only slightly to include an additional loop
in the run_estimates(.) function, then gather these additional results.
"""

from sklearn.utils import resample

#[...]

for bootstrapi in range(num_bootstraps):
    X_index = range(X.shape[0])
    resamp = resample(X_index, random_state=9889)

    ycurr = y[resamp]
    Xcurr = X[resamp]

    ycurr.index = range(len(ycurr))

    skf = StratifiedKFold(y, n_folds=10, shuffle=True, random_state=1234)
    #[...]
sample_variance = np.var(samples)

print("sample mean = {} and sample variance = {}".format(sample_mean, sample_variance))


# part (c)
from sklearn.utils import resample


N_resample = 1000       # number of resample
std = []
mean = []

# re-sample, compute corresponding mean & std, and store them
for i in range(N_resample):
    re_samples = resample(samples, n_samples=len(samples), replace=True)

    mean.append(np.mean(re_samples))
    std.append(np.std(re_samples))

# sort mean and std
mean = np.sort(mean)
std = np.sort(std)


# compute 95% confidence interval
a = 95
per_1_mean = np.percentile(mean, (100-a)/2, interpolation='nearest')
per_2_mean = np.percentile(mean, (100+a)/2, interpolation='nearest')
print('The ', str((100-a)/2), '% percentile mean is: ', str(per_1_mean))
print('The ', str((100+a)/2), '% percentile mean is: ', str(per_2_mean))
Example #50
0
def create_upsampled_test_data(dataset, labels, num_of_classes):
    len_dataset = len(dataset)

    freq_array = [0 for x in range(num_of_classes)]
    new_label = numpy.reshape(labels, (len_dataset, 1))

    for label in labels:
        freq_array[label - 1] += 1

    concat_data_label = numpy.concatenate((dataset, new_label), axis=1)
    max_no_of_class = max(freq_array)
    index_of_max_class = numpy.argmax(freq_array)
    len_concat_data_label = len(concat_data_label[0])

    # upsample
    for label in range(num_of_classes):
        new_array = numpy.zeros((freq_array[label], len_concat_data_label))
        index = 0
        for count in range(0, len_dataset):
            if label == labels[count] - 1:
                new_array[index] = concat_data_label[count]
                index += 1
        if label == index_of_max_class:
            upsamped = copy.deepcopy(new_array)
        else:
            upsamped = resample(copy.deepcopy(new_array),
                                n_samples=max_no_of_class)

        if label == 0:
            resampled_arr = copy.deepcopy(upsamped)
        else:
            resampled_arr = numpy.concatenate((resampled_arr, upsamped),
                                              axis=0)

    resampled_arr = shuffle(resampled_arr)
    len_upsampled_col = len(resampled_arr[0])
    len_upsampled_row = len(resampled_arr)
    labels_upsampled = numpy.array(resampled_arr[:, [len_upsampled_col - 1]])
    resampled_arr = numpy.delete(resampled_arr, [len_upsampled_col - 1],
                                 axis=1)
    len_upsampled_col -= 1

    with open("training_set.csv", 'w') as file:
        for row in range(len_upsampled_row):
            for col in range(len_upsampled_col):
                file.write(str(resampled_arr[row][col]))
                if col == len_upsampled_col - 1:
                    break
                if col != len_upsampled_col:
                    file.write(",")
            file.write("\n")

    with open("training_labels.csv", 'w') as file:
        for row in range(len_upsampled_row):
            file.write(str(labels_upsampled[row][0]))
            file.write("\n")

    labels_upsampled = numpy.reshape(labels_upsampled, len(labels_upsampled))
    labels_matrix_upsampled = create_label_matrix(labels_upsampled)

    return resampled_arr, labels_matrix_upsampled
Example #51
0
def bootstrap_heat_capacity(frame_begin=0,
                            sample_spacing=1,
                            frame_end=-1,
                            plot_file='heat_capacity_boot.pdf',
                            output_data="output/output.nc",
                            num_intermediate_states=0,
                            frac_dT=0.05,
                            conf_percent='sigma',
                            n_trial_boot=200):
    """
    Calculate and plot the heat capacity curve, with uncertainty determined using bootstrapping.
    Uncorrelated datasets are selected using a random starting frame, repeated n_trial_boot 
    times. Uncertainty in melting point and full-width half maximum of the C_v curve are also returned.
    
    :param frame_begin: index of first frame defining the range of samples to use as a production period (default=0)
    :type frame_begin: int
    
    :param sample_spacing: spacing of uncorrelated data points, for example determined from pymbar timeseries subsampleCorrelatedData (default=1)
    :type sample_spacing: int
    
    :param frame_end: index of last frame to include in heat capacity calculation (default=-1)
    :type frame_end: int

    :param output_data: Path to the output data for a NetCDF-formatted file containing replica exchange simulation data (default = "output/output.nc")                                                                                          
    :type output_data: str    
    
    :param num_intermediate_states: The number of states to insert between existing states in 'temperature_list' (default=0)
    :type num_intermediate_states: int

    :param frac_dT: The fraction difference between temperatures points used to calculate finite difference derivatives (default=0.05)
    :type num_intermediate_states: float    
    
    :param conf_percent: Confidence level in percent for outputting uncertainties (default = 68.27 = 1 sigma)
    :type conf_percent: float
    
    :param n_trial_boot: number of trials to run for generating bootstrapping uncertainties
    :type n_trial_boot: int
    
    :returns:
       - T_list ( List( float * unit.simtk.temperature ) ) - The temperature list corresponding to the heat capacity values in 'C_v'
       - C_v_values ( List( float * kJ/mol/K ) ) - The heat capacity values for all (including inserted intermediates) states
       - C_v_uncertainty ( Tuple ( np.array(float) * kJ/mol/K ) ) - confidence interval for all C_v_values computed from bootstrapping
       - Tm_value ( float * unit.simtk.temperature ) - Melting point mean value computed from bootstrapping
       - Tm_uncertainty ( Tuple ( float * unit.simtk.temperature ) ) - confidence interval for melting point computed from bootstrapping
       - FWHM_value ( float * unit.simtk.temperature ) - C_v full width half maximum mean value computed from bootstrapping
       - FWHM_uncertainty ( Tuple ( float * unit.simtk.temperature ) ) - confidence interval for C_v full width half maximum computed from bootstrapping
    
    """

    # extract reduced energies and the state indices from the .nc
    reporter = MultiStateReporter(output_data, open_mode="r")
    analyzer = ReplicaExchangeAnalyzer(reporter)
    (
        replica_energies_all,
        unsampled_state_energies,
        neighborhoods,
        replica_state_indices,
    ) = analyzer.read_energies()

    # Store data for each sampling trial:
    C_v_values_boot = {}
    C_v_uncertainty_boot = {}

    Tm_boot = np.zeros(n_trial_boot)
    Cv_height = np.zeros(n_trial_boot)
    FWHM = np.zeros(n_trial_boot)

    for i_boot in range(n_trial_boot):

        # Select production frames to analyze
        # Here we can potentially change the reference frame for each bootstrap trial.
        ref_shift = np.random.randint(sample_spacing)
        # ***We should check if these energies arrays will be the same size for
        # different reference frames
        replica_energies = replica_energies_all[:, :,
                                                (frame_begin +
                                                 ref_shift)::sample_spacing]

        # Get all possible sample indices
        sample_indices_all = np.arange(0, len(replica_energies[0, 0, :]))
        # n_samples should match the size of the sliced replica energy dataset
        sample_indices = resample(sample_indices_all,
                                  replace=True,
                                  n_samples=len(sample_indices_all))

        n_state = replica_energies.shape[0]

        replica_energies_resample = np.zeros_like(replica_energies)
        # replica_energies is [n_states x n_states x n_frame]

        # Select the sampled frames from array_folded_states and replica_energies:
        j = 0
        for i in sample_indices:
            replica_energies_resample[:, :, j] = replica_energies[:, :, i]
            j += 1

        # Run heat capacity expectation calculation:
        C_v_values_boot[i_boot], C_v_uncertainty_boot[
            i_boot], T_list = get_heat_capacity(
                output_data=output_data,
                num_intermediate_states=num_intermediate_states,
                frac_dT=frac_dT,
                plot_file=None,
                bootstrap_energies=replica_energies_resample,
            )

        if i_boot == 0:
            # Get units:
            C_v_unit = C_v_values_boot[0][0].unit
            T_unit = T_list[0].unit

        # Compute the melting point:
        max_index = np.argmax(C_v_values_boot[i_boot])
        Tm_boot[i_boot] = T_list[max_index].value_in_unit(T_unit)

        # Compute the peak height, relative to lowest C_v value in the temp range:
        Cv_height[i_boot] = (
            np.max(C_v_values_boot[i_boot]) -
            np.min(C_v_values_boot[i_boot])).value_in_unit(C_v_unit)

        # Compute the FWHM:
        # C_v value at half-maximum:
        mid_val = np.min(C_v_values_boot[i_boot]).value_in_unit(
            C_v_unit) + Cv_height[i_boot] / 2

        #***Note: this assumes that there is only a single heat capacity peak, with
        # monotonic behavior on each side of the peak.

        half_lo_found = False
        half_hi_found = False

        T_half_lo = None
        T_half_hi = None

        # Reverse scan for lower half:
        k = 1
        while half_lo_found == False:
            index = max_index - k
            if index < 0:
                # The lower range does not contain the lower midpoint
                break
            else:
                curr_val = C_v_values_boot[i_boot][index].value_in_unit(
                    C_v_unit)
                prev_val = C_v_values_boot[i_boot][index +
                                                   1].value_in_unit(C_v_unit)

            if curr_val <= mid_val:
                # The lower midpoint lies within T[index] and T[index+1]
                # Interpolate solution:
                T_half_lo = T_list[index] + (mid_val - curr_val) * (
                    T_list[index + 1] - T_list[index]) / (prev_val - curr_val)
                half_lo_found = True
            else:
                k += 1

        # Forward scan for upper half:
        m = 1

        while half_hi_found == False:
            index = max_index + m
            if index == len(T_list):
                # The upper range does not contain the upper midpoint
                break
            else:
                curr_val = C_v_values_boot[i_boot][index].value_in_unit(
                    C_v_unit)
                prev_val = C_v_values_boot[i_boot][index -
                                                   1].value_in_unit(C_v_unit)
            if curr_val <= mid_val:
                # The upper midpoint lies within T[index] and T[index-1]
                # Interpolate solution:
                T_half_hi = T_list[index] + (mid_val - curr_val) * (
                    T_list[index - 1] - T_list[index]) / (prev_val - curr_val)
                half_hi_found = True
            else:
                m += 1

        if half_lo_found and half_hi_found:
            FWHM[i_boot] = (T_half_hi - T_half_lo).value_in_unit(T_unit)
        elif half_lo_found == True and half_hi_found == False:
            FWHM[i_boot] = 2 * (Tm_boot[i_boot] -
                                T_half_lo.value_in_unit(T_unit))
        elif half_lo_found == False and half_hi_found == True:
            FWHM[i_boot] = 2 * (T_half_hi.value_in_unit(T_unit) -
                                Tm_boot[i_boot])

    # Compute uncertainty at all temps in T_list over the n_trial_boot trials performed:

    # Convert dicts to array
    arr_C_v_values_boot = np.zeros((n_trial_boot, len(T_list)))

    for i_boot in range(n_trial_boot):
        arr_C_v_values_boot[i_boot, :] = C_v_values_boot[i_boot].value_in_unit(
            C_v_unit)

    # Compute mean values:
    C_v_values = np.mean(arr_C_v_values_boot, axis=0) * C_v_unit
    Cv_height_value = np.mean(Cv_height) * C_v_unit
    Tm_value = np.mean(Tm_boot) * T_unit
    FWHM_value = np.mean(FWHM) * T_unit

    # Compute confidence intervals:
    if conf_percent == 'sigma':
        # Use analytical standard deviation instead of percentile method:

        # C_v values:
        C_v_std = np.std(arr_C_v_values_boot, axis=0)
        C_v_uncertainty = (-C_v_std * C_v_unit, C_v_std * C_v_unit)

        # C_v peak height:
        Cv_height_std = np.std(Cv_height)
        Cv_height_uncertainty = (-Cv_height_std * C_v_unit,
                                 Cv_height_std * C_v_unit)

        # Melting point:
        Tm_std = np.std(Tm_boot)
        Tm_uncertainty = (-Tm_std * T_unit, Tm_std * T_unit)

        # Full width half maximum:
        FWHM_std = np.std(FWHM)
        FWHM_uncertainty = (-FWHM_std * T_unit, FWHM_std * T_unit)

    else:
        # Compute specified confidence interval:
        p_lo = (100 - conf_percent) / 2
        p_hi = 100 - p_lo

        # C_v values:
        C_v_diff = arr_C_v_values_boot - np.mean(arr_C_v_values_boot, axis=0)
        C_v_conf_lo = np.percentile(C_v_diff,
                                    p_lo,
                                    axis=0,
                                    interpolation='linear')
        C_v_conf_hi = np.percentile(C_v_diff,
                                    p_hi,
                                    axis=0,
                                    interpolation='linear')

        C_v_uncertainty = (C_v_conf_lo * C_v_unit, C_v_conf_hi * C_v_unit)

        # C_v peak height:
        Cv_height_diff = Cv_height - np.mean(Cv_height)
        Cv_height_conf_lo = np.percentile(Cv_height_diff,
                                          p_lo,
                                          interpolation='linear')
        Cv_height_conf_hi = np.percentile(Cv_height_diff,
                                          p_hi,
                                          interpolation='linear')

        Cv_height_uncertainty = (Cv_height_conf_lo * C_v_unit,
                                 Cv_height_conf_hi * C_v_unit)

        # Melting point:
        Tm_diff = Tm_boot - np.mean(Tm_boot)
        Tm_conf_lo = np.percentile(Tm_diff, p_lo, interpolation='linear')
        Tm_conf_hi = np.percentile(Tm_diff, p_hi, interpolation='linear')

        Tm_uncertainty = (Tm_conf_lo * T_unit, Tm_conf_hi * T_unit)

        # Full width half maximum:
        FWHM_diff = FWHM - np.mean(FWHM)
        FWHM_conf_lo = np.percentile(FWHM_diff, p_lo, interpolation='linear')
        FWHM_conf_hi = np.percentile(FWHM_diff, p_hi, interpolation='linear')

        FWHM_uncertainty = (FWHM_conf_lo * T_unit, FWHM_conf_hi * T_unit)

    # Plot and return the heat capacity (with units)
    if plot_file is not None:
        plot_heat_capacity(C_v_values,
                           C_v_uncertainty,
                           T_list,
                           file_name=plot_file)

    return T_list, C_v_values, C_v_uncertainty, Tm_value, Tm_uncertainty, Cv_height_value, Cv_height_uncertainty, FWHM_value, FWHM_uncertainty
Example #52
0
def train_and_evaluate(
    model: BertForSequenceClassification,
    tokenizer: BertTokenizer,
    condition_type: str,
    sampling_bin: int,
    n: int,
    metrics_output_path: str,
):
    """Train and evaluate the model on N conditions.
    @param model is the model to encode CLS tokens with.
    @param tokenizer is a BERT tokenizer.
    @param condition_type are we using the icd/medcat extracted conditions?
    @param b is which frequency to sample from.
    @param n is the number of conditions to sample the bin from.
    @return all AUCs and precision @ K scores.
    """
    ### Get Relevant Data

    subject_id_to_patient_info = get_subject_id_to_patient_info(
        condition_type=condition_type)
    condition_code_to_count = get_condition_code_to_count(
        condition_type=condition_type)
    condition_code_to_description = get_condition_code_to_descriptions(
        condition_type=condition_type)

    set_to_use = filter_condition_code_by_count(condition_code_to_count,
                                                min_count=0,
                                                max_count=500000)

    binned_conditions = get_frequency_bins(condition_code_to_count,
                                           condition_type)

    subject_ids = sorted(list(subject_id_to_patient_info.keys()))
    train_subject_ids, test_subject_ids = train_test_split(subject_ids,
                                                           train_size=0.5,
                                                           random_state=2021,
                                                           shuffle=True)

    ### Filter condition in each bin so we have atleast one positive training examples
    ### And One positive test example
    ### Otherwise, we can't train a LR model or calculate roc_auc_score

    train_set_conditions = get_non_zero_count_conditions(
        set_to_use, train_subject_ids, subject_id_to_patient_info)
    test_set_conditions = get_non_zero_count_conditions(
        set_to_use, test_subject_ids, subject_id_to_patient_info)
    binned_conditions = [
        set(bin_) & train_set_conditions & test_set_conditions
        for bin_ in binned_conditions
    ]
    binned_conditions = [sorted(list(bin_)) for bin_ in binned_conditions]

    ### Sample condition in selected bin

    condition_bin = binned_conditions[sampling_bin]
    np.random.seed(2021)
    sampled_conditions = np.random.choice(condition_bin, size=n, replace=False)

    ## Train a Classifier for Each Condition

    auc_score_list, precision_at_10_list = [], []
    for condition in tqdm(sampled_conditions):

        desc = condition_code_to_description[condition]
        train_templates = []
        train_labels = []
        for subject_id in train_subject_ids:
            patient_info = subject_id_to_patient_info[subject_id]
            template = generate_name_condition_template(
                patient_info.FIRST_NAME, patient_info.LAST_NAME,
                patient_info.GENDER, desc)
            label = condition in patient_info.CONDITIONS

            train_templates.append(template)
            train_labels.append(label)

        ## Resample to Upsample positive examples

        negative_indices = [i for i, x in enumerate(train_labels) if x == 0]
        positive_indices = [i for i, x in enumerate(train_labels) if x == 1]

        positive_indices = resample(positive_indices,
                                    replace=True,
                                    n_samples=len(negative_indices),
                                    random_state=2021)
        total_indices = negative_indices + positive_indices

        ### Divide Train Set into Train and Validation Set

        training_indices, validation_indices = train_test_split(
            total_indices, train_size=0.85, random_state=2021, shuffle=True)

        # Not too sure we can ensure the validation templates have a positive label in it...
        # Or if there is only 1, that it doesn't end up in the validation set.
        validation_templates = [train_templates[i] for i in validation_indices]
        validation_labels = [train_labels[i] for i in validation_indices]

        np.random.seed(2021)
        np.random.shuffle(training_indices)
        train_templates = [train_templates[i] for i in training_indices]
        train_labels = [train_labels[i] for i in training_indices]

        ### Train the BERT Model

        train_dataset = get_as_dataset(tokenizer, train_templates,
                                       train_labels)
        validation_dataset = get_as_dataset(tokenizer, validation_templates,
                                            validation_labels)

        clf = train_model(model, train_dataset, validation_dataset)

        ### Get Test Templates

        test_templates = []
        test_labels = []
        for subject_id in test_subject_ids:
            patient_info = subject_id_to_patient_info[subject_id]
            template = generate_name_condition_template(
                patient_info.FIRST_NAME, patient_info.LAST_NAME,
                patient_info.GENDER, desc)
            label = condition in patient_info.CONDITIONS

            test_templates.append(template)
            test_labels.append(label)

        ### Get Test Predictions
        test_dataset = get_as_dataset(tokenizer, test_templates, test_labels)
        test_predictions = clf.predict(test_dataset)

        test_predictions = test_predictions.predictions[:, 1]

        ### Calculate Metrics

        auc_score = roc_auc_score(test_labels, test_predictions)
        precision_at_10 = precision_at_k(test_labels, test_predictions, k=10)

        auc_score_list.append(auc_score)
        precision_at_10_list.append(precision_at_10)

    from experiments.MLM.common import mean_std_as_string

    with open(f"{metrics_output_path}/results.txt", "w") as f:
        f.write(mean_std_as_string("Model AUC", auc_score_list))
        f.write(mean_std_as_string("Model P@K", precision_at_10_list))
Example #53
0
        inplace=True)

# Checking for missing values
print(df.isnull().values.any())

# Checking for imbalances in cases for each class
print(df.team_placement.value_counts())

# Resampling imbalanced data
df_majority = df[df.team_placement == -10]
df_minority = df[df.team_placement != -10]

# Downsample majority class
df_majority_downsampled = resample(
    df_majority,
    replace=False,  # sample without replacement
    n_samples=135167,  # to match minority class
    random_state=42)  # reproducible results

# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Randomly marking 70% rows for training
df_downsampled['is_train'] = np.random.uniform(0, 1,
                                               len(df_downsampled)) <= .70

# Setting team_placement as categorical
change = {"team_placement": {-1: "!top 10", 1: "top 10"}}
df_downsampled.replace(change, inplace=True)
df_downsampled["team_placement"] = df_downsampled["team_placement"].astype(
    'category')
    # splitting up testing and training sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)
    y_test = y_test
    # concatenate our training data back together
    upsample_input = pd.concat([X_train, y_train], axis=1)

    # separate minority and majority classes
    Worst_qual = upsample_input[upsample_input.Takeover_Quality == 1]
    Bad_qual = upsample_input[upsample_input.Takeover_Quality == 2]
    Good_qual = upsample_input[upsample_input.Takeover_Quality == 3]

    #-----------------------------------------------------
    # Downsample the majorities
    Worst_qual_upsampled = resample(
        Worst_qual,
        replace=True,  # sample with replacement
        n_samples=len(Good_qual),  # match number in majority class
        random_state=27)  # reproducible results

    Bad_qual_upsampled = resample(
        Bad_qual,
        replace=True,  # sample with replacement
        n_samples=len(Good_qual),  # match number in majority class
        random_state=27)  # reproducible results

    # combine majority and upsampled minority
    downsampled = pd.concat(
        [Bad_qual_upsampled, Good_qual, Worst_qual_upsampled])

    # check new class counts
    print(downsampled.Takeover_Quality.value_counts())  #63079
Example #55
0
x_treino, x_teste, y_treino, y_teste = train_test_split(x,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=378)

# concatenate our training data back together
X = pd.concat([x_treino, y_treino], axis=1)

# separate minority and majority classes
not_ordem = X[X['LocalMax'] == 0].copy()
ordem = X[X['LocalMax'] == 1].copy()

# upsample minority
ordem_upsampled = resample(
    ordem,
    replace=True,  # sample with replacement
    n_samples=len(not_ordem),  # match number in majority class
    random_state=378)  # reproducible results

# combine majority and upsampled minority
upsampled = pd.concat([not_ordem, ordem_upsampled])

x_treino = upsampled[[c for c in df_cluster_dia.columns if c not in cols_rem]]
y_treino = upsampled['LocalMax']

display(y_treino.value_counts())

#xgb.fit(x_treino, y_treino, eval_set = [(x_treino, y_treino), (x_teste, y_teste)], eval_metric=f1_score)
param = {
    'max_depth': 10,
    'eta': 2,
    stratify=y_train_temp_less)  # training split = 80%, validation split = 10%

# Take minority data samples from dataframe to array
neutral_array = df_neutral.to_numpy()

# Shuffle the data samples of minority class
np.random.shuffle(neutral_array)

# Split minority class Neutral in 80:10:10 ratio.
train_neutral = neutral_array[0:869, :]
val_neutral = neutral_array[869:978, :]
test_neutral = neutral_array[978:1087, :]

# Resample Neutral data to match majority class samples.
train_neutral_resampled = resample(train_neutral,
                                   n_samples=1017,
                                   replace=True,
                                   random_state=0)
val_neutral_resampled = resample(val_neutral,
                                 n_samples=127,
                                 replace=True,
                                 random_state=0)
test_neutral_resampled = resample(test_neutral,
                                  n_samples=127,
                                  replace=True,
                                  random_state=0)

# Separate features and target labels for Neutral data.
X_train_neutral = train_neutral_resampled[:, 0:62]
X_val_neutral = val_neutral_resampled[:, 0:62]
X_test_neutral = test_neutral_resampled[:, 0:62]
y_train_neutral = train_neutral_resampled[:, 62]
Example #57
0
    plt.ylabel('True Positive Rate')
    plt.title(heading)
    plt.legend(loc="lower right")
    plt.show()


#Main
#Read the Data
youTubeTrendingData = pd.read_csv("TrendingVideos.csv",
                                  encoding="UTF-8",
                                  index_col='video_id')
youTubeNonTrendingData = pd.read_csv("NonTrendingVideos.csv",
                                     encoding="UTF-8",
                                     index_col='V_id')
youTubeTrendingData = resample(
    youTubeTrendingData, replace=False, n_samples=len(youTubeNonTrendingData)
)  #Resampling of Data for balancing Class Labels
#Pre-processing the Data
youTubeData, youTubeTrendingData, youTubeNonTrendingData = preProcessTheData(
    youTubeTrendingData, youTubeNonTrendingData)
#Processing the combined Trending and Non Trending Data
youTubeData = processTheData(youTubeData, youTubeTrendingData)
#Drop unused features
youTubeDataForFeatureSelection = youTubeData.drop([
    'category_id', 'description', 'obtained_date', 'publish_time',
    'thumbnail_link'
],
                                                  axis=1)
#Divide the Data into features and class labels
X = youTubeDataForFeatureSelection.ix[:, (0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 11,
                                          12, 13, 14, 15, 16, 17, 18, 19, 20,
                  swa,
              ],
              class_weight=weight)

    ypred = model.predict(X_test)
    ypred = np.argmax(ypred, axis=1)
    test_acc = balanced_accuracy_score(y_test, ypred)
    return model, test_acc


scores, members = list(), list()
used = list()
for co in range(20):  #20 splits
    # select indexes
    ix = [i for i in range(len(X))]
    train_ix = resample(ix, replace=True,
                        n_samples=1000)  #generate a new set with 1000 samples
    test_ix = [x for x in ix if x not in train_ix]
    print('Model {} of {}'.format(co + 1, 20))
    print('Unique training data: {}, testing data: {}'.format(
        X.shape[0] - len(test_ix), len(test_ix)))
    # select data
    X_train, y_train = X[train_ix], y[train_ix]
    X_test, y_test = X[test_ix], y[test_ix]
    # train each model
    model, test_acc = trainModel(X_train, y_train, X_test, y_test)
    print('Test accuracy: {:3.3f}'.format(test_acc))

    scores.append(test_acc)
    members.append(model)  #this list will hold all trained models
    used += train_ix
Example #59
0
def split_train_test(pos_data, neg_data, NEG_SIZE_TRAIN=NEG_SIZE_TRAIN):
    """ Split the data into a test set and a validation set
    """
    m_total_pos = pos_data.shape[0]
    m_total_neg = neg_data.shape[0]

    pos_data_train, pos_data_test = cross_validation.train_test_split(
        pos_data, test_size=0.20, random_state=random_state)
    del pos_data  # don't have access to it anymore, agh!

    m_neg_train = pos_data_train.shape[0] * NEG_SIZE_TRAIN
    m_neg_test = pos_data_test.shape[0] * NEG_SIZE_TEST
    assert neg_data.shape[0] >= (m_neg_train + m_neg_test)

    # Split the negative data into training and validation sets
    neg_data = np.array(neg_data)
    neg_data = utils.shuffle(neg_data, random_state=random_state)

    neg_data_train = neg_data[m_neg_test:]
    neg_data_test = neg_data[:m_neg_test]
    del neg_data  # don't have access to it anymore, agh!

    o_neg_train = neg_data_train.shape[0] / m_neg_train
    assert neg_data_train.shape[0] >= o_neg_train * m_neg_train

    # Cut the negative training examples to be an exact multiple of the positive training examples
    if SPLIT_DATA_BY == 'cut' or SPLIT_DATA_BY == 'reshape':
        neg_data_train = neg_data_train[:o_neg_train * m_neg_train]

    # Split negative examples (for the training set) into o_neg_train-sized batches
    if SPLIT_DATA_BY == 'reshape' and o_neg_train >= 2:
        neg_data_train_new = np.empty(
            (m_neg_train, neg_data_train.shape[1], o_neg_train), dtype=float)
        for o_idx in range(o_neg_train):
            neg_data_train_new[:, :,
                               o_idx] = neg_data_train[m_neg_train *
                                                       o_idx:m_neg_train *
                                                       (o_idx + 1), :]
        neg_data_train = neg_data_train_new
        assert neg_data_train.shape[0] == m_neg_train

    # Sample negative data to generate different training examples
    if SPLIT_DATA_BY == 'resample':
        o_neg_train = O_NEG_RESAMPLED
        neg_data_train_new = np.empty(
            (m_neg_train, neg_data_train.shape[1], o_neg_train), dtype=float)
        for o_idx in range(o_neg_train):
            neg_data_train_new[:, :, o_idx] = utils.resample(
                neg_data_train,
                replace=True,
                n_samples=m_neg_train,
                random_state=random_state)
        neg_data_train = neg_data_train_new
        assert neg_data_train.shape[0] == m_neg_train

    print("Number of negative training datasets: %i" % o_neg_train)

    if SPLIT_DATA_BY == 'cut1':
        neg_data_train = neg_data_train[:m_neg_train]

    print("m_pos_train: %i, m_neg_train: %i, m_pos_total: %i, m_neg_total: %i" % \
    (pos_data_train.shape[0], neg_data_train.shape[0], m_total_pos, m_total_neg))

    print(pos_data_train.shape, pos_data_test.shape, neg_data_train.shape,
          neg_data_test.shape)

    return pos_data_train, pos_data_test, neg_data_train, neg_data_test
print("Test Set:"% test.columns,test.shape,len(test))

def clean_text(df,text_field):
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+) | ([^0-9A-Za-z \t]) | (\w+:\/\/\S+) | ^rt | http.+?", "",elem))
    return df

test_clean = clean_text(test,"tweet")
train_clean = clean_text(train,"tweet")

# Upsampling : We repeatedly takes samples with replacement from minority class until
# the class is the same size as the majority
train_majority = train_clean[train_clean.label==0]
train_minority = train_clean[train_clean.label==1]

train_minority_upsampled = resample(train_minority,replace=True,n_samples=len(train_majority),random_state=123)

train_upsampled = pd.concat([train_minority_upsampled,train_majority])
train_upsampled['label'].value_counts()


train_majority = train_clean[train_clean.label==0]
train_minority = train_clean[train_clean.label==1]

train_majority_downsampled = resample(train_majority,replace=True,n_samples=len(train_minority),random_state=123)

train_downsampled = pd.concat([train_majority_downsampled,train_minority])
train_downsampled['label'].value_counts()


X_train,X_test,y_train,y_test =train_test_split(train_upsampled['tweet'],train_upsampled['label'],random_state = 0)