Python binarizeの例、sklearn.preprocessing.binarize Pythonの例

コード例 #1

0

ファイルを表示

    def _get_node_distance_matrix(self, datapoint, som_array):
        """Get distance of datapoint and node using Euclidean distance.

        Parameters
        ----------
        datapoint : np.array, shape=(X.shape[1])
            Datapoint = one row of the dataset `X`
        som_array : np.array
            Weight vectors of the SOM,
            shape = (self.n_rows, self.n_columns, X.shape[1])

        Returns
        -------
        distmat : np.array of float
            Distance between datapoint and each SOM node

        """
        # algorithms on the full matrix
        if self.distance_metric == "euclidean":
            return np.linalg.norm(som_array - datapoint, axis=2)

        # node-by-node algorithms
        distmat = np.zeros((self.n_rows, self.n_columns))
        if self.distance_metric == "manhattan":
            for node in self.node_list_:
                distmat[node] = dist.cityblock(
                    som_array[node[0], node[1]], datapoint)

        elif self.distance_metric == "mahalanobis":
            for node in self.node_list_:
                som_node = som_array[node[0], node[1]]
                cov = np.cov(np.stack((datapoint, som_node), axis=0),
                             rowvar=False)
                cov_pinv = np.linalg.pinv(cov)   # pseudo-inverse
                distmat[node] = dist.mahalanobis(
                    datapoint, som_node, cov_pinv)

        elif self.distance_metric == "tanimoto":
            # Note that this is a binary distance measure.
            # Therefore, the vectors have to be converted.
            # Source: Melssen 2006, Supervised Kohonen networks for
            #         classification problems
            # VERY SLOW ALGORITHM!!!
            threshold = 0.5
            for node in self.node_list_:
                som_node = som_array[node[0], node[1]]
                distmat[node] = dist.rogerstanimoto(
                    binarize(datapoint.reshape(1, -1), threshold=threshold,
                             copy=True),
                    binarize(som_node.reshape(1, -1), threshold=threshold,
                             copy=True))

        elif self.distance_metric == "spectralangle":
            for node in self.node_list_:
                distmat[node] = np.arccos(np.divide(
                    np.dot(som_array[node[0], node[1]], datapoint),
                    np.multiply(np.linalg.norm(som_array),
                                np.linalg.norm(datapoint))))

        return distmat

コード例 #2

0

ファイルを表示

ファイル: transform.py プロジェクト: amitsingh2783/kaggle

 def predict(self, X):
     ''' Predict class labels. '''
     if self.mode == 'average':
         return binarize(self.predict_proba(X)[:,[1]], 0.5)
     else:
         res = binarize(X, 0.5)
         return np.apply_along_axis(lambda x: np.bincount(x.astype(int), self.weights).argmax(), axis=1, arr=res)

コード例 #3

0

ファイルを表示

ファイル: api.py プロジェクト: sul-cidr/histonets-cv

def binarize_image(image, method='li', **kwargs):
    """Binarize image using one of the available methods: 'isodata',
    'li', 'otsu', 'sauvola', and 'boolean'. Defaults to 'li'.
    Extra keyword arguments are passed in as is to the corresponding
    scikit-image thresholding function. The 'boolean' method refers to simple
    thresholding from a grey-scale image. If a 'threshold' kwarg is not passed
    to the 'boolean' method, 'li' thresholding is performed.
    For reference
    Sezgin M. and Sankur B. (2004) "Survey over Image Thresholding Techniques
    and Quantitative Performance Evaluation" Journal of Electronic Imaging,
    13(1): 146-165 DOI:10.1117/1.1631315
    """
    if image.ndim != 2:
        # image is not gray-scale
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    if np.unique(image).size == 2:
        # image is already binary
        return image
    boolean_threshold = kwargs.get('threshold', None)
    if method == 'boolean' and boolean_threshold:
        preprocessing.binarize(image, threshold=boolean_threshold, copy=False)
        return convert(image)
    if method not in ('sauvola', 'isodata', 'otsu', 'li'):
        method = 'li'
    thresh_func = getattr(filters.thresholding, "threshold_{}".format(method))
    threshold = thresh_func(image, **kwargs)
    # OpenCV can't write black and white images using boolean values, it needs
    # at least a 8bits 1-channel image ranged from 0 (black) to 255 (white)
    return convert(image <= threshold)

コード例 #4

0

ファイルを表示

def binarize_encode_target_columns(apache_df_list):
    """
    Function to binarize predictions and encode the actual 
    target columns for the apache prediction tables
    
    NOTE - all column names are the same names for when
    they were queried from the database. Will not work
    if you have renamed
    
    predictedicumortality,predictedhospitalmortality
    actualicumortality,actualhospitalmortality.
    
    If you have, please rename them back or change
    them directly from this function.
    
    Parameters
    ------------
    apache_df_list: list of dataframe objects 
                    The dataframes for which we will be performing
                    the operations on, given that
                    
    Returns
    ------------
    None, directly makes changes to the dataframes listed in 
    apache_df_list. Four new columns will be added
    
    icu_death_prediction_label : class labels from the 
                                 predictedicumortality column
                                 
    hosp_death_predictions_label : class labels from the 
                                   predictedhospitalmortality column
                                   
    icu_deaths : class labels for the actualicumortality column
    
    hosp_deaths : class labels for the actualhospitalmortality column
    
    """
    # Grab the dataframes
    apache_df_list = apache_df_list

    # set the threshold
    threshold = 0.5

    # loop through the dataframes binarize predictions and encode labels for established truth
    for df in apache_df_list:
        # binarize predictions
        icu_death_predictions = binarize(
            df['predictedicumortality'].values.reshape(-1, 1),
            threshold=threshold)
        hosp_death_predictions = binarize(
            df['predictedhospitalmortality'].values.reshape(-1, 1),
            threshold=threshold)
        df['icu_death_prediction_label'] = icu_death_predictions
        df['hosp_death_prediction_label'] = hosp_death_predictions

        # encode lobels for actual data
        df['icu_deaths'] = df['actualicumortality'].map(
            lambda status: 0 if status == 'ALIVE' else 1)
        df['hosp_deaths'] = df['actualhospitalmortality'].map(
            lambda status: 0 if status == 'ALIVE' else 1)

コード例 #5

0

ファイルを表示

    def train(self, X_train, y_train, silent = False):
        '''train the model, X_train contains the tweet in each row'''
        if self.useTfIdf:
            self.vectorizer = TfidfVectorizer(ngram_range=(self.ngram_s, self.ngram_e), tokenizer=lambda x: x.split(), lowercase=False, preprocessor=lambda x: x)
        else:
            self.vectorizer = CountVectorizer(ngram_range=(self.ngram_s, self.ngram_e), tokenizer=lambda x: x.split(), lowercase=False, preprocessor=lambda x: x)

        if self.multinomial:
            self.model = MultinomialNaiveBayes()
        else:
            self.model = BernoulliNaiveBayes()

        self.vectorizer.fit(X_train.astype('str'))
        #assert len(self.vectorizer.stop_words_) == 0 #we don't want preprocess by scikit learn, we already performed it
        #print(self.vectorizer.get_feature_names())
        
        if not silent:
            print('vectorizer trained')
        
        X_train_bow = self.vectorizer.transform(X_train.astype('str'))
        if not self.multinomial:
            binarize(X_train_bow, copy=False)
        if not silent:
            print('train data vectorized')

        self.model.train(X_train_bow, y_train)
        if not silent:
            print('model trained')

コード例 #6

0

ファイルを表示

ファイル: wordVector.py プロジェクト: vortic/briefgenerator

def getSrlRepresentation(cas,
                         intensity=False,
                         log=False,
                         bnrz=False,
                         representationSize=200):
    from sklearn.preprocessing import binarize
    model = models.Word2Vec.load('models/word2vec/srlModel')
    ret = [None] * len(cas.sentences)
    for i, sentence in enumerate(cas.srlSentences):
        numRows = sum([len(clause) for clause in sentence])
        altSentence = np.zeros((numRows, representationSize))
        currentRow = 0
        for clause in sentence:
            for j, (role, text) in enumerate(clause.iteritems()):
                word = str((role, text))
                try:
                    altWord = np.multiply(
                        np.add(np.divide(model[word], 2.0), 0.5),
                        255) if intensity else model[word]
                    altWord = np.multiply(
                        binarize(altWord, threshold=255.0 /
                                 2.0), 255) if bnrz and intensity else altWord
                    altWord = binarize(
                        altWord) if bnrz and not intensity else altWord
                    altSentence[currentRow, :] = altWord
                except:
                    altSentence[currentRow, :] = altSentence[
                        j - 1, :] if j != 0 else np.zeros(representationSize)
                currentRow += 1
        ret[i] = altSentence
    return ret

コード例 #7

0

ファイルを表示

ファイル: ug_bg_ibcf.py プロジェクト: madcat1991/clustered_cars

def main():
    logging.info(u"Getting clusters data")
    uid_to_ug = get_ug_data(args.user_cluster)
    bid_to_bg, bg_iids = get_bg_data(args.booking_cluster)

    logging.info("Reading training data")
    training_df = pd.read_csv(args.training_csv)
    tr_m = get_matrix(training_df, uid_to_ug, bid_to_bg)
    logging.info(u"Training matrix: %s", get_sparse_matrix_info(tr_m))

    logging.info("Reading testing data")
    # we don't care about repetitive actions in the testing
    testing_df = pd.read_csv(args.testing_csv)[["code",
                                                "propcode"]].drop_duplicates()

    logging.info("Preparing similarity matrix")
    sim_m = get_similarity_matrix(tr_m)

    logging.info("Testing hit ratio at top-%s", args.top_k)
    recs_m = get_topk_recs(tr_m, sim_m, binarize(tr_m), args.top_k)
    logging.info(u"Hit ratio: %.3f",
                 hit_ratio(recs_m, testing_df, uid_to_ug, bg_iids))

    if args.top_k_iid_per_uid:
        recs_m = get_topk_recs(tr_m, sim_m, binarize(tr_m))
        store_data_for_eval(recs_m, testing_df, uid_to_ug, bg_iids)

コード例 #8

0

ファイルを表示

ファイル: test_preprocessing.py プロジェクト: pseudotensor/pandas-ml

    def test_preprocessing_assignment(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        original_columns = df.data.columns
        df['sepal length (cm)'] = df[
            'sepal length (cm)'].preprocessing.binarize(threshold=6)
        self.assertIsInstance(df, pdml.ModelFrame)
        binarized = pp.binarize(np.atleast_2d(iris.data[:, 0]), threshold=6)
        expected = np.hstack([binarized.T, iris.data[:, 1:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        tm.assert_index_equal(df.data.columns, original_columns)

        # recreate data
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        target_columns = ['sepal length (cm)', 'sepal width (cm)']
        df[target_columns] = df[target_columns].preprocessing.binarize(
            threshold=6)
        self.assertIsInstance(df, pdml.ModelFrame)
        binarized = pp.binarize(iris.data[:, 0:2], threshold=6)
        expected = np.hstack([binarized, iris.data[:, 2:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        tm.assert_index_equal(df.data.columns, original_columns)

コード例 #9

0

ファイルを表示

ファイル: image_utils.py プロジェクト: ifahim/utils

def superimpose_two_masks(mask_fn1, mask_fn2):
    img_in = cv2.imread(mask_fn1, cv2.IMREAD_GRAYSCALE)
    img_in = binarize(img_in, threshold=50, copy=True)
    img_side = cv2.imread(mask_fn2, cv2.IMREAD_GRAYSCALE)
    img_side = binarize(img_side, threshold=50, copy=True)
    composite = cv2.bitwise_or(img_in,img_side)
    return composite

コード例 #10

0

ファイルを表示

ファイル: utils.py プロジェクト: RicherMans/CDur

def binarize(pred, threshold=0.5):
    # Batch_wise
    if pred.ndim == 3:
        return np.array(
            [pre.binarize(sub, threshold=threshold) for sub in pred])
    else:
        return pre.binarize(pred, threshold=threshold)

コード例 #11

0

ファイルを表示

ファイル: feature_scoring.py プロジェクト: mac2bua/text_feature_selection

def bns(X, y):
    """
    Implements the bi-normal separation scoring.
    """

    # binarization: from counts to presence/abscence
    binarize(X, threshold=0.0, copy=False)

    # one column per class
    Y = LabelBinarizer().fit_transform(y)
    if Y.shape[1] == 1: # binary problem case
        Y = np.append(1-Y, Y, axis=1)

    pos = np.sum(Y, axis=0)
    neg = Y.shape[0] - pos

    tp = safe_sparse_dot(X.T, Y)
    fp = np.sum(tp, axis=1).reshape(-1, 1) - tp

    tpr = bounded(tp/pos.astype(float))
    fpr = bounded(fp/neg.astype(float))

    bns = np.abs(_z_score(tpr) - _z_score(fpr))

    return bns[:,1], None

コード例 #12

0

ファイルを表示

    def jaccard_sim(self):
        '''given a sparse matrix, calculate jaccard sim

        ** ref : http://na-o-ys.github.io/others/2015-11-07-sparse-vector-similarities.html
        '''
        if self.kind == 'ubcf':
            # assure binarize sp matrix and astype int16
            mat = binarize(self.inter).astype('int16')
        elif self.kind == 'ibcf':
            # assure binarize sp matrix and astype int16
            mat = binarize(self.inter.T).astype('int16')

        rows_sum = mat.getnnz(axis=1).astype('int16')  #
        ab = mat.dot(mat.T).astype('float16')  # mat x t(mat)
        # for rows
        aa = np.repeat(rows_sum, ab.getnnz(axis=1))
        # for columns
        bb = rows_sum[ab.indices]

        similarities = ab.tocoo(copy=True)
        similarities.data /= (aa + bb - ab.data)
        del aa, bb, ab  # large memory cost
        similarities = similarities.astype('float32')
        # similarities.setdiag(0) ##
        similarities = similarities.tocsr()
        similarities.eliminate_zeros()
        sparsity = float(similarities.nnz / mat.shape[0]**2) * 100
        print(
            'similarity (jaccard) matrix built ({}), \nsparsity of similarity: {:.2f} %'
            .format(self.kind, sparsity))
        self.sim = similarities

コード例 #13

0

ファイルを表示

ファイル: test_preprocessing.py プロジェクト: the872/pandas-ml

    def test_binarize(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.preprocessing.binarize()
        expected = pp.binarize(iris.data)

        self.assertTrue(isinstance(result, pdml.ModelFrame))
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        self.assert_index_equal(result.columns, df.data.columns)

        result = df.preprocessing.binarize(threshold=5)
        expected = pp.binarize(iris.data, threshold=5)

        self.assertTrue(isinstance(result, pdml.ModelFrame))
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        self.assert_index_equal(result.columns, df.data.columns)

        s = df['sepal length (cm)']
        self.assertTrue(isinstance(s, pdml.ModelSeries))
        result = s.preprocessing.binarize()
        expected = pp.binarize(iris.data[:, 0])[0]

        self.assertTrue(isinstance(result, pdml.ModelSeries))
        self.assert_numpy_array_almost_equal(result.values, expected)
        self.assertEqual(result.name, 'sepal length (cm)')

        result = s.preprocessing.binarize(threshold=6)
        expected = pp.binarize(iris.data[:, 0], threshold=6)[0]

        self.assertTrue(isinstance(result, pdml.ModelSeries))
        self.assert_numpy_array_almost_equal(result.values, expected)
        self.assertEqual(result.name, 'sepal length (cm)')

コード例 #14

0

ファイルを表示

ファイル: feature_scoring.py プロジェクト: mac2bua/text_feature_selection

def ig(X, y):
    """
    This method calculates the information gain for two random variables I(X, Y).
    """

    # binarization: from counts to presence/abscence
    binarize(X, threshold=0.0, copy=False)

    # una columna por cada clase
    Y = LabelBinarizer().fit_transform(y)
    if Y.shape[1] == 1: # binary problem case
        Y = np.append(1-Y, Y, axis=1)

    Y_prob = (np.sum(Y, axis=0, dtype=np.float64) / len(Y)).reshape(-1, 1)

    # calculate the class entropy H(Y)
    class_entropy = _entropy(Y_prob)

    X_y_count = safe_sparse_dot(Y.T, X)
    # TODO XXX FIXME ver si estoy calculando bien esta probabilidad
    X_y_prob = \
        X_y_count / np.sum(X_y_count, axis=0, dtype=np.float64)

    # calculate the conditional entropy of the class given the feature H(y|f_i)
    cond_entropy = _entropy(X_y_prob) # TODO XXX FIXME ver si estoy calculando bien la entropia condicional
    print "class:", class_entropy
    print "cond_entropy:", cond_entropy

    infogain = class_entropy - cond_entropy

    return infogain, None

コード例 #15

0

ファイルを表示

ファイル: milp.py プロジェクト: kasrsf/twitter-query

def greedy_cailp(positive_coverage, negative_coverage, k=20):
    num_features = positive_coverage.shape[1]
    num_positive_tweets = positive_coverage.shape[0]
    num_negative_tweets = negative_coverage.shape[0]
    positive_bin = binarize(positive_coverage)
    negative_bin = binarize(negative_coverage)
    positive_lil = positive_bin.tolil()
    negative_lil = negative_bin.tolil()
    selected_features = []
    for i in range(k):
        print(i)
        scores = (positive_lil.sum(axis=0) / float(num_positive_tweets)) \
                - (negative_lil.sum(axis=0) / float(num_negative_tweets))
        selected_feature = scores.argmax()
        if selected_feature not in selected_features:
            covered_pos_tweets = list(
                positive_lil[:, selected_feature].nonzero()[0])
            utils.delete_row_lil(positive_lil, covered_pos_tweets)
            covered_neg_tweets = list(
                negative_lil[:, selected_feature].nonzero()[0])
            utils.delete_row_lil(negative_lil, covered_neg_tweets)
            selected_features.append(selected_feature)
        else:
            break
    return selected_features

コード例 #16

0

ファイルを表示

ファイル: user_item_ibcf.py プロジェクト: madcat1991/clustered_cars

def main():
    logging.info("Reading training data")
    training_df = pd.read_csv(args.training_csv)
    tr_m, uid_to_row, iid_to_col = get_training_matrix_and_indices(training_df)
    logging.info("Training matrix: %s", get_sparse_matrix_info(tr_m))

    logging.info("Reading testing data")
    testing_df = pd.read_csv(args.testing_csv)[["code", "propcode"]].drop_duplicates()

    logging.info("Preparing similarity matrix")
    sim_m = get_similarity_matrix(tr_m)

    logging.info("Testing hit ratio at top-%s", args.top_k)
    recs_m = get_topk_recs(
        normalize(tr_m),
        sim_m,
        binarize(tr_m),
        args.top_k,
    )
    logging.info("Hit ratio: %.3f", hit_ratio(recs_m, testing_df, uid_to_row, iid_to_col))

    if args.top_k_iid_per_uid:
        recs_m = get_topk_recs(
            tr_m,
            sim_m,
            binarize(tr_m)
        )
        store_data_for_eval(recs_m, testing_df, uid_to_row, iid_to_col)

コード例 #17

0

ファイルを表示

ファイル: genecall.py プロジェクト: JonETJakobsson/scConnect

def meanThreshold(adata,
                  groupby,
                  threshold,
                  return_df=False,
                  layer=None,
                  use_raw=False,
                  transformation="log1p"):
    """Binarize gene expression for groups aggregated by mean.

    Returns: adata object with updated uns.gene_call
    """
    from sklearn.preprocessing import binarize
    import pandas as pd
    df = get_adata_df(adata,
                      layer=layer,
                      use_raw=use_raw,
                      transformation=transformation)
    result = df.groupby(by=adata.obs[groupby], axis=1).mean()
    binarize(result, threshold=threshold, copy=False)

    if return_df is True:
        return result
    else:
        adata.uns.update({"gene_call": result})
        return adata

コード例 #18

0

ファイルを表示

ファイル: predictions.py プロジェクト: shamnvik/Yolo4Apnea

    def get_prediction_metrics(self):
        print("Getting prediction metrics")
        df = self.get_predictions_as_df(self.predictions)
        metrics = {}
        prediction_metrics = {}
        annotation_metrics = {}

        prediction_metrics["event_count"] = len(df["start"])
        prediction_metrics["mean_duration"] = df["duration"].mean() if len(
            df["start"]) > 0 else 0
        # Hour * hz
        prediction_metrics[
            "recording_length_minutes"] = self.last_predicted_index / (60 * 10)
        if prediction_metrics["recording_length_minutes"] > 0:
            prediction_metrics["calculated_ahi"] = (
                prediction_metrics["event_count"] /
                prediction_metrics["recording_length_minutes"]) * 60

        metrics["prediction"] = prediction_metrics

        if self.ground_truth is not None:
            df = self.get_predictions_as_df(self.ground_truth)

            annotation_metrics["event_count"] = len(df["start"])
            annotation_metrics["mean_duration"] = df["duration"].mean() if len(
                df["start"]) > 0 else 0

            annotation_metrics[
                "annotation_length_minutes"] = self.ground_truth_length / (60 *
                                                                           10)
            metric_end = int(
                float(max(self.ground_truth_length,
                          self.last_predicted_index)))

            if annotation_metrics["annotation_length_minutes"] > 0:
                annotation_metrics["calculated_ahi"] = (
                    annotation_metrics["event_count"] /
                    annotation_metrics["annotation_length_minutes"]) * 60

            predictions = self.predictions[:metric_end]
            ground_truth = self.ground_truth[:metric_end]
            ground_truth_binary = np.ravel(
                binarize(ground_truth.reshape(1, -1), 0))
            predictions_binary = np.ravel(
                binarize(predictions.reshape(1, -1), 0))

            annotation_metrics["accuracy_score"] = accuracy_score(
                ground_truth_binary, predictions_binary)
            annotation_metrics["f1_score"] = f1_score(ground_truth_binary,
                                                      predictions_binary)
            annotation_metrics["precision_score"] = precision_score(
                ground_truth_binary, predictions_binary)
            annotation_metrics["recall_score"] = recall_score(
                ground_truth_binary, predictions_binary)

            metrics["annotation"] = annotation_metrics

        return metrics

コード例 #19

0

ファイルを表示

def partition(features, a, probs, w):
    ig = 0
    ap = {}

    # multiply a by row and re-sparsify
    x = features.index(w)
    ap['yes'] = a.multiply(binarize(a[x])).tocsr()

    # a_no is whatever's left of 'a' after removing a_yes
    ap['no'] = a - ap['yes']

    # sum a's columns and binarize
    qk = binarize(a.sum(axis=0))[0]

    pk = {}
    pk['no'] = binarize(ap['no'].sum(axis=0))[0]

    # pk['yes'] is whatever's left of qk after removing pk['no']
    pk['yes'] = qk - pk['no']

    ap['yes'] = ap['yes'].multiply(pk['yes'].reshape(-1, 1).T).tocsr()

    # for qk and both pk's, multiply by static probs vector, then normalize
    qk = qk * probs

    ig_c = {}
    ig_uc = {}
    ig_c['yes'] = 0.0
    ig_uc['yes'] = 0.0
    ig_c['no'] = 0.0
    ig_uc['no'] = 0.0

    if np.sum(qk) > 0:
        qk_num = len(np.where(qk != 0)[0])
        qk = normalize(qk)

        if VERBOSE:
            print(CRED + 'qk ' + str(qk_num) + CEND, '\n', a.A, '\n', qk)

        for d in ['yes', 'no']:
            pk[d] = pk[d] * probs
            if np.sum(pk[d]) > 0:
                pkd_nz = np.where(pk[d] != 0)[0]
                pk_num = len(pkd_nz)
                pk[d] = normalize(pk[d])
                qk_nz = np.where(qk != 0)[0]
                ig_uc[d] = entropy(pk=pk[d][qk_nz], qk=qk[qk_nz], base=2)
                ig_c[d] = (pk_num / qk_num) * ig_uc[d]
                ig += ig_c[d]

                if VERBOSE:
                    print(CRED + 'pk[' + d + '] ' + str(pk_num) + CEND, '\n',
                          ap[d].A, '\n', pk[d])

    if VERBOSE:
        print(ig)

    return ig, ap['yes'], ig_uc, ig_c

コード例 #20

0

ファイルを表示

ファイル: Day_13_02_preprocessing.py プロジェクト: yunhui21/CB_Ai_NLP

def binarizer():
    a = [[-1, 3, -2], [5, -7, -4]]

    b = preprocessing.binarize(a)
    print(b)
    print(preprocessing.binarize(a, threshold=-2))

    bin = preprocessing.Binarizer()
    print(bin.transform(a))

コード例 #21

0

ファイルを表示

 def perform_test(self, X_test, silent=False):
     X_test_bow = self.vectorizer.transform(X_test.astype('str'))
     if not self.multinomial:
         binarize(X_test_bow, copy=False)
     if not silent:
         print('test data vectorized')
     
     y_score = self.model.multi_prediction_score(X_test_bow)
     y_pred = self.model.multi_predict_class_from_score(y_score, threshold=self.threshold)
     return y_score, y_pred

コード例 #22

0

ファイルを表示

 def predict(self, X):
     ''' Predict class labels. '''
     if self.mode == 'average':
         return binarize(self.predict_proba(X)[:, [1]], 0.5)
     else:
         res = binarize(X, 0.5)
         return np.apply_along_axis(
             lambda x: np.bincount(x.astype(int), self.weights).argmax(),
             axis=1,
             arr=res)

コード例 #23

0

ファイルを表示

 def Binarize(self, column = None):
 """ Feature Binarization, tresholding numerical features 
     to get boolean values """
     try:
         if column == None:
             _dataset = preprocessing.binarize(_dataset)
         else:
             _dataset[column] = preprocessing.binarize(_dataset[column])
     except Exception as n:
         print("Binarize failed!")
         print(n)

コード例 #24

0

ファイルを表示

ファイル: run.py プロジェクト: rafdub/save-kaggle

def multibinarize(x, thresholds):
    if hasattr(x, "fillna"):
        x = x.fillna(0).values.reshape(-1, 1)
    else:
        x = x.reshape(-1, 1)
    res = None
    for threshold in thresholds:
        if res is None:
            res = binarize(x, threshold)
        else:
            res += binarize(x, threshold)
    return res[:, 0]

コード例 #25

0

ファイルを表示

ファイル: build_feature_vecs.py プロジェクト: wmdyer/infogain

def process_nps(nps):
    print("processing meanings ...")
    probs = []
    pairs = []
    features = []

    chunks = []
    vectors = []
    chunk_size = 1000

    # add adjective features
    for alist in nps.adjs.unique():
        for w in alist.split(','):
            if w not in features:
                features.append(w)

    # add noun features
    for w in nps.noun.unique():
        if w not in features:
            features.append(w)

    # create vector for each NP
    total = len(nps)

    for i, row in nps.iterrows():
        print_progress(i + 1, total)
        vector = [0] * len(features)
        vector[features.index(row['noun'])] = 1
        for adj in row['adjs'].split(','):
            vector[features.index(adj)] = 1
        if WEIGHTED_PROBS:
            probs.append(np.clip(row['count'], 0, 100))
        else:
            probs.append(1)
        vectors.append(vector)

        if len(vectors) > chunk_size:
            chunks.append(csr_matrix(binarize(np.array(vectors).T)).tocsr())
            vectors = []
    chunks.append(csr_matrix(binarize(np.array(vectors).T)).tocsr())

    print("")

    print("combining vectors...")
    a_orig = hstack(chunks).tocsr()

    print("normalizing probabilities ...")
    probs = normalize(np.array(probs))

    print('total feature vectors:', len(probs))

    return features, probs, a_orig

コード例 #26

0

ファイルを表示

ファイル: classify.py プロジェクト: jdnc/ml-project

def get_score(X, y, clf, scoring = 'accuracy'):
    from sklearn.preprocessing import binarize

    if scoring == 'accuracy':
        from sklearn.metrics import accuracy_score
        score = accuracy_score(y, binarize(clf.predict(X), 0.5))
    elif scoring =='f1':
        from sklearn.metrics import f1_score
        score = f1_score(y, binarize(clf.predict(X), 0.5))
    else:
        score = clf.score(X, y)

    return score

コード例 #27

0

ファイルを表示

ファイル: representations.py プロジェクト: siddsax/DocClassify

    def binary_bow(self, n=None):
        data_test = self.data_test
        data_train = self.data_train

        if (n):
            X_tr = binarize(np.array(data_test[0][0:n].todense()))
            X_te = binarize(np.array(data_train[0][0:n].todense()))

            small_test = X_tr, data_test[1][0:n]
            small_train = X_te, data_train[1][0:n]

            return small_train, small_test

        return data_train, data_test

コード例 #28

0

ファイルを表示

    def evaluate(self, pred_all, test, method='precision'):
        """
        params
        ======
        
        pred_all:(ndarray) 
            predicted/recommended result for each user 
            
        test:(csr_matrix)
            testing sets(test.shape[0] should be same as pred_all.shape[0])
            
        method: (str) precision(default), recall
            evaluate method 
            
        attribute
        =========
        precision
        recall

        """
        assert type(test) == sp.csr_matrix
        assert test.shape[0] == pred_all.shape[0]

        if method == 'precision':
            test_lil = binarize(test).tolil()  # binarize and tranform to lil
            prec_array = np.zeros(pred_all.shape[0])
            num_of_test_data = 0
            for user, items in enumerate(test_lil.rows):
                prec_array[user] = len(np.intersect1d(
                    items, pred_all[user, ])) / len(pred_all[user, ])
                if items != []:
                    num_of_test_data += 1
#            return np.sum(prec_array)/num_of_test_data
            self.precision = np.sum(prec_array) / num_of_test_data
            print("\n-------------")
            print("model: {},\ntopN: {}".format(self.kind, self.topN))
            print("precision: {:.2f} %".format(self.precision * 100))

        if method == 'recall':
            test_coo = binarize(test).tocoo()  # binarize and transform to coo
            score = 0
            nonzero_rowsets = set(test_coo.row)
            for row, col, v in zip(test_coo.row, test_coo.col, test_coo.data):
                if col in pred_all[row, ]:
                    score += 1
            self.recall = score / len(nonzero_rowsets)
            print("\n-------------")
            print("model: {},\ntopN: {}".format(self.kind, self.topN))
            print("recall:{:.2f} %".format(score / len(test_coo.data) * 100))

コード例 #29

0

ファイルを表示

ファイル: recommender.py プロジェクト: madcat1991/clustered_cars

    def get_recs(self, ug_id, iid_recs, top_clusters=None, min_iid_per_bg=None):
        bg_recs_row = self.ug_bg_recs_m[ug_id]
        bg_mask = binarize(
            self.item_dp.get_iid_per_bg_row(binarize(iid_recs), min_iid_per_bg)
        )
        bg_recs_row = bg_recs_row.multiply(bg_mask)

        if top_clusters is not None:
            arg_ids = np.argsort(bg_recs_row.data)[-top_clusters:]
            rows, cols = bg_recs_row.nonzero()
            bg_recs_row = csr_matrix(
                (bg_recs_row.data[arg_ids], (rows[arg_ids], cols[arg_ids])),
                shape=bg_recs_row.shape
            )
        return bg_recs_row

コード例 #30

0

ファイルを表示

def getRepresentation(cas, intensity=False, log=False, bnrz=False, representationSize=200):
    from sklearn.preprocessing import binarize
    ret = [None]*len(cas.tokens)
    for i, sentence in enumerate(cas.tokens):
        altSentence = np.zeros((len(sentence.split()), representationSize))
        for j, word in enumerate(sentence.split()):
            try:
                altWord = np.multiply(np.add(np.divide(model[word], 2.0), 0.5), 255) if intensity else model[word]
                altWord = np.multiply(binarize(altWord, threshold=255.0/2.0), 255) if bnrz and intensity else altWord
                altWord = binarize(altWord) if bnrz and not intensity else altWord
                altSentence[j,:] = altWord
            except:
                altSentence[j,:] = altSentence[j-1,:] if j != 0 else np.zeros(representationSize)
        ret[i] = altSentence
    return ret

コード例 #31

0

ファイルを表示

    def PreprocessingData(processType):

        if processType=="Normalization":
            AlgorithmOperation.train_X = preprocessing.normalize(AlgorithmOperation.train_X, norm='l2')
            AlgorithmOperation.test_X = preprocessing.normalize(AlgorithmOperation.test_X, norm='l2')
        elif processType=="Scale":
            AlgorithmOperation.train_X =preprocessing.scale(AlgorithmOperation.train_X)
            AlgorithmOperation.test_X =preprocessing.scale(AlgorithmOperation.test_X)
        elif processType=="Binarization":
            AlgorithmOperation.train_X =preprocessing.binarize(AlgorithmOperation.train_X)
            AlgorithmOperation.test_X =preprocessing.binarize(AlgorithmOperation.test_X)
        elif processType=="Polynomial Feature":
            poly=preprocessing.PolynomialFeatures(2)
            AlgorithmOperation.train_X = poly.fit_transform(AlgorithmOperation.train_X)
            AlgorithmOperation.test_X = poly.fit_transform(AlgorithmOperation.test_X)

コード例 #32

0

ファイルを表示

ファイル: binarize.py プロジェクト: MacHu-GWU/six-demon-bag

def example2():
    """方法2[推荐]
    """
    X = np.array([[1, -1,  2], ## "f"非常重要，为了标准化，矩阵元素必须是浮点类型
                  [2,  0,  0],
                  [0,  1, -1]], dtype = "f")
    print("binarized X = \n%s\n" % preprocessing.binarize(X, threshold=1.1))

コード例 #33

0

ファイルを表示

ファイル: trainer.py プロジェクト: sypsyp97/Deep-Learning

    def train_step(self, x, y):
        # perform following steps:
        # -reset the gradients
        # -propagate through the network
        # -calculate the loss
        # -compute gradient by backward propagation
        # -update weights
        # -return the loss
        # TODO

        if self._cuda:
            x = x.clone().detach().cuda()
            y = y.clone().detach().cuda().squeeze()

            #x = t.tensor(x, dtype=t.float).cuda()
            #y = t.tensor(y, dtype=t.float).cuda().squeeze()

        self._optim.zero_grad()
        y_pred = self._model(x)

        y_predTmp = y_pred.clone()
        y_predTmp = binarize(y_predTmp.cpu().detach().numpy(),
                             threshold=0.5)  # numpy array w/o grad
        y_pred.data = t.tensor(y_predTmp, dtype=t.float).cuda()

        loss = self._crit(y_pred, y.float())
        loss.backward()
        self._optim.step()
        return loss

コード例 #34

0

ファイルを表示

ファイル: vanilla_5AUG.py プロジェクト: Rendiere/Masters_Round3

def test_model(LRM, data, TBI=False):

	global GAMMA

	y = list(data.TBResult.values)

	test_recs = list(data.StudyNum.values)

	X = data.drop(['StudyNum','TBResult'],axis=1)

	probs = LRM.predict_proba(X)[:,1]

	"""
	Calculate AUC acc using ROC analysis
	"""
	# Get FPR and TPR for the test set
	fpr, tpr, thresh = roc_curve(y,probs)
	# Calc AUC acc
	auc_acc = auc(fpr,tpr)

	pred = map(int,binarize(np.array(probs).reshape(1,-1),threshold = GAMMA)[0])


	if not TBI:
		ACC,SENS,SPEC = eval_model(pred, y)
		return [ACC,SENS,SPEC,auc_acc]

	else:
		ACC,SENS,SPEC = eval_model(pred, y, probs=probs, test_px=test_recs, TBI=True)
		return [ACC,SENS,SPEC]

コード例 #35

0

ファイルを表示

ファイル: mime.py プロジェクト: ArifulIslam1305/code

def run_test(seqs, label_seqs, sess, preds_T, input_PHs, label_PHs, mask_PHs,
             seq_length_PH, loss_T, options):
    all_losses = []
    all_preds = []
    all_labels = []
    batch_size = options['batch_size']
    for idx in xrange(len(label_seqs) / batch_size):
        batch_x = seqs[idx * batch_size:(idx + 1) * batch_size]
        batch_y = label_seqs[idx * batch_size:(idx + 1) * batch_size]
        inputs, _, masks, seq_length = mime_util.st_preprocess_hf_aux(
            batch_x, options)
        preds, loss = sess.run(
            [preds_T, loss_T],
            feed_dict={
                input_PHs[0]: inputs[0],
                input_PHs[1]: inputs[1],
                input_PHs[2]: inputs[2],
                mask_PHs[0]: masks[0],
                mask_PHs[1]: masks[1],
                mask_PHs[2]: masks[2],
                label_PHs[-1]: batch_y,
                seq_length_PH: seq_length,
            })
        all_losses.append(loss)
        all_preds.extend(list(preds))
        all_labels.extend(batch_y)
    auc = roc_auc_score(all_labels, all_preds)
    aucpr = average_precision_score(all_labels, all_preds)
    accuracy = (np.array(all_labels) == np.squeeze(
        binarize(np.array(all_preds).reshape(-1, 1), threshold=.5))).mean()
    return np.mean(all_losses), auc, aucpr

コード例 #36

0

ファイルを表示

ファイル: input_data.py プロジェクト: vherman3/AxonSegmentation

def resc(patch):
    """
    :param patch:  [image,mask]
    :return: random rescaling of the pair [image,mask]

    --- Rescaling reinforces axons size diversity ---
    """


    s = random.choice([0.5, 0.75, 1.0, 1.5, 2.0])
    data_rescale=[]
    for scale in s:

        image_rescale = rescale(patch[0], scale)
        mask_rescale = rescale(patch[1], scale)
        s_r = mask_rescale.shape[0]
        q_h, r_h = divmod(256-s_r,2)

        if q_h > 0 :
            image_rescale = np.pad(image_rescale,(q_h, q_h+r_h), mode = "reflect")
            mask_rescale = np.pad(mask_rescale,(q_h, q_h+r_h), mode = "reflect")
        else :
            patches = extract_patch(image_rescale,mask_rescale, 256)
            i = np.random.randint(len(patches), size=1)
            image_rescale,mask_rescale = patches[i]

        mask_rescale = preprocessing.binarize(np.array(mask_rescale), threshold=0.001)
        data_rescale = [image_rescale, mask_rescale]

    return data_rescale

コード例 #37

0

ファイルを表示

ファイル: feature_extractor_counts.py プロジェクト: dallascard/guac

    def do_transformations(self):
        # binarize counts
        if self.transform == 'binarize':
            print "Binarizing"
            self.feature_counts = binarize(self.feature_counts, copy=False)
            #self.feature_counts = sparse.csr_matrix(self.feature_counts > 0, dtype=int)

        elif self.transform == 'tfidf':
            print "Doing tf-idf transform"
            #doc_sums = self.feature_counts.sum(axis=1)
            #if np.min(doc_sums) == 0:
            #    doc_sums[doc_sums == 0] = 1.0
            #tf = sparse.csr_matrix(self.feature_counts.multiply(1.0/doc_sums))

            n_items, n_features = self.feature_counts.shape
            tf = normalize(self.feature_counts, norm='l1', axis=1, copy=False)
            doc_counts = self.vocab.get_all_doc_counts()
            n_docs = doc_counts.max()
            # add one to avoid zeros which might screw up the matrix size
            idf = sparse.csr_matrix(np.log(float(n_docs+1) / doc_counts), dtype=float)
            print tf.shape, idf.shape
            self.feature_counts = tf.multiply(idf)
            assert self.feature_counts.shape == (n_items, n_features)

        elif self.transform == 'normalizel1' or self.transform == 'normalize':
            print "Normalizing rows"
            self.feature_counts = normalize(self.feature_counts, norm='l1', axis=1, copy=False)

        elif self.transform == 'normalizel2':
            print "Normalizing rows"
            self.feature_counts = normalize(self.feature_counts, norm='l2', axis=1, copy=False)

        if self.scale_factor is not None:
            self.feature_counts = self.feature_counts * self.scale_factor

コード例 #38

0

ファイルを表示

ファイル: input_data.py プロジェクト: vherman3/AxonSegmentation

def elastic_transform(image, gt, alpha, sigma, random_state=None):
    """
    :param image: image
    :param gt: ground truth
    :param alpha: deformation coefficient (high alpha -> strong deformation)
    :param sigma: std of the gaussian filter. (high sigma -> smooth deformation)
    :param random_state:
    :return: deformation of the pair [image,mask]
    """

    if random_state is None:
        random_state = np.random.RandomState(None)

    shape = image.shape

    d = 4
    sub_shape = (shape[0]/d, shape[0]/d)

    deformations_x = random_state.rand(*sub_shape) * 2 - 1
    deformations_y = random_state.rand(*sub_shape) * 2 - 1

    deformations_x = np.repeat(np.repeat(deformations_x, d, axis=1), d, axis = 0)
    deformations_y = np.repeat(np.repeat(deformations_y, d, axis=1), d, axis = 0)

    dx = gaussian_filter(deformations_x, sigma, mode="constant", cval=0) * alpha
    dy = gaussian_filter(deformations_y, sigma, mode="constant", cval=0) * alpha

    x, y = np.meshgrid(np.arange(shape[0]), np.arange(shape[1]))
    indices = np.reshape(y+dy, (-1, 1)), np.reshape(x+dx, (-1, 1))

    elastic_image = map_coordinates(image, indices, order=1).reshape(shape)
    elastic_gt = map_coordinates(gt, indices, order=1).reshape(shape)
    elastic_gt = preprocessing.binarize(np.array(elastic_gt), threshold=0.5)

    return [elastic_image, elastic_gt]

コード例 #39

0

ファイルを表示

ファイル: predict.py プロジェクト: floranwang/geospatial-cnn

def getPredictions(image_data, threshold, allShipping=False):
    """
    This function returns np arrays of true labels, predicted labels, and predicted probabilities.
    image_data: generated from Keras image generator, in batch format
    threshold: the probability at which a classification should be considered shipping (1)
    allShipping: whether all image_data has a true shipping classification (eg. for the PHMSA data that is assumed to all have shipping activity)
    """
    all_true = np.zeros(0)
    all_pred = np.zeros(0)
    pred_prob = np.zeros(0)

    for i in range(len(image_data)):
        image_batch, label_batch = image_data[i]

        if (not allShipping):
            all_true = np.append(all_true, get_true_labels(label_batch))

        y_pred_prob = model.predict_proba(image_batch)[:, 1]
        y_pred_class = binarize([y_pred_prob], threshold)[0]

        all_pred = np.append(all_pred, y_pred_class)
        pred_prob = np.append(pred_prob, y_pred_prob)

    if (allShipping):
        all_true = np.repeat(1, len(all_pred))

    return all_true, all_pred, pred_prob

コード例 #40

0

ファイルを表示

def roc_auc(y_true, y_pred, jump=0.01):
    '''
    Area under ROC (Receiver Operating Characteristics)  curve

    Parameters
    ----------
    y_true: numpy.ndarray
        Targets
    y_pred: numpy.ndarray
        Class probability

    References
    ----------
    .. [1] https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc

    Returns
    -------
    roc_auc_score: float
        ROC AUC score
    '''
    y_true, y_pred = y_true.reshape(-1, 1), y_pred.reshape(-1, 1)
    x = []
    y = []
    for thr in np.arange(0.01, 1 + jump, jump):
        y_pred_bin = binarize(y_pred, thr)
        tn, fp, fn, tp = confusion_binary(y_true, y_pred_bin)
        tpr = tp / (tp + fn)
        fpr = fp / (tn + fp)
        y.append(tpr)
        x.append(fpr)
    x = np.array(x)
    y = np.array(y)
    return np.abs(np.trapz(y, x)) # Why trapz gives negative value?

コード例 #41

0

ファイルを表示

ファイル: plot_overlay_imgs.py プロジェクト: DaveOC90/Tissue-Segmentation

def op_vs_ip(subid, image_types, imagepaths, op_direc, overlays):
	
	
	img_data_group=[]
	img_shape_group=[]
	ol_data_group=[]
	ol_shape_group=[]
	for i, path in enumerate(imagepaths):	

		axial_slice, cor_slice, sag_slice, img_aspect_axial, img_aspect_cor, img_aspect_sag = pull_midslices(path)
		if os.path.isfile(overlays[i]):
			axial_slice_ol, cor_slice_ol, sag_slice_ol, img_aspect_axial_ol, img_aspect_cor_ol, img_aspect_sag_ol = pull_midslices(overlays[i])
			ol_data_group.append([axial_slice_ol, cor_slice_ol, sag_slice_ol])
			ol_shape_group.append([img_aspect_axial_ol, img_aspect_cor_ol, img_aspect_sag_ol])
		else:
			ol_data_group.append(['null','null','null'])
			ol_shape_group.append(['null','null','null'])
		## Append to Matrices
		img_data_group.append([axial_slice, cor_slice, sag_slice])
		img_shape_group.append([img_aspect_axial,img_aspect_cor,img_aspect_sag])
		


	my_cmap=plt.cm.gray


	fig, axarr = plt.subplots(ncols=np.shape(img_shape_group)[1], nrows=np.shape(img_shape_group)[0], figsize=(np.shape(img_shape_group)[0]*5,np.shape(img_shape_group)[1]*5))
	plt.suptitle(subid+' File Comparison', fontsize=20)	
	
	titlearray=['Axial', 'Coronal', 'Saggital']
	
	for x in range(0,np.shape(img_shape_group)[0]):
		for y in range(0,np.shape(img_shape_group)[1]):
			im = axarr[x, y].imshow(img_data_group[x][y], cmap=my_cmap, aspect=img_shape_group[x][y])
			axarr[x, y].set_xlabel('(Right) Radiological Convention (Left)', fontsize=10)
			axarr[x, y].set_title(image_types[x]+' '+titlearray[y])
			#divider = make_axes_locatable(axarr[x, y])
			#cax_ = divider.append_axes("right", size="5%", pad=0.05)
			#cbar = plt.colorbar(im, cax=cax_, ticks=MultipleLocator(round(np.max(img_data_group[x][y])/5, 1)))
			axarr[x, y].xaxis.set_visible(False)
			axarr[x, y].yaxis.set_visible(False)




			if os.path.isfile(overlays[x]):
				if x == 1:
					thresh=0.25
				if x == 2:
					thresh=0.4
				sl=np.array(ol_data_group[x][y]).astype(np.float64)
				sl=filters.sobel(sl)
				sl=preprocessing.binarize(sl, np.max(sl)*thresh)
				sl[sl < 1] = 'Nan'
				axarr[x, y].imshow(sl, cmap='autumn', aspect=ol_shape_group[x][y])

	#plt.show()
	plt.tight_layout()
	plt.autoscale()
	plt.savefig(op_direc)

コード例 #42

0

ファイルを表示

ファイル: multipleclassifier.py プロジェクト: margulies/NS_Classify

    def load_data(self, features, X_threshold):
        """ Load data into c_data """
        from neurosynth.analysis.reduce import average_within_regions

        # Load Masks by studies matrix

        # ADD FEATURE TO FILTER BY FEATURES
        masks_by_studies = average_within_regions(self.dataset, self.mask_img, threshold = self.thresh)

        study_ids = self.dataset.feature_table.data.index

        print "Loading data from neurosynth..."

        pb = tools.ProgressBar(len(list(masks_by_studies)), start=True)

        self.ids_by_masks = []
        self.data_by_masks = []
        for mask in masks_by_studies:

            m_ids = study_ids[np.where(mask == True)[0]]
            self.ids_by_masks.append(m_ids)
            self.data_by_masks.append(self.dataset.get_feature_data(ids=m_ids))
            pb.next()

        self.mask_num = masks_by_studies.shape[0]    
        self.mask_pairs = list(itertools.permutations(range(0, self.mask_num), 2))

        filename = path.join(mkdtemp(), 'c_data.dat')
        self.c_data = np.memmap(filename, dtype='object',
                                mode='w+', shape=(self.mask_num, self.mask_num))
        # Load data
        for pair in self.mask_pairs:
            reg1_ids = self.ids_by_masks[pair[0]]
            reg2_ids = self.ids_by_masks[pair[1]]

            reg1_set = list(set(reg1_ids) - set(reg2_ids))
            reg2_set = list(set(reg2_ids) - set(reg1_ids))

            x1 = self.data_by_masks[pair[0]]
            x1 = np.array(x1)[np.where(np.in1d(reg1_ids, reg1_set))[0]]

            x2 = self.data_by_masks[pair[1]]
            x2 = np.array(x2)[np.where(np.in1d(reg2_ids, reg2_set))[0]] 

            y = np.array([0]*len(reg1_set) + [1]*len(reg2_set))

            X = np.vstack((x1, x2))

            if X_threshold is not None:
                X = binarize(X, X_threshold)

            from neurosynth.analysis.classify import regularize
            X = regularize(X, method='scale')

            self.c_data[pair] = (X, y)

        if self.memsave:
            self.data_by_masks = []
            self.ids_by_masks = []

コード例 #43

0

ファイルを表示

ファイル: pairs.py プロジェクト: jochenklein/beard

    def transform(self, X):
        """Compute the Jaccard similarity for all pairs of elements in ``X``.

        Rows i in ``X`` are assumed to represent pairs, where
        ``X[i, :n_features]`` and ``X[i, n_features:]`` correspond to their two
        individual elements, each representing a set. Calling ``transform``
        computes the Jaccard similarity between these sets, i.e. such that
        ``Xt[i]`` is the Jaccard similarity of ``X[i, :n_features]`` and
        ``X[i, n_features:]``.

        Parameters
        ----------
        :param X: array-like, shape (n_samples, n_features)
            Input data.

        Returns
        -------
        :returns: Xt array-like, shape (n_samples, 1)
            The transformed data.
        """
        n_samples, n_features_all = X.shape
        n_features = n_features_all // 2

        X = binarize(X)
        X1 = X[:, :n_features]
        X2 = X[:, n_features:]

        sparse = sp.issparse(X)

        if sparse and not sp.isspmatrix_csr(X):
            X = X.tocsr()

        if sparse:
            if X.data.sum() == 0:
                return np.zeros((n_samples, 1))

            numerator = np.asarray(X1.multiply(X2).sum(axis=1)).ravel()

            X_sum = X1 + X2
            X_sum.data[X_sum.data != 0.] = 1
            M = X_sum.sum(axis=1)
            A = M.getA()
            denominator = A.reshape(-1,)

        else:
            if len(X[X.nonzero()]) == 0.:
                return np.zeros((n_samples, 1))

            numerator = (X1 * X2).sum(axis=1)

            X_sum = X1 + X2
            X_sum[X_sum.nonzero()] = 1
            denominator = X_sum.sum(axis=1)

        with np.errstate(divide="ignore", invalid="ignore"):
            Xt = numerator / denominator
            Xt[np.where(denominator == 0)[0]] = 0.

        return np.array(Xt).reshape(-1, 1)

コード例 #44

0

ファイルを表示

ファイル: vanilla_5AUG.py プロジェクト: Rendiere/Masters_Round3

def eval_model(preds, y_ref, probs = [], test_px = [], TBI = False):

	global GAMMA

	if len(preds) != len(y_ref):
		print "Predicted labels and test labels dont have the same dimensions!"
		print "Predicted: ", n_pred, "; Tests: ", n_test
		exit()


	if not TBI:
		CM = confusion_matrix(y_ref, preds)

		TP = CM[1,1]
		TN = CM[0,0]
		FP = CM[0,1]
		FN = CM[1,0]

		ACC = (TP+TN)/float(TP+TN+FP+FN)
		SENS = TP/float(TP+FN)
		SPEC = TN/float(TN+FP)

		return ACC,SENS,SPEC

	else:

		i = np.arange(len(test_px))

		df = pd.DataFrame({"Recording": pd.Series(test_px,index = i),
                            "Prediction": pd.Series(preds,index = i),
                            "Reference": pd.Series(y_ref,index = i),
                            "Probabilities": pd.Series(probs,index = i)
                            }).sort_values(by="Recording")

		y_test_rec 	= []
		TBI_list	= []

		for name, group in df.groupby("Recording"):
			l = group.Reference.iloc[0]
			y_test_rec.append(l)

			TB_prob = sum(group.Probabilities.values) / float(len(group.Probabilities))
			TBI_list.append(TB_prob)


		diagnosis_list = map(int,binarize(np.array(TBI_list).reshape(1,-1),threshold = GAMMA)[0])

		CM = confusion_matrix(y_test_rec, diagnosis_list)

		TP = CM[1,1]
        TN = CM[0,0]
        FP = CM[0,1]
        FN = CM[1,0]

        ACC = (TP+TN)/float(TP+TN+FP+FN)
        SENS = TP/float(TP+FN)
        SPEC = TN/float(TN+FP)

        return ACC, SENS, SPEC

コード例 #45

0

ファイルを表示

ファイル: train.py プロジェクト: asalomatov/variants

 def predictClass(self, threshold=0.5):
     # prediction
     # self.pred_y = self.model.predict(self.test_set_X)
     if self.is_keras:
         self.pred_y_prob = self.model.predict_proba(self.test_set_X)[:, 0]
     else:
         self.pred_y_prob = self.model.predict_proba(self.test_set_X)[:, 1]
     self.pred_y = binarize(self.pred_y_prob.reshape(1, -1), threshold)[0].astype(int)

コード例 #46

0

ファイルを表示

ファイル: MlFiter.py プロジェクト: alaofeng/abu

 def prob_maximum_low(self, x=None, y=None, show=True):
     x, y = self.proxy_xy(x, y)
     fiter = self.get_fiter()
     y_prob = MlFiterExcute.run_prob_cv_estimator(fiter, x, y, n_folds=10)
     l_pb = y_prob[y_prob < y_prob.mean()].mean()
     y_prob_l = binarize(y_prob.reshape(-1, 1), l_pb)
     if show:
         self.scores(y_prob_l, y)
     return l_pb

コード例 #47

0

ファイルを表示

ファイル: normalizationGallery.py プロジェクト: WQ-huziang/WQ-Testcode

 def binarize(df):
     """
     将数据二值化
         :param df: 传入DataFrame
         :returns: 标准化后的数据
     """   
     if not isinstance(df, pd.DataFrame):
         raise Exception("df is not DataFrame!")
     return preprocessing.binarize(df)

コード例 #48

0

ファイルを表示

ファイル: nbsvm1_1.py プロジェクト: ivoPe/SkNbsvm

 def transform(self, X):
     #If the binarize option is set to true, we need now to recompute "f", our binarized word counter
     if(self.bina == True):
         f_hat = binarize(X, threshold = 0.0)
     else :
         f_hat = X
     
     f_tilde = f_hat.multiply(self.r)
     return f_tilde

コード例 #49

0

ファイルを表示

ファイル: vanilla_5AUG.py プロジェクト: Rendiere/Masters_Round3

def validation(data, px, y_px):
	global GAMMA

	# convert px and y_px for broadcasting
	px = np.array(px)
	y_px = np.array(y_px)

	# vanilla LogReg classifier
	LRM = LogisticRegression()

	skf = StratifiedKFold(y_px, n_folds = N_FOLDS, shuffle = True)

	print "Running",N_FOLDS,"Stratified Splits"
	probs 		= []		# Probabilities during validation
	preds 		= []		# Predictions made
	y_ref 		= []		# Labels as they were used in validation
	val_recs 	= []		# List of recordings as they were used in validation
	for train_idx, val_idx in skf:

		# Separate train and val sets using indexes
		X_train, y_train, X_val, y_val, val_px = leave_out_fold(data, px, train_idx, val_idx)

		# Train the LRM
		LRM.fit(X_train, y_train)

		# Save this LRM performance
		probs.extend(list(LRM.predict_proba(X_val)[:,1]))
		preds.extend(list(LRM.predict(X_val)))
		y_ref.extend(y_val)
		val_recs.extend(val_px)

	fpr, tpr, thresholds = roc_curve(y_ref, probs)

	"""
	Do ROC analysis and get optimal threshold
	for sens ~= spec
	"""
	i = np.arange(len(tpr))
	roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),
	                    'tpr' : pd.Series(tpr, index = i),
	                    '1-fpr' : pd.Series(1-fpr, index = i),
	                    'tf' : pd.Series(tpr - (1-fpr), index = i),
	                    'thresholds' : pd.Series(thresholds, index = i)
	                    })
	idx = (roc.tf).abs().argmin()
	thresh = roc.thresholds.iloc[idx]
	auc_acc = auc(fpr,tpr)

	# Perform classification with optimal threshold
	preds_opt = map(int, binarize(np.array(probs).reshape(1,-1), threshold=thresh)[0])

	GAMMA = thresh

	ACC, SENS, SPEC = eval_model(preds_opt, y_ref)

	return [ACC,SENS,SPEC,auc_acc]

コード例 #50

0

ファイルを表示

ファイル: windows_seeded_new.py プロジェクト: Rendiere/Masters_Round3

def test_model(LRM, test_data, TBI = 0, save = 0):

    global GAMMA  

    """
    Evaluate a trained Logistic Regression model

    Inputs:
    =======
    LRM:        Trained Logistic Regression Model
    test_data:  Data to test the LRM on
    return:     Flag - To return [spec, sens, acc] or just acc
    TBI:        Flag - To compute results using TBI or not

    """  

    # Get the labels
    y = test_data.TBResult.values

    # Get the names of the recordings in the test set
    test_recs = test_data.StudyNum.values

    # Keep the feature data for training
    X = test_data.drop(["StudyNum","TBResult"], axis = 1)

    probs = LRM.predict_proba(X)[:,1]

    """
    Calculate AUC acc using ROC analysis
    """
    # Get FPR and TPR for the test set
    fpr, tpr, thresh = roc_curve(y,probs)
    # Calc AUC acc
    auc_acc = auc(fpr,tpr)

    pred = map(int,binarize(np.array(probs).reshape(1,-1),threshold = GAMMA)[0])
   
    # Do the same thing but with pandas
    # i = np.arange(len(probs))
    # temp_df = pd.DataFrame({'StudyNum': pd.Series(test_recs, index = i),
    #                         'Probability': pd.Series(probs, index = i),
    #                         'TBResult': pd.Series(y, index = i),
    #                         'Pred': 0})

    # # This makes all predictions = 1 where Prob >= Gamma
    # temp_df.ix[temp_df.Probability >= GAMMA,'Pred'] = 1
    # pred = temp_df.Pred.values

    if TBI == 0:
        acc,sens,spec = eval_model(pred, y)
        return [acc, sens, spec, auc_acc]

    else:
        acc,sens,spec = eval_model(pred, y, probs = probs, test_recs = test_recs, TBI = 1, save = save)
        return [acc, sens, spec]

コード例 #51

0

ファイルを表示

def getSrlRepresentation(cas, intensity=False, log=False, bnrz=False, representationSize=200):
    from sklearn.preprocessing import binarize
    model = models.Word2Vec.load('models/word2vec/srlModel')
    ret = [None]*len(cas.sentences)
    for i, sentence in enumerate(cas.srlSentences):
        numRows = sum([len(clause) for clause in sentence])
        altSentence = np.zeros((numRows, representationSize))
        currentRow = 0
        for clause in sentence:
            for j, (role, text) in enumerate(clause.iteritems()):
                word = str((role, text))
                try:
                    altWord = np.multiply(np.add(np.divide(model[word], 2.0), 0.5), 255) if intensity else model[word]
                    altWord = np.multiply(binarize(altWord, threshold=255.0/2.0), 255) if bnrz and intensity else altWord
                    altWord = binarize(altWord) if bnrz and not intensity else altWord
                    altSentence[currentRow,:] = altWord
                except:
                    altSentence[currentRow,:] = altSentence[j-1,:] if j != 0 else np.zeros(representationSize)
                currentRow += 1
        ret[i] = altSentence
    return ret

コード例 #52

0

ファイルを表示

ファイル: __init__.py プロジェクト: margulies/NS_Classify

    def load_data(self, features, X_threshold):
        """ Load data into c_data """
        # Load data for each mask
        self.load_mask_data(features)

        filename = path.join(mkdtemp(), 'c_data.dat')
        self.c_data = np.memmap(filename, dtype='object',
                                mode='w+', shape=(self.mask_num))

        all_ids = self.dataset.image_table.ids

        # If a low thresh is set, then get ids for studies at that threshold
        if self.thresh_low is not None:
            ids_by_masks_low = []
            from neurosynth.analysis.reduce import average_within_regions
            masks_by_studies_low = average_within_regions(
                self.dataset, self.mask_img, threshold=self.thresh_low)
            for mask in masks_by_studies_low:
                m_ids = np.array(all_ids)[np.where(mask == True)[0]]
                ids_by_masks_low.append(m_ids)       

        # Set up data into c_data
        for num, on_ids in enumerate(self.ids_by_masks):

            # If a low threshold is set, then use that to filter "off_ids", otherwise use "on_ids"
            if self.thresh_low is not None:
                off_ids = list(set(all_ids) - set(ids_by_masks_low[num]))
            else:
                off_ids = list(set(all_ids) - set(on_ids))

            on_data = self.data_by_masks[num].dropna()

            off_data = self.dataset.get_feature_data(ids=off_ids).dropna()

            y = np.array([0] * off_data.shape[0] + [1] * on_data.shape[0])

            X = np.vstack((np.array(off_data), np.array(on_data)))

            from neurosynth.analysis.classify import regularize
            X = regularize(X, method='scale')

            if X_threshold is not None:
                X = binarize(X, X_threshold)

            self.c_data[num] = (X, y)

        if self.memsave:
            self.data_by_masks = []
            self.ids_by_masks = []

        self.comparisons = range(0, self.mask_num)

        self.comp_dims = (self.mask_num, )

コード例 #53

0

ファイルを表示

ファイル: SelectKBest_and_Classifiers.py プロジェクト: cmgerr/West-Nile-Prediction

def modelEval(name, model, X, y, binarize_threshold):
    X_train, X_test, y_train, y_test = train_test_split(X_kbest, y, test_size=0.2,
        stratify = y, random_state = rs)
    meancvscore = cross_val_score(model, X, y, n_jobs=-1, verbose=1).mean()
    print 'Model %s cross_val_score: %f' % (name, meancvscore)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_adj = binarize(model.predict_proba(X_test)[:,1],
        threshold = binarize_threshold, copy=False).transpose()
    print 'Model %s classification metrics:' % name
    doClassifMetrics(y_test, y_pred)
    print 'Model %s using prediction threshold %f:' % (name, binarize_threshold)
    doClassifMetrics(y_test, y_pred_adj)

コード例 #54

0

ファイルを表示

ファイル: test_preprocessing.py プロジェクト: Sandy4321/pandas-ml

    def test_preprocessing_assignment(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        original_columns = df.data.columns
        df['sepal length (cm)'] = df['sepal length (cm)'].preprocessing.binarize(threshold=6)
        self.assertTrue(isinstance(df, pdml.ModelFrame))
        binarized = pp.binarize(np.atleast_2d(iris.data[:, 0]), threshold=6)
        expected = np.hstack([binarized.T, iris.data[:, 1:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        self.assert_index_equal(df.data.columns, original_columns)

        # recreate data
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        target_columns = ['sepal length (cm)', 'sepal width (cm)']
        df[target_columns] = df[target_columns].preprocessing.binarize(threshold=6)
        self.assertTrue(isinstance(df, pdml.ModelFrame))
        binarized = pp.binarize(iris.data[:, 0:2], threshold=6)
        expected = np.hstack([binarized, iris.data[:, 2:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        self.assert_index_equal(df.data.columns, original_columns)

コード例 #55

0

ファイルを表示

def getFeaturesUnigrams(sentence):
    def normalizeFeatures(values, mn, mx):
        return np.divide(np.subtract(values, mn), float(mx-mn))
    featureDict = {}
    for i, word in enumerate(sentence.split()):
        try:
            representation = model[word]
            representation = binarize(representation)
            representation = normalizeFeatures(representation, 0, 1)
            for j, vectorEntry in enumerate(representation):
                featureDict[str(i*len(representation)+j)] = vectorEntry
        except KeyError:
            continue
    return featureDict

コード例 #56

0

ファイルを表示

ファイル: __init__.py プロジェクト: margulies/NS_Classify

    def load_data(self, features, X_threshold):
        """ Load data into c_data """
        # Load data for each mask
        self.load_mask_data(features)

        # Set up pair-wise data
        self.comparisons = list(
            itertools.combinations(range(0, self.mask_num), 2))

        filename = path.join(mkdtemp(), 'c_data.dat')
        self.c_data = np.memmap(filename, dtype='object',
                                mode='w+', shape=(self.mask_num, self.mask_num))

        # Filter data and arrange into c_data
        for pair in self.comparisons:

            x1 = self.data_by_masks[pair[0]]
            x2 = self.data_by_masks[pair[1]]

            reg1_ids = self.ids_by_masks[pair[0]]
            reg2_ids = self.ids_by_masks[pair[1]]

            if self.remove_overlap is True:
                reg1_set = list(set(reg1_ids) - set(reg2_ids))
                reg2_set = list(set(reg2_ids) - set(reg1_ids))

                x1 = np.array(x1)[np.where(np.in1d(reg1_ids, reg1_set))[0]]
                x2 = np.array(x2)[np.where(np.in1d(reg2_ids, reg2_set))[0]]

                reg1_ids = reg1_set
                reg2_ids = reg2_set
                
            y = np.array([0] * len(reg1_ids) + [1] * len(reg2_ids))

            X = np.vstack((x1, x2))

            if X_threshold is not None:
                X = binarize(X, X_threshold)

            from neurosynth.analysis.classify import regularize
            X = regularize(X, method='scale')

            self.c_data[pair] = (X, y)

        if self.memsave:
            self.data_by_masks = []
            self.ids_by_masks = []

        self.comp_dims = (self.mask_num, self.mask_num)

コード例 #57

0

ファイルを表示

ファイル: classify.py プロジェクト: johnsonc/neurosynth

def get_score(X, y, clf, scoring = 'accuracy'):
    from sklearn.preprocessing import binarize

    prediction = binarize(clf.predict(X), 0.5)

    if scoring == 'accuracy':
        from sklearn.metrics import accuracy_score
        score = accuracy_score(y, prediction)
    elif scoring =='f1':
        from sklearn.metrics import f1_score
        score = f1_score(y, prediction)
    else:
        score = scoring(y, prediction)

    return prediction, score

コード例 #58

0

ファイルを表示

def getFeaturesBigrams(sentence):
    def normalizeFeatures(values, mn, mx):
        return np.divide(np.subtract(values, mn), float(mx-mn))
    featureDict = {}
    sentence = sentence.split()
    bigramSentence = [b for b in zip(sentence[:-1], sentence[1:])]
    for i, (w1, w2) in enumerate(bigramSentence):
        try:
            representation = model[w1 + '_' + w2]
            representation = binarize(representation)
            representation = normalizeFeatures(representation, 0, 1)
            for j, vectorEntry in enumerate(representation):
                featureDict[str(i*len(representation)+j)] = vectorEntry
        except KeyError:
            continue
    return featureDict

コード例 #59

0

ファイルを表示

ファイル: ensemple.py プロジェクト: rodion-zheludkov/kaggle

def read_train(train_file):
    lines = []
    y = []
    vectorizer = CountVectorizer(min_df=3)
    tf_idf = TfidfTransformer()

    for parts in utils.read_train(train_file):
        is_blocked = parts[8]
        desc = cleantext.clean(parts[4], False)
        lines.append(desc)
        y.append(int(is_blocked))

    vectorizer = vectorizer.fit_transform(lines)
    X_nb = tf_idf.fit_transform(vectorizer)
    X_log = binarize(vectorizer)

    return X_nb, X_log, numpy.asarray(y)