Beispiel #1
0
    def _get_node_distance_matrix(self, datapoint, som_array):
        """Get distance of datapoint and node using Euclidean distance.

        Parameters
        ----------
        datapoint : np.array, shape=(X.shape[1])
            Datapoint = one row of the dataset `X`
        som_array : np.array
            Weight vectors of the SOM,
            shape = (self.n_rows, self.n_columns, X.shape[1])

        Returns
        -------
        distmat : np.array of float
            Distance between datapoint and each SOM node

        """
        # algorithms on the full matrix
        if self.distance_metric == "euclidean":
            return np.linalg.norm(som_array - datapoint, axis=2)

        # node-by-node algorithms
        distmat = np.zeros((self.n_rows, self.n_columns))
        if self.distance_metric == "manhattan":
            for node in self.node_list_:
                distmat[node] = dist.cityblock(
                    som_array[node[0], node[1]], datapoint)

        elif self.distance_metric == "mahalanobis":
            for node in self.node_list_:
                som_node = som_array[node[0], node[1]]
                cov = np.cov(np.stack((datapoint, som_node), axis=0),
                             rowvar=False)
                cov_pinv = np.linalg.pinv(cov)   # pseudo-inverse
                distmat[node] = dist.mahalanobis(
                    datapoint, som_node, cov_pinv)

        elif self.distance_metric == "tanimoto":
            # Note that this is a binary distance measure.
            # Therefore, the vectors have to be converted.
            # Source: Melssen 2006, Supervised Kohonen networks for
            #         classification problems
            # VERY SLOW ALGORITHM!!!
            threshold = 0.5
            for node in self.node_list_:
                som_node = som_array[node[0], node[1]]
                distmat[node] = dist.rogerstanimoto(
                    binarize(datapoint.reshape(1, -1), threshold=threshold,
                             copy=True),
                    binarize(som_node.reshape(1, -1), threshold=threshold,
                             copy=True))

        elif self.distance_metric == "spectralangle":
            for node in self.node_list_:
                distmat[node] = np.arccos(np.divide(
                    np.dot(som_array[node[0], node[1]], datapoint),
                    np.multiply(np.linalg.norm(som_array),
                                np.linalg.norm(datapoint))))

        return distmat
Beispiel #2
0
 def predict(self, X):
     ''' Predict class labels. '''
     if self.mode == 'average':
         return binarize(self.predict_proba(X)[:,[1]], 0.5)
     else:
         res = binarize(X, 0.5)
         return np.apply_along_axis(lambda x: np.bincount(x.astype(int), self.weights).argmax(), axis=1, arr=res)
Beispiel #3
0
def binarize_image(image, method='li', **kwargs):
    """Binarize image using one of the available methods: 'isodata',
    'li', 'otsu', 'sauvola', and 'boolean'. Defaults to 'li'.
    Extra keyword arguments are passed in as is to the corresponding
    scikit-image thresholding function. The 'boolean' method refers to simple
    thresholding from a grey-scale image. If a 'threshold' kwarg is not passed
    to the 'boolean' method, 'li' thresholding is performed.
    For reference
    Sezgin M. and Sankur B. (2004) "Survey over Image Thresholding Techniques
    and Quantitative Performance Evaluation" Journal of Electronic Imaging,
    13(1): 146-165 DOI:10.1117/1.1631315
    """
    if image.ndim != 2:
        # image is not gray-scale
        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    if np.unique(image).size == 2:
        # image is already binary
        return image
    boolean_threshold = kwargs.get('threshold', None)
    if method == 'boolean' and boolean_threshold:
        preprocessing.binarize(image, threshold=boolean_threshold, copy=False)
        return convert(image)
    if method not in ('sauvola', 'isodata', 'otsu', 'li'):
        method = 'li'
    thresh_func = getattr(filters.thresholding, "threshold_{}".format(method))
    threshold = thresh_func(image, **kwargs)
    # OpenCV can't write black and white images using boolean values, it needs
    # at least a 8bits 1-channel image ranged from 0 (black) to 255 (white)
    return convert(image <= threshold)
Beispiel #4
0
def binarize_encode_target_columns(apache_df_list):
    """
    Function to binarize predictions and encode the actual 
    target columns for the apache prediction tables
    
    NOTE - all column names are the same names for when
    they were queried from the database. Will not work
    if you have renamed
    
    predictedicumortality,predictedhospitalmortality
    actualicumortality,actualhospitalmortality.
    
    If you have, please rename them back or change
    them directly from this function.
    
    Parameters
    ------------
    apache_df_list: list of dataframe objects 
                    The dataframes for which we will be performing
                    the operations on, given that
                    
    Returns
    ------------
    None, directly makes changes to the dataframes listed in 
    apache_df_list. Four new columns will be added
    
    icu_death_prediction_label : class labels from the 
                                 predictedicumortality column
                                 
    hosp_death_predictions_label : class labels from the 
                                   predictedhospitalmortality column
                                   
    icu_deaths : class labels for the actualicumortality column
    
    hosp_deaths : class labels for the actualhospitalmortality column
    
    """
    # Grab the dataframes
    apache_df_list = apache_df_list

    # set the threshold
    threshold = 0.5

    # loop through the dataframes binarize predictions and encode labels for established truth
    for df in apache_df_list:
        # binarize predictions
        icu_death_predictions = binarize(
            df['predictedicumortality'].values.reshape(-1, 1),
            threshold=threshold)
        hosp_death_predictions = binarize(
            df['predictedhospitalmortality'].values.reshape(-1, 1),
            threshold=threshold)
        df['icu_death_prediction_label'] = icu_death_predictions
        df['hosp_death_prediction_label'] = hosp_death_predictions

        # encode lobels for actual data
        df['icu_deaths'] = df['actualicumortality'].map(
            lambda status: 0 if status == 'ALIVE' else 1)
        df['hosp_deaths'] = df['actualhospitalmortality'].map(
            lambda status: 0 if status == 'ALIVE' else 1)
Beispiel #5
0
    def train(self, X_train, y_train, silent = False):
        '''train the model, X_train contains the tweet in each row'''
        if self.useTfIdf:
            self.vectorizer = TfidfVectorizer(ngram_range=(self.ngram_s, self.ngram_e), tokenizer=lambda x: x.split(), lowercase=False, preprocessor=lambda x: x)
        else:
            self.vectorizer = CountVectorizer(ngram_range=(self.ngram_s, self.ngram_e), tokenizer=lambda x: x.split(), lowercase=False, preprocessor=lambda x: x)

        if self.multinomial:
            self.model = MultinomialNaiveBayes()
        else:
            self.model = BernoulliNaiveBayes()

        self.vectorizer.fit(X_train.astype('str'))
        #assert len(self.vectorizer.stop_words_) == 0 #we don't want preprocess by scikit learn, we already performed it
        #print(self.vectorizer.get_feature_names())
        
        if not silent:
            print('vectorizer trained')
        
        X_train_bow = self.vectorizer.transform(X_train.astype('str'))
        if not self.multinomial:
            binarize(X_train_bow, copy=False)
        if not silent:
            print('train data vectorized')

        self.model.train(X_train_bow, y_train)
        if not silent:
            print('model trained')
Beispiel #6
0
def getSrlRepresentation(cas,
                         intensity=False,
                         log=False,
                         bnrz=False,
                         representationSize=200):
    from sklearn.preprocessing import binarize
    model = models.Word2Vec.load('models/word2vec/srlModel')
    ret = [None] * len(cas.sentences)
    for i, sentence in enumerate(cas.srlSentences):
        numRows = sum([len(clause) for clause in sentence])
        altSentence = np.zeros((numRows, representationSize))
        currentRow = 0
        for clause in sentence:
            for j, (role, text) in enumerate(clause.iteritems()):
                word = str((role, text))
                try:
                    altWord = np.multiply(
                        np.add(np.divide(model[word], 2.0), 0.5),
                        255) if intensity else model[word]
                    altWord = np.multiply(
                        binarize(altWord, threshold=255.0 /
                                 2.0), 255) if bnrz and intensity else altWord
                    altWord = binarize(
                        altWord) if bnrz and not intensity else altWord
                    altSentence[currentRow, :] = altWord
                except:
                    altSentence[currentRow, :] = altSentence[
                        j - 1, :] if j != 0 else np.zeros(representationSize)
                currentRow += 1
        ret[i] = altSentence
    return ret
def main():
    logging.info(u"Getting clusters data")
    uid_to_ug = get_ug_data(args.user_cluster)
    bid_to_bg, bg_iids = get_bg_data(args.booking_cluster)

    logging.info("Reading training data")
    training_df = pd.read_csv(args.training_csv)
    tr_m = get_matrix(training_df, uid_to_ug, bid_to_bg)
    logging.info(u"Training matrix: %s", get_sparse_matrix_info(tr_m))

    logging.info("Reading testing data")
    # we don't care about repetitive actions in the testing
    testing_df = pd.read_csv(args.testing_csv)[["code",
                                                "propcode"]].drop_duplicates()

    logging.info("Preparing similarity matrix")
    sim_m = get_similarity_matrix(tr_m)

    logging.info("Testing hit ratio at top-%s", args.top_k)
    recs_m = get_topk_recs(tr_m, sim_m, binarize(tr_m), args.top_k)
    logging.info(u"Hit ratio: %.3f",
                 hit_ratio(recs_m, testing_df, uid_to_ug, bg_iids))

    if args.top_k_iid_per_uid:
        recs_m = get_topk_recs(tr_m, sim_m, binarize(tr_m))
        store_data_for_eval(recs_m, testing_df, uid_to_ug, bg_iids)
    def test_preprocessing_assignment(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        original_columns = df.data.columns
        df['sepal length (cm)'] = df[
            'sepal length (cm)'].preprocessing.binarize(threshold=6)
        self.assertIsInstance(df, pdml.ModelFrame)
        binarized = pp.binarize(np.atleast_2d(iris.data[:, 0]), threshold=6)
        expected = np.hstack([binarized.T, iris.data[:, 1:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        tm.assert_index_equal(df.data.columns, original_columns)

        # recreate data
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        target_columns = ['sepal length (cm)', 'sepal width (cm)']
        df[target_columns] = df[target_columns].preprocessing.binarize(
            threshold=6)
        self.assertIsInstance(df, pdml.ModelFrame)
        binarized = pp.binarize(iris.data[:, 0:2], threshold=6)
        expected = np.hstack([binarized, iris.data[:, 2:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        tm.assert_index_equal(df.data.columns, original_columns)
Beispiel #9
0
def superimpose_two_masks(mask_fn1, mask_fn2):
    img_in = cv2.imread(mask_fn1, cv2.IMREAD_GRAYSCALE)
    img_in = binarize(img_in, threshold=50, copy=True)
    img_side = cv2.imread(mask_fn2, cv2.IMREAD_GRAYSCALE)
    img_side = binarize(img_side, threshold=50, copy=True)
    composite = cv2.bitwise_or(img_in,img_side)
    return composite
Beispiel #10
0
def binarize(pred, threshold=0.5):
    # Batch_wise
    if pred.ndim == 3:
        return np.array(
            [pre.binarize(sub, threshold=threshold) for sub in pred])
    else:
        return pre.binarize(pred, threshold=threshold)
def bns(X, y):
    """
    Implements the bi-normal separation scoring.
    """

    # binarization: from counts to presence/abscence
    binarize(X, threshold=0.0, copy=False)

    # one column per class
    Y = LabelBinarizer().fit_transform(y)
    if Y.shape[1] == 1: # binary problem case
        Y = np.append(1-Y, Y, axis=1)

    pos = np.sum(Y, axis=0)
    neg = Y.shape[0] - pos

    tp = safe_sparse_dot(X.T, Y)
    fp = np.sum(tp, axis=1).reshape(-1, 1) - tp

    tpr = bounded(tp/pos.astype(float))
    fpr = bounded(fp/neg.astype(float))

    bns = np.abs(_z_score(tpr) - _z_score(fpr))

    return bns[:,1], None
Beispiel #12
0
    def jaccard_sim(self):
        '''given a sparse matrix, calculate jaccard sim

        ** ref : http://na-o-ys.github.io/others/2015-11-07-sparse-vector-similarities.html
        '''
        if self.kind == 'ubcf':
            # assure binarize sp matrix and astype int16
            mat = binarize(self.inter).astype('int16')
        elif self.kind == 'ibcf':
            # assure binarize sp matrix and astype int16
            mat = binarize(self.inter.T).astype('int16')

        rows_sum = mat.getnnz(axis=1).astype('int16')  #
        ab = mat.dot(mat.T).astype('float16')  # mat x t(mat)
        # for rows
        aa = np.repeat(rows_sum, ab.getnnz(axis=1))
        # for columns
        bb = rows_sum[ab.indices]

        similarities = ab.tocoo(copy=True)
        similarities.data /= (aa + bb - ab.data)
        del aa, bb, ab  # large memory cost
        similarities = similarities.astype('float32')
        # similarities.setdiag(0) ##
        similarities = similarities.tocsr()
        similarities.eliminate_zeros()
        sparsity = float(similarities.nnz / mat.shape[0]**2) * 100
        print(
            'similarity (jaccard) matrix built ({}), \nsparsity of similarity: {:.2f} %'
            .format(self.kind, sparsity))
        self.sim = similarities
Beispiel #13
0
    def test_binarize(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        result = df.preprocessing.binarize()
        expected = pp.binarize(iris.data)

        self.assertTrue(isinstance(result, pdml.ModelFrame))
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        self.assert_index_equal(result.columns, df.data.columns)

        result = df.preprocessing.binarize(threshold=5)
        expected = pp.binarize(iris.data, threshold=5)

        self.assertTrue(isinstance(result, pdml.ModelFrame))
        self.assert_numpy_array_almost_equal(result.data.values, expected)
        self.assert_index_equal(result.columns, df.data.columns)

        s = df['sepal length (cm)']
        self.assertTrue(isinstance(s, pdml.ModelSeries))
        result = s.preprocessing.binarize()
        expected = pp.binarize(iris.data[:, 0])[0]

        self.assertTrue(isinstance(result, pdml.ModelSeries))
        self.assert_numpy_array_almost_equal(result.values, expected)
        self.assertEqual(result.name, 'sepal length (cm)')

        result = s.preprocessing.binarize(threshold=6)
        expected = pp.binarize(iris.data[:, 0], threshold=6)[0]

        self.assertTrue(isinstance(result, pdml.ModelSeries))
        self.assert_numpy_array_almost_equal(result.values, expected)
        self.assertEqual(result.name, 'sepal length (cm)')
def ig(X, y):
    """
    This method calculates the information gain for two random variables I(X, Y).
    """

    # binarization: from counts to presence/abscence
    binarize(X, threshold=0.0, copy=False)

    # una columna por cada clase
    Y = LabelBinarizer().fit_transform(y)
    if Y.shape[1] == 1: # binary problem case
        Y = np.append(1-Y, Y, axis=1)

    Y_prob = (np.sum(Y, axis=0, dtype=np.float64) / len(Y)).reshape(-1, 1)

    # calculate the class entropy H(Y)
    class_entropy = _entropy(Y_prob)

    X_y_count = safe_sparse_dot(Y.T, X)
    # TODO XXX FIXME ver si estoy calculando bien esta probabilidad
    X_y_prob = \
        X_y_count / np.sum(X_y_count, axis=0, dtype=np.float64)

    # calculate the conditional entropy of the class given the feature H(y|f_i)
    cond_entropy = _entropy(X_y_prob) # TODO XXX FIXME ver si estoy calculando bien la entropia condicional
    print "class:", class_entropy
    print "cond_entropy:", cond_entropy

    infogain = class_entropy - cond_entropy

    return infogain, None
Beispiel #15
0
def greedy_cailp(positive_coverage, negative_coverage, k=20):
    num_features = positive_coverage.shape[1]
    num_positive_tweets = positive_coverage.shape[0]
    num_negative_tweets = negative_coverage.shape[0]
    positive_bin = binarize(positive_coverage)
    negative_bin = binarize(negative_coverage)
    positive_lil = positive_bin.tolil()
    negative_lil = negative_bin.tolil()
    selected_features = []
    for i in range(k):
        print(i)
        scores = (positive_lil.sum(axis=0) / float(num_positive_tweets)) \
                - (negative_lil.sum(axis=0) / float(num_negative_tweets))
        selected_feature = scores.argmax()
        if selected_feature not in selected_features:
            covered_pos_tweets = list(
                positive_lil[:, selected_feature].nonzero()[0])
            utils.delete_row_lil(positive_lil, covered_pos_tweets)
            covered_neg_tweets = list(
                negative_lil[:, selected_feature].nonzero()[0])
            utils.delete_row_lil(negative_lil, covered_neg_tweets)
            selected_features.append(selected_feature)
        else:
            break
    return selected_features
def main():
    logging.info("Reading training data")
    training_df = pd.read_csv(args.training_csv)
    tr_m, uid_to_row, iid_to_col = get_training_matrix_and_indices(training_df)
    logging.info("Training matrix: %s", get_sparse_matrix_info(tr_m))

    logging.info("Reading testing data")
    testing_df = pd.read_csv(args.testing_csv)[["code", "propcode"]].drop_duplicates()

    logging.info("Preparing similarity matrix")
    sim_m = get_similarity_matrix(tr_m)

    logging.info("Testing hit ratio at top-%s", args.top_k)
    recs_m = get_topk_recs(
        normalize(tr_m),
        sim_m,
        binarize(tr_m),
        args.top_k,
    )
    logging.info("Hit ratio: %.3f", hit_ratio(recs_m, testing_df, uid_to_row, iid_to_col))

    if args.top_k_iid_per_uid:
        recs_m = get_topk_recs(
            tr_m,
            sim_m,
            binarize(tr_m)
        )
        store_data_for_eval(recs_m, testing_df, uid_to_row, iid_to_col)
Beispiel #17
0
def meanThreshold(adata,
                  groupby,
                  threshold,
                  return_df=False,
                  layer=None,
                  use_raw=False,
                  transformation="log1p"):
    """Binarize gene expression for groups aggregated by mean.

    Returns: adata object with updated uns.gene_call
    """
    from sklearn.preprocessing import binarize
    import pandas as pd
    df = get_adata_df(adata,
                      layer=layer,
                      use_raw=use_raw,
                      transformation=transformation)
    result = df.groupby(by=adata.obs[groupby], axis=1).mean()
    binarize(result, threshold=threshold, copy=False)

    if return_df is True:
        return result
    else:
        adata.uns.update({"gene_call": result})
        return adata
Beispiel #18
0
    def get_prediction_metrics(self):
        print("Getting prediction metrics")
        df = self.get_predictions_as_df(self.predictions)
        metrics = {}
        prediction_metrics = {}
        annotation_metrics = {}

        prediction_metrics["event_count"] = len(df["start"])
        prediction_metrics["mean_duration"] = df["duration"].mean() if len(
            df["start"]) > 0 else 0
        # Hour * hz
        prediction_metrics[
            "recording_length_minutes"] = self.last_predicted_index / (60 * 10)
        if prediction_metrics["recording_length_minutes"] > 0:
            prediction_metrics["calculated_ahi"] = (
                prediction_metrics["event_count"] /
                prediction_metrics["recording_length_minutes"]) * 60

        metrics["prediction"] = prediction_metrics

        if self.ground_truth is not None:
            df = self.get_predictions_as_df(self.ground_truth)

            annotation_metrics["event_count"] = len(df["start"])
            annotation_metrics["mean_duration"] = df["duration"].mean() if len(
                df["start"]) > 0 else 0

            annotation_metrics[
                "annotation_length_minutes"] = self.ground_truth_length / (60 *
                                                                           10)
            metric_end = int(
                float(max(self.ground_truth_length,
                          self.last_predicted_index)))

            if annotation_metrics["annotation_length_minutes"] > 0:
                annotation_metrics["calculated_ahi"] = (
                    annotation_metrics["event_count"] /
                    annotation_metrics["annotation_length_minutes"]) * 60

            predictions = self.predictions[:metric_end]
            ground_truth = self.ground_truth[:metric_end]
            ground_truth_binary = np.ravel(
                binarize(ground_truth.reshape(1, -1), 0))
            predictions_binary = np.ravel(
                binarize(predictions.reshape(1, -1), 0))

            annotation_metrics["accuracy_score"] = accuracy_score(
                ground_truth_binary, predictions_binary)
            annotation_metrics["f1_score"] = f1_score(ground_truth_binary,
                                                      predictions_binary)
            annotation_metrics["precision_score"] = precision_score(
                ground_truth_binary, predictions_binary)
            annotation_metrics["recall_score"] = recall_score(
                ground_truth_binary, predictions_binary)

            metrics["annotation"] = annotation_metrics

        return metrics
Beispiel #19
0
def partition(features, a, probs, w):
    ig = 0
    ap = {}

    # multiply a by row and re-sparsify
    x = features.index(w)
    ap['yes'] = a.multiply(binarize(a[x])).tocsr()

    # a_no is whatever's left of 'a' after removing a_yes
    ap['no'] = a - ap['yes']

    # sum a's columns and binarize
    qk = binarize(a.sum(axis=0))[0]

    pk = {}
    pk['no'] = binarize(ap['no'].sum(axis=0))[0]

    # pk['yes'] is whatever's left of qk after removing pk['no']
    pk['yes'] = qk - pk['no']

    ap['yes'] = ap['yes'].multiply(pk['yes'].reshape(-1, 1).T).tocsr()

    # for qk and both pk's, multiply by static probs vector, then normalize
    qk = qk * probs

    ig_c = {}
    ig_uc = {}
    ig_c['yes'] = 0.0
    ig_uc['yes'] = 0.0
    ig_c['no'] = 0.0
    ig_uc['no'] = 0.0

    if np.sum(qk) > 0:
        qk_num = len(np.where(qk != 0)[0])
        qk = normalize(qk)

        if VERBOSE:
            print(CRED + 'qk ' + str(qk_num) + CEND, '\n', a.A, '\n', qk)

        for d in ['yes', 'no']:
            pk[d] = pk[d] * probs
            if np.sum(pk[d]) > 0:
                pkd_nz = np.where(pk[d] != 0)[0]
                pk_num = len(pkd_nz)
                pk[d] = normalize(pk[d])
                qk_nz = np.where(qk != 0)[0]
                ig_uc[d] = entropy(pk=pk[d][qk_nz], qk=qk[qk_nz], base=2)
                ig_c[d] = (pk_num / qk_num) * ig_uc[d]
                ig += ig_c[d]

                if VERBOSE:
                    print(CRED + 'pk[' + d + '] ' + str(pk_num) + CEND, '\n',
                          ap[d].A, '\n', pk[d])

    if VERBOSE:
        print(ig)

    return ig, ap['yes'], ig_uc, ig_c
def binarizer():
    a = [[-1, 3, -2], [5, -7, -4]]

    b = preprocessing.binarize(a)
    print(b)
    print(preprocessing.binarize(a, threshold=-2))

    bin = preprocessing.Binarizer()
    print(bin.transform(a))
Beispiel #21
0
 def perform_test(self, X_test, silent=False):
     X_test_bow = self.vectorizer.transform(X_test.astype('str'))
     if not self.multinomial:
         binarize(X_test_bow, copy=False)
     if not silent:
         print('test data vectorized')
     
     y_score = self.model.multi_prediction_score(X_test_bow)
     y_pred = self.model.multi_predict_class_from_score(y_score, threshold=self.threshold)
     return y_score, y_pred
Beispiel #22
0
 def predict(self, X):
     ''' Predict class labels. '''
     if self.mode == 'average':
         return binarize(self.predict_proba(X)[:, [1]], 0.5)
     else:
         res = binarize(X, 0.5)
         return np.apply_along_axis(
             lambda x: np.bincount(x.astype(int), self.weights).argmax(),
             axis=1,
             arr=res)
Beispiel #23
0
 def Binarize(self, column = None):
 """ Feature Binarization, tresholding numerical features 
     to get boolean values """
     try:
         if column == None:
             _dataset = preprocessing.binarize(_dataset)
         else:
             _dataset[column] = preprocessing.binarize(_dataset[column])
     except Exception as n:
         print("Binarize failed!")
         print(n)
Beispiel #24
0
def multibinarize(x, thresholds):
    if hasattr(x, "fillna"):
        x = x.fillna(0).values.reshape(-1, 1)
    else:
        x = x.reshape(-1, 1)
    res = None
    for threshold in thresholds:
        if res is None:
            res = binarize(x, threshold)
        else:
            res += binarize(x, threshold)
    return res[:, 0]
Beispiel #25
0
def process_nps(nps):
    print("processing meanings ...")
    probs = []
    pairs = []
    features = []

    chunks = []
    vectors = []
    chunk_size = 1000

    # add adjective features
    for alist in nps.adjs.unique():
        for w in alist.split(','):
            if w not in features:
                features.append(w)

    # add noun features
    for w in nps.noun.unique():
        if w not in features:
            features.append(w)

    # create vector for each NP
    total = len(nps)

    for i, row in nps.iterrows():
        print_progress(i + 1, total)
        vector = [0] * len(features)
        vector[features.index(row['noun'])] = 1
        for adj in row['adjs'].split(','):
            vector[features.index(adj)] = 1
        if WEIGHTED_PROBS:
            probs.append(np.clip(row['count'], 0, 100))
        else:
            probs.append(1)
        vectors.append(vector)

        if len(vectors) > chunk_size:
            chunks.append(csr_matrix(binarize(np.array(vectors).T)).tocsr())
            vectors = []
    chunks.append(csr_matrix(binarize(np.array(vectors).T)).tocsr())

    print("")

    print("combining vectors...")
    a_orig = hstack(chunks).tocsr()

    print("normalizing probabilities ...")
    probs = normalize(np.array(probs))

    print('total feature vectors:', len(probs))

    return features, probs, a_orig
Beispiel #26
0
def get_score(X, y, clf, scoring = 'accuracy'):
    from sklearn.preprocessing import binarize

    if scoring == 'accuracy':
        from sklearn.metrics import accuracy_score
        score = accuracy_score(y, binarize(clf.predict(X), 0.5))
    elif scoring =='f1':
        from sklearn.metrics import f1_score
        score = f1_score(y, binarize(clf.predict(X), 0.5))
    else:
        score = clf.score(X, y)

    return score
Beispiel #27
0
    def binary_bow(self, n=None):
        data_test = self.data_test
        data_train = self.data_train

        if (n):
            X_tr = binarize(np.array(data_test[0][0:n].todense()))
            X_te = binarize(np.array(data_train[0][0:n].todense()))

            small_test = X_tr, data_test[1][0:n]
            small_train = X_te, data_train[1][0:n]

            return small_train, small_test

        return data_train, data_test
Beispiel #28
0
    def evaluate(self, pred_all, test, method='precision'):
        """
        params
        ======
        
        pred_all:(ndarray) 
            predicted/recommended result for each user 
            
        test:(csr_matrix)
            testing sets(test.shape[0] should be same as pred_all.shape[0])
            
        method: (str) precision(default), recall
            evaluate method 
            
        attribute
        =========
        precision
        recall

        """
        assert type(test) == sp.csr_matrix
        assert test.shape[0] == pred_all.shape[0]

        if method == 'precision':
            test_lil = binarize(test).tolil()  # binarize and tranform to lil
            prec_array = np.zeros(pred_all.shape[0])
            num_of_test_data = 0
            for user, items in enumerate(test_lil.rows):
                prec_array[user] = len(np.intersect1d(
                    items, pred_all[user, ])) / len(pred_all[user, ])
                if items != []:
                    num_of_test_data += 1
#            return np.sum(prec_array)/num_of_test_data
            self.precision = np.sum(prec_array) / num_of_test_data
            print("\n-------------")
            print("model: {},\ntopN: {}".format(self.kind, self.topN))
            print("precision: {:.2f} %".format(self.precision * 100))

        if method == 'recall':
            test_coo = binarize(test).tocoo()  # binarize and transform to coo
            score = 0
            nonzero_rowsets = set(test_coo.row)
            for row, col, v in zip(test_coo.row, test_coo.col, test_coo.data):
                if col in pred_all[row, ]:
                    score += 1
            self.recall = score / len(nonzero_rowsets)
            print("\n-------------")
            print("model: {},\ntopN: {}".format(self.kind, self.topN))
            print("recall:{:.2f} %".format(score / len(test_coo.data) * 100))
    def get_recs(self, ug_id, iid_recs, top_clusters=None, min_iid_per_bg=None):
        bg_recs_row = self.ug_bg_recs_m[ug_id]
        bg_mask = binarize(
            self.item_dp.get_iid_per_bg_row(binarize(iid_recs), min_iid_per_bg)
        )
        bg_recs_row = bg_recs_row.multiply(bg_mask)

        if top_clusters is not None:
            arg_ids = np.argsort(bg_recs_row.data)[-top_clusters:]
            rows, cols = bg_recs_row.nonzero()
            bg_recs_row = csr_matrix(
                (bg_recs_row.data[arg_ids], (rows[arg_ids], cols[arg_ids])),
                shape=bg_recs_row.shape
            )
        return bg_recs_row
Beispiel #30
0
def getRepresentation(cas, intensity=False, log=False, bnrz=False, representationSize=200):
    from sklearn.preprocessing import binarize
    ret = [None]*len(cas.tokens)
    for i, sentence in enumerate(cas.tokens):
        altSentence = np.zeros((len(sentence.split()), representationSize))
        for j, word in enumerate(sentence.split()):
            try:
                altWord = np.multiply(np.add(np.divide(model[word], 2.0), 0.5), 255) if intensity else model[word]
                altWord = np.multiply(binarize(altWord, threshold=255.0/2.0), 255) if bnrz and intensity else altWord
                altWord = binarize(altWord) if bnrz and not intensity else altWord
                altSentence[j,:] = altWord
            except:
                altSentence[j,:] = altSentence[j-1,:] if j != 0 else np.zeros(representationSize)
        ret[i] = altSentence
    return ret
Beispiel #31
0
    def PreprocessingData(processType):

        if processType=="Normalization":
            AlgorithmOperation.train_X = preprocessing.normalize(AlgorithmOperation.train_X, norm='l2')
            AlgorithmOperation.test_X = preprocessing.normalize(AlgorithmOperation.test_X, norm='l2')
        elif processType=="Scale":
            AlgorithmOperation.train_X =preprocessing.scale(AlgorithmOperation.train_X)
            AlgorithmOperation.test_X =preprocessing.scale(AlgorithmOperation.test_X)
        elif processType=="Binarization":
            AlgorithmOperation.train_X =preprocessing.binarize(AlgorithmOperation.train_X)
            AlgorithmOperation.test_X =preprocessing.binarize(AlgorithmOperation.test_X)
        elif processType=="Polynomial Feature":
            poly=preprocessing.PolynomialFeatures(2)
            AlgorithmOperation.train_X = poly.fit_transform(AlgorithmOperation.train_X)
            AlgorithmOperation.test_X = poly.fit_transform(AlgorithmOperation.test_X)
Beispiel #32
0
def example2():
    """方法2[推荐]
    """
    X = np.array([[1, -1,  2], ## "f"非常重要,为了标准化,矩阵元素必须是浮点类型
                  [2,  0,  0],
                  [0,  1, -1]], dtype = "f")
    print("binarized X = \n%s\n" % preprocessing.binarize(X, threshold=1.1))
Beispiel #33
0
    def train_step(self, x, y):
        # perform following steps:
        # -reset the gradients
        # -propagate through the network
        # -calculate the loss
        # -compute gradient by backward propagation
        # -update weights
        # -return the loss
        # TODO

        if self._cuda:
            x = x.clone().detach().cuda()
            y = y.clone().detach().cuda().squeeze()

            #x = t.tensor(x, dtype=t.float).cuda()
            #y = t.tensor(y, dtype=t.float).cuda().squeeze()

        self._optim.zero_grad()
        y_pred = self._model(x)

        y_predTmp = y_pred.clone()
        y_predTmp = binarize(y_predTmp.cpu().detach().numpy(),
                             threshold=0.5)  # numpy array w/o grad
        y_pred.data = t.tensor(y_predTmp, dtype=t.float).cuda()

        loss = self._crit(y_pred, y.float())
        loss.backward()
        self._optim.step()
        return loss
Beispiel #34
0
def test_model(LRM, data, TBI=False):

	global GAMMA

	y = list(data.TBResult.values)

	test_recs = list(data.StudyNum.values)

	X = data.drop(['StudyNum','TBResult'],axis=1)

	probs = LRM.predict_proba(X)[:,1]

	"""
	Calculate AUC acc using ROC analysis
	"""
	# Get FPR and TPR for the test set
	fpr, tpr, thresh = roc_curve(y,probs)
	# Calc AUC acc
	auc_acc = auc(fpr,tpr)

	pred = map(int,binarize(np.array(probs).reshape(1,-1),threshold = GAMMA)[0])


	if not TBI:
		ACC,SENS,SPEC = eval_model(pred, y)
		return [ACC,SENS,SPEC,auc_acc]

	else:
		ACC,SENS,SPEC = eval_model(pred, y, probs=probs, test_px=test_recs, TBI=True)
		return [ACC,SENS,SPEC]
Beispiel #35
0
def run_test(seqs, label_seqs, sess, preds_T, input_PHs, label_PHs, mask_PHs,
             seq_length_PH, loss_T, options):
    all_losses = []
    all_preds = []
    all_labels = []
    batch_size = options['batch_size']
    for idx in xrange(len(label_seqs) / batch_size):
        batch_x = seqs[idx * batch_size:(idx + 1) * batch_size]
        batch_y = label_seqs[idx * batch_size:(idx + 1) * batch_size]
        inputs, _, masks, seq_length = mime_util.st_preprocess_hf_aux(
            batch_x, options)
        preds, loss = sess.run(
            [preds_T, loss_T],
            feed_dict={
                input_PHs[0]: inputs[0],
                input_PHs[1]: inputs[1],
                input_PHs[2]: inputs[2],
                mask_PHs[0]: masks[0],
                mask_PHs[1]: masks[1],
                mask_PHs[2]: masks[2],
                label_PHs[-1]: batch_y,
                seq_length_PH: seq_length,
            })
        all_losses.append(loss)
        all_preds.extend(list(preds))
        all_labels.extend(batch_y)
    auc = roc_auc_score(all_labels, all_preds)
    aucpr = average_precision_score(all_labels, all_preds)
    accuracy = (np.array(all_labels) == np.squeeze(
        binarize(np.array(all_preds).reshape(-1, 1), threshold=.5))).mean()
    return np.mean(all_losses), auc, aucpr
Beispiel #36
0
def resc(patch):
    """
    :param patch:  [image,mask]
    :return: random rescaling of the pair [image,mask]

    --- Rescaling reinforces axons size diversity ---
    """


    s = random.choice([0.5, 0.75, 1.0, 1.5, 2.0])
    data_rescale=[]
    for scale in s:

        image_rescale = rescale(patch[0], scale)
        mask_rescale = rescale(patch[1], scale)
        s_r = mask_rescale.shape[0]
        q_h, r_h = divmod(256-s_r,2)

        if q_h > 0 :
            image_rescale = np.pad(image_rescale,(q_h, q_h+r_h), mode = "reflect")
            mask_rescale = np.pad(mask_rescale,(q_h, q_h+r_h), mode = "reflect")
        else :
            patches = extract_patch(image_rescale,mask_rescale, 256)
            i = np.random.randint(len(patches), size=1)
            image_rescale,mask_rescale = patches[i]

        mask_rescale = preprocessing.binarize(np.array(mask_rescale), threshold=0.001)
        data_rescale = [image_rescale, mask_rescale]

    return data_rescale
    def do_transformations(self):
        # binarize counts
        if self.transform == 'binarize':
            print "Binarizing"
            self.feature_counts = binarize(self.feature_counts, copy=False)
            #self.feature_counts = sparse.csr_matrix(self.feature_counts > 0, dtype=int)

        elif self.transform == 'tfidf':
            print "Doing tf-idf transform"
            #doc_sums = self.feature_counts.sum(axis=1)
            #if np.min(doc_sums) == 0:
            #    doc_sums[doc_sums == 0] = 1.0
            #tf = sparse.csr_matrix(self.feature_counts.multiply(1.0/doc_sums))

            n_items, n_features = self.feature_counts.shape
            tf = normalize(self.feature_counts, norm='l1', axis=1, copy=False)
            doc_counts = self.vocab.get_all_doc_counts()
            n_docs = doc_counts.max()
            # add one to avoid zeros which might screw up the matrix size
            idf = sparse.csr_matrix(np.log(float(n_docs+1) / doc_counts), dtype=float)
            print tf.shape, idf.shape
            self.feature_counts = tf.multiply(idf)
            assert self.feature_counts.shape == (n_items, n_features)

        elif self.transform == 'normalizel1' or self.transform == 'normalize':
            print "Normalizing rows"
            self.feature_counts = normalize(self.feature_counts, norm='l1', axis=1, copy=False)

        elif self.transform == 'normalizel2':
            print "Normalizing rows"
            self.feature_counts = normalize(self.feature_counts, norm='l2', axis=1, copy=False)

        if self.scale_factor is not None:
            self.feature_counts = self.feature_counts * self.scale_factor
Beispiel #38
0
def elastic_transform(image, gt, alpha, sigma, random_state=None):
    """
    :param image: image
    :param gt: ground truth
    :param alpha: deformation coefficient (high alpha -> strong deformation)
    :param sigma: std of the gaussian filter. (high sigma -> smooth deformation)
    :param random_state:
    :return: deformation of the pair [image,mask]
    """

    if random_state is None:
        random_state = np.random.RandomState(None)

    shape = image.shape

    d = 4
    sub_shape = (shape[0]/d, shape[0]/d)

    deformations_x = random_state.rand(*sub_shape) * 2 - 1
    deformations_y = random_state.rand(*sub_shape) * 2 - 1

    deformations_x = np.repeat(np.repeat(deformations_x, d, axis=1), d, axis = 0)
    deformations_y = np.repeat(np.repeat(deformations_y, d, axis=1), d, axis = 0)

    dx = gaussian_filter(deformations_x, sigma, mode="constant", cval=0) * alpha
    dy = gaussian_filter(deformations_y, sigma, mode="constant", cval=0) * alpha

    x, y = np.meshgrid(np.arange(shape[0]), np.arange(shape[1]))
    indices = np.reshape(y+dy, (-1, 1)), np.reshape(x+dx, (-1, 1))

    elastic_image = map_coordinates(image, indices, order=1).reshape(shape)
    elastic_gt = map_coordinates(gt, indices, order=1).reshape(shape)
    elastic_gt = preprocessing.binarize(np.array(elastic_gt), threshold=0.5)

    return [elastic_image, elastic_gt]
Beispiel #39
0
def getPredictions(image_data, threshold, allShipping=False):
    """
    This function returns np arrays of true labels, predicted labels, and predicted probabilities.
    image_data: generated from Keras image generator, in batch format
    threshold: the probability at which a classification should be considered shipping (1)
    allShipping: whether all image_data has a true shipping classification (eg. for the PHMSA data that is assumed to all have shipping activity)
    """
    all_true = np.zeros(0)
    all_pred = np.zeros(0)
    pred_prob = np.zeros(0)

    for i in range(len(image_data)):
        image_batch, label_batch = image_data[i]

        if (not allShipping):
            all_true = np.append(all_true, get_true_labels(label_batch))

        y_pred_prob = model.predict_proba(image_batch)[:, 1]
        y_pred_class = binarize([y_pred_prob], threshold)[0]

        all_pred = np.append(all_pred, y_pred_class)
        pred_prob = np.append(pred_prob, y_pred_prob)

    if (allShipping):
        all_true = np.repeat(1, len(all_pred))

    return all_true, all_pred, pred_prob
Beispiel #40
0
def roc_auc(y_true, y_pred, jump=0.01):
    '''
    Area under ROC (Receiver Operating Characteristics)  curve

    Parameters
    ----------
    y_true: numpy.ndarray
        Targets
    y_pred: numpy.ndarray
        Class probability

    References
    ----------
    .. [1] https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc

    Returns
    -------
    roc_auc_score: float
        ROC AUC score
    '''
    y_true, y_pred = y_true.reshape(-1, 1), y_pred.reshape(-1, 1)
    x = []
    y = []
    for thr in np.arange(0.01, 1 + jump, jump):
        y_pred_bin = binarize(y_pred, thr)
        tn, fp, fn, tp = confusion_binary(y_true, y_pred_bin)
        tpr = tp / (tp + fn)
        fpr = fp / (tn + fp)
        y.append(tpr)
        x.append(fpr)
    x = np.array(x)
    y = np.array(y)
    return np.abs(np.trapz(y, x)) # Why trapz gives negative value?
def op_vs_ip(subid, image_types, imagepaths, op_direc, overlays):
	
	
	img_data_group=[]
	img_shape_group=[]
	ol_data_group=[]
	ol_shape_group=[]
	for i, path in enumerate(imagepaths):	

		axial_slice, cor_slice, sag_slice, img_aspect_axial, img_aspect_cor, img_aspect_sag = pull_midslices(path)
		if os.path.isfile(overlays[i]):
			axial_slice_ol, cor_slice_ol, sag_slice_ol, img_aspect_axial_ol, img_aspect_cor_ol, img_aspect_sag_ol = pull_midslices(overlays[i])
			ol_data_group.append([axial_slice_ol, cor_slice_ol, sag_slice_ol])
			ol_shape_group.append([img_aspect_axial_ol, img_aspect_cor_ol, img_aspect_sag_ol])
		else:
			ol_data_group.append(['null','null','null'])
			ol_shape_group.append(['null','null','null'])
		## Append to Matrices
		img_data_group.append([axial_slice, cor_slice, sag_slice])
		img_shape_group.append([img_aspect_axial,img_aspect_cor,img_aspect_sag])
		


	my_cmap=plt.cm.gray


	fig, axarr = plt.subplots(ncols=np.shape(img_shape_group)[1], nrows=np.shape(img_shape_group)[0], figsize=(np.shape(img_shape_group)[0]*5,np.shape(img_shape_group)[1]*5))
	plt.suptitle(subid+' File Comparison', fontsize=20)	
	
	titlearray=['Axial', 'Coronal', 'Saggital']
	
	for x in range(0,np.shape(img_shape_group)[0]):
		for y in range(0,np.shape(img_shape_group)[1]):
			im = axarr[x, y].imshow(img_data_group[x][y], cmap=my_cmap, aspect=img_shape_group[x][y])
			axarr[x, y].set_xlabel('(Right) Radiological Convention (Left)', fontsize=10)
			axarr[x, y].set_title(image_types[x]+' '+titlearray[y])
			#divider = make_axes_locatable(axarr[x, y])
			#cax_ = divider.append_axes("right", size="5%", pad=0.05)
			#cbar = plt.colorbar(im, cax=cax_, ticks=MultipleLocator(round(np.max(img_data_group[x][y])/5, 1)))
			axarr[x, y].xaxis.set_visible(False)
			axarr[x, y].yaxis.set_visible(False)




			if os.path.isfile(overlays[x]):
				if x == 1:
					thresh=0.25
				if x == 2:
					thresh=0.4
				sl=np.array(ol_data_group[x][y]).astype(np.float64)
				sl=filters.sobel(sl)
				sl=preprocessing.binarize(sl, np.max(sl)*thresh)
				sl[sl < 1] = 'Nan'
				axarr[x, y].imshow(sl, cmap='autumn', aspect=ol_shape_group[x][y])

	#plt.show()
	plt.tight_layout()
	plt.autoscale()
	plt.savefig(op_direc)
    def load_data(self, features, X_threshold):
        """ Load data into c_data """
        from neurosynth.analysis.reduce import average_within_regions

        # Load Masks by studies matrix

        # ADD FEATURE TO FILTER BY FEATURES
        masks_by_studies = average_within_regions(self.dataset, self.mask_img, threshold = self.thresh)

        study_ids = self.dataset.feature_table.data.index

        print "Loading data from neurosynth..."

        pb = tools.ProgressBar(len(list(masks_by_studies)), start=True)

        self.ids_by_masks = []
        self.data_by_masks = []
        for mask in masks_by_studies:

            m_ids = study_ids[np.where(mask == True)[0]]
            self.ids_by_masks.append(m_ids)
            self.data_by_masks.append(self.dataset.get_feature_data(ids=m_ids))
            pb.next()

        self.mask_num = masks_by_studies.shape[0]    
        self.mask_pairs = list(itertools.permutations(range(0, self.mask_num), 2))

        filename = path.join(mkdtemp(), 'c_data.dat')
        self.c_data = np.memmap(filename, dtype='object',
                                mode='w+', shape=(self.mask_num, self.mask_num))
        # Load data
        for pair in self.mask_pairs:
            reg1_ids = self.ids_by_masks[pair[0]]
            reg2_ids = self.ids_by_masks[pair[1]]

            reg1_set = list(set(reg1_ids) - set(reg2_ids))
            reg2_set = list(set(reg2_ids) - set(reg1_ids))

            x1 = self.data_by_masks[pair[0]]
            x1 = np.array(x1)[np.where(np.in1d(reg1_ids, reg1_set))[0]]

            x2 = self.data_by_masks[pair[1]]
            x2 = np.array(x2)[np.where(np.in1d(reg2_ids, reg2_set))[0]] 

            y = np.array([0]*len(reg1_set) + [1]*len(reg2_set))

            X = np.vstack((x1, x2))

            if X_threshold is not None:
                X = binarize(X, X_threshold)

            from neurosynth.analysis.classify import regularize
            X = regularize(X, method='scale')

            self.c_data[pair] = (X, y)

        if self.memsave:
            self.data_by_masks = []
            self.ids_by_masks = []
Beispiel #43
0
    def transform(self, X):
        """Compute the Jaccard similarity for all pairs of elements in ``X``.

        Rows i in ``X`` are assumed to represent pairs, where
        ``X[i, :n_features]`` and ``X[i, n_features:]`` correspond to their two
        individual elements, each representing a set. Calling ``transform``
        computes the Jaccard similarity between these sets, i.e. such that
        ``Xt[i]`` is the Jaccard similarity of ``X[i, :n_features]`` and
        ``X[i, n_features:]``.

        Parameters
        ----------
        :param X: array-like, shape (n_samples, n_features)
            Input data.

        Returns
        -------
        :returns: Xt array-like, shape (n_samples, 1)
            The transformed data.
        """
        n_samples, n_features_all = X.shape
        n_features = n_features_all // 2

        X = binarize(X)
        X1 = X[:, :n_features]
        X2 = X[:, n_features:]

        sparse = sp.issparse(X)

        if sparse and not sp.isspmatrix_csr(X):
            X = X.tocsr()

        if sparse:
            if X.data.sum() == 0:
                return np.zeros((n_samples, 1))

            numerator = np.asarray(X1.multiply(X2).sum(axis=1)).ravel()

            X_sum = X1 + X2
            X_sum.data[X_sum.data != 0.] = 1
            M = X_sum.sum(axis=1)
            A = M.getA()
            denominator = A.reshape(-1,)

        else:
            if len(X[X.nonzero()]) == 0.:
                return np.zeros((n_samples, 1))

            numerator = (X1 * X2).sum(axis=1)

            X_sum = X1 + X2
            X_sum[X_sum.nonzero()] = 1
            denominator = X_sum.sum(axis=1)

        with np.errstate(divide="ignore", invalid="ignore"):
            Xt = numerator / denominator
            Xt[np.where(denominator == 0)[0]] = 0.

        return np.array(Xt).reshape(-1, 1)
Beispiel #44
0
def eval_model(preds, y_ref, probs = [], test_px = [], TBI = False):

	global GAMMA

	if len(preds) != len(y_ref):
		print "Predicted labels and test labels dont have the same dimensions!"
		print "Predicted: ", n_pred, "; Tests: ", n_test
		exit()


	if not TBI:
		CM = confusion_matrix(y_ref, preds)

		TP = CM[1,1]
		TN = CM[0,0]
		FP = CM[0,1]
		FN = CM[1,0]

		ACC = (TP+TN)/float(TP+TN+FP+FN)
		SENS = TP/float(TP+FN)
		SPEC = TN/float(TN+FP)

		return ACC,SENS,SPEC

	else:

		i = np.arange(len(test_px))

		df = pd.DataFrame({"Recording": pd.Series(test_px,index = i),
                            "Prediction": pd.Series(preds,index = i),
                            "Reference": pd.Series(y_ref,index = i),
                            "Probabilities": pd.Series(probs,index = i)
                            }).sort_values(by="Recording")

		y_test_rec 	= []
		TBI_list	= []

		for name, group in df.groupby("Recording"):
			l = group.Reference.iloc[0]
			y_test_rec.append(l)

			TB_prob = sum(group.Probabilities.values) / float(len(group.Probabilities))
			TBI_list.append(TB_prob)


		diagnosis_list = map(int,binarize(np.array(TBI_list).reshape(1,-1),threshold = GAMMA)[0])

		CM = confusion_matrix(y_test_rec, diagnosis_list)

		TP = CM[1,1]
        TN = CM[0,0]
        FP = CM[0,1]
        FN = CM[1,0]

        ACC = (TP+TN)/float(TP+TN+FP+FN)
        SENS = TP/float(TP+FN)
        SPEC = TN/float(TN+FP)

        return ACC, SENS, SPEC
Beispiel #45
0
 def predictClass(self, threshold=0.5):
     # prediction
     # self.pred_y = self.model.predict(self.test_set_X)
     if self.is_keras:
         self.pred_y_prob = self.model.predict_proba(self.test_set_X)[:, 0]
     else:
         self.pred_y_prob = self.model.predict_proba(self.test_set_X)[:, 1]
     self.pred_y = binarize(self.pred_y_prob.reshape(1, -1), threshold)[0].astype(int)
Beispiel #46
0
 def prob_maximum_low(self, x=None, y=None, show=True):
     x, y = self.proxy_xy(x, y)
     fiter = self.get_fiter()
     y_prob = MlFiterExcute.run_prob_cv_estimator(fiter, x, y, n_folds=10)
     l_pb = y_prob[y_prob < y_prob.mean()].mean()
     y_prob_l = binarize(y_prob.reshape(-1, 1), l_pb)
     if show:
         self.scores(y_prob_l, y)
     return l_pb
 def binarize(df):
     """
     将数据二值化
         :param df: 传入DataFrame
         :returns: 标准化后的数据
     """   
     if not isinstance(df, pd.DataFrame):
         raise Exception("df is not DataFrame!")
     return preprocessing.binarize(df)
Beispiel #48
0
 def transform(self, X):
     #If the binarize option is set to true, we need now to recompute "f", our binarized word counter
     if(self.bina == True):
         f_hat = binarize(X, threshold = 0.0)
     else :
         f_hat = X
     
     f_tilde = f_hat.multiply(self.r)
     return f_tilde
Beispiel #49
0
def validation(data, px, y_px):
	global GAMMA

	# convert px and y_px for broadcasting
	px = np.array(px)
	y_px = np.array(y_px)

	# vanilla LogReg classifier
	LRM = LogisticRegression()

	skf = StratifiedKFold(y_px, n_folds = N_FOLDS, shuffle = True)

	print "Running",N_FOLDS,"Stratified Splits"
	probs 		= []		# Probabilities during validation
	preds 		= []		# Predictions made
	y_ref 		= []		# Labels as they were used in validation
	val_recs 	= []		# List of recordings as they were used in validation
	for train_idx, val_idx in skf:

		# Separate train and val sets using indexes
		X_train, y_train, X_val, y_val, val_px = leave_out_fold(data, px, train_idx, val_idx)

		# Train the LRM
		LRM.fit(X_train, y_train)

		# Save this LRM performance
		probs.extend(list(LRM.predict_proba(X_val)[:,1]))
		preds.extend(list(LRM.predict(X_val)))
		y_ref.extend(y_val)
		val_recs.extend(val_px)

	fpr, tpr, thresholds = roc_curve(y_ref, probs)

	"""
	Do ROC analysis and get optimal threshold
	for sens ~= spec
	"""
	i = np.arange(len(tpr))
	roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i),
	                    'tpr' : pd.Series(tpr, index = i),
	                    '1-fpr' : pd.Series(1-fpr, index = i),
	                    'tf' : pd.Series(tpr - (1-fpr), index = i),
	                    'thresholds' : pd.Series(thresholds, index = i)
	                    })
	idx = (roc.tf).abs().argmin()
	thresh = roc.thresholds.iloc[idx]
	auc_acc = auc(fpr,tpr)

	# Perform classification with optimal threshold
	preds_opt = map(int, binarize(np.array(probs).reshape(1,-1), threshold=thresh)[0])

	GAMMA = thresh

	ACC, SENS, SPEC = eval_model(preds_opt, y_ref)

	return [ACC,SENS,SPEC,auc_acc]
def test_model(LRM, test_data, TBI = 0, save = 0):

    global GAMMA  

    """
    Evaluate a trained Logistic Regression model

    Inputs:
    =======
    LRM:        Trained Logistic Regression Model
    test_data:  Data to test the LRM on
    return:     Flag - To return [spec, sens, acc] or just acc
    TBI:        Flag - To compute results using TBI or not

    """  

    # Get the labels
    y = test_data.TBResult.values

    # Get the names of the recordings in the test set
    test_recs = test_data.StudyNum.values

    # Keep the feature data for training
    X = test_data.drop(["StudyNum","TBResult"], axis = 1)

    probs = LRM.predict_proba(X)[:,1]

    """
    Calculate AUC acc using ROC analysis
    """
    # Get FPR and TPR for the test set
    fpr, tpr, thresh = roc_curve(y,probs)
    # Calc AUC acc
    auc_acc = auc(fpr,tpr)

    pred = map(int,binarize(np.array(probs).reshape(1,-1),threshold = GAMMA)[0])
   
    # Do the same thing but with pandas
    # i = np.arange(len(probs))
    # temp_df = pd.DataFrame({'StudyNum': pd.Series(test_recs, index = i),
    #                         'Probability': pd.Series(probs, index = i),
    #                         'TBResult': pd.Series(y, index = i),
    #                         'Pred': 0})

    # # This makes all predictions = 1 where Prob >= Gamma
    # temp_df.ix[temp_df.Probability >= GAMMA,'Pred'] = 1
    # pred = temp_df.Pred.values

    if TBI == 0:
        acc,sens,spec = eval_model(pred, y)
        return [acc, sens, spec, auc_acc]

    else:
        acc,sens,spec = eval_model(pred, y, probs = probs, test_recs = test_recs, TBI = 1, save = save)
        return [acc, sens, spec]
Beispiel #51
0
def getSrlRepresentation(cas, intensity=False, log=False, bnrz=False, representationSize=200):
    from sklearn.preprocessing import binarize
    model = models.Word2Vec.load('models/word2vec/srlModel')
    ret = [None]*len(cas.sentences)
    for i, sentence in enumerate(cas.srlSentences):
        numRows = sum([len(clause) for clause in sentence])
        altSentence = np.zeros((numRows, representationSize))
        currentRow = 0
        for clause in sentence:
            for j, (role, text) in enumerate(clause.iteritems()):
                word = str((role, text))
                try:
                    altWord = np.multiply(np.add(np.divide(model[word], 2.0), 0.5), 255) if intensity else model[word]
                    altWord = np.multiply(binarize(altWord, threshold=255.0/2.0), 255) if bnrz and intensity else altWord
                    altWord = binarize(altWord) if bnrz and not intensity else altWord
                    altSentence[currentRow,:] = altWord
                except:
                    altSentence[currentRow,:] = altSentence[j-1,:] if j != 0 else np.zeros(representationSize)
                currentRow += 1
        ret[i] = altSentence
    return ret
Beispiel #52
0
    def load_data(self, features, X_threshold):
        """ Load data into c_data """
        # Load data for each mask
        self.load_mask_data(features)

        filename = path.join(mkdtemp(), 'c_data.dat')
        self.c_data = np.memmap(filename, dtype='object',
                                mode='w+', shape=(self.mask_num))

        all_ids = self.dataset.image_table.ids

        # If a low thresh is set, then get ids for studies at that threshold
        if self.thresh_low is not None:
            ids_by_masks_low = []
            from neurosynth.analysis.reduce import average_within_regions
            masks_by_studies_low = average_within_regions(
                self.dataset, self.mask_img, threshold=self.thresh_low)
            for mask in masks_by_studies_low:
                m_ids = np.array(all_ids)[np.where(mask == True)[0]]
                ids_by_masks_low.append(m_ids)       

        # Set up data into c_data
        for num, on_ids in enumerate(self.ids_by_masks):

            # If a low threshold is set, then use that to filter "off_ids", otherwise use "on_ids"
            if self.thresh_low is not None:
                off_ids = list(set(all_ids) - set(ids_by_masks_low[num]))
            else:
                off_ids = list(set(all_ids) - set(on_ids))

            on_data = self.data_by_masks[num].dropna()

            off_data = self.dataset.get_feature_data(ids=off_ids).dropna()

            y = np.array([0] * off_data.shape[0] + [1] * on_data.shape[0])

            X = np.vstack((np.array(off_data), np.array(on_data)))

            from neurosynth.analysis.classify import regularize
            X = regularize(X, method='scale')

            if X_threshold is not None:
                X = binarize(X, X_threshold)

            self.c_data[num] = (X, y)

        if self.memsave:
            self.data_by_masks = []
            self.ids_by_masks = []

        self.comparisons = range(0, self.mask_num)

        self.comp_dims = (self.mask_num, )
def modelEval(name, model, X, y, binarize_threshold):
    X_train, X_test, y_train, y_test = train_test_split(X_kbest, y, test_size=0.2,
        stratify = y, random_state = rs)
    meancvscore = cross_val_score(model, X, y, n_jobs=-1, verbose=1).mean()
    print 'Model %s cross_val_score: %f' % (name, meancvscore)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_adj = binarize(model.predict_proba(X_test)[:,1],
        threshold = binarize_threshold, copy=False).transpose()
    print 'Model %s classification metrics:' % name
    doClassifMetrics(y_test, y_pred)
    print 'Model %s using prediction threshold %f:' % (name, binarize_threshold)
    doClassifMetrics(y_test, y_pred_adj)
    def test_preprocessing_assignment(self):
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        original_columns = df.data.columns
        df['sepal length (cm)'] = df['sepal length (cm)'].preprocessing.binarize(threshold=6)
        self.assertTrue(isinstance(df, pdml.ModelFrame))
        binarized = pp.binarize(np.atleast_2d(iris.data[:, 0]), threshold=6)
        expected = np.hstack([binarized.T, iris.data[:, 1:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        self.assert_index_equal(df.data.columns, original_columns)

        # recreate data
        iris = datasets.load_iris()
        df = pdml.ModelFrame(iris)

        target_columns = ['sepal length (cm)', 'sepal width (cm)']
        df[target_columns] = df[target_columns].preprocessing.binarize(threshold=6)
        self.assertTrue(isinstance(df, pdml.ModelFrame))
        binarized = pp.binarize(iris.data[:, 0:2], threshold=6)
        expected = np.hstack([binarized, iris.data[:, 2:]])
        self.assert_numpy_array_almost_equal(df.data.values, expected)
        self.assert_index_equal(df.data.columns, original_columns)
Beispiel #55
0
def getFeaturesUnigrams(sentence):
    def normalizeFeatures(values, mn, mx):
        return np.divide(np.subtract(values, mn), float(mx-mn))
    featureDict = {}
    for i, word in enumerate(sentence.split()):
        try:
            representation = model[word]
            representation = binarize(representation)
            representation = normalizeFeatures(representation, 0, 1)
            for j, vectorEntry in enumerate(representation):
                featureDict[str(i*len(representation)+j)] = vectorEntry
        except KeyError:
            continue
    return featureDict
Beispiel #56
0
    def load_data(self, features, X_threshold):
        """ Load data into c_data """
        # Load data for each mask
        self.load_mask_data(features)

        # Set up pair-wise data
        self.comparisons = list(
            itertools.combinations(range(0, self.mask_num), 2))

        filename = path.join(mkdtemp(), 'c_data.dat')
        self.c_data = np.memmap(filename, dtype='object',
                                mode='w+', shape=(self.mask_num, self.mask_num))

        # Filter data and arrange into c_data
        for pair in self.comparisons:

            x1 = self.data_by_masks[pair[0]]
            x2 = self.data_by_masks[pair[1]]

            reg1_ids = self.ids_by_masks[pair[0]]
            reg2_ids = self.ids_by_masks[pair[1]]

            if self.remove_overlap is True:
                reg1_set = list(set(reg1_ids) - set(reg2_ids))
                reg2_set = list(set(reg2_ids) - set(reg1_ids))

                x1 = np.array(x1)[np.where(np.in1d(reg1_ids, reg1_set))[0]]
                x2 = np.array(x2)[np.where(np.in1d(reg2_ids, reg2_set))[0]]

                reg1_ids = reg1_set
                reg2_ids = reg2_set
                
            y = np.array([0] * len(reg1_ids) + [1] * len(reg2_ids))

            X = np.vstack((x1, x2))

            if X_threshold is not None:
                X = binarize(X, X_threshold)

            from neurosynth.analysis.classify import regularize
            X = regularize(X, method='scale')

            self.c_data[pair] = (X, y)

        if self.memsave:
            self.data_by_masks = []
            self.ids_by_masks = []

        self.comp_dims = (self.mask_num, self.mask_num)
Beispiel #57
0
def get_score(X, y, clf, scoring = 'accuracy'):
    from sklearn.preprocessing import binarize

    prediction = binarize(clf.predict(X), 0.5)

    if scoring == 'accuracy':
        from sklearn.metrics import accuracy_score
        score = accuracy_score(y, prediction)
    elif scoring =='f1':
        from sklearn.metrics import f1_score
        score = f1_score(y, prediction)
    else:
        score = scoring(y, prediction)

    return prediction, score
Beispiel #58
0
def getFeaturesBigrams(sentence):
    def normalizeFeatures(values, mn, mx):
        return np.divide(np.subtract(values, mn), float(mx-mn))
    featureDict = {}
    sentence = sentence.split()
    bigramSentence = [b for b in zip(sentence[:-1], sentence[1:])]
    for i, (w1, w2) in enumerate(bigramSentence):
        try:
            representation = model[w1 + '_' + w2]
            representation = binarize(representation)
            representation = normalizeFeatures(representation, 0, 1)
            for j, vectorEntry in enumerate(representation):
                featureDict[str(i*len(representation)+j)] = vectorEntry
        except KeyError:
            continue
    return featureDict
Beispiel #59
0
def read_train(train_file):
    lines = []
    y = []
    vectorizer = CountVectorizer(min_df=3)
    tf_idf = TfidfTransformer()

    for parts in utils.read_train(train_file):
        is_blocked = parts[8]
        desc = cleantext.clean(parts[4], False)
        lines.append(desc)
        y.append(int(is_blocked))

    vectorizer = vectorizer.fit_transform(lines)
    X_nb = tf_idf.fit_transform(vectorizer)
    X_log = binarize(vectorizer)

    return X_nb, X_log, numpy.asarray(y)