def summary(n_samples,
            min_n_clusters,
            max_n_clusters,
            n_features,
            random_state=0,
            n_loops=10):
    gm_01_scores = []
    gm_02_scores = []
    xm_scores = []

    for n_cluster in np.linspace(min_n_clusters, max_n_clusters,
                                 max_n_clusters - min_n_clusters + 1):
        total_gm_01_score = 0
        total_gm_02_score = 0
        total_xm_score = 0

        for i in range(n_loops):
            n_cluster = n_cluster.astype(int)
            X, y = datasets.make_blobs(n_samples=n_samples,
                                       n_features=n_features,
                                       centers=n_cluster,
                                       random_state=random_state)
            X = StandardScaler().fit_transform(X)

            X_1 = X.copy()
            X_2 = X.copy()
            X_3 = X.copy()

            gm_01 = GMeans_01().fit(X_1)
            gm_02 = GMeans_02().fit(X_2)
            xm = XMeans().fit(X_3)

            gm_01_score = silhouette_score(X_1,
                                           gm_01.labels,
                                           metric='euclidean')
            gm_02_score = silhouette_score(X_2,
                                           gm_02.labels,
                                           metric='euclidean')
            xm_score = silhouette_score(X_3, xm.labels_, metric='euclidean')

            # print "xm = {}".format(xm_score)

            total_gm_01_score += gm_01_score
            total_gm_02_score += gm_02_score
            total_xm_score += xm_score

        total_gm_01_score = total_gm_01_score / (n_loops * 1.0)
        total_gm_02_score = total_gm_02_score / (n_loops * 1.0)
        total_xm_score = total_xm_score / (n_loops * 1.0)

        print "n_samples = {}, n_features = {}, n_cluster = {}, gm_01_score = {}, gm_02_score = {}, xm_score = {}" \
            .format(n_samples, n_features, n_cluster, total_gm_01_score, total_gm_02_score, total_xm_score)

        gm_01_scores.append(total_gm_01_score)
        gm_02_scores.append(total_gm_02_score)
        xm_scores.append(total_xm_score)

    return gm_01_scores, gm_02_scores, xm_scores
Exemple #2
0
    def predict(self, X=None):
        """Calculate connectivity-based outlier factor for each sample in A

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            New data to predict.

        Returns
        -------
        probs : array, shape (n_samples,)
            Outlier probabilities determined by stochastic outlier selection.
        """

        if X is None:
            if self.X_ is None:
                raise Exception("No data")
            X = self.X_

        log_format = '%(asctime)-15s  [%(levelname)s] - %(name)s: %(message)s'
        logging.basicConfig(format=log_format, level=logging.INFO)
        logger = logging.getLogger('SOS')

        if self.verbose:
            logger.setLevel(logging.DEBUG)
        else:
            logger.setLevel(logging.ERROR)

        if self.standard_scale:
            X = StandardScaler().fit_transform(X.copy())
        return sos(X, self.metric, self.perplexity, logger=logger)
Exemple #3
0
def test_transformers_data_not_an_array():
    # test if transformers do something sensible on training set
    # also test all shapes / shape errors
    transformers = all_estimators(type_filter='transformer')
    X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]],
                      random_state=0, n_features=2, cluster_std=0.1)
    X = StandardScaler().fit_transform(X)
    # We need to make sure that we have non negative data, for things
    # like NMF
    X -= X.min() - .1

    for name, Transformer in transformers:
        # XXX: some transformers are transforming the input
        # data. This is a bug that we'll fix later. Right now we copy
        # the data each time
        this_X = NotAnArray(X.copy())
        this_y = NotAnArray(np.asarray(y))
        if name in dont_test:
            continue
        # these don't actually fit the data:
        if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']:
            continue
        # And these wan't multivariate output
        if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'):
            continue
        yield check_transformer, name, Transformer, this_X, this_y
def get_feature_patches(FV, patch_size, patch_shift, input_shape):
    FV = StandardScaler(copy=False).fit_transform(FV)
    # FV should be of the shape (nFeatures, nFrames)
    if any(np.array([9, 10, 21, 22, 39]) == np.shape(FV)[1]):
        FV = FV.T
    patches = np.empty([])

    if np.shape(FV)[1] < patch_size:
        FV1 = FV.copy()
        while np.shape(FV)[1] <= patch_size:
            FV = np.append(FV, FV1, axis=1)

    numPatches = int(np.ceil(np.shape(FV)[1] / patch_shift))
    patches = PatchExtractor(patch_size=(np.shape(FV)[0], patch_size),
                             max_patches=numPatches).transform(
                                 np.expand_dims(FV, axis=0))
    # print('sklearn splitting: ', time.clock()-startTime, np.shape(patches))

    # print('Patches: ', np.shape(patches))
    if (np.shape(patches)[1] == 9) or (np.shape(patches)[1] == 10):
        diff_dim = input_shape[0] - np.shape(patches)[1]
        zero_padding = np.zeros(
            (np.shape(patches)[0], diff_dim, np.shape(patches)[2]))
        patches = np.append(patches, zero_padding, axis=1)
    elif np.shape(patches)[1] == 22:
        patches = patches[:, :21, :]
    elif np.shape(patches)[1] == 39:
        first_7_cep_dim = np.array(
            list(range(0, 7)) + list(range(13, 20)) + list(range(26, 33)))
        patches = patches[:, first_7_cep_dim, :]
    # print('Patches: ', np.shape(patches))

    return patches
def get_feature_patches(FV, patch_size, patch_shift, input_shape):
    FV = StandardScaler(copy=False).fit_transform(FV)
    # FV should be of the shape (nFeatures, nFrames)
    if any(np.array([9,10,21,22,39])==np.shape(FV)[1]): 
        FV = FV.T
    patches = np.empty([])

    if np.shape(FV)[1]<patch_size:
        # print('Size append: ', np.shape(FV), patch_size)
        FV1 = FV.copy()
        while np.shape(FV)[1]<=patch_size:
            FV = np.append(FV, FV1, axis=1)

    numPatches = int(np.ceil(np.shape(FV)[1]/patch_shift))
    patches = PatchExtractor(patch_size=(np.shape(FV)[0], patch_size), max_patches=numPatches).transform(np.expand_dims(FV, axis=0))

    patches_mean = np.mean(patches, axis=2)
    patches_var = np.var(patches, axis=2)
    patches_mean_var = np.append(patches_mean, patches_var, axis=1)
    # print('sklearn splitting: ', time.clock()-startTime, np.shape(patches))

    # print('Patches: ', np.shape(patches))
    if np.shape(patches_mean_var)[1]!=2*input_shape[0]: # This condition checks for 39CC
        if np.shape(patches_mean_var)[1]==44:
            patches_mean_var = patches_mean_var[:,list(range(0,21))+list(range(22,43))]
        elif np.shape(patches_mean_var)[1]==78:
            first_7_cep_dim = np.array(list(range(0,7))+list(range(13,20))+list(range(26,33))+list(range(39,46))+list(range(52,59))+list(range(65,72)))
            patches_mean_var = patches_mean_var[:, first_7_cep_dim]
    # print('patches_mean_var: ', np.shape(patches_mean_var))
    
    return patches_mean_var
Exemple #6
0
    coef = np.zeros(n_features)
    coef[:n_relevant_features] = coef_min + rng.rand(n_relevant_features)

    # the correction of our design: variables corrected by blocs of 3
    corr = np.zeros((n_features, n_features))
    for i in range(0, n_features, block_size):
        corr[i:i + block_size, i:i + block_size] = 1 - conditioning
    corr.flat[::n_features + 1] = 1
    corr = linalg.cholesky(corr)

    # our design
    x = rng.normal(size=(n_samples, n_features))
    x = np.dot(x, corr)
    x[:n_relevant_features] /= np.abs(linalg.svdvals(
        x[:n_relevant_features])).max()
    x = StandardScaler().fit_transform(x.copy())

    # the output variable
    y = np.dot(x, coef)
    y /= np.std(y)
    y += noise_level * rng.normal(size=n_samples)
    mi = mutual_incoherence(x[:, :n_relevant_features],
                            x[:, n_relevant_features:])

    # plot stability selection path, using a high eps for early stopping of the path, to save computing time
    alpha_grid, scores_path = lasso_stability_path(x,
                                                   y,
                                                   random_state=42,
                                                   eps=0.05)

    plt.figure()
Exemple #7
0
def dj_rec(track_id):
    '''get similar songs depending on the audio features'''
    neighbors = 4
    max_distance = 5.0
    '''
    [:-10] will return only 10 closest songs to the original track_id
    by removing [:-10], code will return 20 songs.
    It will take double of time to make a prediction though'''
    rel_artists = sp.artist_related_artists(
        sp.track(track_id=track_id)['artists'][0]['id'])['artists'][:-10]
    artist_log = []
    for a in rel_artists:
        artist_log.append(a['id'])
    feat_log = []
    for artist in artist_log:
        for track in sp.artist_top_tracks(artist)['tracks']:
            feat_log.append(sp.audio_features(track['id'])[0])

    catalog = pd.DataFrame.from_dict(feat_log)

    root = pd.DataFrame.from_dict(sp.audio_features(tracks=[track_id]))

    merged_df = root.append(catalog, ignore_index=True)

    dropped_df = merged_df.drop(columns=[
        'uri', 'track_href', 'id', 'duration_ms', 'time_signature', 'mode',
        'loudness', 'type', 'analysis_url'
    ])
    scaled_df = StandardScaler().fit_transform(dropped_df)
    trans_array = scaled_df.copy()

    trans_array[:, 0] = [u * 2.4 for u in trans_array[:, 0]]  # acousticness
    trans_array[:, 1] = [((u * u)**0.5) * u
                         for u in trans_array[:, 1]]  # danceability
    trans_array[:, 2] = [u * 1.7 for u in trans_array[:, 2]]  # energy
    trans_array[:,
                3] = [u * 1.4 for u in trans_array[:, 3]]  # instrumentalness
    trans_array[:, 4] = [u * 0.9 for u in trans_array[:, 4]]  # key
    trans_array[:, 5] = [u * 1.0 for u in trans_array[:, 5]]  # liveness
    trans_array[:, 6] = [u * 1.0 for u in trans_array[:, 6]]  # speechiness
    trans_array[:, 7] = [u * 1.1 for u in trans_array[:, 7]]  # tempo
    trans_array[:, 8] = [u * 2.5 for u in trans_array[:, 8]]  # valence

    knn = NearestNeighbors()
    knn.fit(trans_array)

    rec = knn.kneighbors(trans_array[[0]], n_neighbors=neighbors + 1)

    predict_response = []
    for n in range(1, neighbors + 1):
        if rec[0][0][n] <= max_distance:
            pred_dict = (merged_df.loc[rec[1][0][n], 'id'], rec[0][0][n])
        predict_response.append(pred_dict)

    pred = pd.DataFrame(predict_response,
                        columns=['recommendation', 'distance'])

    df_predict_tracks = pd.DataFrame()  # create dataframe

    a = [sp.track(ii)['artists'][0]['name'] for ii in pred['recommendation']]
    b = [sp.track(ii)['name'] for ii in pred['recommendation']]
    c = [sp.track(ii)['id'] for ii in pred['recommendation']]
    d = [
        sp.track(ii)['external_urls']['spotify']
        for ii in pred['recommendation']
    ]
    e = [sp.track(ii)['explicit'] for ii in pred['recommendation']]
    f = [sp.track(ii)['preview_url'] for ii in pred['recommendation']]
    g = [
        sp.track(ii)['album']['images'][1]['url']
        for ii in pred['recommendation']
    ]

    # Save the results
    df_predict_tracks['artist_name'] = a
    df_predict_tracks['song_name'] = b
    df_predict_tracks['id'] = c
    df_predict_tracks['url'] = d
    df_predict_tracks['explicit'] = e
    df_predict_tracks['preview'] = f
    df_predict_tracks['image'] = g

    df_predict_tracks['preview'] = df_predict_tracks['preview'].apply(
        get_rid_of_nulls)

    df_predict_tracks.index += 1

    return json.dumps(json.loads(df_predict_tracks.to_json(orient='index')),
                      indent=2)

def reduce_dim(no_of_components, U, X):
    U_red = U[:, :no_of_components]
    X = np.array(X)
    Z = np.matmul(U_red.T, X.T)
    Z = Z.T
    Z_new = pd.DataFrame(Z,
                         columns=["pc" + str(i) for i in xrange(Z.shape[1])])
    return Z_new


# In[4]:

U_copy = U.copy()
X_copy = X.copy()
Z_1 = reduce_dim(17, U_copy, X_copy)

U_copy = U.copy()
X_copy = X.copy()
Z_2 = reduce_dim(26, U_copy, X_copy)

U_copy = U.copy()
X_copy = X.copy()
Z_3 = reduce_dim(38, U_copy, X_copy)

# In[5]:

print Z_1.shape
print Z_2.shape
print Z_3.shape
# x_data = processing.snv(x_data)
x_data = StandardScaler().fit_transform(x_data)

for i, test_model in enumerate(models_to_test):
    for j, y_transformation in enumerate(y_transformations):
        for k, x_transformation in enumerate(x_transformations):

            title = "{0}\ny transformer: {1}" \
                    "\nx transformer: {2}".format(model_names[i],
                                                  transormation_names[j],
                                                  transormation_names[k])

            if x_transformation:
                x_data_new = x_transformation(x_data)
            else:
                x_data_new = x_data.copy()

            if type(y_transformation) == tuple:
                _model = TransformedTargetRegressor(
                    regressor=clone(test_model),
                    func=y_transformation[0],
                    inverse_func=y_transformation[1])
            elif y_transformation:
                _model = TransformedTargetRegressor(
                    regressor=clone(test_model), transformer=y_transformation)
            else:
                _model = clone(test_model)

            print(title)

            try:
Exemple #10
0
    Y['AGE2'] = Y['AGE']**2
    # Y['SITE'] = Y['SITE'].astype('category').cat.codes
    # Y['PTRACCAT'] = Y['PTRACCAT'].astype('category').cat.codes
    Y['PTGENDER'] = Y['PTGENDER'].astype('category').cat.codes
    Y = Y.fillna(Y.mean())
    print(Y.shape)
    print(X.shape)
    print(icv.shape)

    W = np.linalg.inv(Y.T.dot(Y)).dot(Y.T.dot(X))

    # Substract effect of age
    X = X - Y.dot(W)

    # Save_corrected_data: cdf
    cdf = X.copy()
    cdf.columns = cols
    cdf['label'] = df['label']
    corrected_csv = join(data_csv[:-4] + '_corrected.csv')
    cdf.to_csv(corrected_csv)
    print('CSV saved!')

    # Reduce dimmensionality: X_ld
    n_comp = 5 if X.shape[1] > 5 else X.shape[1]
    X_ld = PCA(n_components=n_comp).fit_transform(X)

    #Convert in DataFrame
    cols = ['PC%d' % (c + 1) for c in range(X_ld.shape[1])]
    X_ld = pd.DataFrame(data=X_ld, columns=cols, index=df.index)
    X_ld['label'] = df['label']
    X_ld = X_ld[(X_ld['label'] == 'MCIc') | (X_ld['label'] == 'MCInc')]
Exemple #11
0
def get_subsampling_index2(data_process, standard_scale = True, cutoff_sig = 0.02, rate = 0.3, \
                           method = "pykdtree", verbose = 1):
    """
    Using Nearest-Neighbor search based algorithm, find the list of indices of the subsampled dataset
    
    
    Parameters
    -------------
    data_process: List. the list of datapoints, with selected features
    
    standard_scale [True]: Boolean. Whether to apply standard scaler to the dataset prior to subsampling
    
    cutoff_sig [0.02]: Float. cutoff significance. the cutoff distance equals to the Euclidean 
                       norm of the standard deviations in all dimensions of the data points 
    
    rate [0.3]: Float. possibility of deletion
    
    method ["pykdtree"]: String. which backend nearest neighbour model to use. 
                         possible choices: ["pykdtree", "nmslib", "sklearn", "scipy", "annoy", "flann"]
    
    verbose [1]: integer. level of verbosity
    
    
    Return
    -------------
    overall_keep_list: The list of indices of the final subsampled entries
    
    """

    if verbose >= 1:
        print("Started NN-subsampling, original length: {}".format(
            len(data_process)))

    method = method.lower()
    start = time.time()

    if method == "flann":
        if verbose >= 1:
            print("use flann backend")
    elif method == "pykdtree":
        if verbose >= 1:
            print("use pykdtree backend")
    elif method == "sklearn":
        if verbose >= 1:
            print("use slearn nearest neighbors backend")
    elif method == "scipy":
        if verbose >= 1:
            print("use scipy cKDTree backend")
    elif method == "annoy":
        if verbose >= 1:
            print("use annoy backend")
    elif method == "nmslib":
        if verbose >= 1:
            print("use nmslib backend")
    else:
        print("method {} not impletemented".format(method))
        raise NotImplemented

    # apply standard scaling
    if standard_scale:
        if verbose >= 2:
            print("Subample with standard scaled data")
        data_process = StandardScaler().fit_transform(
            np.asarray(data_process).copy())
    else:
        if verbose >= 2:
            print("Subample with original data")
        data_process = np.asarray(data_process).copy()

    #set cutoff distance
    list_of_descs = zip(*data_process)
    sum_std2 = 0.
    for descs in list_of_descs:
        temp_std = np.std(descs)
        sum_std2 += temp_std**2
    cutoff = cutoff_sig * np.sqrt(sum_std2)

    #initialize the index
    overall_keep_list = np.arange(len(data_process)).tolist()

    keep_going = True
    iter_count = 1
    while keep_going:
        if verbose >= 2:
            print('start iteration {}, total length: {}'.format(
                iter_count, len(overall_keep_list)))
        start_cycle = time.time()
        temp_data_process = get_array_based_on_index(data_process.copy(),
                                                     overall_keep_list)

        #build and query nearest neighbour model
        if method == "flann":
            flann = FLANN()
            indices, distances = flann.nn(temp_data_process,
                                          temp_data_process,
                                          2,
                                          algorithm="kmeans")
        elif method == "scipy":
            kd_tree = cKDTree(temp_data_process)
            distances, indices = kd_tree.query(temp_data_process, k=2)
        elif method == "pykdtree":
            kd_tree = KDTree(temp_data_process, leafsize=6)
            distances, indices = kd_tree.query(temp_data_process, k=2)
        elif method == "sklearn":
            nbrs = NearestNeighbors(n_neighbors=2,
                                    algorithm='kd_tree',
                                    n_jobs=-1).fit(temp_data_process)
            distances, indices = nbrs.kneighbors(temp_data_process)
        elif method == "annoy":
            annoy = AnnoyIndex(len(temp_data_process[0]), metric='euclidean')
            for i in range(len(temp_data_process)):
                annoy.add_item(i, temp_data_process[i])
            annoy.build(1)
            distances = []
            indices = []
            for i in range(len(temp_data_process)):
                temp_index, temp_dist = annoy.get_nns_by_vector(
                    temp_data_process[i], 2, include_distances=True)
                indices.append([i, temp_index[1]])
                distances.append([0.0, temp_dist[1]])
        elif method == "nmslib":
            index = nmslib.init(method='hnsw', space='l2')
            index.addDataPointBatch(temp_data_process)
            index.createIndex(print_progress=False)

            neighbours = index.knnQueryBatch(temp_data_process, k=2)

            distances = []
            indices = []
            for item in neighbours:
                indices.append(item[0])
                distances.append(item[1])

        else:
            raise NotImplemented

        # if distance between each point and its nearest neighbor is below cutoff distance,
        # add the nearest neighbout to the candidate removal list
        remove_index_li = []
        index_li = []

        for index, distance in zip(indices, distances):
            index_li.append(index[0])
            if distance[1] <= cutoff:
                remove_index_li.append(index[1])

        # randomly select datapoints in the candidate removal list (based on rate)
        # and form the final removal list of this iteration
        # stop the cycle if the final removal list is empty
        temp_num = int(ceil(float(len(remove_index_li)) * rate))

        if temp_num == 0:
            keep_going = False
        remove_index_li = random_subsampling(remove_index_li, temp_num)

        temp_keep_list = remove_list_from_list(index_li, remove_index_li)
        overall_keep_list = [overall_keep_list[i] for i in temp_keep_list]
        if verbose >= 2:
            print('end iteration {}. length: {}\t time:{}'.format(
                iter_count, len(overall_keep_list),
                time.time() - start_cycle))
        iter_count += 1

    if verbose >= 1:
        print('end NN-subsampling. length: {}\t time:{}'.format(
            len(overall_keep_list),
            time.time() - start))
    return overall_keep_list
class Cell2Patients():
    def __init__(self,
                 clf,
                 threshold,
                 clf_transfer=None,
                 max_run=None,
                 out_dir="./semi_supervised/",
                 field_separator="\t",
                 increment_rate=10,
                 verbose=True,
                 normalize=False):
        self._clf = clone(clf)
        if clf_transfer == None:
            self._clf_transfer = clone(clf)
        else:
            self._clf_transfer = clf_transfer
        self._normalize = normalize
        self._increment_rate = increment_rate
        self._max_run = max_run
        self._out_dir = os.path.realpath(out_dir)
        os.makedirs(out_dir, exist_ok=True)
        self._Xl = []
        self._Xu = []
        self._names_l = []
        self._features_l = []
        self._names_u = []
        self._features_u = []
        self._Y = []
        self._patient_added = []
        self._verbose = verbose
        self._FS = field_separator
        self._leftover = ""
        self._thr = threshold
        if self._thr < 0.51 or self._thr > 0.99:
            raise ValueError("Threshold must be between 0.51 and 0.99")

    def _mex(self, txt, end="\n"):
        if self._verbose:
            print(txt, flush=True, end=end)
        else:
            if end == "\n":
                logger.debug("{}{}".format(self._leftover, txt))
                self._leftover = ""
            else:
                self._leftover = txt

    def import_data(self, input_file, labelled=True):
        if labelled:
            X, features, names, Y = self._Xl, self._features_l, self._names_l, self._Y
        else:
            X, features, names, Y = self._Xu, self._features_u, self._names_u, None
        if not os.path.exists(input_file):
            raise FileNotFoundError("File {} not found!".format(input_file))
        head_n = 2 if labelled else 1
        with open(input_file, "r") as f:
            while head_n != 0:
                line = [v.strip() for v in f.readline().split(self._FS)]
                if line[0] == "group" or line[0] == "label":
                    for idx in range(1, len(line)):
                        Y.append(line[idx])
                elif line[0] == "name":
                    for idx in range(1, len(line)):
                        names.append(line[idx])
                else:
                    raise ValueError(
                        "File {} is not well formatted. Found {} as first field, should be 'name' or 'group'."
                        .format(input_file, line[0]))
                head_n -= 1
            line = f.readline()
            while line:
                line = [v.strip() for v in line.split(self._FS)]
                if len(line) == len(names) + 1:
                    features.append(line[0])
                    vals = []
                    for idx in range(1, len(line)):
                        try:
                            vals.append(float(line[idx]))
                        except ValueError:
                            vals.append("NaN")
                    X.append(vals)
                line = f.readline()

    def _initData(self):
        self._class_names = sorted(list(set(self._Y)), reverse=True)
        if len(self._class_names) != 2:
            raise ValueError(
                "Cell2Patients works only with binary classification.\n{} labels found: {}"
                .format(len(self._class_names),
                        ", ".join(list(self._class_names))))
        for i in range(len(self._Y)):
            self._Y[i] = self._class_names.index(self._Y[i])
        if sorted(self._features_l) != sorted(self._features_u):
            raise ValueError(
                "Features in labelled and unlabelled are not the same!")
        self._features = self._features_l.copy()
        for i in range(len(self._features_l)):
            self._features_l[i] = self._features.index(self._features_l[i])
        for i in range(len(self._features_u)):
            self._features_u[i] = self._features.index(self._features_u[i])
        warnings.filterwarnings("ignore")
        self._Y = np.asarray(self._Y)
        self._Xl = SimpleImputer(strategy="mean").fit_transform(
            np.array(self._Xl)[self._features_l, :].T)
        self._Xu = SimpleImputer(strategy="mean").fit_transform(
            np.array(self._Xu)[self._features_u, :].T)
        if self._normalize:
            self._Xl = StandardScaler().fit_transform(self._Xl)
            self._Xu = StandardScaler().fit_transform(self._Xu)
        self._Ypatients = [-1 for _ in self._names_u]
        self._mex("Working with {} {} and {} {} cell lines".format(
            sum(self._Y), self._class_names[1],
            len(self._Y) - sum(self._Y), self._class_names[1]))

    def run(self):
        self._initData()
        self._mex(
            "Starting the tranfer learning of {} cell lines to {} patients".
            format(len(self._names_l), len(self._names_u)))
        self._patients_proba = []
        self._cell_proba = []
        running = True
        gen = 0
        Xl = self._Xl.copy()
        Y = self._Y.copy()
        self._feature_importances = []
        transferred = []
        prev_novel = [1, 1]
        while running:
            gen += 1
            self._mex("-" * 20)
            self._mex("Training the generation number {}".format(gen))
            if gen == 1:
                clf = clone(self._clf)
            else:
                clf = clone(self._clf_transfer)
            clf.fit(Xl, Y)
            self._feature_importances.append(clf.feature_importances_)
            newX, newY, novel = self._evaluate(clf, transferred,
                                               gen * self._increment_rate)
            tot_pos = sum(
                [1 if v >= self._thr else 0 for v in self._patients_proba[-1]])
            if novel[0] + novel[1] == 0:
                self._mex("No patients added.")
                running = False
            elif (prev_novel[0] == 0 and novel[0] == 0 and tot_pos == 0):
                self._mex(
                    "Model is overfitting over {}. Stopping the iteration.".
                    format(self._class_names[1]))
                running = False
            elif (prev_novel[1] == 0 and novel[1] == 0
                  and tot_pos == len(self._patients_proba[-1])):
                self._mex(
                    "Model is overfitting over {}. Stopping the iteration.".
                    format(self._class_names[0]))
                running = False
            else:
                self._mex("Adding {} patients, {} novel:".format(
                    len(newY), novel[0] + novel[1]))
                self._mex(" - {} labelled as {} ( {} novel ) ".format(
                    len(newY) - sum(newY), self._class_names[0], novel[0]))
                self._mex(" - {} labelled as {} ( {} novel ) ".format(
                    sum(newY), self._class_names[1], novel[1]))
                self._patient_added.append([len(newY) - sum(newY), sum(newY)])
                Xl = np.append(self._Xl, newX, axis=0)
                Y = np.append(self._Y, newY, axis=0)
            prev_novel = novel.copy()
        self._mex("-" * 20)
        self._write_results()
        return self.get_patient_labelled_data()

    def _evaluate(self, clf, transferred, limit):
        proba = clf.predict_proba(self._Xu)[:, 0]
        self._patients_proba.append(proba.copy())
        self._cell_proba.append(clf.predict_proba(self._Xl)[:, 0])
        newX, newY = [], []
        proba = sorted(enumerate(proba), key=lambda tup: tup[1])
        novel = [0, 0]
        i = min(limit, len(proba)) - 1
        while i >= 0:
            if 1 - proba[i][1] >= self._thr:
                newX.append(list(self._Xu[proba[i][0], :]))
                newY.append(1)
                if not proba[i][0] in transferred:
                    novel[1] += 1
                transferred.append(proba[i][0])
                self._Ypatients[proba[i][0]] = 1
            i -= 1
        i = len(proba) - 1
        n = 0
        while n < limit and proba[i][1] >= self._thr:
            newX.append(list(self._Xu[proba[i][0], :]))
            newY.append(0)
            if not proba[i][0] in transferred:
                novel[0] += 1
            transferred.append(proba[i][0])
            self._Ypatients[proba[i][0]] = 0
            n += 1
            i -= 1
        return np.array(newX), np.array(newY), novel

    def _plot_importances(self, pdf):
        plt.close()
        top_10 = [[
            v[0] for v in sorted(enumerate(self._feature_importances[0]),
                                 key=lambda tup: tup[1],
                                 reverse=True)[:10]
        ],
                  [
                      v[0]
                      for v in sorted(enumerate(self._feature_importances[-1]),
                                      key=lambda tup: tup[1],
                                      reverse=True)[:10]
                  ]]
        for idx in top_10[0]:
            plt.plot(range(len(self._feature_importances)),
                     [v[idx] for v in self._feature_importances],
                     "-r",
                     label="Top10 Begin")
        for idx in top_10[1]:
            plt.plot(range(len(self._feature_importances)),
                     [v[idx] for v in self._feature_importances],
                     "--b",
                     label="Top10 End")
        plt.xlabel('Generation')
        plt.ylabel('Feature Importance')
        handles, labels = plt.gca().get_legend_handles_labels()
        newLabels, newHandles = [], []
        for handle, label in zip(handles, labels):
            if label not in newLabels:
                newLabels.append(label)
                newHandles.append(handle)
        plt.legend(newHandles, newLabels)
        plt.title("Top10 feature importance evolution")
        plt.savefig(pdf, format="pdf")

        return

    def _plot_bar(self, pdf):
        plt.close()
        plt.bar([v - 0.21 for v in range(1,
                                         len(self._patient_added) + 1)],
                [v[0] for v in self._patient_added],
                width=0.40,
                color="#F8766D",
                label=self._class_names[0])
        plt.bar([v + 0.21 for v in range(1,
                                         len(self._patient_added) + 1)],
                [v[1] for v in self._patient_added],
                width=0.40,
                color="#00BFC4",
                label=self._class_names[1])
        plt.xlabel("Round")
        plt.ylabel("Number of patients added")
        plt.title("Patients added each round")
        plt.savefig(pdf, format="pdf")
        handles, labels = plt.gca().get_legend_handles_labels()
        newLabels, newHandles = [], []
        for handle, label in zip(handles, labels):
            if label not in newLabels:
                newLabels.append(label)
                newHandles.append(handle)
        plt.legend(newHandles, newLabels)
        plt.legend()
        plt.close()

    def _plotPCA(self, pdf):
        X_pca = PCA(n_components=2, svd_solver='full').fit_transform(self._Xl)
        plt.close()
        colors = ['#F8766D', '#00BFC4', 'grey']
        for i in range(2):
            plt.scatter(X_pca[self._Y == i, 0],
                        X_pca[self._Y == i, 1],
                        color=colors[i],
                        lw=2,
                        label="{}".format(self._class_names[i]))
        plt.legend()
        plt.title("Cell lines PCA")
        plt.savefig(pdf, format="pdf")
        plt.close()
        X_pca = PCA(n_components=2, svd_solver='full').fit_transform(self._Xu)
        class_names = self._class_names.copy()
        class_names.append("Unclassified")
        Y = np.array(self._Ypatients)
        for i in range(-1, 2):
            plt.scatter(X_pca[Y == i, 0],
                        X_pca[Y == i, 1],
                        color=colors[i],
                        lw=2,
                        label="{}".format(class_names[i]))
        plt.legend()
        plt.title("Patients PCA")
        plt.savefig(pdf, format="pdf")
        plt.close()

    def _plot_proba(self, pdf):
        plt.close()
        n_gen = len(self._patients_proba)
        plt.plot(range(n_gen), [self._thr for _ in range(n_gen)],
                 "--",
                 color="grey")
        plt.plot(range(n_gen), [1 - self._thr for _ in range(n_gen)],
                 "--",
                 color="grey")
        for gen in range(n_gen):
            blue, red, grey = [], [], []
            for p in self._patients_proba[gen]:
                if p >= self._thr:
                    red.append(p)
                elif p <= 1 - self._thr:
                    blue.append(p)
                else:
                    grey.append(p)
            if len(red) > 0:
                plt.plot([
                    gen + ((np.random.rand() - np.random.rand()) * 0.3)
                    for _ in range(len(red))
                ],
                         red,
                         ".",
                         color="#F8766D",
                         label=self._class_names[0])
            if len(blue) > 0:
                plt.plot([
                    gen + ((np.random.rand() - np.random.rand()) * 0.3)
                    for _ in range(len(blue))
                ],
                         blue,
                         ".",
                         color="#00BFC4",
                         label=self._class_names[1])
            if len(grey) > 0:
                plt.plot([
                    gen + ((np.random.rand() - np.random.rand()) * 0.3)
                    for _ in range(len(grey))
                ],
                         grey,
                         ".",
                         color="grey",
                         label="Unclassified")
        plt.xlabel("Generation")
        plt.ylabel("{} Probability".format(self._class_names[0]))
        plt.ylim(0, 1)
        plt.title("Patients probabilities")
        handles, labels = plt.gca().get_legend_handles_labels()
        newLabels, newHandles = [], []
        for handle, label in zip(handles, labels):
            if label not in newLabels:
                newLabels.append(label)
                newHandles.append(handle)
        plt.legend(newHandles, newLabels)
        plt.savefig(pdf, format="pdf")
        plt.close()
        return

    def _plot_proba_cells(self, pdf):
        plt.close()
        n_gen = len(self._cell_proba)
        plt.plot(range(n_gen), [0.6 for _ in range(n_gen)], "--", color="grey")
        plt.plot(range(n_gen), [0.4 for _ in range(n_gen)], "--", color="grey")
        for gen in range(n_gen):
            blue, red, grey = [], [], []
            for p in self._cell_proba[gen]:
                if p >= 0.6:
                    red.append(p)
                elif p <= 0.4:
                    blue.append(p)
                else:
                    grey.append(p)
            if len(red) > 0:
                plt.plot([
                    gen + ((np.random.rand() - np.random.rand()) * 0.3)
                    for _ in range(len(red))
                ],
                         red,
                         ".",
                         color="#F8766D")
            if len(blue) > 0:
                plt.plot([
                    gen + ((np.random.rand() - np.random.rand()) * 0.3)
                    for _ in range(len(blue))
                ],
                         blue,
                         ".",
                         color="#00BFC4")
            if len(grey) > 0:
                plt.plot([
                    gen + ((np.random.rand() - np.random.rand()) * 0.3)
                    for _ in range(len(grey))
                ],
                         grey,
                         ".",
                         color="grey")
        plt.xlabel("Generation")
        plt.ylabel("{} Probability".format(self._class_names[0]))
        plt.title("Cell lines probabilities")
        plt.ylim(0, 1)
        plt.savefig(pdf, format="pdf")
        plt.close()
        return

    def get_patient_labelled_data(self):
        X = []
        Y = []
        for p in range(len(self._names_u)):
            if self._Ypatients[p] != -1:
                Y.append(self._Ypatients[p])
                X.append(self._Xu[p, :])
        return np.array(X), np.array(Y)

    def get_patient_unlabelled_data(self):
        X = []
        for p in range(len(self._names_u)):
            if self._Ypatients[p] == -1:
                X.append(self._Xu[p, :])
        return np.array(X)

    def _write_results(self):
        pdf = PdfPages("{}/graphs.pdf".format(self._out_dir))
        self._plot_importances(pdf)
        self._plot_proba(pdf)
        self._plot_proba_cells(pdf)
        self._plot_bar(pdf)
        self._plotPCA(pdf)
        pdf.close()
        with open("{}/feature_importances.tsv".format(self._out_dir),
                  "w") as ofs:
            ofs.write("Generation\tFeatureName\tImportance\n")
            for gen in range(len(self._feature_importances)):
                for feat in range(len(self._features)):
                    ofs.write("{}\t{}\t{:.3f}\n".format(
                        gen, self._features[feat],
                        self._feature_importances[gen][feat]))
        with open("{}/patients_probabilities.tsv".format(self._out_dir),
                  "w") as ofs:
            ofs.write(
                "Generation\tPatientName\tProbability_{}\tProbability_{}\tGroup\n"
                .format(self._class_names[0], self._class_names[1]))
            for gen in range(len(self._patients_proba)):
                for p in range(len(self._names_u)):
                    ofs.write("{}\t{}\t{}\t{}\n".format(
                        gen, self._names_u[p], self._patients_proba[gen][p],
                        1 - self._patients_proba[gen][p]))
        with open("{}/cells_probabilities.tsv".format(self._out_dir),
                  "w") as ofs:
            ofs.write("Generation\tCellName\tProbability_{}\tProbability_{}\n".
                      format(self._class_names[0], self._class_names[1]))
            for gen in range(len(self._cell_proba)):
                for p in range(len(self._names_l)):
                    ofs.write("{}\t{}\t{}\t{}\n".format(
                        gen, self._names_l[p], self._cell_proba[gen][p],
                        1 - self._cell_proba[gen][p]))
        with open("{}/patients_labels.tsv".format(self._out_dir), "w") as ofs:
            ofs.write("Name\tLabel\n")
            for i in range(len(self._names_u)):
                ofs.write("{}\t{}\n".format(
                    self._names_u[i], self._class_names[self._Ypatients[i]]
                    if self._Ypatients[i] != -1 else "NA"))
Exemple #13
0
def process_data(regressor_name, X, normalise=None):
    if regressor_name in classical_ml:
        tmp = []
        for i in tqdm(range(len(X))):
            # 1. flatten
            # 2. fill missing values
            x = X.iloc[i, 0].reset_index(drop=True)
            x.interpolate(method='linear',
                          inplace=True,
                          limit_direction='both')
            if normalise == "standard":
                x = StandardScaler().fit_transform(x.values.reshape(-1, 1))
                x = pd.DataFrame(x)
            elif normalise == "minmax":
                x = MinMaxScaler().fit_transform(x.values.reshape(-1, 1))
                x = pd.DataFrame(x)
            tmp2 = x.values.tolist()
            for j in range(1, len(X.columns)):
                x = X.iloc[i, j].reset_index(drop=True)
                x.interpolate(method='linear',
                              inplace=True,
                              limit_direction='both')
                if normalise == "standard":
                    x = StandardScaler().fit_transform(x.values.reshape(-1, 1))
                    x = pd.DataFrame(x)
                elif normalise == "minmax":
                    x = MinMaxScaler().fit_transform(x.values.reshape(-1, 1))
                    x = pd.DataFrame(x)
                tmp2 = tmp2 + x.values.tolist()
            tmp2 = pd.DataFrame(tmp2).transpose()

            tmp.append(tmp2)
        X = pd.concat(tmp).reset_index(drop=True)
    else:
        tmp = []
        for i in tqdm(range(len(X))):
            x = X.iloc[i, :]
            _x = x.copy(deep=True)

            # 1. find the maximum length of each dimension
            all_len = [len(y) for y in _x]
            max_len = max(all_len)

            # 2. adjust the length of each dimension
            _y = []
            for y in _x:
                # 2.1 fill missing values
                if y.isnull().any():
                    y = y.interpolate(method='linear', limit_direction='both')

                # 2.2. if length of each dimension is different, uniformly scale the shorted one to the max length
                if len(y) < max_len:
                    y = uniform_scaling(y, max_len)
                _y.append(y)
            _y = np.array(np.transpose(_y))
            if normalise == "standard":
                scaler = StandardScaler().fit(_y)
                _y = scaler.transform(_y)
            if normalise == "minmax":
                scaler = MinMaxScaler().fit(_y)
                _y = scaler.transform(_y)

            tmp.append(_y)
        X = np.array(tmp)
    return X
clf2.fit(X_train)

clf3 = AutoEncoder(hidden_neurons =[25, 15, 10, 2, 10,15, 25])
clf3.fit(X_train)

# Predict the anomaly scores
y_test_scores = clf1.decision_function(X_test)  
y_test_scores = pd.Series(y_test_scores)

# Step 2: Determine the cut point
import matplotlib.pyplot as plt
plt.hist(y_test_scores, bins='auto')  
plt.title("Histogram with Model Clf3 Anomaly Scores")
plt.show()

df_test = X_test.copy()
df_test['score'] = y_test_scores
df_test['cluster'] = np.where(df_test['score']<4, 0, 1)
df_test['cluster'].value_counts()

# Step 3: Get the summary statistics by cluster
df_test.groupby('cluster').mean()



# ensemble
# Put all the predictions in a data frame
from pyod.models.combination import aom, moa, average, maximization

# Put all the predictions in a data frame
train_scores = pd.DataFrame({'clf1': clf1.decision_scores_,
Exemple #15
0
def get_feature_patches(PARAMS, FV, patch_size, patch_shift, input_shape):
    # Removing NaN and Inf
    if any(np.array([9, 10, 21, 22, 39]) == np.shape(FV)[1]):
        FV = FV[~np.isnan(FV).any(axis=1), :]
        FV = FV[~np.isinf(FV).any(axis=1), :]
    else:
        FV = FV[:, ~np.isnan(FV).any(axis=0)]
        FV = FV[:, ~np.isinf(FV).any(axis=0)]

    FV = StandardScaler(copy=False).fit_transform(FV)
    # FV should be of the shape (nFeatures, nFrames)
    if any(np.array([9, 10, 21, 22, 39]) == np.shape(FV)[1]):
        FV = FV.T

    frmStart = 0
    frmEnd = 0
    patchNum = 0
    patches = np.empty([])

    if np.shape(FV)[1] < patch_size:
        FV1 = FV.copy()
        while np.shape(FV)[1] <= patch_size:
            FV = np.append(FV, FV1, axis=1)

    # while frmEnd<np.shape(FV)[1]:
    #     # print('get_feature_patches: ', frmStart, frmEnd, np.shape(FV))
    #     frmStart = patchNum*patch_shift
    #     frmEnd = np.min([patchNum*patch_shift+patch_size, np.shape(FV)[1]])
    #     if frmEnd-frmStart<patch_size:
    #         frmStart = frmEnd - patch_size
    #     if np.size(patches)<=1:
    #         patches = np.expand_dims(FV[:, frmStart:frmEnd], axis=0)
    #     else:
    #         patches = np.append(patches, np.expand_dims(FV[:, frmStart:frmEnd], axis=0), axis=0)
    #     patchNum += 1

    # startTime = time.clock()
    # for frmStart in range(0, np.shape(FV)[1], patch_shift):
    #     # print('get_feature_patches: ', frmStart, frmEnd, np.shape(FV))
    #     frmEnd = np.min([frmStart+patch_size, np.shape(FV)[1]])
    #     if frmEnd-frmStart<patch_size:
    #         frmStart = frmEnd - patch_size
    #     if np.size(patches)<=1:
    #         patches = np.array(FV[:, frmStart:frmEnd], ndmin=3)
    #     else:
    #         patches = np.append(patches, np.array(FV[:, frmStart:frmEnd], ndmin=3), axis=0)
    # print('My splitting: ', time.clock()-startTime, np.shape(patches))

    startTime = time.clock()
    numPatches = int(np.ceil(np.shape(FV)[1] / patch_shift))
    patches = PatchExtractor(patch_size=(np.shape(FV)[0], patch_size),
                             max_patches=numPatches).transform(
                                 np.expand_dims(FV, axis=0))
    # print('sklearn splitting: ', time.clock()-startTime, np.shape(patches))

    # print('Patches: ', np.shape(patches))
    if (np.shape(patches)[1] == 9) or (np.shape(patches)[1] == 10):
        diff_dim = input_shape[0] - np.shape(patches)[1]
        zero_padding = np.zeros(
            (np.shape(patches)[0], diff_dim, np.shape(patches)[2]))
        patches = np.append(patches, zero_padding, axis=1)
    elif np.shape(patches)[1] == 22:
        patches = patches[:, :21, :]
    elif np.shape(patches)[1] == 39:
        if not PARAMS['39_dim_CC_feat']:
            first_7_cep_dim = np.array(
                list(range(0, 7)) + list(range(13, 20)) + list(range(26, 33)))
            patches = patches[:, first_7_cep_dim, :]
    # print('Patches: ', np.shape(patches))

    return patches
Exemple #16
0
from tempfile import mkdtemp
from functools import wraps
import pytest

from sklearn import datasets

import warnings

n_clusters = 3
# X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50)
X, y = make_blobs(n_samples=200, random_state=10)
X, y = shuffle(X, y, random_state=7)
X = StandardScaler().fit_transform(X)

X_missing_data = X.copy()
X_missing_data[0] = [np.nan, 1]
X_missing_data[5] = [np.nan, np.nan]


def test_missing_data():
    """Tests if nan data are treated as infinite distance from all other points and assigned to -1 cluster"""
    model = HDBSCAN().fit(X_missing_data)
    assert model.labels_[0] == -1
    assert model.labels_[5] == -1
    assert model.probabilities_[0] == 0
    assert model.probabilities_[5] == 0
    assert model.probabilities_[5] == 0
    clean_indices = list(range(1, 5)) + list(range(6, 200))
    clean_model = HDBSCAN().fit(X_missing_data[clean_indices])
    assert np.allclose(clean_model.labels_, model.labels_[clean_indices])
Exemple #17
0
def load_data(path, file_name):
    with open(path + file_name + '.pickle', 'rb') as handle:
        data = pickle.load(handle)
    return data


# -------------------------------------------------------------------------------------------------------------

orient = 9
pix_per_cell = 8
cell_per_block = 2
hist_bins = 32

X_scaler = StandardScaler()

X_scaler.copy = True
X_scaler.with_mean = True
X_scaler.with_std = True

filename = 'svm_model.sav'

data_vehicle = load_data("./data/", "Vehicle")
data_no_vehicle = load_data("./data/", "No_Vehicle")

ones = np.ones(len(data_vehicle))
zeros = np.zeros(len(data_no_vehicle))
feature_list = [ones, zeros]

label = np.hstack(feature_list)
# for idx in range(len(data_no_vehicle)):
#     img = data_no_vehicle[idx]
    coef[:n_relevant_features] = coef_min + rng.rand(n_relevant_features)

    # The correlation of our design: variables correlated by blocs of 3
    corr = np.zeros((n_features, n_features))
    for i in range(0, n_features, block_size):
        corr[i:i + block_size, i:i + block_size] = 1 - conditioning
    corr.flat[::n_features + 1] = 1
    corr = linalg.cholesky(corr)

    # Our design
    X = rng.normal(size=(n_samples, n_features))
    X = np.dot(X, corr)
    # Keep [Wainwright2006] (26c) constant
    X[:n_relevant_features] /= np.abs(
        linalg.svdvals(X[:n_relevant_features])).max()
    X = StandardScaler().fit_transform(X.copy())

    # The output variable
    y = np.dot(X, coef)
    y /= np.std(y)
    # We scale the added noise as a function of the average correlation
    # between the design and the output variable
    y += noise_level * rng.normal(size=n_samples)
    mi = mutual_incoherence(X[:, :n_relevant_features],
                            X[:, n_relevant_features:])

    ###########################################################################
    # Plot stability selection path, using a high eps for early stopping
    # of the path, to save computation time
    alpha_grid, scores_path = lasso_stability_path(X, y, random_state=42,
                                                   eps=0.05)
Exemple #19
0
        keras.layers.Dense(240, activation=tf.nn.relu),
        keras.layers.Dropout(0.2),
        keras.layers.Dense(240, activation=tf.nn.relu),
        keras.layers.Dropout(0.1),
        keras.layers.Dense(240, activation=tf.nn.tanh),
        keras.layers.Dropout(0.1),
        keras.layers.Dense(5, activation=tf.nn.softmax),
    ])
    model.name = 'model' + str(cont)

    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

    ### drop outliers
    tmp = pd.DataFrame(data=X.copy())
    tmp.insert(0, column='y', value=y)

    X_curr, y_curr = drop_outliers(tmp, cont)

    # model.fit(X_curr, y_curr, epochs = 50)
    model.fit(X_curr, y_curr, epochs=200)

    # results[cont] = model.evaluate(X_test, y_test)
    # print("done with ", model.name)
    # print(results[cont])

# results ={}

# # exit()