def cluster_user(self):
     user_feature_matrix = self.__extract_user_feature()
     user_feature_matrix = user_feature_matrix.tocsr()
     user_feature_matrix= MaxAbsScaler().fit_transform(user_feature_matrix)
     #model = DBSCAN(eps=0.5, min_samples=100).fit(user_feature_matrix)
     model = MiniBatchKMeans(n_clusters=50,max_iter=10000).fit(user_feature_matrix.toarray())
     labels = model.labels_
     n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
     print('Estimated number of clusters: %d' % n_clusters_)
     user_label_dict = dict()
     for user in self.__user_ix_dict:
         user_label_dict[user] = labels[self.__user_ix_dict[user]]
     return user_label_dict
    def create_read_tabular_file(path,
                                 save_pref='_',
                                 feature_normalization=None,
                                 transpose=False,
                                 override=False):
        '''
        :param path:
        :param save_pref:
        :param transpose: if isolates are columns
        :param feature_normalization: 'binary': {0,1}, '0-1': [0-1],  'percent': {0,1,..,100}, 'zu': zero mean, unit variance
        :return:
        '''
        print('Start creating ', save_pref)
        if override or not os.path.exists('_'.join(
            [save_pref, 'feature', 'vect.npz'])):
            rows = [
                l.strip() for l in codecs.open(path, 'r', 'utf-8').readlines()
            ]
            tf_vec = sparse.csr_matrix([[
                GenotypeReader.get_float_or_zero(x)
                for x in entry.split('\t')[1::]
            ] for entry in rows[1::]])

            if transpose:
                tf_vec = sparse.csr_matrix(tf_vec.toarray().T)
                isolates = [
                    feat.replace(' ', '')
                    for feat in rows[0].rstrip().split('\t')
                ]
                feature_names = [row.split()[0] for row in rows[1::]]
            else:
                isolates = [row.split()[0] for row in rows[1::]]
                feature_names = [
                    feat.replace(' ', '')
                    for feat in rows[0].rstrip().split('\t')
                ]

            # normalizer / discretizer
            if feature_normalization:
                if feature_normalization == 'binary':
                    tf_vec = np.round(MaxAbsScaler().fit_transform(tf_vec))

                elif feature_normalization == '01':
                    tf_vec = MaxAbsScaler().fit_transform(tf_vec)
                elif feature_normalization == 'percent':
                    tf_vec = np.round(MaxAbsScaler().fit_transform(tf_vec) *
                                      100)
                elif feature_normalization == 'zu':
                    tf_vec = sparse.csr_matrix(
                        preprocessing.StandardScaler().fit_transform(
                            tf_vec.toarray()))

                FileUtility.save_sparse_csr(
                    '_'.join([save_pref, 'feature', 'vect.npz']), tf_vec)
                FileUtility.save_list(
                    '_'.join([save_pref, 'feature', 'list.txt']),
                    feature_names)
                FileUtility.save_list(
                    '_'.join([save_pref, 'strains', 'list.txt']), isolates)
                print(save_pref, ' created successfully containing ',
                      str(len(isolates)), ' strains and ',
                      str(len(feature_names)), ' features')
                return (''.join([
                    save_pref, ' created successfully containing ',
                    str(len(isolates)), ' strains and ',
                    str(len(feature_names)), ' features'
                ]))
        else:
            print(save_pref, ' already exist ')
            return (''.join([save_pref, ' already exist ']))