コード例 #1
0
ファイル: main.py プロジェクト: sachinvarriar/ditk
 def impute(self, trained_model, input):
     """
     Loads the input table and gives the imputed table
 
 	:param trained_model: trained model returned by train function - not used in our case
 	:param input: input table which needs to be imputed
 	:return:
 		X_filled_softimpute: imputed table as a numpy array
     """
     X_incomplete = input
     softImpute = SoftImpute()
     biscaler = BiScaler()
     X_incomplete_normalized = biscaler.fit_transform(X_incomplete)
     X_filled_softimpute_normalized = softImpute.fit_transform(
         X_incomplete_normalized)
     X_filled_softimpute = biscaler.inverse_transform(
         X_filled_softimpute_normalized)
     return X_filled_softimpute
コード例 #2
0
X_filled_knn = knnImpute.fit_transform(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().fit_transform(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()

# rescale both rows and columns to have zero mean and unit variance
X_incomplete_normalized = biscaler.fit_transform(X_incomplete)

X_filled_softimpute_normalized = softImpute.fit_transform(
    X_incomplete_normalized)
X_filled_softimpute = biscaler.inverse_transform(
    X_filled_softimpute_normalized)

X_filled_softimpute_no_biscale = softImpute.fit_transform(X_incomplete)

meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask])**2).mean()
print("meanFill MSE: %f" % meanfill_mse)

# print mean squared error for the imputation methods above
nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask])**2).mean()
print("Nuclear norm minimization MSE: %f" % nnm_mse)
コード例 #3
0
        df = pd.DataFrame(X, columns=allele_list, index=peptide_list)
        df.to_csv(args.save_incomplete_affinity_matrix, index_label="peptide")

    scores = ScoreSet()
    kfold = stratified_cross_validation(
        X=X,
        observed_mask=observed_mask,
        n_folds=args.n_folds)
    for fold_idx, (X_fold, ok_mesh, test_coords, X_test_vector) in enumerate(kfold):
        X_fold_reduced = X_fold[ok_mesh]
        biscaler = BiScaler(
            scale_rows=args.normalize_rows,
            center_rows=args.normalize_rows,
            scale_columns=args.normalize_columns,
            center_columns=args.normalize_columns)
        X_fold_reduced_scaled = biscaler.fit_transform(X=X_fold_reduced)
        for (method_name, solver) in sorted(imputation_methods.items()):
            print("CV fold %d/%d, running %s" % (
                fold_idx + 1,
                args.n_folds,
                method_name))
            X_completed_reduced_scaled = solver.complete(X_fold_reduced)
            X_completed_reduced = biscaler.inverse_transform(
                X_completed_reduced_scaled)
            X_completed = np.zeros_like(X)
            X_completed[ok_mesh] = X_completed_reduced
            y_pred = X_completed[test_coords]
            mae, tau, auc, f1_score = evaluate_predictions(
                y_true=X_test_vector, y_pred=y_pred, max_ic50=args.max_ic50)
            scores.add_many(
                method_name,
コード例 #4
0
class ResultsTable(object):
    def __init__(self,
                 images_dict,
                 percent_missing=0.25,
                 saved_image_stride=25,
                 dirname="face_images",
                 scale_rows=False,
                 center_rows=False):
        self.images_dict = images_dict
        self.labels = list(sorted(images_dict.keys()))
        self.images_array = np.array([images_dict[k]
                                      for k in self.labels]).astype("float32")
        self.image_shape = self.images_array[0].shape
        self.width, self.height = self.image_shape[:2]
        self.color = (len(self.image_shape) == 3) and (self.image_shape[2]
                                                       == 3)
        if self.color:
            self.images_array = color_balance(self.images_array)
        self.n_pixels = self.width * self.height
        self.n_features = self.n_pixels * (3 if self.color else 1)
        self.n_images = len(self.images_array)
        print(
            "[ResultsTable] # images = %d, color=%s # features = %d, shape = %s"
            % (self.n_images, self.color, self.n_features, self.image_shape))

        self.flattened_array_shape = (self.n_images, self.n_features)

        self.flattened_images = self.images_array.reshape(
            self.flattened_array_shape)

        n_missing_pixels = int(self.n_pixels * percent_missing)

        missing_square_size = int(np.sqrt(n_missing_pixels))
        print(
            "[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" %
            (n_missing_pixels, missing_square_size))
        self.incomplete_images = remove_pixels(
            self.images_array, missing_square_size=missing_square_size)
        print("[ResultsTable] Incomplete images shape = %s" %
              (self.incomplete_images.shape, ))
        self.flattened_incomplete_images = self.incomplete_images.reshape(
            self.flattened_array_shape)
        self.missing_mask = np.isnan(self.flattened_incomplete_images)
        self.normalizer = BiScaler(scale_rows=scale_rows,
                                   center_rows=center_rows,
                                   min_value=self.images_array.min(),
                                   max_value=self.images_array.max())
        self.incomplete_normalized = self.normalizer.fit_transform(
            self.flattened_incomplete_images)

        self.saved_image_indices = list(
            range(0, self.n_images, saved_image_stride))
        self.saved_images = defaultdict(dict)
        self.dirname = dirname
        self.mse_dict = {}
        self.mae_dict = {}

        self.save_images(self.images_array, "original", flattened=False)
        self.save_images(self.incomplete_images, "incomplete", flattened=False)

    def ensure_dir(self, dirname):
        if not exists(dirname):
            print("Creating directory: %s" % dirname)
            mkdir(dirname)

    def save_images(self, images, base_filename, flattened=True):
        self.ensure_dir(self.dirname)
        for i in self.saved_image_indices:
            label = self.labels[i].lower().replace(" ", "_")
            image = images[i, :].copy()
            if flattened:
                image = image.reshape(self.image_shape)
            image[np.isnan(image)] = 0
            figure = pylab.gcf()
            axes = pylab.gca()
            extra_kwargs = {}
            if self.color:
                extra_kwargs["cmap"] = "gray"
            assert image.min() >= 0, "Image can't contain negative numbers"
            if image.max() <= 1:
                image *= 256
            image[image > 255] = 255
            axes.imshow(image.astype("uint8"), **extra_kwargs)
            axes.get_xaxis().set_visible(False)
            axes.get_yaxis().set_visible(False)
            filename = base_filename + ".png"
            subdir = join(self.dirname, label)
            self.ensure_dir(subdir)
            path = join(subdir, filename)
            figure.savefig(path, bbox_inches='tight')
            self.saved_images[i][base_filename] = path

    def add_entry(self, solver, name):
        print("Running %s" % name)
        completed_normalized = solver.fit_transform(self.incomplete_normalized)
        completed = self.normalizer.inverse_transform(completed_normalized)

        mae = masked_mae(X_true=self.flattened_images,
                         X_pred=completed,
                         mask=self.missing_mask)
        mse = masked_mse(X_true=self.flattened_images,
                         X_pred=completed,
                         mask=self.missing_mask)
        print("==> %s: MSE=%0.4f MAE=%0.4f" % (name, mse, mae))
        self.mse_dict[name] = mse
        self.mae_dict[name] = mae
        self.save_images(completed, base_filename=name)

    def sorted_errors(self):
        """
        Generator for (rank, name, MSE, MAE) sorted by increasing MAE
        """
        for i, (name, mae) in enumerate(
                sorted(self.mae_dict.items(), key=lambda x: x[1])):
            yield (
                i + 1,
                name,
                self.mse_dict[name],
                self.mae_dict[name],
            )

    def print_sorted_errors(self):
        for (rank, name, mse, mae) in self.sorted_errors():
            print("%d) %s: MSE=%0.4f MAE=%0.4f" % (rank, name, mse, mae))

    def save_html_table(self, filename="results_table.html"):
        html = """
            <table>
            <th>
                <td>Rank</td>
                <td>Name</td>
                <td>Mean Squared Error</td>
                <td>Mean Absolute Error</td>
            </th>
        """
        for (rank, name, mse, mae) in self.sorted_errors():
            html += """
            <tr>
                <td>%d</td>
                <td>%s</td>
                <td>%0.4f</td>
                <td>%0.4f</td>
            </tr>
            """ % (rank, name, mse, mae)
        html += "</table>"
        self.ensure_dir(self.dirname)
        path = join(self.dirname, filename)
        with open(path, "w") as f:
            f.write(html)
        return html
コード例 #5
0
    'silent': -1,
    'verbose': -1,
    'n_jobs': -1,
}
fit_params = {
    'eval_metric': 'auc',
    'early_stopping_rounds': 150,
    'verbose': 100
}

with timer('impute missing'):
    df = pd.concat([X_train, X_test], axis=0)
    df = df.loc[:, df.isnull().sum() != len(df)]
    cols = [f for f in df.columns if df[f].dtype != 'object']
    bi = BiScaler()
    df[cols] = bi.fit_transform(df[cols].values)
    df.fillna(-9999, inplace=True)
    X_train = df[:len(X_train)].copy()
    X_test = df[len(X_train):].copy()
    del bi, df, cols
    gc.collect()

with timer('training'):
    cv_results = []
    val_series = y_train.copy()
    test_df = pd.DataFrame()
    feat_df = None
    for i, (trn_idx, val_idx) in enumerate(cv.split(X_train, y_train)):
        X_trn = X_train.iloc[trn_idx].copy()
        y_trn = y_train[trn_idx]
        X_val = X_train.iloc[val_idx].copy()
コード例 #6
0
class ResultsTable(object):

    def __init__(
            self,
            images_dict,
            percent_missing=0.25,
            saved_image_stride=25,
            dirname="face_images",
            scale_rows=False,
            center_rows=False):
        self.images_dict = images_dict
        self.labels = list(sorted(images_dict.keys()))
        self.images_array = np.array(
            [images_dict[k] for k in self.labels]).astype("float32")
        self.image_shape = self.images_array[0].shape
        self.width, self.height = self.image_shape[:2]
        self.color = (len(self.image_shape) == 3) and (self.image_shape[2] == 3)
        if self.color:
            self.images_array = color_balance(self.images_array)
        self.n_pixels = self.width * self.height
        self.n_features = self.n_pixels * (3 if self.color else 1)
        self.n_images = len(self.images_array)
        print("[ResultsTable] # images = %d, color=%s # features = %d, shape = %s" % (
            self.n_images, self.color, self.n_features, self.image_shape))

        self.flattened_array_shape = (self.n_images, self.n_features)

        self.flattened_images = self.images_array.reshape(self.flattened_array_shape)

        n_missing_pixels = int(self.n_pixels * percent_missing)

        missing_square_size = int(np.sqrt(n_missing_pixels))
        print("[ResultsTable] n_missing_pixels = %d, missing_square_size = %d" % (
            n_missing_pixels, missing_square_size))
        self.incomplete_images = remove_pixels(
            self.images_array,
            missing_square_size=missing_square_size)
        print("[ResultsTable] Incomplete images shape = %s" % (
            self.incomplete_images.shape,))
        self.flattened_incomplete_images = self.incomplete_images.reshape(
            self.flattened_array_shape)
        self.missing_mask = np.isnan(self.flattened_incomplete_images)
        self.normalizer = BiScaler(
            scale_rows=scale_rows,
            center_rows=center_rows,
            min_value=self.images_array.min(),
            max_value=self.images_array.max())
        self.incomplete_normalized = self.normalizer.fit_transform(
            self.flattened_incomplete_images)

        self.saved_image_indices = list(
            range(0, self.n_images, saved_image_stride))
        self.saved_images = defaultdict(dict)
        self.dirname = dirname
        self.mse_dict = {}
        self.mae_dict = {}

        self.save_images(self.images_array, "original", flattened=False)
        self.save_images(self.incomplete_images, "incomplete", flattened=False)

    def ensure_dir(self, dirname):
        if not exists(dirname):
            print("Creating directory: %s" % dirname)
            mkdir(dirname)

    def save_images(self, images, base_filename, flattened=True):
        self.ensure_dir(self.dirname)
        for i in self.saved_image_indices:
            label = self.labels[i].lower().replace(" ", "_")
            image = images[i, :].copy()
            if flattened:
                image = image.reshape(self.image_shape)
            image[np.isnan(image)] = 0
            figure = pylab.gcf()
            axes = pylab.gca()
            extra_kwargs = {}
            if self.color:
                extra_kwargs["cmap"] = "gray"
            assert image.min() >= 0, "Image can't contain negative numbers"
            if image.max() <= 1:
                image *= 256
            image[image > 255] = 255
            axes.imshow(image.astype("uint8"), **extra_kwargs)
            axes.get_xaxis().set_visible(False)
            axes.get_yaxis().set_visible(False)
            filename = base_filename + ".png"
            subdir = join(self.dirname, label)
            self.ensure_dir(subdir)
            path = join(subdir, filename)
            figure.savefig(
                path,
                bbox_inches='tight')
            self.saved_images[i][base_filename] = path

    def add_entry(self, solver, name):
        print("Running %s" % name)
        completed_normalized = solver.complete(self.incomplete_normalized)
        completed = self.normalizer.inverse_transform(completed_normalized)

        mae = masked_mae(
            X_true=self.flattened_images,
            X_pred=completed,
            mask=self.missing_mask)
        mse = masked_mse(
            X_true=self.flattened_images,
            X_pred=completed,
            mask=self.missing_mask)
        print("==> %s: MSE=%0.4f MAE=%0.4f" % (name, mse, mae))
        self.mse_dict[name] = mse
        self.mae_dict[name] = mae
        self.save_images(completed, base_filename=name)

    def sorted_errors(self):
        """
        Generator for (rank, name, MSE, MAE) sorted by increasing MAE
        """
        for i, (name, mae) in enumerate(
                sorted(self.mae_dict.items(), key=lambda x: x[1])):
            yield(i + 1, name, self.mse_dict[name], self.mae_dict[name],)

    def print_sorted_errors(self):
        for (rank, name, mse, mae) in self.sorted_errors():
            print("%d) %s: MSE=%0.4f MAE=%0.4f" % (
                rank,
                name,
                mse,
                mae))

    def save_html_table(self, filename="results_table.html"):
        html = """
            <table>
            <th>
                <td>Rank</td>
                <td>Name</td>
                <td>Mean Squared Error</td>
                <td>Mean Absolute Error</td>
            </th>
        """
        for (rank, name, mse, mae) in self.sorted_errors():
            html += """
            <tr>
                <td>%d</td>
                <td>%s</td>
                <td>%0.4f</td>
                <td>%0.4f</td>
            </tr>
            """ % (rank, name, mse, mae)
        html += "</table>"
        self.ensure_dir(self.dirname)
        path = join(self.dirname, filename)
        with open(path, "w") as f:
            f.write(html)
        return html
コード例 #7
0
random.seed(123)
np.random.seed(123)

# read in data and transpose
data = pd.read_csv(input_file, sep='\t', header=0, index_col=0, 
				   error_bad_lines=False)
new_data = data.copy()
transposed = new_data.T

# we'll need a matrix specifically for the biscaler transform, for SoftImpute
print("SoftImpute...")
transposed_mat = transposed.as_matrix()
biscaler = BiScaler()

# perform the scaling appropriate for this imputation strategy
transposed_normalized = biscaler.fit_transform(transposed_mat)

# the imputation itself
imputed_softimpute = SoftImpute().fit_transform(transposed_normalized)

# we don't want the transformed values and we want samples to be columns
inverse_softimpute = biscaler.inverse_transform(imputed_softimpute)
untransposed_softimpute = inverse_softimpute.transpose()

# prepare to write to file, back to DataFrame, return indices
softimpute_df = pd.DataFrame(untransposed_softimpute)
softimpute_df.index = data.index
softimpute_df.columns = data.columns.values

# write to a tab separated values file, but we'll use the .pcl file extension
softimpute_outfile = outfile + "_softimpute.pcl"
コード例 #8
0
ファイル: readme_example.py プロジェクト: garner1/fancyimpute
X_filled_knn = knnImpute.complete(X_incomplete)

# matrix completion using convex optimization to find low-rank solution
# that still matches observed values. Slow!
X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)

# Instead of solving the nuclear norm objective directly, instead
# induce sparsity using singular value thresholding
softImpute = SoftImpute()

# simultaneously normalizes the rows and columns of your observed data,
# sometimes useful for low-rank imputation methods
biscaler = BiScaler()

# rescale both rows and columns to have zero mean and unit variance
X_incomplete_normalized = biscaler.fit_transform(X_incomplete)

X_filled_softimpute_normalized = softImpute.complete(X_incomplete_normalized)
X_filled_softimpute = biscaler.inverse_transform(X_filled_softimpute_normalized)

X_filled_softimpute_no_biscale = softImpute.complete(X_incomplete)

meanfill_mse = ((X_filled_mean[missing_mask] - X[missing_mask]) ** 2).mean()
print("meanFill MSE: %f" % meanfill_mse)

# print mean squared error for the three imputation methods above
nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean()
print("Nuclear norm minimization MSE: %f" % nnm_mse)

softImpute_mse = ((X_filled_softimpute[missing_mask] - X[missing_mask]) ** 2).mean()
print("SoftImpute MSE: %f" % softImpute_mse)