Beispiel #1
0
    def save(self, fname: str):
        """
        Save database as feather file.

        :param fname: The name of the file to create.
        """
        assert not os.path.exists(fname)
        df = self._df.copy()
        df.index.name = INDEX_NAME
        df.reset_index(
            inplace=True
        )  # Index is not stored in feather format. https://github.com/wesm/feather/issues/200
        write_dataframe(df, fname)
Beispiel #2
0
    def invert(cls,
               db: Type[RankingDatabase],
               fname: str,
               top_n_identifiers: int = 50000) -> None:
        """
        Create an inverted whole genome rankings database keeping only the top n genes/regions for a feature.

        Inverted design: not storing the rankings for all regions in the dataframe but instead store the identifier of the
        top n genes/regions in the dataframe introduces an enormous reduction in disk and memory size.

        :param db: The rankings database.
        :param fname: the filename of the inverted database to be created.
        :param top_n_identifiers: The number of genes to keep in the inverted database.
        """

        df_original = db.load_full()
        n_features = len(df_original)

        index_fname = InvertedRankingDatabase._derive_identifiers_fname(fname)
        assert not os.path.exists(
            index_fname), "Database index {0:s} already exists.".format(
                index_fname)
        identifiers = df_original.columns.values
        with open(index_fname, 'w') as f:
            f.write('\n'.join(identifiers))
        identifier2idx = {
            identifier: idx
            for idx, identifier in enumerate(identifiers)
        }

        inverted_data = np.empty(shape=(n_features, top_n_identifiers),
                                 dtype=INVERTED_DB_DTYPE)
        df_original.columns = [
            identifier2idx[identifier] for identifier in df_original.columns
        ]
        for idx, (_, row) in tqdm(enumerate(df_original.iterrows())):
            inverted_data[idx, :] = np.array(
                row.sort_values(ascending=True).head(top_n_identifiers).index,
                dtype=INVERTED_DB_DTYPE)
        df = pd.DataFrame(data=inverted_data,
                          index=df_original.index,
                          columns=list(range(top_n_identifiers)))

        df.index.name = INDEX_NAME
        df.reset_index(
            inplace=True
        )  # Index is not stored in feather format. https://github.com/wesm/feather/issues/200
        write_dataframe(df, fname)
Beispiel #3
0
def convert2feather(fname: str,
                    out_folder: str,
                    name: str,
                    extension: str = "feather") -> str:
    """
    Convert a whole genome rankings database to a feather format based database.

    More information on this format can be found here:
    .. feather-format: https://blog.rstudio.com/2016/03/29/feather/

    :param fname: The filename of the legacy
    :param out_folder: The name of the folder to write the new database to.
    :param name: The name of the rankings database.
    :param extension: The extension of the new database file.
    :return: The filename of the new database.
    """
    assert os.path.isfile(fname), "{} does not exist.".format(fname)
    assert os.path.isdir(out_folder), "{} is not a directory.".format(
        out_folder)

    feather_fname = os.path.join(
        out_folder, "{}.{}".format(
            os.path.splitext(os.path.basename(fname))[0], extension))
    assert not os.path.exists(feather_fname), "{} already exists.".format(
        feather_fname)

    # Load original database into memory.
    # Caveat: the original storage format of whole genome rankings does not store the metadata, i.e. name and gene
    # nomenclature.
    # The avoid having to specify nomenclature it is set as unknown.
    db = SQLiteRankingDatabase(fname=fname,
                               name=name,
                               nomenclature=UNKNOWN_NOMENCLATURE)
    df = db.load_full()
    df.index.name = INDEX_NAME
    df.reset_index(
        inplace=True
    )  # Index is not stored in feather format. https://github.com/wesm/feather/issues/200
    write_dataframe(df, feather_fname)
    return feather_fname