def save(self, fname: str): """ Save database as feather file. :param fname: The name of the file to create. """ assert not os.path.exists(fname) df = self._df.copy() df.index.name = INDEX_NAME df.reset_index( inplace=True ) # Index is not stored in feather format. https://github.com/wesm/feather/issues/200 write_dataframe(df, fname)
def invert(cls, db: Type[RankingDatabase], fname: str, top_n_identifiers: int = 50000) -> None: """ Create an inverted whole genome rankings database keeping only the top n genes/regions for a feature. Inverted design: not storing the rankings for all regions in the dataframe but instead store the identifier of the top n genes/regions in the dataframe introduces an enormous reduction in disk and memory size. :param db: The rankings database. :param fname: the filename of the inverted database to be created. :param top_n_identifiers: The number of genes to keep in the inverted database. """ df_original = db.load_full() n_features = len(df_original) index_fname = InvertedRankingDatabase._derive_identifiers_fname(fname) assert not os.path.exists( index_fname), "Database index {0:s} already exists.".format( index_fname) identifiers = df_original.columns.values with open(index_fname, 'w') as f: f.write('\n'.join(identifiers)) identifier2idx = { identifier: idx for idx, identifier in enumerate(identifiers) } inverted_data = np.empty(shape=(n_features, top_n_identifiers), dtype=INVERTED_DB_DTYPE) df_original.columns = [ identifier2idx[identifier] for identifier in df_original.columns ] for idx, (_, row) in tqdm(enumerate(df_original.iterrows())): inverted_data[idx, :] = np.array( row.sort_values(ascending=True).head(top_n_identifiers).index, dtype=INVERTED_DB_DTYPE) df = pd.DataFrame(data=inverted_data, index=df_original.index, columns=list(range(top_n_identifiers))) df.index.name = INDEX_NAME df.reset_index( inplace=True ) # Index is not stored in feather format. https://github.com/wesm/feather/issues/200 write_dataframe(df, fname)
def convert2feather(fname: str, out_folder: str, name: str, extension: str = "feather") -> str: """ Convert a whole genome rankings database to a feather format based database. More information on this format can be found here: .. feather-format: https://blog.rstudio.com/2016/03/29/feather/ :param fname: The filename of the legacy :param out_folder: The name of the folder to write the new database to. :param name: The name of the rankings database. :param extension: The extension of the new database file. :return: The filename of the new database. """ assert os.path.isfile(fname), "{} does not exist.".format(fname) assert os.path.isdir(out_folder), "{} is not a directory.".format( out_folder) feather_fname = os.path.join( out_folder, "{}.{}".format( os.path.splitext(os.path.basename(fname))[0], extension)) assert not os.path.exists(feather_fname), "{} already exists.".format( feather_fname) # Load original database into memory. # Caveat: the original storage format of whole genome rankings does not store the metadata, i.e. name and gene # nomenclature. # The avoid having to specify nomenclature it is set as unknown. db = SQLiteRankingDatabase(fname=fname, name=name, nomenclature=UNKNOWN_NOMENCLATURE) df = db.load_full() df.index.name = INDEX_NAME df.reset_index( inplace=True ) # Index is not stored in feather format. https://github.com/wesm/feather/issues/200 write_dataframe(df, feather_fname) return feather_fname