def get_log_enrichment( input_lib: DataFrame, output_lib: DataFrame, input_stopcodon: DataFrame, output_stopcodon: DataFrame, min_counts: int, stopcodon: bool, infinite: float, ) -> Tuple[npt.NDArray, npt.NDArray]: """ Calculate log10 enrichment scores from input and output counts. """ # Copy data and replace low counts by np.nan input_lib_np: npt.NDArray = np.copy(input_lib.astype(float)) output_lib_np: npt.NDArray = np.copy(output_lib.astype(float)) input_lib_np[input_lib_np < min_counts] = np.nan # Stop codon correction if stopcodon: output_lib_np = stopcodon_correction( input_lib_np, output_lib_np, np.array(input_stopcodon), np.array(output_stopcodon) ) # log10 of library and replace infinite values. This will potentially divide by zero. with np.errstate(divide='ignore'): counts_log10_ratio: npt.NDArray = replace_inf( np.log10(output_lib_np / input_lib_np), infinite ) return counts_log10_ratio, output_lib_np
def writeTrainTestData(name: str, train: DataFrame, test: DataFrame): train = train.astype({'102':int}) train_file = open(f'{name}.data', 'w' if os.path.isfile(f'{name}.data') else 'x') train.to_csv(train_file, header=False, index=False) train_file.close() test = test.astype({'102':int}) test_file = open(f'{name}.test', 'w' if os.path.isfile(f'{name}.test') else 'x') test.to_csv(test_file, header=False, index=False) test_file.close()
def isochrone(network: DataFrame, start_vertices: array, distance_limits: array, only_minimum_cover=True) -> array: """ Calculate the isochrone of a network. Parameters ---------- network : Pandas DataFrame The network graph to be used. start_vertices : array(int64) The starting vertex. distance_limits : array(double) The distance limits. only_minimum_cover : bool (optional, default: True) If True, only the minimum cover is returned. Returns ------- isochrone_gdp : GeoDataFrame The isochrone paths. """ network.astype({ "id": int64, "source": int64, "target": int64, "cost": double, "reverse_cost": double, "length": double, }) start_vertices = array(start_vertices).astype(int64) distance_limits = array(distance_limits).astype(double) isochroneclass = isochrone_cpp.Isochrone() # TODO: Find a way to bypass the type conversion. result = isochroneclass.calculate( network["id"], network["source"], network["target"], network["cost"], network["reverse_cost"], network["length"], network["geom"], start_vertices, distance_limits, only_minimum_cover, ) return result
def test_read_write_dta12(self): original = DataFrame([(1, 2, 3, 4, 5, 6)], columns=[ 'astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-', 'short', 'delete' ]) formatted = DataFrame([(1, 2, 3, 4, 5, 6)], columns=[ 'astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_', '_short', '_delete' ]) formatted.index.name = 'index' formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: original.to_stata(path, None) tm.assert_equal(len(w), 1) # should get a warning for that format. written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
def test_read_write_dta12(self): original = DataFrame([(1, 2, 3, 4, 5, 6)], columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-', 'short', 'delete']) formatted = DataFrame([(1, 2, 3, 4, 5, 6)], columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_', '_short', '_delete']) formatted.index.name = 'index' formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: original.to_stata(path, None) tm.assert_equal(len(w), 1) # should get a warning for that format. written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
def test_read_write_dta12(self): original = DataFrame( [(1, 2, 3, 4, 5, 6)], columns=[ "astringwithmorethan32characters_1", "astringwithmorethan32characters_2", "+", "-", "short", "delete", ], ) formatted = DataFrame( [(1, 2, 3, 4, 5, 6)], columns=[ "astringwithmorethan32characters_", "_0astringwithmorethan32character", "_", "_1_", "_short", "_delete", ], ) formatted.index.name = "index" formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: original.to_stata(path, None) tm.assert_equal(len(w), 1) # should get a warning for that format. written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index("index"), formatted)
def test_read_write_dta11(self): # skip_if_not_little_endian() original = DataFrame([(1, 2, 3, 4)], columns=[ 'good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______' ]) formatted = DataFrame([(1, 2, 3, 4)], columns=[ 'good', 'b_d', '_8number', 'astringwithmorethan32characters_' ]) formatted.index.name = 'index' formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: original.to_stata(path, None, False) np.testing.assert_equal( len(w), 1) # should get a warning for that format. written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
def get_dataset(data): dataset = DataFrame(data) # dataframe = pd.read_csv(pathname, usecols=[1], engine='python', skipfooter=3) # dataset = dataframe.values # 将int变为float dataset = dataset.astype('float32') # 归一化 dataset = __scaler.fit_transform(dataset) train_size = int(len(dataset) * 0.65) train_list = dataset[:train_size] test_list = dataset[train_size:] # 训练数据太少,look_back不能过大 # 3, 5, 7, 9 look_back = 7 train_x, train_y = create_dataset(train_list, look_back) test_x, test_y = create_dataset(test_list, look_back) train_x = numpy.reshape(train_x, (train_x.shape[0], train_x.shape[1], 1)) test_x = numpy.reshape(test_x, (test_x.shape[0], test_x.shape[1], 1)) return dataset, train_x, train_y, test_x, test_y
def _finalize_output(self, frame: DataFrame) -> DataFrame: """ Processes data read in based on kwargs. Parameters ---------- frame: DataFrame The DataFrame to process. Returns ------- DataFrame The processed DataFrame. """ num_cols = len(frame.columns) multi_index_named = True if self.header is None: if self.names is None: if self.prefix is not None: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] elif self.header is None: self.names = range(num_cols) if len(self.names) != num_cols: # usecols is passed through to pyarrow, we only handle index col here # The only way self.names is not the same length as number of cols is # if we have int index_col. We should just pad the names(they will get # removed anyways) to expected length then. self.names = list( range(num_cols - len(self.names))) + self.names multi_index_named = False frame.columns = self.names # we only need the frame not the names # error: Incompatible types in assignment (expression has type # "Union[List[Union[Union[str, int, float, bool], Union[Period, Timestamp, # Timedelta, Any]]], Index]", variable has type "Index") [assignment] frame.columns, frame = self._do_date_conversions( # type: ignore[assignment] frame.columns, frame) if self.index_col is not None: for i, item in enumerate(self.index_col): if is_integer(item): self.index_col[i] = frame.columns[item] else: # String case if item not in frame.columns: raise ValueError(f"Index {item} invalid") frame.set_index(self.index_col, drop=True, inplace=True) # Clear names if headerless and no name given if self.header is None and not multi_index_named: frame.index.names = [None] * len(frame.index.names) if self.kwds.get("dtype") is not None: try: frame = frame.astype(self.kwds.get("dtype")) except TypeError as e: # GH#44901 reraise to keep api consistent raise ValueError(e) return frame
def _preprocess_temporal_columns(df: DataFrame) -> DataFrame: """Pre-process the columns with temporal dtype to convert from numpy datetime objects to pandas datetime objects. Casting the dtype of the columns to object type results in columns of dtype "object" with the contents of the columns being pandas datetime objects, rather than numpy datetime objects. Args: df: A DataFrame with temporal columns with numpy datetime dtypes. Returns: A DataFrame without numpy datetime dtypes. The content of the columns with temporal dtypes are accessible as pandas datetime objects. """ for col in df.select_dtypes(include=["datetime64[ns, UTC]"]): df = df.astype({col: "O"}) for col in df.select_dtypes(include="timedelta64[ns]"): df = df.astype({col: "O"}) return df
def _finalize_output(self, frame: DataFrame) -> DataFrame: """ Processes data read in based on kwargs. Parameters ---------- frame: DataFrame The DataFrame to process. Returns ------- DataFrame The processed DataFrame. """ num_cols = len(frame.columns) multi_index_named = True if self.header is None: if self.names is None: if self.prefix is not None: self.names = [f"{self.prefix}{i}" for i in range(num_cols)] elif self.header is None: self.names = range(num_cols) if len(self.names) != num_cols: # usecols is passed through to pyarrow, we only handle index col here # The only way self.names is not the same length as number of cols is # if we have int index_col. We should just pad the names(they will get # removed anyways) to expected length then. self.names = list( range(num_cols - len(self.names))) + self.names multi_index_named = False frame.columns = self.names # we only need the frame not the names frame.columns, frame = self._do_date_conversions(frame.columns, frame) if self.index_col is not None: for i, item in enumerate(self.index_col): if is_integer(item): self.index_col[i] = frame.columns[item] else: # String case if item not in frame.columns: raise ValueError(f"Index {item} invalid") frame.set_index(self.index_col, drop=True, inplace=True) # Clear names if headerless and no name given if self.header is None and not multi_index_named: frame.index.names = [None] * len(frame.index.names) if self.kwds.get("dtype") is not None: frame = frame.astype(self.kwds.get("dtype")) return frame
def test_read_write_dta11(self): original = DataFrame([(1, 2, 3, 4)], columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______']) formatted = DataFrame([(1, 2, 3, 4)], columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) formatted.index.name = 'index' formatted = formatted.astype(np.int32) with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: original.to_stata(path, None) # should get a warning for that format. tm.assert_equal(len(w), 1) written_and_read_again = self.read_dta(path) tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted)
class PandasBackend(DataBackend): _data: DataFrame _index: PandasIndex _loc: _LocIndexer _iloc: _ILocIndexer def __init__( self, data: Optional[Union(Series, DataFrame, dict[str, list])] = None, index: Optional[PandasIndex] = None, ) -> None: if data is None: self._data = DataFrame(dtype="object") elif type(data) is Series: self._data = cast(Series, data).to_frame().transpose() elif type(data) is DataFrame: self._data = DataFrame(data) elif type(data) is dict: sample_value = next(iter(data.values())) if not isinstance(sample_value, Iterable) or isinstance( sample_value, str): self._data = Series(data).to_frame().transpose() else: self._data = DataFrame(data) else: raise ValueError( f"Received unexpected value type {type(data)}: {data}") if index is None: self._data.index.name = "index" self._index = PandasIndex(self._data.index, []) else: if not isinstance(index, PandasIndex): index = PandasIndex(index) self._data.index = index._data self._index = index self._loc = _LocIndexer(self) self._iloc = _ILocIndexer(self) def is_link(self) -> bool: return False def link_token(self) -> Optional[DataToken]: return None def to_pandas(self) -> DataFrame: return self._data @property def columns(self) -> list[str]: return self._data.columns.tolist() @property def values(self) -> np.ndarray: data_values = self._data.values shape = data_values.shape if shape[1] == 1: return np.squeeze(data_values, axis=1) elif shape[0] == 1: return np.squeeze(data_values, axis=0) else: return data_values @property def dtypes(self) -> dict[str, DataType]: return { col: DataType(dtype) for col, dtype in self._data.dtypes.items() } def cast_columns(self, column_dtypes: dict[str, type]) -> PandasBackend: return PandasBackend(self._data.astype(column_dtypes, errors="ignore")) def to_dict(self) -> dict[str, any]: return self._data.to_dict("list") @property def index(self) -> Index: return self._index @property def index_name(self) -> Union[str, list[str]]: return self._data.index.name @property def loc(self: PandasBackend) -> LocIndexer[PandasBackend]: return self._loc @property def iloc(self: PandasBackend) -> ILocIndexer[PandasBackend]: return self._iloc def equals(self, other: PandasBackend) -> bool: if type(other) is not PandasBackend: return False return np.array_equal(self._data.values, other._data.values) and self._index.equals( other._index) def __eq__(self, other) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data == other def __ne__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data != other def __gt__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data > other def __ge__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data >= other def __lt__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data < other def __le__(self, other: Any) -> DataFrame: if issubclass(type(other), PandasBackend): other = other._data return self._data <= other def __len__(self) -> int: return len(self._data) def __iter__(self) -> Generator[str, None, None]: return iter(self._data) def iterrows(self) -> Generator[tuple[int, PandasBackend], None, None]: for i, row in self._data.iterrows(): yield (i, PandasBackend(row.to_frame().transpose())) def itertuples(self, ignore_index: bool = False): for values in self._data.itertuples(index=not ignore_index): yield values def __getitem__(self, item: str) -> Any: return PandasBackend(self._data[item].to_frame()) def getitems(self, items: list[str]) -> PandasBackend: return PandasBackend(self._data[items]) def getmask(self, mask: list[bool]) -> PandasBackend: return PandasBackend(self._data[mask]) def query(self, query: "Query") -> PandasBackend: from tanuki.database.adapter.query.pandas_query_compiler import PandasQueryCompiler query_compiler = PandasQueryCompiler(self._data) query = query_compiler.compile(query) return PandasBackend(self._data[query]) def __setitem__(self, items: str, value: Any) -> None: if isinstance(value, PandasBackend): value = value._data self._data[items] = value def get_index(self, index_alias: IndexAlias) -> Index: cols = [str(col) for col in index_alias.columns] new_data = self._data.set_index(cols) new_data.index.name = index_alias.name return PandasIndex(new_data.index, cols) def set_index(self, index: Union[Index, IndexAlias]) -> PandasBackend: cols = [str(col) for col in index.columns] new_data = self._data.set_index(cols) new_data.index.name = index.name new_index = PandasIndex(new_data.index, cols) return PandasBackend(new_data, new_index) def reset_index(self: PandasBackend) -> PandasBackend: new_data = self._data.reset_index(drop=True) new_data.index.name = "index" new_index = PandasIndex(new_data.index, []) return PandasBackend(new_data, new_index) def append( self: PandasBackend, new_backend: PandasBackend, ignore_index: bool = False, ) -> PandasBackend: return PandasBackend( self._data.append(new_backend._data, ignore_index=ignore_index)) def drop_indices(self: PandasBackend, indices: list[int]) -> PandasBackend: return PandasBackend(self._data.drop(indices)) @classmethod def concat( cls: type[PandasBackend], all_backends: list[PandasBackend], ignore_index: bool = False, ) -> PandasBackend: all_data = [backend._data for backend in all_backends] return PandasBackend(pd.concat(all_data, ignore_index=ignore_index)) def nunique(self) -> int: return self._data.nunique() def __str__(self) -> str: return str(self._data) def __repr__(self) -> str: return str(self)
def from_dummies( data: DataFrame, sep: None | str = None, default_category: None | Hashable | dict[str, Hashable] = None, ) -> DataFrame: """ Create a categorical ``DataFrame`` from a ``DataFrame`` of dummy variables. Inverts the operation performed by :func:`~pandas.get_dummies`. .. versionadded:: 1.5.0 Parameters ---------- data : DataFrame Data which contains dummy-coded variables in form of integer columns of 1's and 0's. sep : str, default None Separator used in the column names of the dummy categories they are character indicating the separation of the categorical names from the prefixes. For example, if your column names are 'prefix_A' and 'prefix_B', you can strip the underscore by specifying sep='_'. default_category : None, Hashable or dict of Hashables, default None The default category is the implied category when a value has none of the listed categories specified with a one, i.e. if all dummies in a row are zero. Can be a single value for all variables or a dict directly mapping the default categories to a prefix of a variable. Returns ------- DataFrame Categorical data decoded from the dummy input-data. Raises ------ ValueError * When the input ``DataFrame`` ``data`` contains NA values. * When the input ``DataFrame`` ``data`` contains column names with separators that do not match the separator specified with ``sep``. * When a ``dict`` passed to ``default_category`` does not include an implied category for each prefix. * When a value in ``data`` has more than one category assigned to it. * When ``default_category=None`` and a value in ``data`` has no category assigned to it. TypeError * When the input ``data`` is not of type ``DataFrame``. * When the input ``DataFrame`` ``data`` contains non-dummy data. * When the passed ``sep`` is of a wrong data type. * When the passed ``default_category`` is of a wrong data type. See Also -------- :func:`~pandas.get_dummies` : Convert ``Series`` or ``DataFrame`` to dummy codes. :class:`~pandas.Categorical` : Represent a categorical variable in classic. Notes ----- The columns of the passed dummy data should only include 1's and 0's, or boolean values. Examples -------- >>> df = pd.DataFrame({"a": [1, 0, 0, 1], "b": [0, 1, 0, 0], ... "c": [0, 0, 1, 0]}) >>> df a b c 0 1 0 0 1 0 1 0 2 0 0 1 3 1 0 0 >>> pd.from_dummies(df) 0 a 1 b 2 c 3 a >>> df = pd.DataFrame({"col1_a": [1, 0, 1], "col1_b": [0, 1, 0], ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], ... "col2_c": [0, 0, 1]}) >>> df col1_a col1_b col2_a col2_b col2_c 0 1 0 0 1 0 1 0 1 1 0 0 2 1 0 0 0 1 >>> pd.from_dummies(df, sep="_") col1 col2 0 a b 1 b a 2 a c >>> df = pd.DataFrame({"col1_a": [1, 0, 0], "col1_b": [0, 1, 0], ... "col2_a": [0, 1, 0], "col2_b": [1, 0, 0], ... "col2_c": [0, 0, 0]}) >>> df col1_a col1_b col2_a col2_b col2_c 0 1 0 0 1 0 1 0 1 1 0 0 2 0 0 0 0 0 >>> pd.from_dummies(df, sep="_", default_category={"col1": "d", "col2": "e"}) col1 col2 0 a b 1 b a 2 d e """ from pandas.core.reshape.concat import concat if not isinstance(data, DataFrame): raise TypeError("Expected 'data' to be a 'DataFrame'; " f"Received 'data' of type: {type(data).__name__}") if data.isna().any().any(): raise ValueError("Dummy DataFrame contains NA value in column: " f"'{data.isna().any().idxmax()}'") # index data with a list of all columns that are dummies try: data_to_decode = data.astype("boolean", copy=False) except TypeError: raise TypeError("Passed DataFrame contains non-dummy data") # collect prefixes and get lists to slice data for each prefix variables_slice = defaultdict(list) if sep is None: variables_slice[""] = list(data.columns) elif isinstance(sep, str): for col in data_to_decode.columns: prefix = col.split(sep)[0] if len(prefix) == len(col): raise ValueError(f"Separator not specified for column: {col}") variables_slice[prefix].append(col) else: raise TypeError("Expected 'sep' to be of type 'str' or 'None'; " f"Received 'sep' of type: {type(sep).__name__}") if default_category is not None: if isinstance(default_category, dict): if not len(default_category) == len(variables_slice): len_msg = ( f"Length of 'default_category' ({len(default_category)}) " f"did not match the length of the columns being encoded " f"({len(variables_slice)})") raise ValueError(len_msg) elif isinstance(default_category, Hashable): default_category = dict( zip(variables_slice, [default_category] * len(variables_slice))) else: raise TypeError("Expected 'default_category' to be of type " "'None', 'Hashable', or 'dict'; " "Received 'default_category' of type: " f"{type(default_category).__name__}") cat_data = {} for prefix, prefix_slice in variables_slice.items(): if sep is None: cats = prefix_slice.copy() else: cats = [col[len(prefix + sep):] for col in prefix_slice] assigned = data_to_decode.loc[:, prefix_slice].sum(axis=1) if any(assigned > 1): raise ValueError("Dummy DataFrame contains multi-assignment(s); " f"First instance in row: {assigned.idxmax()}") elif any(assigned == 0): if isinstance(default_category, dict): cats.append(default_category[prefix]) else: raise ValueError( "Dummy DataFrame contains unassigned value(s); " f"First instance in row: {assigned.idxmin()}") data_slice = concat( (data_to_decode.loc[:, prefix_slice], assigned == 0), axis=1) else: data_slice = data_to_decode.loc[:, prefix_slice] cats_array = np.array(cats, dtype="object") # get indices of True entries along axis=1 cat_data[prefix] = cats_array[data_slice.to_numpy().nonzero()[1]] return DataFrame(cat_data)