def test_float64_index_difference(): # https://github.com/pandas-dev/pandas/issues/35217 float_index = Index([1.0, 2, 3]) string_index = Index(["1", "2", "3"]) result = float_index.difference(string_index) tm.assert_index_equal(result, float_index) result = string_index.difference(float_index) tm.assert_index_equal(result, string_index)
def _get_common_indices(index_1: pd.Index, index_2: pd.Index): """Get intersection of indices""" _intersection = index_1.intersection(index_2) _diff_to_1 = index_1.difference(_intersection) if len(_diff_to_1) > 0: logger.warning( "Sample with clinical features not in demographics: {}".format( ", ".join(_diff_to_1))) return _intersection, _diff_to_1
def append_new_entrants(self, existing_data: pd.Series, new_index: pd.Index, getter: Callable): intersection = existing_data.loc[new_index.intersection( existing_data.index)] new_entrants_index = new_index.difference( self.stratification_groups.index) new_entrants_stratifications = getter(new_entrants_index) return intersection.append(new_entrants_stratifications)
def _compare_labels(axis: str, actual: pd.Index, expected: pd.Index): error_message = f"{df_name} data frame does not have expected {axis}" missing_columns = expected.difference(actual) extra_columns = actual.difference(expected) error_detail = [] if len(actual) != len(expected): error_detail.append( f"expected {len(expected)} columns but got {len(actual)}" ) if len(missing_columns) > 0: error_detail.append( f"missing columns: " f"{', '.join(str(item) for item in missing_columns)}" ) if len(extra_columns) > 0: error_detail.append( f"extra columns: " f"{', '.join(str(item) for item in extra_columns)}" ) raise ValueError(f"{error_message} ({'; '.join(error_detail)})")
def splitter(index: pd.Index, y, *args, **kwargs) -> Tuple[pd.Index, pd.Index]: df = y.copy() df["index"] = df.index.to_list( ) # we want tuples in case of multi index indices_per_class = df.groupby( y.columns.to_list()).agg(lambda x: list(x)) with temp_seed(seed): test_idx = concat_indices([ i[0] for i in indices_per_class.apply( choose, axis=1, result_type='reduce').to_list() ]) return index.difference(test_idx), test_idx
def get_oligo_non_oligos(l1_ix: pd.Index, l2_ix: pd.Index) -> Tuple[pd.Index, pd.Index]: """ Determines which patients are oligo-ns and non-oligo-ns from the given indices. Args: l1_ix: The level 1 NMF subject IDs. l2_ix: The level 2 NMF subject IDs. Returns: The oligo-n subject IDs and non-oligo-n subject IDs. """ oligo_ids = l1_ix.difference(l2_ix) return oligo_ids, l2_ix
def _make_indexer(self, self_indexer: Index, other_indexer: Index): if self.aggregation_required: group_ints, group_order = other_indexer.factorize() self.other_grouper = group_ints self.flat_indexer, self.missing_indices = group_order.get_indexer_non_unique( self_indexer) else: # Performance-tuned fast paths for constructing indexers if self_indexer.equals(other_indexer): # Indexers are identical self.flat_indexer = np.arange(len(other_indexer)) self.missing_indices = np.array([], dtype=int) elif len(self_indexer.difference( other_indexer)) == 0: # No missing values # Taking the difference is faster than `all(.isin())` self.missing_indices = np.array([], dtype=int) self.flat_indexer = other_indexer.get_indexer(self_indexer) else: # All other cases self.flat_indexer, self.missing_indices = other_indexer.get_indexer_non_unique( self_indexer)
def parse_csv(csv_file_stream, categories: pd.Index) -> pd.DataFrame: probabilities = pd.read_csv(csv_file_stream, header=0) if 'image' not in probabilities.columns: raise ScoreException('Missing column in CSV: "image".') probabilities.set_index('image', drop=True, inplace=True, verify_integrity=True) missing_columns = categories.difference(probabilities.columns) if not missing_columns.empty: raise ScoreException(f'Missing columns in CSV: {list(missing_columns)}.') extra_columns = probabilities.columns.difference(categories) if not extra_columns.empty: raise ScoreException(f'Extra columns in CSV: {list(extra_columns)}.') # sort by the order in categories probabilities = probabilities.reindex(categories, axis='columns') missing_rows = probabilities[probabilities.isnull().any(axis='columns')].index if not missing_rows.empty: raise ScoreException(f'Missing value(s) in CSV for images: {missing_rows.tolist()}.') non_float_columns = probabilities.dtypes[ probabilities.dtypes.apply(lambda x: x != np.float64) ].index if not non_float_columns.empty: raise ScoreException( f'CSV contains non-floating-point value(s) in columns: {non_float_columns.tolist()}.' ) # TODO: identify specific failed rows out_of_range_rows = probabilities[ probabilities.applymap(lambda x: x < 0.0 or x > 1.0).any(axis='columns') ].index if not out_of_range_rows.empty: raise ScoreException( f'Values in CSV are outside the interval [0.0, 1.0] for images: ' f'{out_of_range_rows.tolist()}.' ) # TODO: fail on extra columns in data rows return probabilities
def _condition_logic(base_idx: pd.Index, sub_select_idx: pd.Index, state_idx: pd.Index, condition_idx: pd.Index, logic: str) -> pd.Index: if str(logic).upper() == 'ALL': return base_idx.intersection(condition_idx).sort_values() elif str(logic).upper() == 'ANY': return sub_select_idx.intersection(condition_idx).sort_values() elif str(logic).upper() == 'AND': return state_idx.intersection(condition_idx).sort_values() elif str(logic).upper() == 'NAND': return sub_select_idx.drop(state_idx.intersection(condition_idx)).sort_values() elif str(logic).upper() == 'OR': return state_idx.append(state_idx.union(condition_idx)).drop_duplicates().sort_values() elif str(logic).upper() == 'NOR': result = state_idx.append(state_idx.union(condition_idx)).drop_duplicates().sort_values() return sub_select_idx.drop(result) elif str(logic).upper() == 'NOT': return state_idx.difference(condition_idx) elif str(logic).upper() == 'XOR': return state_idx.union(condition_idx).difference(state_idx.intersection(condition_idx)) raise ValueError(f"The logic '{logic}' must be AND, NAND, OR, NOR, NOT, XOR ANY or ALL")
def _difference(left: pd.Index, right: pd.Index) -> pd.Index: return left.difference(right, sort=False)
def parse_csv(csv_file_stream: TextIO, categories: pd.Index) -> pd.DataFrame: try: probabilities = pd.read_csv(csv_file_stream, header=0, index_col=False) except pd.errors.ParserError as e: # TODO: Test this case raise ScoreException(f'Could not parse CSV: "{str(e)}"') if 'image' not in probabilities.columns: raise ScoreException('Missing column in CSV: "image".') # Pandas represents strings as 'O' (object) if probabilities['image'].dtype != np.dtype('O'): # Coercing to 'U' (unicode) ensures that even NaN values are converted; # however, the resulting type is still 'O' probabilities['image'] = probabilities['image'].astype(np.dtype('U')) probabilities['image'] = probabilities['image'].str.replace(r'\.jpg$', '', case=False) if not probabilities['image'].is_unique: duplicate_images = probabilities['image'][ probabilities['image'].duplicated()].unique() raise ScoreException( f'Duplicate image rows detected in CSV: {duplicate_images.tolist()}.' ) # The duplicate check is the same as performed by 'verify_integrity' probabilities.set_index('image', drop=True, inplace=True, verify_integrity=False) missing_columns = categories.difference(probabilities.columns) if not missing_columns.empty: raise ScoreException( f'Missing columns in CSV: {missing_columns.tolist()}.') extra_columns = probabilities.columns.difference(categories) if not extra_columns.empty: raise ScoreException( f'Extra columns in CSV: {extra_columns.tolist()}.') # sort by the order in categories probabilities = probabilities.reindex(categories, axis='columns') missing_rows = probabilities[probabilities.isnull().any( axis='columns')].index if not missing_rows.empty: raise ScoreException( f'Missing value(s) in CSV for images: {missing_rows.tolist()}.') non_float_columns = probabilities.dtypes[probabilities.dtypes.apply( lambda x: x != np.float64)].index if not non_float_columns.empty: raise ScoreException( f'CSV contains non-floating-point value(s) in columns: {non_float_columns.tolist()}.' ) # TODO: identify specific failed rows out_of_range_rows = probabilities[probabilities.applymap( lambda x: x < 0.0 or x > 1.0).any(axis='columns')].index if not out_of_range_rows.empty: raise ScoreException( f'Values in CSV are outside the interval [0.0, 1.0] for images: ' f'{out_of_range_rows.tolist()}.') # TODO: fail on extra columns in data rows return probabilities