Ejemplo n.º 1
0
def test_float64_index_difference():
    # https://github.com/pandas-dev/pandas/issues/35217
    float_index = Index([1.0, 2, 3])
    string_index = Index(["1", "2", "3"])

    result = float_index.difference(string_index)
    tm.assert_index_equal(result, float_index)

    result = string_index.difference(float_index)
    tm.assert_index_equal(result, string_index)
Ejemplo n.º 2
0
 def _get_common_indices(index_1: pd.Index, index_2: pd.Index):
     """Get intersection of indices"""
     _intersection = index_1.intersection(index_2)
     _diff_to_1 = index_1.difference(_intersection)
     if len(_diff_to_1) > 0:
         logger.warning(
             "Sample with clinical features not in demographics: {}".format(
                 ", ".join(_diff_to_1)))
     return _intersection, _diff_to_1
    def append_new_entrants(self, existing_data: pd.Series,
                            new_index: pd.Index, getter: Callable):
        intersection = existing_data.loc[new_index.intersection(
            existing_data.index)]

        new_entrants_index = new_index.difference(
            self.stratification_groups.index)
        new_entrants_stratifications = getter(new_entrants_index)
        return intersection.append(new_entrants_stratifications)
Ejemplo n.º 4
0
 def _compare_labels(axis: str, actual: pd.Index, expected: pd.Index):
     error_message = f"{df_name} data frame does not have expected {axis}"
     missing_columns = expected.difference(actual)
     extra_columns = actual.difference(expected)
     error_detail = []
     if len(actual) != len(expected):
         error_detail.append(
             f"expected {len(expected)} columns but got {len(actual)}"
         )
         if len(missing_columns) > 0:
             error_detail.append(
                 f"missing columns: "
                 f"{', '.join(str(item) for item in missing_columns)}"
             )
         if len(extra_columns) > 0:
             error_detail.append(
                 f"extra columns: "
                 f"{', '.join(str(item) for item in extra_columns)}"
             )
         raise ValueError(f"{error_message} ({'; '.join(error_detail)})")
Ejemplo n.º 5
0
    def splitter(index: pd.Index, y, *args,
                 **kwargs) -> Tuple[pd.Index, pd.Index]:
        df = y.copy()
        df["index"] = df.index.to_list(
        )  # we want tuples in case of multi index
        indices_per_class = df.groupby(
            y.columns.to_list()).agg(lambda x: list(x))

        with temp_seed(seed):
            test_idx = concat_indices([
                i[0] for i in indices_per_class.apply(
                    choose, axis=1, result_type='reduce').to_list()
            ])

        return index.difference(test_idx), test_idx
def get_oligo_non_oligos(l1_ix: pd.Index,
                         l2_ix: pd.Index) -> Tuple[pd.Index, pd.Index]:
    """
    Determines which patients are oligo-ns and non-oligo-ns from the given
    indices.

    Args:
        l1_ix: The level 1 NMF subject IDs.
        l2_ix: The level 2 NMF subject IDs.

    Returns:
        The oligo-n subject IDs and non-oligo-n subject IDs.
    """

    oligo_ids = l1_ix.difference(l2_ix)

    return oligo_ids, l2_ix
Ejemplo n.º 7
0
 def _make_indexer(self, self_indexer: Index, other_indexer: Index):
     if self.aggregation_required:
         group_ints, group_order = other_indexer.factorize()
         self.other_grouper = group_ints
         self.flat_indexer, self.missing_indices = group_order.get_indexer_non_unique(
             self_indexer)
     else:  # Performance-tuned fast paths for constructing indexers
         if self_indexer.equals(other_indexer):  # Indexers are identical
             self.flat_indexer = np.arange(len(other_indexer))
             self.missing_indices = np.array([], dtype=int)
         elif len(self_indexer.difference(
                 other_indexer)) == 0:  # No missing values
             # Taking the difference is faster than `all(.isin())`
             self.missing_indices = np.array([], dtype=int)
             self.flat_indexer = other_indexer.get_indexer(self_indexer)
         else:  # All other cases
             self.flat_indexer, self.missing_indices = other_indexer.get_indexer_non_unique(
                 self_indexer)
Ejemplo n.º 8
0
def parse_csv(csv_file_stream, categories: pd.Index) -> pd.DataFrame:
    probabilities = pd.read_csv(csv_file_stream, header=0)

    if 'image' not in probabilities.columns:
        raise ScoreException('Missing column in CSV: "image".')

    probabilities.set_index('image', drop=True, inplace=True, verify_integrity=True)

    missing_columns = categories.difference(probabilities.columns)
    if not missing_columns.empty:
        raise ScoreException(f'Missing columns in CSV: {list(missing_columns)}.')

    extra_columns = probabilities.columns.difference(categories)
    if not extra_columns.empty:
        raise ScoreException(f'Extra columns in CSV: {list(extra_columns)}.')

    # sort by the order in categories
    probabilities = probabilities.reindex(categories, axis='columns')

    missing_rows = probabilities[probabilities.isnull().any(axis='columns')].index
    if not missing_rows.empty:
        raise ScoreException(f'Missing value(s) in CSV for images: {missing_rows.tolist()}.')

    non_float_columns = probabilities.dtypes[
        probabilities.dtypes.apply(lambda x: x != np.float64)
    ].index
    if not non_float_columns.empty:
        raise ScoreException(
            f'CSV contains non-floating-point value(s) in columns: {non_float_columns.tolist()}.'
        )
    # TODO: identify specific failed rows

    out_of_range_rows = probabilities[
        probabilities.applymap(lambda x: x < 0.0 or x > 1.0).any(axis='columns')
    ].index
    if not out_of_range_rows.empty:
        raise ScoreException(
            f'Values in CSV are outside the interval [0.0, 1.0] for images: '
            f'{out_of_range_rows.tolist()}.'
        )

    # TODO: fail on extra columns in data rows

    return probabilities
Ejemplo n.º 9
0
 def _condition_logic(base_idx: pd.Index, sub_select_idx: pd.Index, state_idx: pd.Index, condition_idx: pd.Index,
                      logic: str) -> pd.Index:
     if str(logic).upper() == 'ALL':
         return base_idx.intersection(condition_idx).sort_values()
     elif str(logic).upper() == 'ANY':
         return sub_select_idx.intersection(condition_idx).sort_values()
     elif str(logic).upper() == 'AND':
         return state_idx.intersection(condition_idx).sort_values()
     elif str(logic).upper() == 'NAND':
         return sub_select_idx.drop(state_idx.intersection(condition_idx)).sort_values()
     elif str(logic).upper() == 'OR':
         return state_idx.append(state_idx.union(condition_idx)).drop_duplicates().sort_values()
     elif str(logic).upper() == 'NOR':
         result = state_idx.append(state_idx.union(condition_idx)).drop_duplicates().sort_values()
         return sub_select_idx.drop(result)
     elif str(logic).upper() == 'NOT':
         return state_idx.difference(condition_idx)
     elif str(logic).upper() == 'XOR':
         return state_idx.union(condition_idx).difference(state_idx.intersection(condition_idx))
     raise ValueError(f"The logic '{logic}' must be AND, NAND, OR, NOR, NOT, XOR ANY or ALL")
Ejemplo n.º 10
0
def _difference(left: pd.Index, right: pd.Index) -> pd.Index:
    return left.difference(right, sort=False)
Ejemplo n.º 11
0
def parse_csv(csv_file_stream: TextIO, categories: pd.Index) -> pd.DataFrame:
    try:
        probabilities = pd.read_csv(csv_file_stream, header=0, index_col=False)
    except pd.errors.ParserError as e:
        # TODO: Test this case
        raise ScoreException(f'Could not parse CSV: "{str(e)}"')

    if 'image' not in probabilities.columns:
        raise ScoreException('Missing column in CSV: "image".')

    # Pandas represents strings as 'O' (object)
    if probabilities['image'].dtype != np.dtype('O'):
        # Coercing to 'U' (unicode) ensures that even NaN values are converted;
        # however, the resulting type is still 'O'
        probabilities['image'] = probabilities['image'].astype(np.dtype('U'))

    probabilities['image'] = probabilities['image'].str.replace(r'\.jpg$',
                                                                '',
                                                                case=False)

    if not probabilities['image'].is_unique:
        duplicate_images = probabilities['image'][
            probabilities['image'].duplicated()].unique()
        raise ScoreException(
            f'Duplicate image rows detected in CSV: {duplicate_images.tolist()}.'
        )

    # The duplicate check is the same as performed by 'verify_integrity'
    probabilities.set_index('image',
                            drop=True,
                            inplace=True,
                            verify_integrity=False)

    missing_columns = categories.difference(probabilities.columns)
    if not missing_columns.empty:
        raise ScoreException(
            f'Missing columns in CSV: {missing_columns.tolist()}.')

    extra_columns = probabilities.columns.difference(categories)
    if not extra_columns.empty:
        raise ScoreException(
            f'Extra columns in CSV: {extra_columns.tolist()}.')

    # sort by the order in categories
    probabilities = probabilities.reindex(categories, axis='columns')

    missing_rows = probabilities[probabilities.isnull().any(
        axis='columns')].index
    if not missing_rows.empty:
        raise ScoreException(
            f'Missing value(s) in CSV for images: {missing_rows.tolist()}.')

    non_float_columns = probabilities.dtypes[probabilities.dtypes.apply(
        lambda x: x != np.float64)].index
    if not non_float_columns.empty:
        raise ScoreException(
            f'CSV contains non-floating-point value(s) in columns: {non_float_columns.tolist()}.'
        )
    # TODO: identify specific failed rows

    out_of_range_rows = probabilities[probabilities.applymap(
        lambda x: x < 0.0 or x > 1.0).any(axis='columns')].index
    if not out_of_range_rows.empty:
        raise ScoreException(
            f'Values in CSV are outside the interval [0.0, 1.0] for images: '
            f'{out_of_range_rows.tolist()}.')

    # TODO: fail on extra columns in data rows

    return probabilities