Example #1
0
    def population_project(self,
                           year_length=None,
                           method=None,
                           growth_rate=None):
        """
        Continuation of population to provide convergent present values
        
        Parameters
        ----------
        year_length : int, default None
                      Duration to continue the population projection
        method : str, default None
                 The value must be 'stable' or 'exp_growth'  
        """

        if 'pop' not in self.columns:
            raise Exception('pop is not a column of cohort')
        if year_length is None:
            raise Exception('a duration in years should be provided')
        if method is None:
            raise Exception('a method should be specified')
        years = self.index_sets['year']
        first_year = min(years)
        last_year = max(years)

        if (first_year + year_length) > last_year:
            new_last_year = first_year + year_length
        else:
            return

        if method == 'stable':
            last_pop = self.xs(last_year, level='year', axis=0)
            pop = DataFrame(self['pop'])
            years = range(last_year + 1, new_last_year + 1)
            list_df = [last_pop] * len(years)

            pop = concat(list_df, keys=years, names=['year'])
            pop = pop.reorder_levels(['age', 'sex', 'year'], axis=0)
            combined = self.combine_first(pop)
            self.__init__(data=combined, columns=['pop'])

        if method == 'exp_growth':
            if growth_rate is None:
                raise Exception(
                    'a growth rate must be provided for the method')

            last_pop = self.xs(last_year, level='year', axis=0)
            pop = DataFrame(self['pop'])
            years = range(last_year + 1, new_last_year + 1)
            list_df = [last_pop] * len(years)

            pop = concat(list_df, keys=years, names=['year'])
            pop = pop.reorder_levels(['age', 'sex', 'year'], axis=0)
            pop = Cohorts(pop)
            pop.gen_grth(growth_rate)
            pop['pop'] *= pop['grth']
            del pop['grth']

            combined = self.combine_first(pop)
            self.__init__(data=combined, columns=['pop'])
Example #2
0
    def test_reorder_levels(self):
        index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
                           labels=[[0, 0, 0, 0, 0, 0],
                                   [0, 1, 2, 0, 1, 2],
                                   [0, 1, 0, 1, 0, 1]],
                           names=['L0', 'L1', 'L2'])
        df = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, index=index)

        # no change, position
        result = df.reorder_levels([0, 1, 2])
        assert_frame_equal(df, result)

        # no change, labels
        result = df.reorder_levels(['L0', 'L1', 'L2'])
        assert_frame_equal(df, result)

        # rotate, position
        result = df.reorder_levels([1, 2, 0])
        e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']],
                           labels=[[0, 1, 2, 0, 1, 2],
                                   [0, 1, 0, 1, 0, 1],
                                   [0, 0, 0, 0, 0, 0]],
                           names=['L1', 'L2', 'L0'])
        expected = DataFrame({'A': np.arange(6), 'B': np.arange(6)},
                             index=e_idx)
        assert_frame_equal(result, expected)
Example #3
0
    def test_reorder_levels(self):
        index = MultiIndex(levels=[['bar'], ['one', 'two', 'three'], [0, 1]],
                           labels=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2],
                                   [0, 1, 0, 1, 0, 1]],
                           names=['L0', 'L1', 'L2'])
        df = DataFrame({'A': np.arange(6), 'B': np.arange(6)}, index=index)

        # no change, position
        result = df.reorder_levels([0, 1, 2])
        assert_frame_equal(df, result)

        # no change, labels
        result = df.reorder_levels(['L0', 'L1', 'L2'])
        assert_frame_equal(df, result)

        # rotate, position
        result = df.reorder_levels([1, 2, 0])
        e_idx = MultiIndex(levels=[['one', 'two', 'three'], [0, 1], ['bar']],
                           labels=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1],
                                   [0, 0, 0, 0, 0, 0]],
                           names=['L1', 'L2', 'L0'])
        expected = DataFrame({
            'A': np.arange(6),
            'B': np.arange(6)
        },
                             index=e_idx)
        assert_frame_equal(result, expected)
Example #4
0
    def population_project(self, year_length = None, method = None, growth_rate = None):
        """
        Continuation of population to provide convergent present values
        
        Parameters
        ----------
        year_length : int, default None
                      Duration to continue the population projection
        method : str, default None
                 The value must be 'stable' or 'exp_growth'  
        """

        if 'pop' not in self.columns:
            raise Exception('pop is not a column of cohort')
        if year_length is None:
            raise Exception('a duration in years should be provided')
        if method is None:
            raise Exception('a method should be specified')
        years = self.index_sets['year']
        first_year = min(years)
        last_year = max(years)
        
        if ( first_year + year_length ) > last_year:
            new_last_year = first_year + year_length
        else:
            return

        if method == 'stable':
            last_pop = self.xs(last_year, level='year', axis=0)
            pop = DataFrame(self['pop'])
            years = range(last_year+1,new_last_year+1)
            list_df = [last_pop] * len(years)

            pop = concat(list_df, keys = years, names =['year'])
            pop = pop.reorder_levels(['age','sex','year'], axis=0)
            combined = self.combine_first(pop)
            self.__init__(data = combined, columns = ['pop'])
            

        if method == 'exp_growth':
            if growth_rate is None:
                    raise Exception('a growth rate must be provided for the method')
            
            last_pop = self.xs(last_year, level='year', axis=0)
            pop = DataFrame(self['pop'])
            years = range(last_year+1,new_last_year+1)
            list_df = [last_pop] * len(years) 

            pop = concat(list_df, keys = years, names =['year'])
            pop = pop.reorder_levels(['age','sex','year'], axis=0)
            pop = Cohorts(pop)
            pop.gen_grth(growth_rate)
            pop['pop'] *= pop['grth']
            del pop['grth']
            
            combined = self.combine_first(pop)
            self.__init__(data = combined, columns = ['pop'])
Example #5
0
def read_data_file(fn, skiplines=1, maxlines=False):
    """  A function to read any foam data files returning data and
         index after header
    """

    # TODO check if sorting the index gives any performance benefits
    # print "opening file {}".format(fn)
    if not os.path.exists(fn):
        print("Can not open file " + fn)
        return None
    try:
        with open(fn, encoding="utf-8") as f:
            field = fn.split('/')[-1]
            content = f.readlines()
            content.append('bla')
            start, num_entries = if_header_skip(content)
            entries = len(content[start].split())
            is_a_vector = (True if entries > 1 else False)
            end = start + num_entries
            if is_a_vector:
                data = list(
                    map(lambda x: re.sub("[0-9]*\(|\)", '', x).split(),
                        content[start:end:skiplines]))
                loc, names = evaluate_names(fn, entries)
                df = DataFrame(data=data, columns=names)
                if loc:
                    df['Loc'] = loc
                else:
                    df['Loc'] = range(len(df))
                df.set_index('Loc', append=True, inplace=True)
                df.index.names = ['Id', 'Loc']
                df = df.reorder_levels(['Loc', 'Id'])
                df = df.astype(float)
                hashes = {}
                for row in df.columns:
                    hashes.update({row: hash_series(df[row])})
                return names, df, hashes
            else:
                data = [np.float32(x) for x in content[start:end:skiplines]]
                entries = 1
                df = DataFrame(data=data, columns=[field])
                df['Loc'] = "Field"
                df.set_index('Loc', append=True, inplace=True)
                df.index.names = ['Id', 'Loc']
                df = df.reorder_levels(['Loc', 'Id'])
                hashes = {
                    field:
                    int(
                        hashlib.md5(str(data).encode('utf-8')).hexdigest(), 16)
                }
                return field, df, hashes
    except Exception as e:
        if DEBUG:
            print("Error processing datafile " + fn)
            print(e)
        return None
Example #6
0
File: io.py Project: ALGe9/owls
def read_data_file(fn, skiplines=1, maxlines=False):
    """  A function to read any foam data files returning data and
         index after header
    """

    # TODO check if sorting the index gives any performance benefits
    # print "opening file {}".format(fn)
    if not os.path.exists(fn):
        print("Can not open file " + fn)
        return None
    try:
        with open(fn, encoding="utf-8") as f:
            field = fn.split('/')[-1]
            content = f.readlines()
            content.append('bla')
            start, num_entries = if_header_skip(content)
            entries = len(content[start].split())
            is_a_vector = (True if entries > 1 else False)
            end = start + num_entries
            if is_a_vector:
                data = list(map(lambda x: re.sub("[0-9]*\(|\)", '', x).split(),
                            content[start:end:skiplines]))
                loc, names = evaluate_names(fn, entries)
                df = DataFrame(data=data, columns=names)
                if loc:
                    df['Loc'] = loc
                else:
                    df['Loc'] = range(len(df))
                df.set_index('Loc', append=True, inplace=True)
                df.index.names=['Id','Loc']
                df = df.reorder_levels(['Loc','Id'])
                df = df.astype(float)
                hashes = {}
                for row in df.columns:
                    hashes.update({row: hash_series(df[row])})
                return names, df, hashes
            else:
                data = [np.float32(x) for x in content[start:end:skiplines]]
                entries = 1
                df = DataFrame(data=data, columns=[field])
                df['Loc'] = "Field"
                df.set_index('Loc', append=True, inplace=True)
                df.index.names=['Id','Loc']
                df = df.reorder_levels(['Loc','Id'])
                hashes = {field: int(hashlib.md5(str(data).encode('utf-8')).hexdigest(),16)}
                return field, df, hashes
    except Exception as e:
        if DEBUG:
            print("Error processing datafile " + fn)
            print(e)
        return None
Example #7
0
    def test_reorder_levels(self):
        index = MultiIndex(
            levels=[["bar"], ["one", "two", "three"], [0, 1]],
            codes=[[0, 0, 0, 0, 0, 0], [0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1]],
            names=["L0", "L1", "L2"],
        )
        df = DataFrame({"A": np.arange(6), "B": np.arange(6)}, index=index)

        # no change, position
        result = df.reorder_levels([0, 1, 2])
        tm.assert_frame_equal(df, result)

        # no change, labels
        result = df.reorder_levels(["L0", "L1", "L2"])
        tm.assert_frame_equal(df, result)

        # rotate, position
        result = df.reorder_levels([1, 2, 0])
        e_idx = MultiIndex(
            levels=[["one", "two", "three"], [0, 1], ["bar"]],
            codes=[[0, 1, 2, 0, 1, 2], [0, 1, 0, 1, 0, 1], [0, 0, 0, 0, 0, 0]],
            names=["L1", "L2", "L0"],
        )
        expected = DataFrame({
            "A": np.arange(6),
            "B": np.arange(6)
        },
                             index=e_idx)
        tm.assert_frame_equal(result, expected)

        result = df.reorder_levels([0, 0, 0])
        e_idx = MultiIndex(
            levels=[["bar"], ["bar"], ["bar"]],
            codes=[[0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0]],
            names=["L0", "L0", "L0"],
        )
        expected = DataFrame({
            "A": np.arange(6),
            "B": np.arange(6)
        },
                             index=e_idx)
        tm.assert_frame_equal(result, expected)

        result = df.reorder_levels(["L0", "L0", "L0"])
        tm.assert_frame_equal(result, expected)
Example #8
0
    def test_pandas_extend_index(self):
        d1 = DataFrame(data=[2, 4, 6, 8], columns=["A"], index=[1, 2, 3, 4])
        d1.index.name = "first"

        d1["second"] = "default"
        d1.set_index(["second"], append=True, inplace=True)
        self.assertEqual(d1.index.names, ["first", "second"])

        d1 = d1.reorder_levels(["second", "first"])
        self.assertEqual(d1.index.names, ["second", "first"])
Example #9
0
def aggregate_chunks(mod_features_df, modality):
    without_info_df = mod_features_df.query('field != "info"')
    cnt_df = DataFrame([list(mod_features_df.loc[('info', 'count'), :].values)] * len(without_info_df),
                       index=without_info_df.index)
    agg_df = without_info_df * cnt_df
    agg_df = DataFrame(agg_df.sum(axis=1) / cnt_df.sum(axis=1), index=without_info_df.index)
    agg_df['modality'] = modality
    agg_df.set_index('modality', append=True, inplace=True)
    agg_df = agg_df.reorder_levels(['modality', 'field', 'feature'])
    return agg_df
Example #10
0
    def test_pandas_extend_index(self):
        d1 = DataFrame(data=[2, 4, 6, 8], columns=["A"], index=[1, 2, 3, 4])
        d1.index.name = "first"

        d1["second"] = "default"
        d1.set_index(["second"], append=True, inplace=True)
        self.assertEqual(d1.index.names, ["first", "second"])

        d1 = d1.reorder_levels(["second", "first"])
        self.assertEqual(d1.index.names, ["second", "first"])
Example #11
0
def sort_hierarchical_data(data: pd.DataFrame) -> pd.DataFrame:
    """Reorder index labels of a hierarchical index and sort in level order."""
    sort_order = [
        "location", "sex", "age_start", "age_end", "year_start", "year_end"
    ]
    sorted_data_index = [n for n in sort_order if n in data.index.names]
    sorted_data_index.extend(
        [n for n in data.index.names if n not in sorted_data_index])

    if isinstance(data.index, pd.MultiIndex):
        data = data.reorder_levels(sorted_data_index)
    data = data.sort_index()
    return data
Example #12
0
def reshape(data: pd.DataFrame,
            value_cols: List = DRAW_COLUMNS) -> pd.DataFrame:
    if isinstance(data, pd.DataFrame) and not isinstance(
            data.index, pd.MultiIndex):  # push all non-val cols into index
        data = data.set_index(
            get_ordered_index_cols(data.columns.difference(value_cols)))
    elif not data.columns.difference(
            value_cols
    ).empty:  # we missed some columns that need to be in index
        data = data.set_index(list(data.columns.difference(value_cols)),
                              append=True)
        data = data.reorder_levels(
            get_ordered_index_cols(set(data.index.names)))
    else:  # we've already set the full index
        pass
    return data
def add_elo_rating(data_frame: pd.DataFrame):
    """Add ELO rating of team prior to matches"""

    if "score" not in data_frame.columns or "oppo_score" not in data_frame.columns:
        raise ValueError(
            "To calculate ELO ratings, 'score' and 'oppo_score' must be "
            "in the data frame, but the columns given were "
            f"{list(data_frame.columns)}")

    elo_data_frame = data_frame.reorder_levels(
        [YEAR_LEVEL, ROUND_LEVEL, TEAM_LEVEL]).sort_index(ascending=True)

    elo_column = (reduce(
        partial(_calculate_match_elo_rating, elo_data_frame),
        elo_data_frame.iterrows(),
        None,
    ).reorder_levels(
        [REORDERED_TEAM_LEVEL, REORDERED_YEAR_LEVEL,
         REORDERED_ROUND_LEVEL]).sort_index())

    return data_frame.assign(elo_rating=elo_column)
Example #14
0
def import_foam_folder(path,
                       search,
                       files,
                       skiplines=1,
                       maxlines=0,
                       skiptimes=1,
                       exclude=None):
    """ returns a Dataframe for every file in fileList """
    #import StringIO
    from pandas import concat
    fileList = find_datafiles(path,
                              search=search,
                              files=files,
                              exclude=exclude)
    if not fileList:
        print("no files found")
        return
    p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()]))
    df = DataFrame()
    #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0])
    from collections import defaultdict
    origins = Origins()
    els = list(fileList.items())[::skiptimes]
    for fullpath, files in els:
        time = strip_time(fullpath, path)
        df_tmp = DataFrame()
        for fn in files:
            #ret = read_table(StringIO.StringIO(foam_to_csv(fn)))
            ret = read_data_file(fn, skiplines, maxlines)
            p_bar.next()
            if not ret:
                continue
            field_names, x, hashes = ret
            loc = x.index.values[-1][0]
            if df_tmp.empty:
                df_tmp = x
            else:
                try:
                    # use combine first for all df at existing Loc or
                    # if not Loc is specified (Eul or Lag fields)
                    if x.index.levels[0][0] in df_tmp.index.levels[0]:
                        df_tmp = df_tmp.combine_first(x)
                        #df_tmp = concat([df_tmp, x], axis=1)
                        pass
                    else:
                        df_tmp = concat([df_tmp, x])
                except Exception as e:
                    print(x)
                    print(e)
            field_names = ([field_names]
                           if not type(field_names) == list else field_names)
            for field in field_names:
                origins.insert(time, loc, field, fn, hashes[field])
        df_tmp['Time'] = time
        if df.empty:
            df = df_tmp
        else:
            df = df.append(df_tmp)
    df.set_index('Time', append=True, inplace=True)
    df = df.reorder_levels(['Time', 'Loc', 'Id'])
    p_bar.done()
    return origins, df
Example #15
0
def normalize_df(target: DataFrame,
                 normer: DataFrame,
                 ind_sep: Optional[str] = "-",
                 alphas: Optional[Iterable[float]] = None,
                 cv: float = 5,
                 **RidgeCV_kws) -> DataFrame:
    """ Used to normalize a dataset by another dataset, using a linear model with regularization
    chosen through cross validation (aka sklearn's RidgeCV). This is useful for normalizing,
    for example, RNA values by CNA, or phosphopeptide values by protein abundance. If target and
    normer dataframe row IDs (index) match 1:1, pass None for ind_sep.

    Args:
        target: Dataframe of values to normalize. Row IDs (index) before the sep (or whole ID
        if no sep) must match normer IDs. Row IDs must be unique.
        normer: Dataframe of values to use for normalization. Row IDs must match all or
        pre-ind_sep portions of target row IDs. Row IDs must be unique.
        ind_sep: If multiple rows in target map to 1 row in normer, the delimiter used to split
        the unique ID that matches the normer IDs. Defaul "-"
        alphas: Parameters to try for regulariztion. If None, tries powers of 2 from -10 to 10.
        cv: Fold for cross validation. Also the minimum number of non-null values for each
        row. Default 5
        **RidgeCV_kws: kws to pass to sklearn's RidgeCV

    Returns: normed
        The target dataframe normalized by the normer dataframe. Only includes rows with
        sufficient non-null values from both dataframe.

    """

    if not alphas:
        alphas = [2**i for i in range(-10, 10, 1)]

    normer = normer[[col for col in target.columns if col in normer.columns]]
    target = target[normer.columns]
    if (len(normer.columns) < cv) or (len(target.columns) < cv):
        raise KeyError(
            "target and normer dataframes do not have at least %s columns in common"
            % cv)

    target = target.transpose()
    target["col0"] = 0
    target.set_index("col0", append=True, inplace=True)
    target = target.reorder_levels(
        [target.index.names[-1], target.index.names[0]]).transpose()

    normer = normer.transpose()
    normer["col0"] = 1
    normer.set_index("col0", append=True, inplace=True)
    normer = normer.reorder_levels(
        [normer.index.names[-1], normer.index.names[0]]).transpose()

    target["gene"] = [i.split(ind_sep)[0] for i in target.index]
    target = target.loc[target["gene"].isin(normer.index), :]
    if len(target) == 0:
        raise KeyError("No rows in common between target and normer")
    logging.info(
        "Normalizing %s common rows and %s common samples between target and normer"
        % (len(target), len(normer.columns)))
    data = target.merge(normer, how="left", left_on="gene", right_index=True)

    model = lm.RidgeCV(alphas=alphas, cv=cv, **RidgeCV_kws)
    normed = data.apply(
        (lambda row: _convert_to_residuals(row[0], row[1], model)), axis=1)

    return normed
Example #16
0
    def __mul__(self, other: 'Conditional'):

        if not isinstance(other, Conditional):
            return other * self

        def expand_conditions(data: DataFrame,
                              new_states: Dict[str, list]) -> DataFrame:
            """
            Repeat the data for each state of the new_states dict.

            :param data: Original data.
            :param new_states: Dict mapping new variables to states.
            """
            num_additional_states = np_product(
                [len(values) for _, values in new_states.items()])
            data_width = data.shape[1]
            expanded_data = concat(
                [data for _ in range(num_additional_states)], axis=1)
            expanded_index = (list(data.columns.values) if isinstance(
                expanded_data.columns, MultiIndex) else
                              [(x, ) for x in expanded_data.columns.values])
            additional_index = list(
                chain.from_iterable(
                    repeat(x, data_width)
                    for x in list(product(*new_states.values()))))
            new_names = list(new_states.keys()) + list(data.columns.names)
            new_columns = [
                tuple(chain(ai, xi))
                for ai, xi in zip(additional_index, expanded_index)
            ]
            expanded_data.columns = MultiIndex.from_tuples(tuples=new_columns,
                                                           names=new_names)
            return expanded_data

        # for each conditional that is only in one distribution,
        # replicate the other distribution for each state in that conditional
        self_conds = set(self._conditional_variables)
        other_conds = set(other._conditional_variables)
        if len(other_conds - self_conds) > 0:
            self_data = expand_conditions(
                self._data, {
                    cond: other._states[cond]
                    for cond in other_conds if cond not in self_conds
                })
        else:
            self_data = self._data
        if len(self_conds - other_conds) > 0:
            other_data = expand_conditions(
                other._data, {
                    cond: self._states[cond]
                    for cond in self_conds if cond not in other_conds
                })
        else:
            other_data = other._data

        # multiply joint variables as if it were a joint distribution
        results = {}
        for d1_states, d1_values in self_data.iterrows():
            for d2_states, d2_values in other_data.iterrows():
                if isinstance(d1_states, tuple):
                    k1 = [x for x in d1_states]
                else:
                    k1 = [d1_states]
                if isinstance(d2_states, tuple):
                    k2 = [x for x in d2_states]
                else:
                    k2 = [d2_states]
                key = tuple(k1 + k2)
                results[key] = d1_values * d2_values
        data = DataFrame(results).T
        data.index.names = (list(self_data.index.names) +
                            list(other_data.index.names))
        data = data.reorder_levels(sorted(data.index.names), axis=0)
        data = data.reorder_levels(sorted(data.columns.names), axis=1)
        new_joints = list(data.index.names)
        new_conds = list(data.columns.names)
        new_states = {
            variable: (self._states[variable] if variable
                       in self._states.keys() else other._states[variable])
            for variable in set(new_joints + new_conds)
        }
        return Conditional(data=data,
                           joint_variables=new_joints,
                           conditional_variables=new_conds,
                           states=new_states)
Example #17
0
def pivot_df(  # pylint: disable=too-many-locals, too-many-arguments, too-many-statements, too-many-branches
    df: pd.DataFrame,
    rows: List[str],
    columns: List[str],
    metrics: List[str],
    aggfunc: str = "Sum",
    transpose_pivot: bool = False,
    combine_metrics: bool = False,
    show_rows_total: bool = False,
    show_columns_total: bool = False,
    apply_metrics_on_rows: bool = False,
) -> pd.DataFrame:
    metric_name = f"Total ({aggfunc})"

    if transpose_pivot:
        rows, columns = columns, rows

    # to apply the metrics on the rows we pivot the dataframe, apply the
    # metrics to the columns, and pivot the dataframe back before
    # returning it
    if apply_metrics_on_rows:
        rows, columns = columns, rows
        axis = {"columns": 0, "rows": 1}
    else:
        axis = {"columns": 1, "rows": 0}

    # pivot data; we'll compute totals and subtotals later
    if rows or columns:
        # pivoting with null values will create an empty df
        df = df.fillna("NULL")
        df = df.pivot_table(
            index=rows,
            columns=columns,
            values=metrics,
            aggfunc=pivot_v2_aggfunc_map[aggfunc],
            margins=False,
        )
    else:
        # if there's no rows nor columns we have a single value; update
        # the index with the metric name so it shows up in the table
        df.index = pd.Index([*df.index[:-1], metric_name], name="metric")

    # if no rows were passed the metrics will be in the rows, so we
    # need to move them back to columns
    if columns and not rows:
        df = df.stack()
        if not isinstance(df, pd.DataFrame):
            df = df.to_frame()
        df = df.T
        df = df[metrics]
        df.index = pd.Index([*df.index[:-1], metric_name], name="metric")

    # combining metrics changes the column hierarchy, moving the metric
    # from the top to the bottom, eg:
    #
    # ('SUM(col)', 'age', 'name') => ('age', 'name', 'SUM(col)')
    if combine_metrics and isinstance(df.columns, pd.MultiIndex):
        # move metrics to the lowest level
        new_order = [*range(1, df.columns.nlevels), 0]
        df = df.reorder_levels(new_order, axis=1)

        # sort columns, combining metrics for each group
        decorated_columns = [(col, i) for i, col in enumerate(df.columns)]
        grouped_columns = sorted(decorated_columns,
                                 key=lambda t: get_column_key(t[0], metrics))
        indexes = [i for col, i in grouped_columns]
        df = df[df.columns[indexes]]
    elif rows:
        # if metrics were not combined we sort the dataframe by the list
        # of metrics defined by the user
        df = df[metrics]

    # compute fractions, if needed
    if aggfunc.endswith(" as Fraction of Total"):
        total = df.sum().sum()
        df = df.astype(total.dtypes) / total
    elif aggfunc.endswith(" as Fraction of Columns"):
        total = df.sum(axis=axis["rows"])
        df = df.astype(total.dtypes).div(total, axis=axis["columns"])
    elif aggfunc.endswith(" as Fraction of Rows"):
        total = df.sum(axis=axis["columns"])
        df = df.astype(total.dtypes).div(total, axis=axis["rows"])

    # convert to a MultiIndex to simplify logic
    if not isinstance(df.index, pd.MultiIndex):
        df.index = pd.MultiIndex.from_tuples([(str(i), ) for i in df.index])
    if not isinstance(df.columns, pd.MultiIndex):
        df.columns = pd.MultiIndex.from_tuples([(str(i), )
                                                for i in df.columns])

    if show_rows_total:
        # add subtotal for each group and overall total; we start from the
        # overall group, and iterate deeper into subgroups
        groups = df.columns
        for level in range(df.columns.nlevels):
            subgroups = {group[:level] for group in groups}
            for subgroup in subgroups:
                slice_ = df.columns.get_loc(subgroup)
                subtotal = pivot_v2_aggfunc_map[aggfunc](df.iloc[:, slice_],
                                                         axis=1)
                depth = df.columns.nlevels - len(subgroup) - 1
                total = metric_name if level == 0 else "Subtotal"
                subtotal_name = tuple([*subgroup, total, *([""] * depth)])
                # insert column after subgroup
                df.insert(int(slice_.stop), subtotal_name, subtotal)

    if rows and show_columns_total:
        # add subtotal for each group and overall total; we start from the
        # overall group, and iterate deeper into subgroups
        groups = df.index
        for level in range(df.index.nlevels):
            subgroups = {group[:level] for group in groups}
            for subgroup in subgroups:
                slice_ = df.index.get_loc(subgroup)
                subtotal = pivot_v2_aggfunc_map[aggfunc](
                    df.iloc[slice_, :].apply(pd.to_numeric), axis=0)
                depth = df.index.nlevels - len(subgroup) - 1
                total = metric_name if level == 0 else "Subtotal"
                subtotal.name = tuple([*subgroup, total, *([""] * depth)])
                # insert row after subgroup
                df = pd.concat([
                    df[:slice_.stop],
                    subtotal.to_frame().T, df[slice_.stop:]
                ])

    # if we want to apply the metrics on the rows we need to pivot the
    # dataframe back
    if apply_metrics_on_rows:
        df = df.T

    return df
Example #18
0
File: io.py Project: greole/owls
def import_logs(folder, search, keys):
    """
        keys = {"ExectionTime": ["ExecTime", "ClockTime"]}

        return a DataFrame

              Loc, Time KeyName1 Keyname2
                1   0.1

                    0.2
                2


    """
    def find_start(log):
        """ Fast forward through file till 'Starting time loop' """
        for i, line in enumerate(log):
            if "Starting time loop" in line:
                return i


    def extract(line, keys):
        """
            returns key and values as list
                "ExecutionTime":[0,1]
        """
        import re
        for key, col_names in keys.items():
            if re.search(key, line):
                return col_names, list(
                        map(float,filter(lambda x:
                        x, re.findall("[0-9]+[.]?[0-9]*[e]?[\-]?[0-9]*", line))))
        return None, None

    fold, dirs, files = next(os.walk(folder))
    logs = [fold + "/" + log for log in files if search in log]
    p_bar = ProgressBar(n_tot = len(logs))
    # Lets make sure that we find Timesteps in the log
    keys.update({"^Time = ": ['Time']})

    for log_number, log_name in enumerate(logs):
        with open(log_name, encoding="utf-8") as log:
            f = log.readlines()
            start = find_start(f)
            dataDict = defaultdict(list)
            df=DataFrame()
            for line in f[start:-1]:
                 col_names, values = extract(line, keys)
                 if not col_names:
                    continue
                 if col_names[0] == 'Time':
                    # a new time step has begun
                    # flush datadict and concat to df
                    # Very slow but, so far the solution
                    # to keep subiterations attached to correct time
                    # FIXME: still needs handling of different length dictionaries
                    df = concat([df, DataFrame(dataDict)])
                    dataDict = defaultdict(list)
                 for i, col in enumerate(col_names):
                    dataDict[col].append(values[i])
        p_bar.next()
        try:
            df.index=range(len(df))
            df.index.names=['Id']
            df['Loc'] = log_number
            df.set_index('Time', append=True, inplace=True)
            df.set_index('Loc', append=True, inplace=True)
            df = df.reorder_levels(['Loc','Time','Id'])
            p_bar.done()
        except Exception as e:
            print(log_name)
            print("failed to process")
            print(e)
            return {}, None
    return {}, DataFrame()
Example #19
0
File: io.py Project: greole/owls
def read_data_file(fn, skiplines=1, maxlines=False, p_bar=None):
    """  A function to read any foam data files returning data and
         index after header
    """

    # TODO check if sorting the index gives any performance benefits
    # print "opening file {}".format(fn)
    if not os.path.exists(fn):
        print("Can not open file " + fn)
        return None
    try:
        with open(fn, encoding="utf-8") as f:
            field = fn.split('/')[-1]
            content = f.readlines()
            content.append('bla')
            start, num_entries = if_header_skip(content)
            entries = len(content[start].split())
            is_a_vector = (True if entries > 1 else False)
            end = start + num_entries
            # FIXME this fails for eulerian/lagrangian vector fields
            # since no positional entry is produced
            if isinstance(p_bar, ProgressBar):
                p_bar.next()
            if is_a_vector:
                data = list(map(lambda x: re.sub("[0-9]*\(|\)", '', x).split(),
                            content[start:end:skiplines]))
                loc, names = evaluate_names(fn, entries)
                df = DataFrame(data=data, columns=names)
                if loc:
                    df['Loc'] = loc
                else:
                    df['Loc'] = range(len(df))
                if "Pos" in df:
                    df.set_index('Loc', append=False, inplace=True)
                    df["Pos"] = df["Pos"].astype(float)
                    df.set_index('Pos', append=True, inplace=True)
                else:
                    # if no pos is availible we have either
                    # an eulerian or lagrangian field
                    df.set_index('Loc', append=True, inplace=True)
                    df.index.names = ['Pos', 'Loc']
                    df = df.reorder_levels(['Loc', 'Pos'])
                df = df.astype(float)
                hashes = {}
                for row in df.columns:
                    hashes.update({row: hash_series(df[row])})
                return names, df, hashes
            # DataFile with a single row are seen as Eulerian or Lagrangian fields
            else:
                data = [np.float32(x) for x in content[start:end:skiplines]]
                entries = 1
                df = DataFrame(data=data, columns=[field])
                df['Loc'] = "Field"
                df.set_index('Loc', append=True, inplace=True)
                df.index.names=['Pos', 'Loc']
                df = df.reorder_levels(['Loc', 'Pos'])
                if HASH_RESULTS:
                    hashes = {field: int(hashlib.md5(str(data).encode('utf-8')).hexdigest(),16)}
                else:
                    hashes = {field: 0}
                return field, df, hashes
    except Exception as e:
        if DEBUG:
            print("Error processing datafile " + fn)
            print(e)
        return None
Example #20
0
def import_foam_folder(
        path,
        search,
        files,
        skiplines=1,
        maxlines=0,
        skiptimes=1,
        exclude=None,
        times_slice=None,
        ):
    """ returns a Dataframe for every file in fileList """
    #import StringIO
    fileList = find_datafiles(
        path, search=search, files=files,
        exclude=exclude, times_slice=times_slice
    )
    if not fileList:
        print("no files found")
        return None, DataFrame()
    p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()]))
    df = DataFrame()
    #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0])
    from collections import defaultdict
    origins = Origins()
    els = list(fileList.items())[::skiptimes]
    for fullpath, files in els:
        time = strip_time(fullpath, path)
        df_tmp = DataFrame()
        for fn in files:
            #ret = read_table(StringIO.StringIO(foam_to_csv(fn)))
            ret = read_data_file(fn, skiplines, maxlines)
            p_bar.next()
            if not ret or ret[1].empty:
                continue
            field_names, x, hashes = ret
            loc = x.index.values[-1][0]
            if df_tmp.empty:
                df_tmp = x
            else:
                try:
                    df_tmp = df_tmp.combine_first(x)
                except Exception as e:
                    print("failed to concat: ",
                            df_tmp, "and", x, "new_loc ",
                            x.index.levels[0][0], " existing_locs ",
                            df_tmp.index.levels[0] )
                    print(e)
            field_names = ([field_names] if not type(field_names) == list else field_names)
            for field in field_names:
                if field == "Pos":
                    continue
                origins.insert(time, loc, field, fn, hashes[field])
        df_tmp['Time'] = time
        if df.empty:
            df = df_tmp
        else:
            df = df.append(df_tmp)
    df.set_index('Time', append=True, inplace=True)
    if not "Loc" in  df.index.names:
        print(df)
        # df = df.reorder_levels(['Time', ])
    else:
        df = df.reorder_levels(['Time', 'Loc', 'Pos'])
    p_bar.done()
    return origins, df
Example #21
0
def reorder_pairs(pair_df: pd.DataFrame,
                  num_stars: int,
                  parameters: Dict[Union[str, int], Tuple[Union[float, str]]],
                  df: pd.DataFrame,
                  delta_h: float,
                  report_verbose: bool,
                  xmax: np.ndarray,
                  xmin: np.ndarray,
                  offline_mode: bool
                  ) -> pd.DataFrame:

    """
    Calculates the differences('h') between the pairings of the star points, and
    bins and reorders the pair dataframe according to the calculated 'h' values

    Parameters
    ----------
    pair_df : pd.DataFrame
        Pandas DataFrame containing the paired star points values with the model outputs
    num_stars : int
        number of star samples
    parameters : dictionary
        dictionary containing parameter names and their attributes
    df : pd.DataFrame
        Pandas DataFrame containing the star points, and model outputs
    delta_h : float
        resolution of star samples
    report_verbose : boolean
        if True will use a loading bar when generating stars, does nothing if False
    xmax : arraylike
        array containing max boundary of each parameter
    xmin : arraylike
        array containing min boundary of each parameter
    offline_mode : boolean
        if True GVARS analysis is in offline mode, if False it is in online mode

    Returns
    -------
    pair_df : array_like
        the returned dataframe of paired values
    """

    # for loading bar when calculating differences in values 'h'
    if report_verbose:
        star_centres = tqdm(range(0, num_stars), desc='calculating \'h\' values')
    else:
        star_centres = range(0, num_stars)

    # gather the actual 'h' differences between each star point value for every pair
    # possibly find a faster way to do this later
    dist_list = []
    for star_centre in star_centres:
        param_num = 0
        for param in parameters.keys():
            # check for offline on online mode as index changes for df
            if offline_mode:
                pairs = pairs_h(df.loc[star_centre, param][str(param_num)].index.get_level_values(-1))
            else:
                pairs = pairs_h(df.loc[star_centre, param][param_num].index.get_level_values(-1))
            for ignore, idx in pairs.items():
                for idx_tup in idx:
                    if offline_mode:
                        dist_list.append(np.abs((df.loc[star_centre, param][str(param_num)][idx_tup[0]] -
                                                 df.loc[star_centre, param][str(param_num)][idx_tup[1]]) / (
                                                            xmax[param_num] - xmin[param_num])))
                    else:
                        dist_list.append(np.abs((df.loc[star_centre, param][param_num][idx_tup[0]] -
                                                 df.loc[star_centre, param][param_num][idx_tup[1]]) / (
                                                            xmax[param_num] - xmin[param_num])))

            param_num += 1

    # loading bar for binning and reording pairs based on new 'h' values
    if report_verbose:
        pairs_pbar = tqdm(desc='binning and reording pairs based on \'h\' values', total=2, dynamic_ncols=True)

    # add new distances to dataframe
    pair_df['actual h'] = dist_list

    # create bin ranges
    num_bins = int(1 / delta_h)  # the number of bins created by delta h
    bins = np.zeros(num_bins + 1)
    bins[1:] = np.arange(start=delta_h / 2, step=delta_h, stop=1)  # create middle bin ranges

    # create labels for the bin ranges which will be the actual delta h values
    labels = np.zeros(num_bins)
    labels[0] = delta_h / 4
    labels[1:] = np.arange(start=delta_h, step=delta_h, stop=1)

    # bin pair values according to their distances 'h' for each paramter at each star centre
    binned_pairs = []
    for star_centre in range(0, num_stars):
        for param in parameters.keys():
            binned_pairs.append(
                pd.cut(pair_df.loc[star_centre, param, :]['actual h'], bins=bins, labels=labels).sort_values())

    # put binned pairs into a panda series
    binned_pairs = pd.concat(binned_pairs, ignore_index=False)

    if report_verbose:
        pairs_pbar.update(1)

    # re order pairs values according to the bins
    pair_df = pair_df.loc[binned_pairs.index]

    # add in new index h, according to bin ranges
    # ex.) h = 0.1 = [0-0.15], h = 0.2 = [0.15-0.25]
    h = list(binned_pairs.values)
    pair_df['h'] = h

    # format data frame so that it works properly with variogram analsysis functions
    pair_df.set_index('h', append=True, inplace=True)
    pair_df.set_index('actual h', append=True, inplace=True)

    pair_df = pair_df.reorder_levels(['centre', 'param', 'h', 'actual h', 'pair_ind'])

    if report_verbose:
        pairs_pbar.update(1)
        pairs_pbar.close()

    return pair_df
Example #22
0
def read_data_file(fn, skiplines=1, maxlines=False, p_bar=None):
    """  A function to read any foam data files returning data and
         index after header
    """

    # TODO check if sorting the index gives any performance benefits
    # print "opening file {}".format(fn)
    if not os.path.exists(fn):
        print("Can not open file " + fn)
        return None
    try:
        with open(fn, encoding="utf-8") as f:
            field = fn.split('/')[-1]
            content = f.readlines()
            content.append('bla')
            start, num_entries = if_header_skip(content)
            entries = len(content[start].split())
            is_a_vector = (True if entries > 1 else False)
            end = start + num_entries
            # FIXME this fails for eulerian/lagrangian vector fields
            # since no positional entry is produced
            if isinstance(p_bar, ProgressBar):
                p_bar.next()
            if is_a_vector:
                data = list(map(lambda x: re.sub("[0-9]*\(|\)", '', x).split(),
                            content[start:end:skiplines]))
                loc, names = evaluate_names(fn, entries)
                df = DataFrame(data=data, columns=names)
                if loc:
                    df['Loc'] = loc
                else:
                    df['Loc'] = range(len(df))
                if "Pos" in df:
                    df.set_index('Loc', append=False, inplace=True)
                    df["Pos"] = df["Pos"].astype(float)
                    df.set_index('Pos', append=True, inplace=True)
                else:
                    # if no pos is availible we have either
                    # an eulerian or lagrangian field
                    df.set_index('Loc', append=True, inplace=True)
                    df.index.names = ['Pos', 'Loc']
                    df = df.reorder_levels(['Loc', 'Pos'])
                df = df.astype(float)
                hashes = {}
                for row in df.columns:
                    hashes.update({row: hash_series(df[row])})
                return names, df, hashes
            # DataFile with a single row are seen as Eulerian or Lagrangian fields
            else:
                data = [np.float32(x) for x in content[start:end:skiplines]]
                entries = 1
                df = DataFrame(data=data, columns=[field])
                df['Loc'] = "Field"
                df.set_index('Loc', append=True, inplace=True)
                df.index.names=['Pos', 'Loc']
                df = df.reorder_levels(['Loc', 'Pos'])
                if HASH_RESULTS:
                    hashes = {field: int(hashlib.md5(str(data).encode('utf-8')).hexdigest(),16)}
                else:
                    hashes = {field: 0}
                return field, df, hashes
    except Exception as e:
        if DEBUG:
            print("Error processing datafile " + fn)
            print(e)
        return None
Example #23
0
File: io.py Project: greole/owls
def import_foam_folder(
        path,
        search,
        files,
        skiplines=1,
        maxlines=0,
        skiptimes=slice(0,None),
        exclude=None,
        times_slice=None,
        ):
    """ returns a Dataframe for every file in fileList """
    #import StringIO
    fileList = find_datafiles(
        path, search=search, files=files,
        exclude=exclude, times_slice=times_slice
    )
    if not fileList:
        print("no files found")
        return None, DataFrame()
    p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()]))
    df = DataFrame()
    #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0])
    from collections import defaultdict
    origins = Origins()
    els = list(fileList.items())[skiptimes]
    for fullpath, files in els:
        time = strip_time(fullpath, path)
        df_tmp = DataFrame()

        # for fn in files:
        #     #ret = read_table(StringIO.StringIO(foam_to_csv(fn)))
        #     ret = read_data_file(fn, skiplines, maxlines)
        #     p_bar.next()

        args = [(fn, skiplines, maxlines, p_bar) for fn in files]
        if MULTIPROCESS:
            with multiprocessing.Pool(processes=MULTIPROCESS) as pool:
                rets = pool.map(read_data_file_args, args)
        else:
            rets = map(read_data_file_args, args)


        for fn, ret in zip(files, rets):
            if not ret or ret[1].empty:
                continue
            field_names, x, hashes = ret
            loc = x.index.values[-1][0]
            if df_tmp.empty:
                df_tmp = x
            else:
                try:
                    df_tmp = df_tmp.combine_first(x)
                except Exception as e:
                    print("failed to concat: ",
                            df_tmp, "and", x, "new_loc ",
                            x.index.levels[0][0], " existing_locs ",
                            df_tmp.index.levels[0] )
                    print(e)
            field_names = ([field_names] if not type(field_names) == list else field_names)
            for field in field_names:
                if field == "Pos":
                    continue
                origins.insert(time, loc, field, fn, hashes[field])
        df_tmp['Time'] = time
        if df.empty:
            df = df_tmp
        else:
            df = df.append(df_tmp)
    df.set_index('Time', append=True, inplace=True)
    if not "Loc" in  df.index.names:
        print(df)
        # df = df.reorder_levels(['Time', ])
    else:
        df = df.reorder_levels(['Time', 'Loc', 'Pos'])
    p_bar.done()
    return origins, df
Example #24
0
def import_logs(folder, search, keys):
    """
        keys = {"ExectionTime": ["ExecTime", "ClockTime"]}

        return a DataFrame

              Loc, Time KeyName1 Keyname2
                1   0.1

                    0.2
                2


    """
    def find_start(log):
        """ Fast forward through file till 'Starting time loop' """
        for i, line in enumerate(log):
            if "Starting time loop" in line:
                return i


    def extract(line, keys):
        """
            returns key and values as list
                "ExecutionTime":[0,1]
        """
        import re
        for key, col_names in keys.items():
            if re.search(key, line):
                return col_names, list(
                        map(float,filter(lambda x:
                        x, re.findall("[0-9]+[.]?[0-9]*[e]?[\-]?[0-9]*", line))))
        return None, None

    fold, dirs, files = next(os.walk(folder))
    logs = [fold + "/" + log for log in files if search in log]
    p_bar = ProgressBar(n_tot = len(logs))
    # Lets make sure that we find Timesteps in the log
    keys.update({"^Time = ": ['Time']})

    for log_number, log_name in enumerate(logs):
        with open(log_name, encoding="utf-8") as log:
            f = log.readlines()
            start = find_start(f)
            dataDict = defaultdict(list)
            df=DataFrame()
            for line in f[start:-1]:
                 col_names, values = extract(line, keys)
                 if not col_names:
                    continue
                 if col_names[0] == 'Time':
                    # a new time step has begun
                    # flush datadict and concat to df
                    # Very slow but, so far the solution
                    # to keep subiterations attached to correct time
                    # FIXME: still needs handling of different length dictionaries
                    df = concat([df, DataFrame(dataDict)])
                    dataDict = defaultdict(list)
                 for i, col in enumerate(col_names):
                    dataDict[col].append(values[i])
        p_bar.next()
        try:
            df.index=range(len(df))
            df.index.names=['Id']
            df['Loc'] = log_number
            df.set_index('Time', append=True, inplace=True)
            df.set_index('Loc', append=True, inplace=True)
            df = df.reorder_levels(['Loc','Time','Id'])
            p_bar.done()
        except Exception as e:
            print(log_name)
            print("failed to process")
            print(e)
            return {}, None
    return {}, DataFrame()
Example #25
0
File: io.py Project: ALGe9/owls
def import_foam_folder(
        path,
        search,
        files,
        skiplines=1,
        maxlines=0,
        skiptimes=1,
        exclude=None
        ):
    """ returns a Dataframe for every file in fileList """
    #import StringIO
    from pandas import concat
    fileList = find_datafiles(
        path, search=search, files=files, exclude=exclude)
    if not fileList:
        print("no files found")
        return
    p_bar = ProgressBar(n_tot=sum([len(l) for l in fileList.values()]))
    df = DataFrame()
    #df.index = MultiIndex.from_tuples(zip([],[]),names=['Loc',0])
    from collections import defaultdict
    origins = Origins()
    els = list(fileList.items())[::skiptimes]
    for fullpath, files in els:
        time = strip_time(fullpath, path)
        df_tmp = DataFrame()
        for fn in files:
            #ret = read_table(StringIO.StringIO(foam_to_csv(fn)))
            ret = read_data_file(fn, skiplines, maxlines)
            p_bar.next()
            if not ret:
                continue
            field_names, x, hashes = ret
            loc = x.index.values[-1][0]
            if df_tmp.empty:
                df_tmp = x
            else:
                try:
                    # use combine first for all df at existing Loc or
                    # if not Loc is specified (Eul or Lag fields)
                    if x.index.levels[0][0] in df_tmp.index.levels[0]:
                        df_tmp = df_tmp.combine_first(x)
                        #df_tmp = concat([df_tmp, x], axis=1)
                        pass
                    else:
                        df_tmp = concat([df_tmp, x])
                except Exception as e:
                    print(x)
                    print(e)
            field_names = ([field_names] if not type(field_names) == list else field_names)
            for field in field_names:
                origins.insert(time, loc, field, fn, hashes[field])
        df_tmp['Time'] = time
        if df.empty:
            df = df_tmp
        else:
            df = df.append(df_tmp)
    df.set_index('Time', append=True, inplace=True)
    df = df.reorder_levels(['Time','Loc','Id'])
    p_bar.done()
    return origins, df