Esempio n. 1
0
def mk_terms_df(df, text_cols, id_cols=None, tokenizer_re=tokenizer_re):
    text_cols = util_ulist.ascertain_list(text_cols)
    if id_cols is None:
        id_cols = colloc.setdiff(df.columns, text_cols)
    else:
        id_cols = util_ulist.ascertain_list(id_cols)
        id_cols_missing = colloc.setdiff(id_cols, df.columns)
        if id_cols_missing:  # if any columns are missing, try to get them from named index
            df = df.reset_index(id_cols_missing)
    dd = pd.DataFrame()
    for c in text_cols:
        d = df[id_cols]
        d['term'] = map(lambda x: re.findall(tokenizer_re, x), df[c])
        d = daf_manip.rollout_cols(d, cols_to_rollout='term')
        dd = pd.concat([dd, d])
    return dd
Esempio n. 2
0
 def __init__(self, obj):
     [setattr(self, k, v) for k, v in obj.__dict__.iteritems()]
     if not hasattr(self, 'data_dependencies'):
         self.data_dependencies = dict()
     if not hasattr(self, 'data_makers'):
         self.data_makers = dict()
     if not hasattr(self, 'data_storers'):
         self.data_storers = dict()
     # make sure values of data_dependencies are lists
     self.data_dependencies = {k: util_ulist.ascertain_list(v) for k, v in self.data_dependencies.iteritems()}
     ## mark all data_maker_types that already exist as "call"
     #self.data_maker_type = {k: 'call' for k in self.data_makers.keys()}
     # default data_makers to functions of the same name as data_dependencies
     missing_data_makers = set(self.data_dependencies.keys()).difference(self.data_makers.keys())
     for k in self.data_dependencies.keys():
         #if k in self.data_makers.keys():  # if the data_maker was already given
         #
         if hasattr(self, k):
             k_attr = getattr(self, k)
             if isinstance(k_attr, pd.HDFStore):  # if k_attr is a store
                 self.data_makers[k] = StoreDataGetter(store=k_attr, key=k)
             elif hasattr(k_attr, '__call__'):  # if k_attr is callable (method, function, ...)
                 self.data_makers[k] = CallDataGetter(fun=k_attr)
             else:  # if not, assume there is, or will be, an attribute of that name, and have data_maker return it
                 self.data_makers[k] = AttrDataGetter(obj=self, attr=k_attr)
     if not hasattr(self, 'verbose_level'):
         self.verbose_level = 1
Esempio n. 3
0
def mk_terms_df(df, text_cols, id_cols=None, tokenizer_re=tokenizer_re):
    text_cols = util_ulist.ascertain_list(text_cols)
    if id_cols is None:
        id_cols = colloc.setdiff(df.columns, text_cols)
    else:
        id_cols = util_ulist.ascertain_list(id_cols)
        id_cols_missing = colloc.setdiff(id_cols, df.columns)
        if id_cols_missing: # if any columns are missing, try to get them from named index
            df = df.reset_index(id_cols_missing)
    dd = pd.DataFrame()
    for c in text_cols:
        d = df[id_cols]
        d['term'] = map(lambda x : re.findall(tokenizer_re, x), df[c])
        d = daf_manip.rollout_cols(d, cols_to_rollout='term')
        dd = pd.concat([dd, d])
    return dd
Esempio n. 4
0
File: pot.py Progetto: yz-/ut
 def project_to(self, var_list=[]):
     """
     project to a subset of variables (marginalize out other variables)
     """
     var_list = colloc.intersect(ascertain_list(var_list), self.vars())
     if var_list:  # if non-empty, marginalize out other variables
         return Pot(self.tb[var_list + ['pval']].groupby(var_list).sum().reset_index())
     else:  # if var_list is empty, return a singleton potential containing the sum of the vals of self.tb
         return Pot(pd.DataFrame({'pval': self.tb['pval'].sum()}, index=['']))
Esempio n. 5
0
def add_bpx_col(df, groupby_keys=[]):
    groupby_keys = ulist.ascertain_list(groupby_keys) + [
        'kw_stripped_and_lowered'
    ]
    df['kw_stripped_and_lowered'] = pstr_trans.lower(
        aw_manip.strip_kw(df['keyword']))
    dg = df.groupby(groupby_keys,
                    group_keys=False).apply(lambda x: _bpx_tag(x))
    del dg['kw_stripped_and_lowered']
    return dg
Esempio n. 6
0
File: diagnosis.py Progetto: yz-/ut
def mk_fanout_score_df(df, fromVars, toVars, statVars=None, keep_statVars=False):
    if statVars is None:
        statVars = list(set(df.columns)-set(fromVars+toVars))
    fromVars = ascertain_list(fromVars)
    toVars = ascertain_list(toVars)
    statVars = ascertain_list(statVars)
    # make a dataframe with all same fromVars+toVars aggregated (summing the statVars)
    agg_df = df[fromVars+toVars+statVars].groupby(fromVars+toVars,as_index=False).sum()
    # group agg_df by fromVars, keeping only fromVars+statVars
    agg_df_gr = agg_df[fromVars+statVars].groupby(fromVars)
    # compute the sum-normalize values of every group
    agg_df_freq = agg_df_gr.transform(group_normalized_freq).add_suffix('_freq_fanout_ratio')
    # compute the inverse of the group sizes
    agg_df_count = agg_df_gr.agg(group_normalized_count).add_suffix('_count_fanout_ratio')
    d = agg_df.join(agg_df_freq)
    if keep_statVars==False:
        d = d.drop(statVars,axis=1)
    d = d.join(agg_df_count,on=fromVars)
    return d
Esempio n. 7
0
 def project_to(self, var_list=[]):
     """
     project to a subset of variables (marginalize out other variables)
     """
     var_list = colloc.intersect(ascertain_list(var_list), self.vars())
     if var_list:  # if non-empty, marginalize out other variables
         return Pot(self.tb[var_list +
                            ['pval']].groupby(var_list).sum().reset_index())
     else:  # if _var_list is empty, return a singleton potential containing the sum of the vals of self.tb
         return Pot(
             pd.DataFrame({'pval': self.tb['pval'].sum()}, index=['']))
Esempio n. 8
0
File: op.py Progetto: yz-/ut
def rep_tags(df, rep_cols, with_cols, name_to_tag_fun=None):
    """
    Replaces tags (specified by with_cols and the name_to_tag_fun) of the strings of rep_cols with the values of the
    with_cols columns of df.
    """
    # process inputs
    df = df.copy()
    rep_cols = util_ulist.ascertain_list(rep_cols)
    with_cols = util_ulist.ascertain_list(with_cols)
    if name_to_tag_fun is None:
        name_to_tag_fun = lambda x: name_to_tag(x, tag_str_format='#{%s}')
    tag_exp_with_col = \
        [{'with': name,
          'tag_exp': re.compile(name_to_tag_fun(name))}
         for name in with_cols]
    # go through all rep_cols and replace tags with the value in the with_cols
    for r in rep_cols:
        for t in tag_exp_with_col:
            w = t['with']
            tag_exp = t['tag_exp']
            print w, tag_exp.pattern
            df[r] = map(lambda x, y: tag_exp.sub(x, y), df[w], df[r])
    return df
Esempio n. 9
0
File: addcol.py Progetto: yz-/ut
def group_count(df, gr_cols=None, count_col=None, keep_order=True):
    """
    adds a column containing the count of the number of groups (defined by the gr_cols columns)
    """
    gr_cols = gr_cols or df.columns
    count_col = count_col or ut.daf.get.free_col_name(df, ['count', 'gr_count'])
    if keep_order:
        df = df.copy()
        df['column_to_keep_original_order'] = range(len(df))
    gr_cols = ulist.ascertain_list(gr_cols)
    gr_df = ut.daf.gr.group_and_count(df[gr_cols], count_col=count_col)
    df = df.merge(gr_df, left_on=gr_cols, right_on=gr_cols)
    if keep_order:
        df.sort(columns=['column_to_keep_original_order'], inplace=True)
        del df['column_to_keep_original_order']
    return df
Esempio n. 10
0
File: pstore.py Progetto: yz-/ut
def get_info_df(store, keys=None, info=None, cols=None):
    # process inputs
    if not keys:
        keys = store.keys()
    else:
        keys = util_ulist.ascertain_list(keys)
        keys = colloc.intersect(keys, store.keys())
    # get info_dict
    info_dict = get_info_dict(store)
    # make the df
    df = pd.DataFrame([dict(v, **{'key': k}) for k, v in info_dict.iteritems()])
    df = df[df['key'].isin(keys)]
    if 'shape' in df.columns:
        del df['shape']
    if 'ncols' not in df.columns:
        df['ncols'] = np.nan
    if 'nrows' not in df.columns:
        df['nrows'] = np.nan
    # get ncols and nrows with missing
    idx = df['ncols'].isnull().nonzero()[0]  # ncols and nrows should both be missing when one is
    for i in idx:
        d = store[df['key'].iloc[i]]
        df['nrows'].iloc[i] = len(d)
        df['ncols'].iloc[i] = len(d.columns)
    # clean up and return
    df = df.set_index('key')
    df = df.sort_index()
    df = daf_manip.reorder_columns_as(df, ['nrows', 'ncols', 'isa', 'typ', 'indexers', 'dc'])
    df = df.replace(to_replace=np.nan, value='')
    if info:
        if isinstance(info, dict):
            # add as many columns as there are keys in dict, using the values of the dict as functions applied to
            # the whole stored dataframe to get the column value
            df = pd.concat([df, pd.DataFrame(columns=info.keys(), index=df.index)], axis=1)
            for key in df.index.values:
                key_data = store[key]
                for k, v in info.iteritems():
                    df[k].loc[key] = v(key_data)
        elif np.all(map(lambda x: isinstance(x, basestring), info)):
            df = daf_manip.filter_columns(df, info)
        else:
            raise ValueError('Unrecognized info format')
    # filter cols
    if cols:
        df = daf_manip.filter_columns(df, cols)
    return df
Esempio n. 11
0
 def mk_data_flow(self):
     # make sure values of data_dependencies are lists
     self.data_dependencies = {k: util_ulist.ascertain_list(v) for k, v in self.data_dependencies.iteritems()}
     # default data_makers to functions of the same name as data_dependencies
     missing_data_makers = set(self.data_dependencies.keys()).difference(self.data_makers.keys())
     bundles = list()
     for k in missing_data_makers:
         if hasattr(self, k):
             self.data_makers[k] = getattr(self, k)
         else:
             bundles.append(k)
     if not hasattr(self, 'verbose_level'):
         setattr(self, 'verbose_level', 1)
     if bundles:
         print("Bundles:")
         for k in bundles:
             print("  {}: \n    {}").format(k, ', '.join(self.data_dependencies[k]))
Esempio n. 12
0
File: manip.py Progetto: yz-/ut
def rollout_cols(df, cols_to_rollout=None):
    """
    rolls out the values of cols_to_rollout so that each individual list (or other iterable) element is on it's own row,
    with other non-cols_to_rollout values aligned with them as in the original dataframe
    Example:
    df =
        A   B
        1   [11,111]
        2   [22]
        3   [3,33,333]
    rollout_cols(df, cols_to_rollout='B') =
        A   B
        1   11
        1   111
        2   22
        3   3
        3   33
        3   333
    """
    # if no cols_to_rollout is given, (try to) rollout all columns that are iterable (lists, etc.)
    cols_to_rollout = cols_to_rollout or daf_diagnosis.cols_that_are_of_the_type(df, util_var.is_an_iter)
    # make sure cols_to_rollout is a list
    cols_to_rollout = util_ulist.ascertain_list(cols_to_rollout)
    # get non_rollout_columns
    non_rollout_columns = colloc.setdiff(df.columns, cols_to_rollout)
    # mk an array with the lengths of the lists to rollout (get it from the first cols_to_rollout and cross fingers that
    # all cols_to_rollout have the same list lengths
    rollout_lengths = np.array(df[cols_to_rollout[0]].apply(len))
    # create a rollout_df dataframe (this will be the output)
    rollout_df = pd.DataFrame(range(np.sum(rollout_lengths)))  # TODO: I CANNOT F**ING BELIEVE I'M DOING THIS!!! But found no other way to make a dataframe empty, and then construct it on the fly!
    # rollout cols_to_rollout
    for c in cols_to_rollout:
        rollout_df[c] = np.concatenate(list(df[c]))
    # rollout cols_to_rollout
    for c in non_rollout_columns:
        t = [np.tile(x, (y, 1)) for (x, y) in zip(df[c], rollout_lengths)]
        try:
            rollout_df[c] = np.concatenate(t)
        except ValueError:
            rollout_df[c] = [x for x in chain(*t)]
    # put the columns in their original order
    return rollout_df[df.columns]
Esempio n. 13
0
File: pstore.py Progetto: yz-/ut
def copy_data(from_store, to_store, from_keys, overwrite=False):
    '''
    Copies key contents from one store to another, overwriting or not (default), and respecting original store format.
    :param from_store: store (or path of store) to copy from
    :param to_store: store (or path of store) to copy to
    :param from_keys: list of keys to copy from from_store
    :param overwrite: if True, will remove existing key in to_store if they exist, if False, will not copy (silently)
    :return: None
    '''
    # handle input formats
    if isinstance(from_store, basestring):
        from_store = MyStore(from_store)
        close_from_store = True
    else:
        close_from_store = False
    if isinstance(to_store, basestring):
        to_store = MyStore(to_store)
        close_to_store = True
    else:
        close_to_store = False
    from_keys = util_ulist.ascertain_list(from_keys)
    from_keys = map(ascertain_prefix_slash, from_keys)
    # if overwrite is False, keep only those keys that don't exist
    if not overwrite:
        from_keys = list(set(from_keys).difference(to_store.keys()))

    # get some info on the from_store
    store_info = get_info_dict(from_store)

    # do the copying
    for k in from_keys:
        store_df_respecting_given_format(to_store, k, from_store[k], key_info=store_info[k])
    to_store.flush()

    # close stores (if they were specified by paths
    if close_from_store:
        from_store.close()
    if close_to_store:
        to_store.close()
Esempio n. 14
0
File: pstore.py Progetto: yz-/ut
def get_col_names(store, keys=None, singular_info='index_and_columns', print_results=False, style='dict'):
    '''

    :param store: a HDFStore
    :param keys: list of keys to get info from (if present)
    :return: a cols_info dict whose keys are the keys of the store and values are a dict with
    'index', 'columns', and 'index_and_columns' which contain the data col names
    '''
    # process inputs
    if not keys:
        keys = store.keys()
    else:
        keys = util_ulist.ascertain_list(keys)
        keys = colloc.intersect(keys, store.keys())
    # make a dict with col (and index) info
    cols_info = dict()
    for key in keys:
        cols_info[key] = dict()
        df = store[key]
        cols_info[key]['index'] = list(df.index.names)
        cols_info[key]['columns'] = list(df.columns)
        cols_info[key]['index_and_columns'] = cols_info[key]['index'] + cols_info[key]['columns']
    if singular_info:
        cols_info_copy = cols_info
        cols_info = dict()
        for key in keys:
            cols_info[key] = cols_info_copy[key][singular_info]
    if print_results:
        PrettyPrinter(indent=2).pprint(cols_info)
    if style == 'dataframe':
        d = pd.DataFrame()
        for k, v in cols_info.iteritems():
            v = [x for x in v if x]
            d = pd.concat([d, pd.DataFrame(data=v, columns=[k])], axis=1)
        d = d.fillna(value='')
        cols_info = d.transpose()

    return cols_info
Esempio n. 15
0
File: pstore.py Progetto: yz-/ut
 def join_col(self, df, add_cols, join_cols=None, join_key=None, join_store=None, join_filter=None, drop_joining_duplicates=True):
     """
     This function is meant to return the input df with add_cols added.
     These columns are fetched in join_store[join_key] and are aligned to df using join_cols.
     Note: At the time of this writing, only a restricted case is handled, namely:
         join_cols has only one element that must be in the index of the store
     """
     join_store = join_store or self.join_store
     join_key = join_key or self.join_key
     if isinstance(add_cols, basestring):
         if add_cols in self.add_from.keys():
             if 'join_store' in self.add_from[add_cols].keys():
                 join_store = join_store or self.add_from[add_cols]['join_store']
             if 'join_key' in self.add_from[add_cols].keys():
                 join_key = join_key or self.add_from[add_cols]['join_key']
             if 'join_cols' in self.add_from[add_cols].keys():
                 join_cols = join_cols or self.add_from[add_cols]['join_cols']
     join_cols = util_ulist.ascertain_list(join_cols)
     add_cols = util_ulist.ascertain_list(add_cols)
     # get the df values to join (and see if they're in cols or index)
     if coll_op.contains(list(df.columns), join_cols):
         df_join_cols_in_columns = True
         df_join_col_values = np.unique(df[join_cols])
     else:
         df_join_cols_in_columns = False
         df_join_col_values = np.unique(list(df.index))
     # get necessary information from store
     store_key_info = self.store_info[join_store]
     join_key = ascertain_prefix_slash(join_key)
     store_key_info = store_key_info[join_key]
     if len(join_cols) == 1 and join_cols[0] == 'index':
         print "uploading only specific indices for join_df"
         join_df = self.store[join_store].select(
             key=join_key,
             where=[pd.Term('index', df_join_col_values)],
             columns=add_cols)
     elif join_cols in store_key_info['column_names']:
         print "uploading only specific columns for join_df"
         join_df = self.store[join_store].select(
             key=join_key,
             where=[pd.Term(join_cols[0], df_join_col_values)],
             columns=join_cols+add_cols)
         join_df.set_index(join_cols[0])
     else:
         print "uploading the whole potential join_df"
         join_df = self.store[join_store].select(
             key=join_key,
             columns=join_cols+add_cols)
     #print join_cols
     #print add_cols
     #print join_df.head(10)
     # drop duplicates
     if drop_joining_duplicates==True:
         join_df = join_df.drop_duplicates()
     if coll_op.contains(list(join_df.columns), join_cols):
         join_df_cols_in_cols = True
     else:
         join_df_cols_in_cols = False
     #print df_join_cols_in_columns
     #print join_df_cols_in_cols
     # join
     if df_join_cols_in_columns:
         if join_df_cols_in_cols:
             return pd.merge(df, join_df, on=join_cols)
         else:
             return pd.merge(df, join_df, right_on=join_cols, left_index=True)
     else:
         if join_df_cols_in_cols:
             return pd.merge(df, join_df, right_index=True, left_on=join_cols)
         else:
             return pd.merge(df, join_df, right_index=True, left_index=True)
Esempio n. 16
0
def add_bpx_col(df, groupby_keys=[]):
    groupby_keys = ulist.ascertain_list(groupby_keys) + ['kw_stripped_and_lowered']
    df['kw_stripped_and_lowered'] = pstr_trans.lower(aw_manip.strip_kw(df['keyword']))
    dg = df.groupby(groupby_keys,group_keys=False).apply(lambda x:_bpx_tag(x))
    del dg['kw_stripped_and_lowered']
    return dg
Esempio n. 17
0
File: pobj.py Progetto: SRHerzog/ut
def has_attributes(obj, attr_list):
    attr_list = util_ulist.ascertain_list(attr_list)
    return all([x in obj.__dict__.keys() for x in attr_list])
Esempio n. 18
0
File: pot.py Progetto: yz-/ut
 def order_vars(self, var_list, sort_pts=True):
     self.tb = reorder_columns_as(self.tb, ascertain_list(var_list))
     if sort_pts:
         self.sort_pts()
     return self
Esempio n. 19
0
File: dup_diag.py Progetto: yz-/ut
 def add_grp_info(grp):
     print [ulist.ascertain_list(my_counter.next()) for i in range(len(grp))]
Esempio n. 20
0
 def order_vars(self, var_list, sort_pts=True):
     self.tb = reorder_columns_as(self.tb, ascertain_list(var_list))
     if sort_pts:
         self.sort_pts()
     return self
Esempio n. 21
0
File: get.py Progetto: yz-/ut
def all_but(d, exclude_keys):
    return get_subdict(d, set(d.keys()).difference(ulist.ascertain_list(exclude_keys)))
Esempio n. 22
0
File: manip.py Progetto: yz-/ut
def rm_cols_if_present(df, cols):
    cols = util_ulist.ascertain_list(cols)
    return df[colloc.setdiff(df.columns, cols)]