Beispiel #1
0
def inner_join(df, join: pd.DataFrame, prefix: str = '', prefix_left='', force_multi_index=False, ffill=False):
    if df is None:
        if force_multi_index:
            if isinstance(join.columns, pd.MultiIndex):
                return join
            else:
                return add_multi_index(join, prefix)
        else:
            return join

    if force_multi_index:
        if not isinstance(df.columns, pd.MultiIndex) and len(df.columns) > 0:
            if len(prefix_left) <= 0:
                raise ValueError("You need to provide a prefix_left")
            else:
                df = add_multi_index(df, prefix_left)

    if isinstance(df.columns, pd.MultiIndex) and not isinstance(join.columns, pd.MultiIndex):
        b = join.copy()
        b.columns = pd.MultiIndex.from_product([[prefix], b.columns])
        if ffill:
            return pd\
                .merge(df, b, left_index=True, right_index=True, how='outer', sort=True)\
                .fillna(method='ffill')\
                .dropna()
        else:
            return pd.merge(df, b, left_index=True, right_index=True, how='inner', sort=True)
    else:
        if ffill:
            return pd\
                .merge(df.add_prefix(prefix_left), join.add_prefix(prefix), left_index=True, right_index=True, how='outer', sort=True)\
                .fillna(method='ffill')\
                .dropna()
        else:
            return pd.merge(df.add_prefix(prefix_left), join.add_prefix(prefix), left_index=True, right_index=True, how='inner', sort=True)
def merge_and_displace_frames(
    substrate: pd.DataFrame,
    reference: pd.DataFrame,
    pipette: pd.DataFrame,
    experiment_duration: pd.Timedelta,
    duration_of_resampled_row: pd.Timedelta,
):
    reference = reference.add_prefix("Reference_")
    substrate = substrate.add_prefix("Substrate_")
    pipette = pipette.add_prefix("Pipette_")

    # Convert from frame numbers to the actual time through the experiment
    for df, name in [
        (reference, "Reference"),
        (substrate, "Substrate"),
        (pipette, "Pipette"),
    ]:
        df: pd.DataFrame
        name: str
        number_of_frames = df[f"{name}_Frame"].max()
        instant = df[f"{name}_Frame"] / number_of_frames * experiment_duration
        # pd.Timedelta -> seconds
        df["Instant"] = instant
        df.set_index("Instant", inplace=True)

    combined = pd.concat(
        (reference, substrate, pipette),
        axis=
        "columns",  # We want to join two tables so that the columns are the joining point (i.e left and right)
    )

    # In order to compare results between experiments, we must now resample them (so that each row has a common `Instant`)
    # This is a lossy operation. We choose to take the mean
    combined: pd.DataFrame = combined.resample(
        rule=duration_of_resampled_row).mean()
    # Frame numbers are no longer valid
    combined.drop(
        columns=[col for col in combined.columns if col.endswith("Frame")],
        inplace=True,
    )
    logger.info(
        f"Resampled to buckets of {duration_of_resampled_row} ({len(combined)} rows)"
    )

    combined["X_Delta"] = (combined["Substrate_X_Position"] -
                           combined["Reference_X_Position"])
    combined["Y_Delta"] = (combined["Substrate_Y_Position"] -
                           combined["Reference_Y_Position"])

    # Make our delta lines start at 0
    x_start = combined["X_Delta"].iloc[0]
    y_start = combined["Y_Delta"].iloc[0]

    combined["X_Delta"] = combined["X_Delta"] - x_start
    combined["Y_Delta"] = combined["Y_Delta"] - y_start

    return combined
Beispiel #3
0
def slide_17():
    df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'],
                    'data1': range(6)})
    print pd.get_dummies(df['key'])

    dummies = pd.get_dummies(df['key'], prefix='key')
    print dummies
    df_with_dummy = df[['data1']].join(dummies)
    print df_with_dummy

    mnames = ['movie_id', 'title', 'genres']
    movies = pd.read_table(MOVIELENSPATH,
                           sep='::',
                           header=None,
                           engine='python',
                           names=mnames)
    print movies[:10]

    genre_iter = (set(x.split('|')) for x in movies.genres)
    genres = sorted(set.union(*genre_iter))
    print genres
    dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)

    for i, gen in enumerate(movies.genres):
        dummies.ix[i, gen.split('|')] = 1

    movies_windic = movies.join(dummies.add_prefix('Genre_'))
    print movies_windic.ix[0]

    values = np.random.rand(10)
    print values
    bins = [0, 0.2, 0.4, 0.6, 0.8, 1]

    print pd.get_dummies(pd.cut(values, bins))
Beispiel #4
0
def compute_anomalous_events(df_prices: pd.DataFrame,
                             df_bollinger: pd.DataFrame):
    """Compute anomalous (high or low) price events for a set of stocks over time."""
    df = pd.concat([df_prices, df_bollinger.add_prefix("bol_")], axis=1)
    df["event"] = pd.Series(pd.NA, index=df.index, dtype=EVENT_TYPE)
    df["event"][df["close"] > df["bol_upper"]] = "high"  # type: ignore
    df["event"][df["close"] < df["bol_lower"]] = "low"  # type: ignore
    return df[df["event"].notna()][["name", "date", "event"]].reset_index()
Beispiel #5
0
    def predict(self, X: pd.DataFrame) -> pd.Series:
        self._check_is_fitted()
        if set(self.features_) != set(X.columns):
            raise ValueError(
                f"Feature sets do not match: [{self.features_}, {X.columns}]"
            )

        data = X.add_prefix(__class__._X_COL_PREFIX)
        preds = map(self._predict_row, data.iterrows())
        return pd.Series(preds, index=data.index)
def test_add_prefix():
    npr = np.array([[20.2, 2.0, 3.2, 4.3, 5.5], [10, -20, -30, -40, -50],
                    [36.2, 13.2, 16.4, 12.2, 10.8]])
    pdf = DataFrame(npr)
    ctx: CylonContext = CylonContext(config=None, distributed=False)
    cn_tb: Table = Table.from_pandas(ctx, pdf)
    prefix = "item_"
    cn_tb_with_prefix = cn_tb.add_prefix(prefix)
    pdf_with_prefix = pdf.add_prefix(prefix)

    assert pdf_with_prefix.columns.tolist() == cn_tb_with_prefix.column_names
Beispiel #7
0
 def combine_sewershed_polygon_sample(
         self, df: pd.DataFrame, polygon: pd.DataFrame) -> pd.DataFrame:
     if polygon.empty:
         return df
     elif df.empty:
         return polygon
     polygon = polygon.copy()
     polygon = polygon.add_prefix("Sewershed-")
     return pd.merge(df,
                     polygon,
                     how="left",
                     left_on="Site_polygonID",
                     right_on="Sewershed-Polygon_polygonID")
Beispiel #8
0
 def combine_cphd_polygon_sample(self, df: pd.DataFrame,
                                 polygon: pd.DataFrame) -> pd.DataFrame:
     if polygon.empty:
         return df
     elif df.empty:
         return polygon
     polygon = polygon.copy()
     polygon = polygon.add_prefix("CPHD-")
     return pd.merge(df,
                     polygon,
                     how="left",
                     left_on="Calculated_polygonIDForCPHD",
                     right_on="CPHD-Polygon_polygonID")
def dummy02():
    mnames=['movies_id','title','genres']
    movies=pd.read_table(u'D:\study\书籍\python\pydata-book-master\pydata-book-master\ch02\movielens\movies.dat',
                         sep='::',header=None,names=mnames)
    print movies[:10]
    genre_iter=(set(x.split('|')) for x in movies.genres)
    genres=sorted(set.union(*genre_iter))
    print genres
    dummies=DataFrame(np.zeros((len(movies),len(genres))),columns=genres)
    for i,gen in enumerate(movies.genres):
        dummies.ix[i,gen.split('|')]=1
    movies_windic=movies.join(dummies.add_prefix('Genre_'))
    print movies_windic.ix[0]
Beispiel #10
0
def test(path=None):
    data = pd.read_csv('ch08/Haiti.csv')
    data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) &
                (data.LONGITUDE > -75) & (data.LONGITUDE < -70) &
                data.CATEGORY.notnull()]

    # 得到所有分类
    all_cats = get_all_categories(data.CATEGORY)

    # 将分类编号和分类信息组成 dict
    english_mapping = dict(get_english(x) for x in all_cats)

    # 或得所有分类 code
    all_codes = get_code(all_cats)

    # 得到一个 index 对象,使用分类的编号
    code_index = pd.Index(np.unique(all_codes))

    # index 是原来 data 的 index
    # columns 是分类的编号
    dummy_frame = DataFrame(np.zeros((len(data), len(code_index))),
                            index=data.index, columns=code_index)

    # row: index
    # cat: category
    # 每条记录中有那个分类就置 1
    for row, cat in zip(data.index, data.CATEGORY):
        codes = get_code(to_cat_list(cat))
        dummy_frame.ix[row, codes] = 1

    data = data.join(dummy_frame.add_prefix('category_'))

    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)

    to_plot = ['2a', '1', '3c', '7a']
    lllat = 17.25
    urlat = 20.25
    lllon = -75
    urlon = -71

    for code, ax in zip(to_plot, axes.flat):
        m = basic_haiti_map(ax, lllat=lllat, urlat=urlat,
                            lllon=lllon, urlon=urlon)

    cat_data = data[data['category_%s' % code] == 1]
    x, y = m(cat_data.LONGITUDE, cat_data.LATITUDE)
    m.plot(x, y, 'k.', alpha=0.5)
    ax.set_title('%s: %s' % (code, english_mapping[code]))
def Preprocess(movies, ratings):
	print 'Data preprocessing...'
	# Convert the column of "genres" into dummies
	genre_iter = (set(x.split('|')) for x in movies.genres)
	genres = sorted(set.union(*genre_iter))  # get all genres
	dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)
	# iterate "genres" each row, assign "1" to corresponding location
	for i, gen in enumerate(movies.genres):
		dummies.ix[i, gen.split('|')] = 1
	movies_dummies = movies.join(dummies.add_prefix('Genres_'))  # merged with movies

	data_merged = pd.merge(movies_dummies, ratings, on='movieid')  # merge two tables by 'movieid'
	data = data_merged.drop_duplicates()  # drop duplications
	data = data.dropna()  # drop na
	print 'Preprocessing completed.'
	return data
Beispiel #12
0
def inner_join(df, join: pd.DataFrame, prefix: str = ''):
    if isinstance(df.columns, pd.MultiIndex) and not isinstance(
            join.columns, pd.MultiIndex):
        b = join.copy()
        b.columns = pd.MultiIndex.from_product([[prefix], b.columns])
        return pd.merge(df,
                        b,
                        left_index=True,
                        right_index=True,
                        how='inner',
                        sort=True)
    else:
        return pd.merge(df,
                        join.add_prefix(prefix),
                        left_index=True,
                        right_index=True,
                        how='inner',
                        sort=True)
Beispiel #13
0
def movie_genres(movies):
    #设置哑变量
    genre_iter = (set(x.split('|')) for x in movies.genres)
    genres = sorted(set.union(*genre_iter))
    dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)
    #迭代movies的genres列,将其对应类型的位置设置为1
    for i, gen in enumerate(movies.genres):
        dummies.ix[i, gen.split('|')] = 1
    #得到合并后的movies
    movies_windic = movies.join(dummies.add_prefix('Genres_'))
    movies_windic = movies_windic.drop('Genres_IMAX', axis=1)
    #对每种类型的电影进行统计,绘出扇形图
    colors = [
        'Blue', 'RoyalBlue', 'MediumBlue', 'DodgerBlue', 'CornflowerBlue',
        'DeepSkyBlue', 'SkyBlue', 'Azure', 'SlateBlue', 'LightBlue',
        'PaleTurquoise', 'DarkCyan', 'DarkSlateBlue', 'LightSkyBlue',
        'MediumTurquoise', 'Navy', 'SteelBlue', 'MidnightBlue', 'PowderBlue'
    ]
    movies_windic.ix[:, 5:].sum().plot(kind='pie',
                                       title='The Pie of Genres',
                                       colors=colors)
    return movies_windic
Beispiel #14
0
def portfolio_analyzer(weights: dict, pnl: pd.DataFrame, returns: pd.DataFrame,
                       factor_betas: pd.DataFrame, factor_alphas: pd.DataFrame,
                       n_days_delay: float):
    assert isinstance(weights,
                      dict), f"{weights} must be a dictionary of pandas Series"

    w_daily = {}
    all_factors = {}
    factors_returns = {}
    factor_exp = {}
    factor_returns_models = {}
    S_idiosyncratic_returns = pd.Series(dtype=float,
                                        name='idiosyncratic_returns')

    dates_lst = list(weights.keys())
    for i, key_str_dt in enumerate(dates_lst[:-n_days_delay]):

        key_dt = dt.datetime.strptime(key_str_dt, "%Y%m%d")
        key_tzaw_rets_dt = pd.Timestamp(
            dates_lst[i + n_days_delay]).tz_localize(tz="utc")
        # weights
        val_w = weights[key_str_dt]
        w_daily[key_dt] = val_w
        # factors
        B_alpha = factor_alphas.loc[key_dt].add_prefix('alpha_')
        B_beta = factor_betas.add_prefix('beta_')
        B_all_factors = B_alpha.join(B_beta)  # static betas
        all_factors[key_dt] = B_all_factors
        # returns
        returns_day = returns.loc[key_tzaw_rets_dt]
        returns_day.name = 'returns'
        returns_day.index.name = 'asset'

        # Compute factor returns f(t)[i]: r(t+n)[i] = b(t)[i]*f(t)[i] + s(t)[i] as reg coefs
        exog = B_all_factors.filter(regex='(^alpha|^beta)').fillna(0.0)
        endog = returns_day[exog.index].fillna(0.0)
        model = sm.OLS(endog, exog)
        res = model.fit()
        factor_returns_models[key_dt] = res
        factors_returns[key_dt] = res.params
        # Compute factor exposure e(t)[i] = w(t)[i]*f(t)[i]
        factor_exp[key_dt] = get_factor_exposures(factor_betas=exog,
                                                  weights=val_w)
        # Compute idiosyncratic returns:  s(t)[i]*w(t)[i]
        S_idiosyncratic_returns[key_dt] = partial_dot_product(v=res.resid,
                                                              w=val_w)

    w_opt_df = pd.concat(w_daily)
    w_opt_df.index.names = ['date', 'asset']
    w_opt_df.name = 'w_opt'

    B_factors_asset_df = pd.concat(all_factors)
    B_factors_asset_df.index.names = ['date', 'asset']

    f_factors_returns_df = pd.concat(factors_returns).unstack()
    f_factors_returns_df.index.name = 'date'
    E_factors_exp_df = pd.concat(factor_exp).unstack()
    E_factors_exp_df.index.name = 'date'

    S_idiosyncratic_returns.index.name = 'date'

    pnl_and_w_df = join_weights_and_pnl(w_opt_df, pnl, returns)

    return w_opt_df, pnl_and_w_df, B_factors_asset_df, f_factors_returns_df, E_factors_exp_df, S_idiosyncratic_returns
Beispiel #15
0
# Read data from standard input on the command line
sys.stdin = os.fdopen(sys.stdin.fileno(), "rU")
data = pd.read_csv(sys.stdin)

# Restrict to data in Haiti with categories
data = data[
    (data.LATITUDE > 18)
    & (data.LATITUDE < 20)
    & (data.LONGITUDE > -75)
    & (data.LONGITUDE < -70)
    & data.CATEGORY.notnull()
]

# Extract categorizations
all_cats = get_all_categories(data.CATEGORY)

# Add indicator columns for categories
all_codes = get_code(all_cats)
code_index = pd.Index(np.unique(all_codes))
dummy_frame = DataFrame(np.zeros((len(data), len(code_index))), index=data.index, columns=code_index)

for row, cat in zip(data.index, data.CATEGORY):
    codes = get_code(to_cat_list(cat))
    dummy_frame.ix[row, codes] = 1

data = data.join(dummy_frame.add_prefix("category_"))

# Write data to standard output
data.to_csv(sys.stdout)
#下面要根据分类选取记录
#添加指标列
#先抽取出唯一的分类编码
def get_code(seq):
    return [x.split('.')[0] for x in seq if x]
all_codes = get_code(all_cats)
#索引化
code_index = pd.Index(np.unique(all_codes))
#构造一个新的DataFrame
dummy_frame = DataFrame(np.zeros((len(data),len(code_index))),index= data.index,columns = code_index)

#绘制海地地图(Basemap库无法导入)
for row,cat in zip(data.index,data.CATEGORY):
    codes = get_code(to_cat_list(cat))
    dummy_frame.ix[row,codes] = 1
data = data.join(dummy_frame.add_prefix('category_'))
def basic_haiti_map(ax=None,lllat = 17.25, urlat = 20.25, lllon = -75, urlon = -71):
    m = Basemap(ax = ax, projection = 'stere',lon_0 = (urlon + lllon) / 2,lat_0 = (urlat + lllat) /2 , llcrnrlat = lllat, urcrnrlat = urlat, llcrnrlon = lllon,
                urcrnrlon = urlon,resolution = 'f')
    m.drawcoastlines()
    m.drawstates()
    m.drawcountries()
    return m
fig,axes = plt.subplots(nrows = 2,ncols = 2,figsize = (12,10))
fig.subplots_adjust(hspace = 0.05,wspace = 0.05)
to_plot = ['2a','1','3c','7a']
lllat = 17.25;urlat = 20.25;lllon = -75;urlon = -71
for code,ax in zip(to_plot,axes.flat):
    m = basic_haiti_map(ax,lllat=lllat,urlat = urlat, lllon = lllon,urlon = urlon)
    cat_data = data[data['category_%s' % code] == 1]
    x,y = m(cat_data.LONGITUDE,cat_data.LATITUDE)
all_codes = get_code(all_cats)
code_index = pd.Index(np.unique(all_codes))
dummy_frame = DataFrame(np.zeros((len(data), len(code_index))),
                        index=data.index, columns=code_index)

# <codecell>

dummy_frame.ix[:, :6]

# <codecell>

for row, cat in zip(data.index, data.CATEGORY):
    codes = get_code(to_cat_list(cat))
    dummy_frame.ix[row, codes] = 1

data = data.join(dummy_frame.add_prefix('category_'))

# <codecell>

data.CATEGORY.isnull().value_counts()

# <codecell>

from mpl_toolkits.basemap import Basemap
import matplotlib.pyplot as plt

def basic_haiti_map(ax=None, lllat=17.25, urlat=20.25,
                    lllon=-75, urlon=-71):
    # create polar stereographic Basemap instance.
    m = Basemap(ax=ax, projection='stere',
                lon_0=(urlon + lllon) / 2,
Beispiel #18
0
 def _compose_data(cls, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
     data = X.add_prefix(cls._X_COL_PREFIX)  # makes a copy
     data[cls._Y_COL_PREFIX] = y
     return data
Beispiel #19
0
    def _get_next_state(
        state: pd.DataFrame,
        seperation: float,
        cohesion: float,
        alignment: float,
        visibility: float,
        dimensions: tp.List[str],
        step: float,
    ) -> pd.DataFrame:

        # Self-cross-product Boids for all (center, neighbor) pairs.
        state["i"] = range(len(state))
        state["j"] = 0
        pairs = pd.merge(
            left=state,
            right=state.add_prefix(prefix="n"),
            left_on="j",
            right_on="nj",
            how="outer",
        )

        # Unpack columns.
        cols = [
            (
                f"p{i}",  # Positions
                f"v{i}",  # Velocitys
                f"np{i}",  # Neighbor positions.
                f"nv{i}",  # Neighbor velocitys.
                f"nd{i}",  # Neighbor distances.
            ) for i in dimensions
        ]
        p, v, np, nv, nd = map(list, zip(*cols))

        # For each dimension:
        for pi, npi, ndi in zip(p, np, nd):
            # Compute neighbor-to-center translations.
            pairs[ndi] = pairs[pi] - pairs[npi]

        # Compute neighbor-to-center distances.
        ndmag = pairs[nd].pow(2).sum(axis=1).pow(0.5)

        # Subset pairs to visible neighbors.
        pairs = pairs.loc[ndmag.le(visibility)]

        # For each dimension:
        for ndi in nd:
            # Transform neighbor-to-center translations to repulsions.
            pairs[ndi] /= ndmag.pow(2)

        # Compute neighbor velocity magnitudes.
        nvmag = pairs[nv].pow(2).sum(axis=1).pow(0.5)

        # For each dimension:
        for nvi in nv:
            # Transform neighbor velocities to (unit) neighbor directions.
            pairs[nvi] /= nvmag
            pairs[nvi].where(cond=nvmag.gt(0), other=0, inplace=True)

        # Nullify neighbors that are centers.
        pairs.loc[pairs["i"] == pairs["ni"], [*np, *nv, *nd]] = None

        # Augment repulsor behaviour.
        centers = pairs["t"].eq("repulsor")
        pairs.loc[centers, np] = None
        pairs.loc[centers, nv] = None
        pairs.loc[centers, nd] = None
        neighbors = pairs["nt"].eq("repulsor")
        pairs.loc[neighbors, np] = None
        pairs.loc[neighbors, nv] = None
        pairs.loc[neighbors, nd] *= 30

        # Aggregate neighbor information per center Boid.
        agg_last = {col: "last" for col in ("t", *p, *v)}
        agg_mean = {col: "mean" for col in (*np, *nv, *nd)}
        agg = {**agg_last, **agg_mean}
        groups = pairs.groupby(by="i", as_index=False, sort=False)
        state = groups.agg(func=agg).drop(columns="i")

        # For each dimension:
        for pi, npi in zip(p, np):
            # Transform mean-neighbor positions to center-to-mean-neighbor translations.
            state[npi] -= state[pi]

        # For each dimension:
        for pi, vi, npi, nvi, ndi in zip(p, v, np, nv, nd):
            # Compute accelerations.
            ai = 0
            ai += seperation * state.pop(ndi).where(cond=pd.notnull, other=0)
            ai += cohesion * state.pop(npi).where(cond=pd.notnull, other=0)
            ai += alignment * state.pop(nvi).where(cond=pd.notnull, other=0)
            # Update velocities and positions.
            state[vi] += ai * step**2
            state[pi] += state[vi] * step

        return state
Beispiel #20
0
class DimRed:
    """
  Class for quadruple dimension reduction.
  """
    def __init__(self, x, w, p):
        """
      Dimension reduction class.

      Parameters:
          x: Input matrix (np array)
          w: Weights to adjustment for ae
          p: latent dimension for ae
      """
        self.x = x
        self.w = w
        self.p = p
        self.pca = None
        self.nmf = None
        self.ae = None
        self.__reduced = None
        self.__pcanmf = None
        self.__median = None
        self.__ael1 = None
        self.__ael2 = None
        self.__a = None
        self.__r = 15
        self.__a1 = 0.03
        self.__a2 = 0.85
        self.__scorer = metrics.explained_variance_score
        self.__run_id = name_generator(6)

    def __str__(self):
        return "Quadruple dimension reduction class"

    def __repr__(self):
        return "\n" + self.__str__()

    def __get_score(self, model, y):
        """
      Determine level of explained variance

      """
        prediction = model.inverse_transform(model.transform(y))
        return self.__scorer(y, prediction)

    def l_med(self):
        self.__median = pd.DataFrame(np.median(self.x.T, axis=1))
        self.__median = self.__median.add_prefix('MEDIAN_' + self.__run_id +
                                                 '_')
        return

    def lde(self):
        """
      Decompose with PCA and NMF

      """
        self.pca = PCA(n_components=0.95)
        self.pca.fit(self.x)
        pc_weights = pd.DataFrame(self.pca.components_.T)
        pc_weights = pc_weights.add_prefix('PCA_' + self.__run_id + '_')

        opti_rank = []
        #
        warnings.filterwarnings('ignore')
        for k in range(2, self.__r):
            nmf = NMF(n_components=k, max_iter=1000).fit(self.x)
            score_it = self.__get_score(nmf, self.x)
            opti_rank.append(score_it)
            if score_it >= 0.95:
                break

        self.nmf = NMF(n_components=len(opti_rank) + 1, max_iter=10000)
        self.nmf.fit(self.x)
        warnings.resetwarnings()
        #

        nmf_weights = pd.DataFrame(self.nmf.components_.T)
        nmf_weights = nmf_weights.add_prefix('NMF_' + self.__run_id + '_')

        self.__pcanmf = pd.concat([pc_weights, nmf_weights], axis=1)

        return

    def __de4ae(self, y):
        """
      Estimate optimal dimension for AE,
        based on Bahadur and Paffenroth 2020, IEEE

      """
        s_x = y.copy()

        for t in range(s_x.shape[0]):
            s_x.iloc[t, :] = np.sort(np.array(s_x.iloc[t, :]))[::-1]

        svp = np.sort(s_x.mean())[::-1]
        svp_sum = svp.sum()
        alg1 = sum(svp / svp_sum > self.__a1)
        alg2 = 0

        temp = (svp_sum * self.__a2) / 1
        temp2 = 0

        for i in range(len(svp)):
            temp2 += svp[i]
            alg2 += 1

            if temp2 >= temp:
                break
        return int((alg1 + alg2) / 2)

    def __aer(self, nc):
        """
      Build model structure

      """
        input_layer = Input(shape=(nc, ), name="input")
        encoder = Dense(self.__a,
                        activation="relu",
                        kernel_initializer="glorot_uniform",
                        activity_regularizer=regularizers.l1_l2(1e-16, 1e-9),
                        name="enco1")(input_layer)

        encoder = Dense(self.__a // 2, activation="relu", name="code")(encoder)

        decoder = Dense(self.__a, activation="sigmoid", name="deco1")(encoder)
        decoder = Dense(nc, activation="sigmoid", name="output")(decoder)

        self.ae = Model(inputs=input_layer, outputs=decoder)
        self.ae.compile(optimizer=optimizers.RMSprop(learning_rate=1e-3),
                        loss='mean_squared_error',
                        metrics=['mse'])
        return

    def mud_ae(self):
        """
      Model training and output.

      """
        nam = self.__run_id + "_model.h5"

        if self.p == 0:
            self.__a = self.x.shape[0] * 50 // 100
            check1 = ModelCheckpoint(filepath="est_" + nam,
                                     verbose=0,
                                     save_best_only=True)

            self.__aer(nc=self.x.shape[1])
            self.ae.fit(self.x,
                        self.x,
                        sample_weight=self.w,
                        epochs=500,
                        shuffle=True,
                        batch_size=15,
                        validation_data=(self.x, self.x),
                        callbacks=[check1],
                        verbose=0)

            estimate = load_model("est_" + nam)
            code_est = Dff(estimate.get_layer("enco1").get_weights()[0])
            self.__a = self.__de4ae(code_est)
            print("The optimal number of dimension is {}".format(self.__a))

        else:
            self.__a = self.p

        check = ModelCheckpoint(filepath=nam, verbose=0, save_best_only=True)

        self.__aer(nc=self.x.shape[1])
        self.ae.fit(self.x,
                    self.x,
                    sample_weight=self.w,
                    epochs=3000,
                    shuffle=True,
                    batch_size=15,
                    validation_data=(self.x, self.x),
                    callbacks=[check],
                    verbose=0)

        final = load_model(nam)
        self.__ael1 = Dff(final.get_layer("enco1").get_weights()[0])
        self.__ael1 = self.__ael1.add_prefix('AE_' + self.__run_id + '_')

        self.__ael2 = Dff(final.get_layer("code").get_weights()[0])
        self.__ael2["run"] = nam

        self.__ael2.to_csv("code_{}.csv".format(self.__run_id), index=False)
        return

    def fit(self):
        """
      Fit quadruple dimension reduction {Median, PCA, NMF, AE[DE]}

      """
        self.l_med()
        self.lde()
        self.mud_ae()
        self.__reduced = pd.concat([self.__ael1, self.__pcanmf, self.__median],
                                   axis=1)
        return

    def get_reduced(self):
        """
      Get reduced dimension

      """
        return self.__reduced

    def get_aede(self):
        return self.__a

    def add_reduced_row(self, y):
        self.__reduced["ID"] = y
        self.__pcanmf["ID"] = y
        self.__ael1["ID"] = y
        return
Beispiel #21
0
print pd.get_dummies(df['key'])

# 给DataFrame的列加上前缀,方便合并
dummies = pd.get_dummies(df['key'], prefix='key')
df_with_dummy = df[['data1']].join(dummies)
print df_with_dummy

# 某一行同属于多个分类
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('movies.dat', sep='::', header=None, names=mnames)
print movies[:10]
# 数据规整
genre_iter = (set(x.split('|')) for x in movies.genres)
# 抽取出不同的值
genres = sorted(set.union(* genre_iter))
print genre_iter
print genres

# 构建全0的DataFrame
dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)
for i, gen in enumerate(movies.genres):
    dummies.ix[i, gen.split('|')] = 1
movies_windic = movies.join(dummies.add_prefix('Genre_'))
print movies_windic.ix[0]

# 结合get_dummies和诸如cut之类离散化函数
values = np.random.rand(10)
print values
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
print pd.get_dummies(pd.cut(values, bins))
Beispiel #22
0
def red_flags(test_file: pd.DataFrame):

    # Difference claims and policy effect/emission
    test_file = test_file[[
        'id_siniestro', 'fecha_diferencia_siniestro_efecto',
        'fecha_diferencia_siniestro_efecto_5',
        'fecha_diferencia_siniestro_efecto_15',
        'fecha_diferencia_siniestro_efecto_30',
        'fecha_diferencia_siniestro_emision',
        'fecha_diferencia_siniestro_emision_5',
        'fecha_diferencia_siniestro_emision_15',
        'fecha_diferencia_siniestro_emision_30', 'fecha_siniestro_ocurrencia',
        'fecha_poliza_emision', 'fecha_poliza_efecto_natural',
        'fecha_diferencia_siniestro_comunicacion'
    ]]

    policy_file = pd.read_csv(STRING.poliza_input_prediction,
                              sep=',',
                              encoding='utf-8',
                              quotechar='"')
    policy_file = policy_file[[
        'audit_siniestro_referencia', 'poliza_cod_intermediario'
    ]]
    policy_file = policy_file.rename(
        columns={
            'audit_siniestro_referencia': 'id_siniestro',
            'poliza_cod_intermediario': 'id_mediador'
        })
    policy_file['id_siniestro'] = policy_file['id_siniestro'].map(int)
    test_file['id_siniestro'] = test_file['id_siniestro'].map(int)
    test_file = pd.merge(test_file, policy_file, how='left', on='id_siniestro')

    test_file = test_file.dropna(subset=['id_siniestro'])
    test_file['id_mediador'] = test_file['id_mediador'].fillna(-1)

    # Occurance between effect and emision
    for i in [
            'fecha_siniestro_ocurrencia', 'fecha_poliza_emision',
            'fecha_poliza_efecto_natural'
    ]:
        test_file[i] = pd.to_datetime(test_file[i],
                                      format='%Y-%m-%d',
                                      errors='coerce')

    test_file['fecha_ocurrencia_entre_efecto_emision'] = pd.Series(
        0, index=test_file.index)
    test_file.loc[(test_file['fecha_poliza_emision'] <=
                   test_file['fecha_siniestro_ocurrencia']) &
                  (test_file['fecha_siniestro_ocurrencia'] <=
                   test_file['fecha_poliza_efecto_natural']),
                  'fecha_ocurrencia_entre_efecto_emision'] = 1

    test_file.loc[(test_file['fecha_poliza_efecto_natural'] <=
                   test_file['fecha_siniestro_ocurrencia']) &
                  (test_file['fecha_siniestro_ocurrencia'] <=
                   test_file['fecha_poliza_emision']),
                  'fecha_ocurrencia_entre_efecto_emision'] = 1

    # Comunication and occurance difference
    test_file['retraso_comunicacion'] = pd.Series(0, index=test_file.index)
    test_file.loc[test_file['fecha_diferencia_siniestro_comunicacion'] >= 15,
                  'retraso_comunicacion'] = 1

    # If mediador
    test_file['mediador'] = pd.Series(0, index=test_file.index)
    test_file['id_mediador'] = test_file['id_mediador'].map(int)
    test_file.loc[test_file['id_mediador'] == 62659, 'mediador'] = 1

    # Indicator of RF
    test_file['indicator'] = pd.Series(0, index=test_file.index)

    # test_file.loc[test_file['fecha_diferencia_siniestro_efecto'] <= 30, 'indicator'] = 1
    test_file.loc[test_file['fecha_diferencia_siniestro_emision'] <= 30,
                  'indicator'] = 1
    test_file.loc[test_file['retraso_comunicacion'] == 1, 'indicator'] = 1
    test_file.loc[test_file['fecha_ocurrencia_entre_efecto_emision'] == 1,
                  'indicator'] = 1
    test_file.loc[test_file['mediador'] == 1, 'indicator'] = 1

    test_file = test_file.add_prefix('RF_')
    test_file = test_file.rename(columns={'RF_id_siniestro': 'id_siniestro'})

    return test_file
Beispiel #23
0
english_mapping = dict(get_english(x) for x in all_cats)


def get_code(seq):
    return [x.split('.')[0] for x in seq if x]


all_codes = get_code(all_cats)
code_index = pd.Index(np.unique(all_codes))
dummy_frame = DataFrame(np.zeros((len(data), len(code_index))),
                        index=data.index,
                        columns=code_index)
for row, cat in zip(data.index, data.CATEGORY):
    codes = get_code(to_cat_list(cat))
    dummy_frame.ix[row, codes] = 1
data = data.join(dummy_frame.add_prefix('category_'))  # 与原数据表建立对应关系


def basic_haiti_map(ax=None, lllat=17.25, urlat=20.25, lllon=-75, urlon=-71):
    # 创建极球面投影的Basemap实例
    m = Basemap(ax=ax,
                projection='stere',
                lon_0=(urlon + lllon) / 2,
                lat_0=(urlon + lllat) / 2,
                llcrnrlat=lllat,
                urcrnrlat=urlat,
                llcrnrlon=lllon,
                urcrnrlon=urlon,
                resolution='f')
    # 绘制海岸线、州界、国界以及地图边界
    m.drawcoastlines()
def slide_14():
    data = pd.read_csv(HAICHICSVPATH)
    print data

    print data[['INCIDENT DATE', 'LATITUDE', 'LONGITUDE']][:10]

    print 'データのカテゴリ'
    print data['CATEGORY'][:6]

    print 'データの詳細'
    print data.describe()
    print '外れたところのデータと欠損値を外す'
    data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) &
                (data.LONGITUDE > -75) & (data.LONGITUDE < -70)
                & (data.CATEGORY.notnull())]

    def to_cat_list(catstr):
        stripped = (x.strip() for x in catstr.split(','))
        return [x for x in stripped if x]

    def get_all_categories(cat_series):
        cat_sets = (set(to_cat_list(x)) for x in cat_series)
        return sorted(set.union(*cat_sets))

    def get_english(cat):
        code, names = cat.split('.')
        if '|' in names:
            names = names.split(' | ')[1]
        return code, names.strip()

    all_cats = get_all_categories(data.CATEGORY)
    english_mapping = dict(get_english(x) for x in all_cats)

    print english_mapping['2a']
    print english_mapping['6c']

    def get_code(seq):
        return [x.split('.')[0] for x in seq if x]

    all_codes = get_code(all_cats)
    code_index = pd.Index(np.unique(all_codes))
    dummy_frame = DataFrame(np.zeros((len(data), len(code_index))),
                            index=data.index, columns=code_index)
    print dummy_frame.ix[:, :6]

    print data.index
    for row, cat in zip(data.index, data.CATEGORY):
        codes = get_code(to_cat_list(cat))
        dummy_frame.ix[row, codes] = 1
    data = data.join(dummy_frame.add_prefix('category_'))
    print data.ix[:, 10:15]

    from mpl_toolkits.basemap import Basemap

    def basic_haiti_map(ax=None, lllat=17.25, urlat=20.25,
                                 lllon=-75, urlon=-71):
        m = Basemap(ax=ax,
                    projection='stere',
                    lon_0=(urlon + lllon) / 2,
                    lat_0=(urlat + lllat) / 2,
                    llcrnrlat=lllat,
                    urcrnrlat=urlat,
                    llcrnrlon=lllon,
                    urcrnrlon=urlon,
                    resolution='f')
        m.drawcoastlines()
        m.drawstates()
        m.drawcountries()
        return m

    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)
    to_plot = ['2a', '1', '3c', '7a']
    lllat = 17.25
    urlat = 20.25
    lllon = -75
    urlon = -71

    for code, ax in zip(to_plot, axes.flat):
        m = basic_haiti_map(ax,
                            lllat=lllat,
                            urlat=urlat,
                            lllon=lllon,
                            urlon=urlon)
        cat_data = data[data['category_%s' % code] == 1]

        x, y = m(cat_data.LONGITUDE.values, cat_data.LATITUDE.values)
        m.plot(x, y, 'k.', alpha=0.5)
        ax.set_title('%s: %s' % (code, english_mapping[code]))

    m.readshapefile(SHAPEFILEPATH, 'roads')
Beispiel #25
0
import pandas as pd
import numpy as np

###计算指标与哑变量
#将离散变量转化为哑变量形式
df = DataFrame({'key': ['b', 'b', 'a', 'c', 'a', 'b'], 'data1': range(6)})
pd.get_dummies(df['key'])

dummies = pd.get_dummies(df['key'], prefix='key')  #prefix设定哑变量的名称前缀
df_with_dummy = df[['data1']].join(dummies)
df_with_dummy

#分类不止一个的情况
mnames = ['movie_id', 'title', 'genres']
movies = pd.read_table('d:/data/movies.dat',
                       sep='::',
                       header=None,
                       names=mnames)
movies[:10]

genre_iter = (set(x.split('|')) for x in movies.genres)
genres = sorted(set.union(*genre_iter))

dummies = DataFrame(np.zeros((len(movies), len(genres))), columns=genres)

for i, gen in enumerate(movies.genres):
    dummies.ix[i, gen.split('|')] = 1

movies_windic = movies.join(dummies.add_prefix('Genre_'))
movies_windic.ix[0]
mnames = ['movie_id', 'tittle', 'genres']
movies = pd.read_table(
    '/Users/changyueh/Desktop/CodePractice/Data_Analysis/Chapt2/ml-1m/movies.dat',
    sep='::',
    header=None,
    names=mnames)
movies[:10]
genre_iter = (set(x.split('|')) for x in movies.genres)  #迭代每一個genre
genres = sorted(set.union(*genre_iter))  #列出所有的genre,經過去重
dummies = DataFrame(np.zeros((len(movies.index), len(genres))),
                    columns=genres)  #建構全是零,columns=genres數目的DF
for i, gen in enumerate(movies.genres):
    dummies.loc[i, gen.split('|')] = 1  #說明寫在page216
dummies.head()
movies_windic = movies.join(
    dummies.add_prefix('Genre_'))  #直接用join,並且增加columns的前綴
movies_windic.iloc[0]

values = np.random.rand(10)
values
bins = [0, 0.2, 0.4, 0.6, 0.8, 1]
ranks = ['0<x<=0.2', '0.2<x<=0.4', '0.4<x<=0.6', '0.6<x<=0.8', '0.8<x<=1.']
pd.get_dummies(pd.cut(values, bins, labels=ranks))

#字符串操作
##字符串對象方法,p218表7-3有所有的內置字符串方式
val = 'a,b, guido'
val.split(',')  #內置的split可以做很多事

pieces = [x.strip() for x in val.split(',')]
pieces  #strip會自動把空白取消
Beispiel #27
0
def slide_14():
    data = pd.read_csv(HAICHICSVPATH)
    print data

    print data[['INCIDENT DATE', 'LATITUDE', 'LONGITUDE']][:10]

    print 'データのカテゴリ'
    print data['CATEGORY'][:6]

    print 'データの詳細'
    print data.describe()
    print '外れたところのデータと欠損値を外す'
    data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) &
                (data.LONGITUDE > -75) & (data.LONGITUDE < -70)
                & (data.CATEGORY.notnull())]

    def to_cat_list(catstr):
        stripped = (x.strip() for x in catstr.split(','))
        return [x for x in stripped if x]

    def get_all_categories(cat_series):
        cat_sets = (set(to_cat_list(x)) for x in cat_series)
        return sorted(set.union(*cat_sets))

    def get_english(cat):
        code, names = cat.split('.')
        if '|' in names:
            names = names.split(' | ')[1]
        return code, names.strip()

    all_cats = get_all_categories(data.CATEGORY)
    english_mapping = dict(get_english(x) for x in all_cats)

    print english_mapping['2a']
    print english_mapping['6c']

    def get_code(seq):
        return [x.split('.')[0] for x in seq if x]

    all_codes = get_code(all_cats)
    code_index = pd.Index(np.unique(all_codes))
    dummy_frame = DataFrame(np.zeros((len(data), len(code_index))),
                            index=data.index,
                            columns=code_index)
    print dummy_frame.ix[:, :6]

    print data.index
    for row, cat in zip(data.index, data.CATEGORY):
        codes = get_code(to_cat_list(cat))
        dummy_frame.ix[row, codes] = 1
    data = data.join(dummy_frame.add_prefix('category_'))
    print data.ix[:, 10:15]

    from mpl_toolkits.basemap import Basemap

    def basic_haiti_map(ax=None,
                        lllat=17.25,
                        urlat=20.25,
                        lllon=-75,
                        urlon=-71):
        m = Basemap(ax=ax,
                    projection='stere',
                    lon_0=(urlon + lllon) / 2,
                    lat_0=(urlat + lllat) / 2,
                    llcrnrlat=lllat,
                    urcrnrlat=urlat,
                    llcrnrlon=lllon,
                    urcrnrlon=urlon,
                    resolution='f')
        m.drawcoastlines()
        m.drawstates()
        m.drawcountries()
        return m

    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(12, 10))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)
    to_plot = ['2a', '1', '3c', '7a']
    lllat = 17.25
    urlat = 20.25
    lllon = -75
    urlon = -71

    for code, ax in zip(to_plot, axes.flat):
        m = basic_haiti_map(ax,
                            lllat=lllat,
                            urlat=urlat,
                            lllon=lllon,
                            urlon=urlon)
        cat_data = data[data['category_%s' % code] == 1]

        x, y = m(cat_data.LONGITUDE.values, cat_data.LATITUDE.values)
        m.plot(x, y, 'k.', alpha=0.5)
        ax.set_title('%s: %s' % (code, english_mapping[code]))

    m.readshapefile(SHAPEFILEPATH, 'roads')
Beispiel #28
0
    return [x.split(".")[0] for x in seq if x]


# Read data from standard input on the command line
sys.stdin = os.fdopen(sys.stdin.fileno(), "rU")
data = pd.read_csv(sys.stdin)

# Restrict to data in Haiti with categories
data = data[(data.LATITUDE > 18) & (data.LATITUDE < 20) &
            (data.LONGITUDE > -75) & (data.LONGITUDE < -70)
            & data.CATEGORY.notnull()]

# Extract categorizations
all_cats = get_all_categories(data.CATEGORY)

# Add indicator columns for categories
all_codes = get_code(all_cats)
code_index = pd.Index(np.unique(all_codes))
dummy_frame = DataFrame(np.zeros((len(data), len(code_index))),
                        index=data.index,
                        columns=code_index)

for row, cat in zip(data.index, data.CATEGORY):
    codes = get_code(to_cat_list(cat))
    dummy_frame.ix[row, codes] = 1

data = data.join(dummy_frame.add_prefix("category_"))

# Write data to standard output
data.to_csv(sys.stdout)