Esempio n. 1
0
def test_date_breaks():
    # cpython
    x = [datetime(year, 1, 1) for year in [2010, 2026, 2015]]
    limits = min(x), max(x)

    breaks = date_breaks('5 Years')
    years = [d.year for d in breaks(limits)]
    npt.assert_array_equal(
        years, [2010, 2015, 2020, 2025, 2030])

    breaks = date_breaks('10 Years')
    years = [d.year for d in breaks(limits)]
    npt.assert_array_equal(years, [2010, 2020, 2030])

    # numpy
    x = [np.datetime64(i*10, 'D') for i in range(1, 10)]
    breaks = date_breaks('10 Years')
    limits = min(x), max(x)
    with pytest.raises(AttributeError):
        breaks(limits)

    # NaT
    limits = np.datetime64('NaT'), datetime(2017, 1, 1)
    breaks = date_breaks('10 Years')
    assert len(breaks(limits)) == 0
Esempio n. 2
0
    def __init__(self, **kwargs):
        # Permit the use of the general parameters for
        # specifying the format strings
        with suppress(KeyError):
            breaks = kwargs['breaks']
            if isinstance(breaks, six.string_types):
                kwargs['breaks'] = date_breaks(breaks)

        with suppress(KeyError):
            minor_breaks = kwargs['minor_breaks']
            if isinstance(minor_breaks, six.string_types):
                kwargs['minor_breaks'] = date_breaks(minor_breaks)

        # Using the more specific parameters take precedence
        with suppress(KeyError):
            breaks_fmt = kwargs.pop('date_breaks')
            kwargs['breaks'] = date_breaks(breaks_fmt)

        with suppress(KeyError):
            labels_fmt = kwargs.pop('date_labels')
            kwargs['labels'] = date_format(labels_fmt)

        with suppress(KeyError):
            minor_breaks_fmt = kwargs.pop('date_minor_breaks')
            kwargs['minor_breaks'] = date_breaks(minor_breaks_fmt)

        scale_continuous.__init__(self, **kwargs)
Esempio n. 3
0
def one_day_graph(collect_date='20191015', gateway_id='ep18270334'):
    db = 'aihems_api_db'

    # db = 'aihems_service_db'
    conn = pymysql.connect(
        host='aihems-service-db.cnz3sewvscki.ap-northeast-2.rds.amazonaws.com',
        port=3306,
        user='******',
        passwd='#cslee1234',
        db=db,
        charset='utf8')

    sql = f"""
    SELECT
        COLLECT_DATE
        , COLLECT_TIME
        , ONOFF
        , case when POWER > 20 then 1 else 0 end POWER
--        , POWER
        , ENERGY_DIFF
    FROM AH_USE_LOG_BYMINUTE
    WHERE 1=1 
    AND GATEWAY_ID = '{gateway_id}'
    AND COLLECT_DATE = '{collect_date}'
    """

    df = pd.read_sql(sql, con=conn)
    df['date'] = df.COLLECT_DATE + ' ' + df.COLLECT_TIME
    #     print(sql)
    df.date = pd.to_datetime(df.date)
    print(collect_date)
    return(ggplot(df, aes(x = 'date', y = 'POWER'))+geom_line()+\
        scale_x_datetime(breaks=date_breaks('2 hours'),labels=date_format('%H')))
def groupid_metricat_eval_time(metric_name, cutoff, role, engine):

    qry = """set role {}; 
            select *,
                cast(model_group_id as varchar) as model_group_id_char,
                case when null_par = 'omit' then 'labeled' 
                    else 'all' end as null_label
            from results.view_modelgroup
            where 
                subset = 'all_data' and 
                cutoff = '{cutoff}' and 
                null_par = 'all' and
                metric_name = '{metric_name}' and
                type = 'abs';
            """.format(role, cutoff=cutoff, metric_name=metric_name)

    tab = pd.read_sql_query(qry, engine)

    gg = (
        ggplot(
            tab,
            aes(x='evaluation_start_time',
                y='value',
                color='model_type',
                group='model_group_id')) + geom_line() + geom_point() +
        theme(axis_text_x=element_text(angle=90)) +
        ggtitle("{} at {}".format(metric_name.title(), cutoff)) +
        #facet_wrap('~cutoff') +
        scale_color_brewer('qual', name='Model type') +
        scale_x_datetime(breaks=date_breaks('1 months')) +
        #ylim(0,1) +
        ylab(metric_name.title()) + xlab('Evaluation start time') +
        theme(figure_size=(5, 3)))

    return gg
def chart_time_series(data):

    data = data.copy()

    category_list = data["category"].value_counts().index.tolist()
    category_cat = CategoricalDtype(categories=category_list, ordered=True)

    data["category_cat"] = data["category"].astype(str).astype(category_cat)

    p1 = (
        ggplot(data) + geom_bar(
            aes(x="date", y="quantity", fill="category_cat"),
            stat="identity",
            position=position_dodge(),
        ) + scale_x_datetime(breaks=date_breaks("1 years"),
                             labels=custom_date_format1) +
        labs(y="sample size", x="years", title="LAPIG") +
        guides(fill=guide_legend(title="Legend", ))  # new
    )

    return p1 + theme(
        panel_background=element_rect(fill="gray", alpha=0.2),
        dpi=120,
        figure_size=(12, 6),  # inches
        aspect_ratio=0.3,  # height:width
    )
Esempio n. 6
0
def plot_frequency(n = 200):
    """
    Draws the histogram of the distribution of n tweets by date.
    
    Parameters
    ----------
    n: int
    An integer specifying how many tweets should be analysed.
    
    Returns
    -------
    It saves the histogram as a .png file in the static folder.

    """
        
    from plotnine import ggplot, aes, geom_histogram,  scale_x_datetime, labs, theme_minimal, ggsave 
    from Mod_1_API import gather_tweets
    from mizani.breaks import date_breaks
    from mizani.formatters import date_format
    import pandas
    
     
    df = pandas.DataFrame(gather_tweets(n))
       
    plot1 = (ggplot(df, aes(x = 'Date', fill = 'Author')) +
           geom_histogram() +
           scale_x_datetime(breaks=date_breaks('1 week')) +
           labs(x = "Time in weeks", y = "Number of tweets by source") +
           theme_minimal()
           )
    ggsave(plot = plot1, filename = "test.png", path = "static/")
def plot_drawdowns(cumulative_returns, benchmark_cum_returns):
    """Any time the cumulative returns dips below the current cumulative
    maximum returns, it's a drawdown. Drawdowns are measured as a percentage of
    that maximum cumulative return, in effect, measured from peak equity."""
    benchmark_drawdown = get_drawdown(benchmark_cum_returns)
    benchmark_drawdown = benchmark_drawdown.to_frame()
    benchmark_drawdown = benchmark_drawdown.rename(columns={"benchmark": "drawdown"})
    benchmark_drawdown['key'] = "benchmark"
    benchmark_drawdown.index.name = 'date'
    benchmark_drawdown.reset_index(level=0, inplace=True)
    portfolio_drawdown = get_drawdown(cumulative_returns)
    portfolio_drawdown = portfolio_drawdown.to_frame()
    portfolio_drawdown['key'] = "portfolio"
    portfolio_drawdown = portfolio_drawdown.rename(columns={"returns": "drawdown"})
    portfolio_drawdown.index.name = 'date'
    portfolio_drawdown.reset_index(level=0, inplace=True)
    mask = benchmark_drawdown.date.isin(portfolio_drawdown.date)
    benchmark_drawdown = benchmark_drawdown[mask]
    df = portfolio_drawdown.append(benchmark_drawdown)
    df.to_csv(data_path+portfolio_name
                        +'drawdowns.csv', header = True)
    warnings.filterwarnings('ignore')
    d = (ggplot(df)
         + aes(x = 'date', y = 'drawdown', color='key', group='key')
         + geom_line()
         + scale_x_datetime(breaks=date_breaks('1 years'), labels=date_format('%Y'))
         + theme(axis_text_x=element_text(rotation=90, hjust=1))
         + labs(title=portfolio_name+'portfolio vs. benchmark',
                y = 'Drawdown % (change peak to trough)')
         )
    d.save(filename=portfolio_name+'drawdowns.png', \
        format="png", path=results_path, width = 6.4, height = 4.8, dpi=125)
    warnings.filterwarnings('default')
Esempio n. 8
0
def plot_evolution_node(df, col):
    return ggplot(df) +\
            aes(x='timestamp', y=col) +\
            geom_line() +\
            geom_point(aes(color='weird'), size=0.5) +\
            geom_point(df[df.weird == True], aes(color='weird'), size=2) +\
            scale_color_manual({
                'NA': '#AAAAAA',
                True: '#FF0000',
                False: '#00FF00'}) +\
            theme_bw() +\
            geom_ribbon(aes(ymin='low_bound', ymax='high_bound'), color='grey', alpha=0.2) +\
            facet_wrap('cpu', labeller='label_both') +\
            scale_x_datetime(breaks=date_breaks('3 months'))
Esempio n. 9
0
def _generic_overview(df, changelog, col, weird_col, grey_after_reset=True):
    cluster = select_unique(df, 'cluster')
    df = df.copy()
    df['node_cpu'] = df['node'].astype(str) + ':' + df['cpu'].astype(str)
    node_cat = df[['node', 'cpu', 'node_cpu']].drop_duplicates().sort_values(
        by=['node', 'cpu'], ascending=False)['node_cpu']
    df['node_cpu'] = pandas.Categorical(df['node_cpu'],
                                        categories=node_cat,
                                        ordered=True)
    global_changes, local_changes = get_changes_from_changelog(
        changelog[changelog['date'] >= df['timestamp'].min()], cluster)
    local_changes['ymin'] = local_changes['node'].astype(str) + ':' + str(
        df['cpu'].min())
    local_changes['ymax'] = (local_changes['node'] +
                             1).astype(str) + ':' + str(df['cpu'].min())
    local_changes[col] = 42  # not used, but otherwise plotnine complains...
    points_args = {'stroke': 0, 'size': 3}
    plot = ggplot() +\
        aes(x='timestamp', y='node_cpu') +\
        geom_point(df[df[weird_col] == 'NA'], *[aes(fill=col) if not grey_after_reset else None],  **{**points_args, **({'fill': '#AAAAAA'} if grey_after_reset else {})}) +\
        geom_point(df[df[weird_col] == 'False'], aes(fill=col, shape='outlier'), **points_args) +\
        scale_shape_manual({False: 'o', True: 'X'}, limits=[False, True]) +\
        scale_color_manual({
            'protocol': '#888888',
            'G5K': '#DD9500'},
            guide=False) +\
        labs(shape='Outlier') +\
        theme_bw() +\
        scale_x_datetime(breaks=date_breaks(get_date_breaks(df))) +\
        ylab('Node:CPU') +\
        ggtitle(f'Overview of the cluster {cluster}')
    if len(local_changes) > 0:
        plot += geom_segment(local_changes,
                             aes(x='date',
                                 xend='date',
                                 y='ymin',
                                 yend='ymax',
                                 color='type'),
                             position=position_nudge(y=0.5),
                             size=1)
    if len(global_changes) > 0:
        plot += geom_vline(global_changes,
                           aes(xintercept='date', color='type'),
                           size=1)
    weird_points = df[~df[weird_col].isin({'NA', 'False'})]
    if len(weird_points) > 0:
        plot += geom_point(weird_points, aes(fill=col, shape='outlier'),
                           **points_args)
    return plot
Esempio n. 10
0
def test_date_breaks():
    # cpython
    x = [datetime(year, 1, 1) for year in [2010, 2026, 2015]]
    limits = min(x), max(x)

    breaks = date_breaks('5 Years')
    years = [d.year for d in breaks(limits)]
    npt.assert_array_equal(years, [2010, 2015, 2020, 2025, 2030])

    breaks = date_breaks('10 Years')
    years = [d.year for d in breaks(limits)]
    npt.assert_array_equal(years, [2010, 2020, 2030])

    # numpy
    x = [np.datetime64(i * 10, 'D') for i in range(1, 10)]
    breaks = date_breaks('10 Years')
    limits = min(x), max(x)
    with pytest.raises(AttributeError):
        breaks(limits)

    # NaT
    limits = np.datetime64('NaT'), datetime(2017, 1, 1)
    breaks = date_breaks('10 Years')
    assert len(breaks(limits)) == 0
Esempio n. 11
0
def plot_evolution_node(df, col, low_col, high_col, weird_col):
    return ggplot(df) +\
            aes(x='timestamp', y=col) +\
            geom_line() +\
            geom_point(aes(fill=weird_col, shape='outlier'), size=1.5, stroke=0) +\
            geom_point(df[df[weird_col].isin({'positive', 'negative'})], aes(fill=weird_col, shape='outlier'), size=3, stroke=0) +\
            scale_shape_manual({False: 'o', True: 'X'}, limits=[False, True]) +\
            scale_fill_manual({
                'NA': '#AAAAAA',
                'positive': '#FF0000',
                'negative': '#0000FF',
                'False': '#00FF00'}, limits=['False', 'positive', 'negative']) +\
            theme_bw() +\
            labs(fill='Weird', shape='Outlier') +\
            geom_ribbon(aes(ymin=low_col, ymax=high_col), color='grey', alpha=0.2) +\
            facet_wrap('cpu', labeller='label_both') +\
            scale_x_datetime(breaks=date_breaks(get_date_breaks(df)))
def performance_graph(performance_data, data_name="recall", y_label="Recall"):
    p = (
        ggplot(performance_data) + aes("date", data_name) + scale_x_datetime(
            breaks=date_breaks("1 years"),
            # date_breaks=("5 years"),
            # date_minor_breaks=("1 years"),
            # limits=["1985-01-01 T 00:00 UTC", "2018-01-01 T 00:00 UTC"],
            labels=custom_date_format1,
        ) + ylab(y_label) + xlab("Year") + geom_line(color="blue", group=1) +
        ylim(0, 1) + theme_gray(base_size=14))

    p = p + theme(
        axis_line=element_line(size=0.7, color="gray"),
        panel_background=element_rect(fill="gray", alpha=0.2),
        dpi=120,
        figure_size=(8, 6),
        aspect_ratio=0.2,
    )

    return p
def plot_portfolio_vs_benchmark(cumulative_returns, benchmark_cum_returns):
    benchmark_cum_returns = benchmark_cum_returns.rename(columns={"benchmark": "returns"})
    benchmark_cum_returns['key'] = "benchmark"
    cumulative_returns['key'] = "portfolio"
    cumulative_returns["returns"] = cumulative_returns["returns"]
    df = cumulative_returns.append(benchmark_cum_returns)
    df.index.name = 'date'
    df.reset_index(level=0, inplace=True)
    df['returns'] = df['returns']*100
    warnings.filterwarnings('ignore')
    df.to_csv(data_path+portfolio_name
                        +'returns.csv', header = True)
    r = (ggplot(df)
         + aes(x = 'date', y = 'returns', color='key', group='key')
         + geom_line()
         + scale_x_datetime(breaks=date_breaks('1 years'), labels=date_format('%Y'))
         + theme(axis_text_x=element_text(rotation=90, hjust=1))
         + labs(title=portfolio_name+'portfolio vs. benchmark',
                y = 'Returns %')
         )
    r.save(filename=portfolio_name+'returns.png', \
            format="png", path=results_path, width = 6.4, height = 4.8, dpi=125)
    warnings.filterwarnings('default')
Esempio n. 14
0
}
months = ['201801', '201802', '201803', '201804', '201805', '201806', '201807']
dfs = get_dfs(areaname_dict, months, appcode)
df = pd.concat(dfs)
df

# In[ ]:

#将日期和aqi转为需要的格式
df.time = pd.to_datetime(df.time)
df.aqi = pd.to_numeric(df.aqi)
import matplotlib.pyplot as plt

get_ipython().run_line_magic('matplotlib', 'inline')
import plotnine
from plotnine import *
from mizani.breaks import date_breaks
#比较api空气状况折线图
(ggplot(df, aes(x='time', y='aqi', color='factor(areaname)')) + geom_line() +
 scale_x_datetime(breaks=date_breaks('2 week')) + xlab('日期') +
 theme_matplotlib() + theme(axis_text_x=element_text(rotation=45, hjust=1)) +
 theme(text=element_text(family='Arial Unicode MS')))

# In[ ]:

#比较aqiLevel散点图
(ggplot(df, aes(x='time', y='aqiLevel', color='factor(areaname)')) +
 geom_point() + scale_x_datetime(breaks=date_breaks('2 week')) + xlab('日期') +
 theme_matplotlib() + theme(axis_text_x=element_text(rotation=45, hjust=1)) +
 theme(text=element_text(family='Arial Unicode MS')))
result_df['거래금액'] = pd.to_numeric(result_df['거래금액'])
result_df['도로명시군구코드'] = pd.to_numeric(result_df['도로명시군구코드'], downcast='integer')
result_df = pd.merge(left=result_df, 
                     right=gu_code_data, 
                     left_on='도로명시군구코드', 
                     right_on='코드').drop('코드', axis=1)
result_df['년월'] = result_df['년'] + result_df['월']
result_df['년월'] = result_df['년월'].map(lambda x : datetime.datetime.strptime(x, '%Y%m'))

chart_df = result_df.groupby(['년월', '구'])['거래금액'].agg('sum')
chart_df = chart_df.reset_index()
chart_df['거래금액'] = chart_df['거래금액'] * 0.0001
chart_df = chart_df.query('년월 != "2019-07-01"')   # 19년 6월 실거래가 아직 일부만 반영되어 제외

#%%
# 그래프 그리기

(ggplot(data=chart_df, mapping=aes(x = '년월', y = '거래금액', color='구'))
 + geom_line()
 + scale_x_datetime(breaks=date_breaks('6 months'), labels=date_format('%Y%m'))
 + scale_y_continuous(breaks=list(range(0, 20000, 2500)),labels = comma_format())
 + labs(x='기간', y='거래금액 (단위:억원)', color='')
 + theme(text=element_text(family='Malgun Gothic'))
)

#%%
# 피벗 돌려서 파일로 저장하기

trade_vol_df = chart_df.pivot(index='년월', columns='구', values='거래금액')
trade_vol_df.to_csv(file_path + 'apt_trade_vol_data.csv', encoding='euc-kr')
def time_graphs(engine, id_company):

    """
    Function to create inspections, type of inspections and infractions over time
        engine: to query
        id_company: rutempresamask
    """

    #  Query with new variables
    qry = """set role direccion_trabajo_inspections_write;
        select rutempresamask,  
        date(agno || '-' || mesreg || '-01') as date,    
        count(rutempresamask) as inspections, 
        sum(infra)  as infractions,
        sum(num_materias) as matters,
        sum(case when solicitante = 'Por Programa' then 1 else 0 end) as proactive,
        sum(case when solicitante = 'Por Programa' then 0 else 1 end) as reactive
        from raw.inspections_complete
        where rutempresamask = {} 
        group by rutempresamask, date
        order by date;""".format("'" + id_company + "'")

    tab_summ = pd.read_sql_query(qry, engine)

    # Date format for graph
    tab_summ['date'] = pd.to_datetime(tab_summ.date)

    # New features
    tab_summ['infractions'] = tab_summ['infractions'].astype('int')
    tab_summ['matters'] = tab_summ['matters'].astype('int')
    tab_summ['prop_raw'] = tab_summ['infractions']/tab_summ['matters']
    tab_summ['prop'] = round(100*tab_summ['infractions']/tab_summ['matters'])
    tab_summ['prop'] = tab_summ['prop'].astype('int')
    tab_summ['prop'] = tab_summ['prop'].map(str) + "%"
    

    # Number of inspections
    gg1 = (ggplot(tab_summ, aes ( x = 'date', y = 'inspections')) +
        geom_hline(yintercept = 0, color = 'gray') +
        geom_bar(stat = 'identity', fill = 'purple', alpha = .5) +
        geom_text(aes(y = 'inspections + 1', label = 'inspections'),
                  color = 'black', size = 10) +
        ylab('Count') +
        xlab('Month of inspection')  +
        scale_x_datetime(
             breaks=date_breaks('1 months'),
             labels=custom_date_format2) +
        theme(axis_text_x = element_text(angle = 90, size= 8),
             figure_size = (15, 3)))

    # Number of inspections by type of inspections
    tab_gg = pd.melt(tab_summ[['rutempresamask', "date", "proactive","reactive", "inspections"]],
            id_vars = ['rutempresamask', 'date', 'inspections'])

    gg2 = (ggplot(tab_gg, aes ( x = 'date', y = 'value', fill = 'variable')) +
        geom_hline(yintercept = 0, color = 'gray') +
        geom_bar(stat = 'identity', alpha = .5) +
        #geom_text(aes(y = 'inspections + 1', label = 'inspections'),
        #          color = 'black', size = 10) +
        ylab('Count') +
        xlab('Month of inspection')  +
        scale_x_datetime(
             breaks=date_breaks('1 months'),
             labels=custom_date_format2) +
        theme(axis_text_x = element_text(angle = 90, size= 8),
             figure_size = (15, 3)))

    # Matters inspected vs matters with infractions
    gg3 = (ggplot(tab_summ, aes ( x = 'date', y = -1)) +
        geom_hline(yintercept = -1, color = 'gray') +
        geom_linerange(aes(ymin =0, ymax = 'matters'),
                color = 'blue') +
        geom_linerange(aes(ymin =0, ymax = 'infractions'),
                color = 'red') +
        geom_point(aes(y = 'infractions'),
                color = 'red', size = 4,
                   alpha = .3, shape  = 4) +
        geom_text(aes(y = 'matters + 6', label = 'prop'),
                size = 4, color = 'gray') +
        ylab('Count') +
        xlab('Month of infractions') +
        scale_x_datetime(
             breaks=date_breaks('1 months'),
             labels=custom_date_format2) +
        theme(axis_text_x = element_text(angle = 90, size= 8),
             figure_size = (15, 3)))

    return gg1, gg2, gg3
Esempio n. 17
0
 def filter_func(limits):
     breaks = date_breaks(width)(limits)
     # filter
     return [x for x in breaks if x.month % 2]
Esempio n. 18
0
    fontsize=9)
#plt.show()
g.savefig("PV-Channel-bar-an.jpeg.png", bbox_inches="tight", dpi=600)

####################################################################
#Evolution des pages vues selon les canaux (lissée)
####################################################################
###################################################################################
###### On prendra Plotline : implémentation de ggplot dans Python ########
#https://plotnine.readthedocs.io/en/stable/generated/plotnine.stats.stat_smooth.html#plotnine.stats.stat_smooth

p = (
    ggplot(dateChannel_data) + stat_smooth(
        aes('date', 'pageviews', color='channel'), method='loess', span=0.4) +
    ylab("Pages vues") +
    scale_x_datetime(breaks=date_breaks('2 years')) +  # new
    ggtitle(
        "Le canal 'search' a augmenté jusqu'en 2015 puis a baissé fortement. \nLes autres canaux ont régulièrement baissé"
    ) + xlab(
        "Date\nTrafic Global - Evolution lissée des pages vues selon les canaux depuis 2011"
    ))

p.save("PV-Channel-smooth.png", bbox_inches="tight", dpi=600)

##Remarque : certains paramètres d'affichage ne sont pas implémentés comme
##par exemple caption.
#sauvegarde de dateChannel_data
dateChannel_data.to_csv("dateChannel_data.csv", sep=";", index=False)

##########################################################################
# Pour le traffic de base
# COMMAND ----------

from plotnine import *
from plotnine.data import meat
from mizani.breaks import date_breaks
from mizani.formatters import date_format

spkDF = spark.sql("SELECT DAYTIME,EVENT_ID, LUBE_OIL_PRESS FROM rc_data_05 WHERE RUL BETWEEN 1 AND 1440 and MM in (10,20,30,40,50) " )
spkDFPD = spkDF.toPandas()

#series = pdAll[['DAYTIME','LUBE_OIL_PRESS']]

pn = ggplot(spkDFPD, aes('DAYTIME','LUBE_OIL_PRESS')) + \
    geom_line(color='blue') + \
    scale_x_date(breaks=date_breaks('1 years'), labels=date_format('%b %Y')) + \
    scale_y_continuous() + theme_bw() + theme(figure_size=(12, 8))

# COMMAND ----------

display(pn.draw())

# COMMAND ----------

### This step is for cleaning data using simple median value of each colum to the missing or unknown value of sensors.
from pyspark.sql.functions import when

def replaceByMedian(pySparkDF, columnList):
    for colName in columnList:
        med = pySparkDF.approxQuantile(colName, [0.5], 0.25)
        pySparkDF = pySparkDF.withColumn(colName, when(pySparkDF[colName].isNotNull(),pySparkDF[colName]).otherwise(med[0]))