def test_date_breaks(): # cpython x = [datetime(year, 1, 1) for year in [2010, 2026, 2015]] limits = min(x), max(x) breaks = date_breaks('5 Years') years = [d.year for d in breaks(limits)] npt.assert_array_equal( years, [2010, 2015, 2020, 2025, 2030]) breaks = date_breaks('10 Years') years = [d.year for d in breaks(limits)] npt.assert_array_equal(years, [2010, 2020, 2030]) # numpy x = [np.datetime64(i*10, 'D') for i in range(1, 10)] breaks = date_breaks('10 Years') limits = min(x), max(x) with pytest.raises(AttributeError): breaks(limits) # NaT limits = np.datetime64('NaT'), datetime(2017, 1, 1) breaks = date_breaks('10 Years') assert len(breaks(limits)) == 0
def __init__(self, **kwargs): # Permit the use of the general parameters for # specifying the format strings with suppress(KeyError): breaks = kwargs['breaks'] if isinstance(breaks, six.string_types): kwargs['breaks'] = date_breaks(breaks) with suppress(KeyError): minor_breaks = kwargs['minor_breaks'] if isinstance(minor_breaks, six.string_types): kwargs['minor_breaks'] = date_breaks(minor_breaks) # Using the more specific parameters take precedence with suppress(KeyError): breaks_fmt = kwargs.pop('date_breaks') kwargs['breaks'] = date_breaks(breaks_fmt) with suppress(KeyError): labels_fmt = kwargs.pop('date_labels') kwargs['labels'] = date_format(labels_fmt) with suppress(KeyError): minor_breaks_fmt = kwargs.pop('date_minor_breaks') kwargs['minor_breaks'] = date_breaks(minor_breaks_fmt) scale_continuous.__init__(self, **kwargs)
def one_day_graph(collect_date='20191015', gateway_id='ep18270334'): db = 'aihems_api_db' # db = 'aihems_service_db' conn = pymysql.connect( host='aihems-service-db.cnz3sewvscki.ap-northeast-2.rds.amazonaws.com', port=3306, user='******', passwd='#cslee1234', db=db, charset='utf8') sql = f""" SELECT COLLECT_DATE , COLLECT_TIME , ONOFF , case when POWER > 20 then 1 else 0 end POWER -- , POWER , ENERGY_DIFF FROM AH_USE_LOG_BYMINUTE WHERE 1=1 AND GATEWAY_ID = '{gateway_id}' AND COLLECT_DATE = '{collect_date}' """ df = pd.read_sql(sql, con=conn) df['date'] = df.COLLECT_DATE + ' ' + df.COLLECT_TIME # print(sql) df.date = pd.to_datetime(df.date) print(collect_date) return(ggplot(df, aes(x = 'date', y = 'POWER'))+geom_line()+\ scale_x_datetime(breaks=date_breaks('2 hours'),labels=date_format('%H')))
def groupid_metricat_eval_time(metric_name, cutoff, role, engine): qry = """set role {}; select *, cast(model_group_id as varchar) as model_group_id_char, case when null_par = 'omit' then 'labeled' else 'all' end as null_label from results.view_modelgroup where subset = 'all_data' and cutoff = '{cutoff}' and null_par = 'all' and metric_name = '{metric_name}' and type = 'abs'; """.format(role, cutoff=cutoff, metric_name=metric_name) tab = pd.read_sql_query(qry, engine) gg = ( ggplot( tab, aes(x='evaluation_start_time', y='value', color='model_type', group='model_group_id')) + geom_line() + geom_point() + theme(axis_text_x=element_text(angle=90)) + ggtitle("{} at {}".format(metric_name.title(), cutoff)) + #facet_wrap('~cutoff') + scale_color_brewer('qual', name='Model type') + scale_x_datetime(breaks=date_breaks('1 months')) + #ylim(0,1) + ylab(metric_name.title()) + xlab('Evaluation start time') + theme(figure_size=(5, 3))) return gg
def chart_time_series(data): data = data.copy() category_list = data["category"].value_counts().index.tolist() category_cat = CategoricalDtype(categories=category_list, ordered=True) data["category_cat"] = data["category"].astype(str).astype(category_cat) p1 = ( ggplot(data) + geom_bar( aes(x="date", y="quantity", fill="category_cat"), stat="identity", position=position_dodge(), ) + scale_x_datetime(breaks=date_breaks("1 years"), labels=custom_date_format1) + labs(y="sample size", x="years", title="LAPIG") + guides(fill=guide_legend(title="Legend", )) # new ) return p1 + theme( panel_background=element_rect(fill="gray", alpha=0.2), dpi=120, figure_size=(12, 6), # inches aspect_ratio=0.3, # height:width )
def plot_frequency(n = 200): """ Draws the histogram of the distribution of n tweets by date. Parameters ---------- n: int An integer specifying how many tweets should be analysed. Returns ------- It saves the histogram as a .png file in the static folder. """ from plotnine import ggplot, aes, geom_histogram, scale_x_datetime, labs, theme_minimal, ggsave from Mod_1_API import gather_tweets from mizani.breaks import date_breaks from mizani.formatters import date_format import pandas df = pandas.DataFrame(gather_tweets(n)) plot1 = (ggplot(df, aes(x = 'Date', fill = 'Author')) + geom_histogram() + scale_x_datetime(breaks=date_breaks('1 week')) + labs(x = "Time in weeks", y = "Number of tweets by source") + theme_minimal() ) ggsave(plot = plot1, filename = "test.png", path = "static/")
def plot_drawdowns(cumulative_returns, benchmark_cum_returns): """Any time the cumulative returns dips below the current cumulative maximum returns, it's a drawdown. Drawdowns are measured as a percentage of that maximum cumulative return, in effect, measured from peak equity.""" benchmark_drawdown = get_drawdown(benchmark_cum_returns) benchmark_drawdown = benchmark_drawdown.to_frame() benchmark_drawdown = benchmark_drawdown.rename(columns={"benchmark": "drawdown"}) benchmark_drawdown['key'] = "benchmark" benchmark_drawdown.index.name = 'date' benchmark_drawdown.reset_index(level=0, inplace=True) portfolio_drawdown = get_drawdown(cumulative_returns) portfolio_drawdown = portfolio_drawdown.to_frame() portfolio_drawdown['key'] = "portfolio" portfolio_drawdown = portfolio_drawdown.rename(columns={"returns": "drawdown"}) portfolio_drawdown.index.name = 'date' portfolio_drawdown.reset_index(level=0, inplace=True) mask = benchmark_drawdown.date.isin(portfolio_drawdown.date) benchmark_drawdown = benchmark_drawdown[mask] df = portfolio_drawdown.append(benchmark_drawdown) df.to_csv(data_path+portfolio_name +'drawdowns.csv', header = True) warnings.filterwarnings('ignore') d = (ggplot(df) + aes(x = 'date', y = 'drawdown', color='key', group='key') + geom_line() + scale_x_datetime(breaks=date_breaks('1 years'), labels=date_format('%Y')) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title=portfolio_name+'portfolio vs. benchmark', y = 'Drawdown % (change peak to trough)') ) d.save(filename=portfolio_name+'drawdowns.png', \ format="png", path=results_path, width = 6.4, height = 4.8, dpi=125) warnings.filterwarnings('default')
def plot_evolution_node(df, col): return ggplot(df) +\ aes(x='timestamp', y=col) +\ geom_line() +\ geom_point(aes(color='weird'), size=0.5) +\ geom_point(df[df.weird == True], aes(color='weird'), size=2) +\ scale_color_manual({ 'NA': '#AAAAAA', True: '#FF0000', False: '#00FF00'}) +\ theme_bw() +\ geom_ribbon(aes(ymin='low_bound', ymax='high_bound'), color='grey', alpha=0.2) +\ facet_wrap('cpu', labeller='label_both') +\ scale_x_datetime(breaks=date_breaks('3 months'))
def _generic_overview(df, changelog, col, weird_col, grey_after_reset=True): cluster = select_unique(df, 'cluster') df = df.copy() df['node_cpu'] = df['node'].astype(str) + ':' + df['cpu'].astype(str) node_cat = df[['node', 'cpu', 'node_cpu']].drop_duplicates().sort_values( by=['node', 'cpu'], ascending=False)['node_cpu'] df['node_cpu'] = pandas.Categorical(df['node_cpu'], categories=node_cat, ordered=True) global_changes, local_changes = get_changes_from_changelog( changelog[changelog['date'] >= df['timestamp'].min()], cluster) local_changes['ymin'] = local_changes['node'].astype(str) + ':' + str( df['cpu'].min()) local_changes['ymax'] = (local_changes['node'] + 1).astype(str) + ':' + str(df['cpu'].min()) local_changes[col] = 42 # not used, but otherwise plotnine complains... points_args = {'stroke': 0, 'size': 3} plot = ggplot() +\ aes(x='timestamp', y='node_cpu') +\ geom_point(df[df[weird_col] == 'NA'], *[aes(fill=col) if not grey_after_reset else None], **{**points_args, **({'fill': '#AAAAAA'} if grey_after_reset else {})}) +\ geom_point(df[df[weird_col] == 'False'], aes(fill=col, shape='outlier'), **points_args) +\ scale_shape_manual({False: 'o', True: 'X'}, limits=[False, True]) +\ scale_color_manual({ 'protocol': '#888888', 'G5K': '#DD9500'}, guide=False) +\ labs(shape='Outlier') +\ theme_bw() +\ scale_x_datetime(breaks=date_breaks(get_date_breaks(df))) +\ ylab('Node:CPU') +\ ggtitle(f'Overview of the cluster {cluster}') if len(local_changes) > 0: plot += geom_segment(local_changes, aes(x='date', xend='date', y='ymin', yend='ymax', color='type'), position=position_nudge(y=0.5), size=1) if len(global_changes) > 0: plot += geom_vline(global_changes, aes(xintercept='date', color='type'), size=1) weird_points = df[~df[weird_col].isin({'NA', 'False'})] if len(weird_points) > 0: plot += geom_point(weird_points, aes(fill=col, shape='outlier'), **points_args) return plot
def test_date_breaks(): # cpython x = [datetime(year, 1, 1) for year in [2010, 2026, 2015]] limits = min(x), max(x) breaks = date_breaks('5 Years') years = [d.year for d in breaks(limits)] npt.assert_array_equal(years, [2010, 2015, 2020, 2025, 2030]) breaks = date_breaks('10 Years') years = [d.year for d in breaks(limits)] npt.assert_array_equal(years, [2010, 2020, 2030]) # numpy x = [np.datetime64(i * 10, 'D') for i in range(1, 10)] breaks = date_breaks('10 Years') limits = min(x), max(x) with pytest.raises(AttributeError): breaks(limits) # NaT limits = np.datetime64('NaT'), datetime(2017, 1, 1) breaks = date_breaks('10 Years') assert len(breaks(limits)) == 0
def plot_evolution_node(df, col, low_col, high_col, weird_col): return ggplot(df) +\ aes(x='timestamp', y=col) +\ geom_line() +\ geom_point(aes(fill=weird_col, shape='outlier'), size=1.5, stroke=0) +\ geom_point(df[df[weird_col].isin({'positive', 'negative'})], aes(fill=weird_col, shape='outlier'), size=3, stroke=0) +\ scale_shape_manual({False: 'o', True: 'X'}, limits=[False, True]) +\ scale_fill_manual({ 'NA': '#AAAAAA', 'positive': '#FF0000', 'negative': '#0000FF', 'False': '#00FF00'}, limits=['False', 'positive', 'negative']) +\ theme_bw() +\ labs(fill='Weird', shape='Outlier') +\ geom_ribbon(aes(ymin=low_col, ymax=high_col), color='grey', alpha=0.2) +\ facet_wrap('cpu', labeller='label_both') +\ scale_x_datetime(breaks=date_breaks(get_date_breaks(df)))
def performance_graph(performance_data, data_name="recall", y_label="Recall"): p = ( ggplot(performance_data) + aes("date", data_name) + scale_x_datetime( breaks=date_breaks("1 years"), # date_breaks=("5 years"), # date_minor_breaks=("1 years"), # limits=["1985-01-01 T 00:00 UTC", "2018-01-01 T 00:00 UTC"], labels=custom_date_format1, ) + ylab(y_label) + xlab("Year") + geom_line(color="blue", group=1) + ylim(0, 1) + theme_gray(base_size=14)) p = p + theme( axis_line=element_line(size=0.7, color="gray"), panel_background=element_rect(fill="gray", alpha=0.2), dpi=120, figure_size=(8, 6), aspect_ratio=0.2, ) return p
def plot_portfolio_vs_benchmark(cumulative_returns, benchmark_cum_returns): benchmark_cum_returns = benchmark_cum_returns.rename(columns={"benchmark": "returns"}) benchmark_cum_returns['key'] = "benchmark" cumulative_returns['key'] = "portfolio" cumulative_returns["returns"] = cumulative_returns["returns"] df = cumulative_returns.append(benchmark_cum_returns) df.index.name = 'date' df.reset_index(level=0, inplace=True) df['returns'] = df['returns']*100 warnings.filterwarnings('ignore') df.to_csv(data_path+portfolio_name +'returns.csv', header = True) r = (ggplot(df) + aes(x = 'date', y = 'returns', color='key', group='key') + geom_line() + scale_x_datetime(breaks=date_breaks('1 years'), labels=date_format('%Y')) + theme(axis_text_x=element_text(rotation=90, hjust=1)) + labs(title=portfolio_name+'portfolio vs. benchmark', y = 'Returns %') ) r.save(filename=portfolio_name+'returns.png', \ format="png", path=results_path, width = 6.4, height = 4.8, dpi=125) warnings.filterwarnings('default')
} months = ['201801', '201802', '201803', '201804', '201805', '201806', '201807'] dfs = get_dfs(areaname_dict, months, appcode) df = pd.concat(dfs) df # In[ ]: #将日期和aqi转为需要的格式 df.time = pd.to_datetime(df.time) df.aqi = pd.to_numeric(df.aqi) import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') import plotnine from plotnine import * from mizani.breaks import date_breaks #比较api空气状况折线图 (ggplot(df, aes(x='time', y='aqi', color='factor(areaname)')) + geom_line() + scale_x_datetime(breaks=date_breaks('2 week')) + xlab('日期') + theme_matplotlib() + theme(axis_text_x=element_text(rotation=45, hjust=1)) + theme(text=element_text(family='Arial Unicode MS'))) # In[ ]: #比较aqiLevel散点图 (ggplot(df, aes(x='time', y='aqiLevel', color='factor(areaname)')) + geom_point() + scale_x_datetime(breaks=date_breaks('2 week')) + xlab('日期') + theme_matplotlib() + theme(axis_text_x=element_text(rotation=45, hjust=1)) + theme(text=element_text(family='Arial Unicode MS')))
result_df['거래금액'] = pd.to_numeric(result_df['거래금액']) result_df['도로명시군구코드'] = pd.to_numeric(result_df['도로명시군구코드'], downcast='integer') result_df = pd.merge(left=result_df, right=gu_code_data, left_on='도로명시군구코드', right_on='코드').drop('코드', axis=1) result_df['년월'] = result_df['년'] + result_df['월'] result_df['년월'] = result_df['년월'].map(lambda x : datetime.datetime.strptime(x, '%Y%m')) chart_df = result_df.groupby(['년월', '구'])['거래금액'].agg('sum') chart_df = chart_df.reset_index() chart_df['거래금액'] = chart_df['거래금액'] * 0.0001 chart_df = chart_df.query('년월 != "2019-07-01"') # 19년 6월 실거래가 아직 일부만 반영되어 제외 #%% # 그래프 그리기 (ggplot(data=chart_df, mapping=aes(x = '년월', y = '거래금액', color='구')) + geom_line() + scale_x_datetime(breaks=date_breaks('6 months'), labels=date_format('%Y%m')) + scale_y_continuous(breaks=list(range(0, 20000, 2500)),labels = comma_format()) + labs(x='기간', y='거래금액 (단위:억원)', color='') + theme(text=element_text(family='Malgun Gothic')) ) #%% # 피벗 돌려서 파일로 저장하기 trade_vol_df = chart_df.pivot(index='년월', columns='구', values='거래금액') trade_vol_df.to_csv(file_path + 'apt_trade_vol_data.csv', encoding='euc-kr')
def time_graphs(engine, id_company): """ Function to create inspections, type of inspections and infractions over time engine: to query id_company: rutempresamask """ # Query with new variables qry = """set role direccion_trabajo_inspections_write; select rutempresamask, date(agno || '-' || mesreg || '-01') as date, count(rutempresamask) as inspections, sum(infra) as infractions, sum(num_materias) as matters, sum(case when solicitante = 'Por Programa' then 1 else 0 end) as proactive, sum(case when solicitante = 'Por Programa' then 0 else 1 end) as reactive from raw.inspections_complete where rutempresamask = {} group by rutempresamask, date order by date;""".format("'" + id_company + "'") tab_summ = pd.read_sql_query(qry, engine) # Date format for graph tab_summ['date'] = pd.to_datetime(tab_summ.date) # New features tab_summ['infractions'] = tab_summ['infractions'].astype('int') tab_summ['matters'] = tab_summ['matters'].astype('int') tab_summ['prop_raw'] = tab_summ['infractions']/tab_summ['matters'] tab_summ['prop'] = round(100*tab_summ['infractions']/tab_summ['matters']) tab_summ['prop'] = tab_summ['prop'].astype('int') tab_summ['prop'] = tab_summ['prop'].map(str) + "%" # Number of inspections gg1 = (ggplot(tab_summ, aes ( x = 'date', y = 'inspections')) + geom_hline(yintercept = 0, color = 'gray') + geom_bar(stat = 'identity', fill = 'purple', alpha = .5) + geom_text(aes(y = 'inspections + 1', label = 'inspections'), color = 'black', size = 10) + ylab('Count') + xlab('Month of inspection') + scale_x_datetime( breaks=date_breaks('1 months'), labels=custom_date_format2) + theme(axis_text_x = element_text(angle = 90, size= 8), figure_size = (15, 3))) # Number of inspections by type of inspections tab_gg = pd.melt(tab_summ[['rutempresamask', "date", "proactive","reactive", "inspections"]], id_vars = ['rutempresamask', 'date', 'inspections']) gg2 = (ggplot(tab_gg, aes ( x = 'date', y = 'value', fill = 'variable')) + geom_hline(yintercept = 0, color = 'gray') + geom_bar(stat = 'identity', alpha = .5) + #geom_text(aes(y = 'inspections + 1', label = 'inspections'), # color = 'black', size = 10) + ylab('Count') + xlab('Month of inspection') + scale_x_datetime( breaks=date_breaks('1 months'), labels=custom_date_format2) + theme(axis_text_x = element_text(angle = 90, size= 8), figure_size = (15, 3))) # Matters inspected vs matters with infractions gg3 = (ggplot(tab_summ, aes ( x = 'date', y = -1)) + geom_hline(yintercept = -1, color = 'gray') + geom_linerange(aes(ymin =0, ymax = 'matters'), color = 'blue') + geom_linerange(aes(ymin =0, ymax = 'infractions'), color = 'red') + geom_point(aes(y = 'infractions'), color = 'red', size = 4, alpha = .3, shape = 4) + geom_text(aes(y = 'matters + 6', label = 'prop'), size = 4, color = 'gray') + ylab('Count') + xlab('Month of infractions') + scale_x_datetime( breaks=date_breaks('1 months'), labels=custom_date_format2) + theme(axis_text_x = element_text(angle = 90, size= 8), figure_size = (15, 3))) return gg1, gg2, gg3
def filter_func(limits): breaks = date_breaks(width)(limits) # filter return [x for x in breaks if x.month % 2]
fontsize=9) #plt.show() g.savefig("PV-Channel-bar-an.jpeg.png", bbox_inches="tight", dpi=600) #################################################################### #Evolution des pages vues selon les canaux (lissée) #################################################################### ################################################################################### ###### On prendra Plotline : implémentation de ggplot dans Python ######## #https://plotnine.readthedocs.io/en/stable/generated/plotnine.stats.stat_smooth.html#plotnine.stats.stat_smooth p = ( ggplot(dateChannel_data) + stat_smooth( aes('date', 'pageviews', color='channel'), method='loess', span=0.4) + ylab("Pages vues") + scale_x_datetime(breaks=date_breaks('2 years')) + # new ggtitle( "Le canal 'search' a augmenté jusqu'en 2015 puis a baissé fortement. \nLes autres canaux ont régulièrement baissé" ) + xlab( "Date\nTrafic Global - Evolution lissée des pages vues selon les canaux depuis 2011" )) p.save("PV-Channel-smooth.png", bbox_inches="tight", dpi=600) ##Remarque : certains paramètres d'affichage ne sont pas implémentés comme ##par exemple caption. #sauvegarde de dateChannel_data dateChannel_data.to_csv("dateChannel_data.csv", sep=";", index=False) ########################################################################## # Pour le traffic de base
# COMMAND ---------- from plotnine import * from plotnine.data import meat from mizani.breaks import date_breaks from mizani.formatters import date_format spkDF = spark.sql("SELECT DAYTIME,EVENT_ID, LUBE_OIL_PRESS FROM rc_data_05 WHERE RUL BETWEEN 1 AND 1440 and MM in (10,20,30,40,50) " ) spkDFPD = spkDF.toPandas() #series = pdAll[['DAYTIME','LUBE_OIL_PRESS']] pn = ggplot(spkDFPD, aes('DAYTIME','LUBE_OIL_PRESS')) + \ geom_line(color='blue') + \ scale_x_date(breaks=date_breaks('1 years'), labels=date_format('%b %Y')) + \ scale_y_continuous() + theme_bw() + theme(figure_size=(12, 8)) # COMMAND ---------- display(pn.draw()) # COMMAND ---------- ### This step is for cleaning data using simple median value of each colum to the missing or unknown value of sensors. from pyspark.sql.functions import when def replaceByMedian(pySparkDF, columnList): for colName in columnList: med = pySparkDF.approxQuantile(colName, [0.5], 0.25) pySparkDF = pySparkDF.withColumn(colName, when(pySparkDF[colName].isNotNull(),pySparkDF[colName]).otherwise(med[0]))