Esempio n. 1
0
def plot_trend_season(dates, ndf_domain, x, x_trend, season, my_domain):
    # ---------------------- Prepare Data Frame ----------------------- #
    df_domain = pd.DataFrame(ndf_domain, columns=['Date', 'Volume'])
    df_domain['Date'] = dates

    x_lbl = ['Observed Volume' for i in xrange(len(x))]
    xt_lbl = ['Overall Trend' for i in xrange(len(x_trend))]
    xs_lbl = ['Repeat Sending Trend' for i in xrange(len(season))]
    col3 = pd.DataFrame(x_lbl+xt_lbl+xs_lbl)

    df_plot = pd.concat( (df_domain, col3), axis=1)
    df_plot.columns = ['Date', 'Volume', 'Data']
    
    
    # ---------------------- Plot Decomposition ----------------------- #
    p = ggplot.ggplot(aes(x='Date', y='Volume', color='Data'), data=df_plot) + \
        ggplot.geom_line(color='blue', size=2) + \
        ggplot.scale_x_date(labels = date_format("%Y-%m-%d"), breaks="1 week") + \
        ggplot.xlab("Week (Marked on Mondays)") + \
        ggplot.ylab("Message Vol") + \
        ggplot.ggtitle("%s Message Volume by Week" % my_domain) + \
        ggplot.facet_grid('Data', scales='free_y') + \
        ggplot.theme_seaborn()

    return p
Esempio n. 2
0
def plot_matches(df_in,
                 date,
                 filename_out,
                 x_var='date_time',
                 y_var="shorthand_search_vol"):
    """
    Plot y-var and save based on specified variables.

    Assumes that df has already been filtered using dplyr's sift mechanism.
    Also assumes that a date has been passed in.
    """
    # basic data processing for viz
    df_in['date_time'] = date + " " + df_in['time'].astype(str)
    df_in['date_time'] = pd.to_datetime(df_in['date_time'],
                                        errors="coerce",
                                        infer_datetime_format=True)

    # build layers for plot
    p = ggplot(aes(x=x_var, y=y_var, group="match_id", color="match_id"),
               data=df_in)
    p += geom_line(size=2)

    # informative
    p += labs(x="time (gmt)", y="search volume (scaled to 100)")
    # p += ggtitle("man. city (h) vs. chelsea (a)\naug. 8 '16, etihad stadium")
    p += scale_x_date(labels=date_format("%H:%M:%S"), date_breaks="30 minutes")

    # visual
    t = theme_gray()
    t._rcParams['font.size'] = 8
    t._rcParams['font.family'] = 'monospace'
    p += t

    # done
    p.save(filename_out, width=16, height=8)
Esempio n. 3
0
def main(file_path):
    # Validate raw data path
    if not os.path.exists(file_path):
        LOG_ERROR('Could not find file: {}'.format(file_path))
        return

    # Validate raw data file type
    if not file_path.endswith('.pkl'):
        LOG_ERROR('File path must be a pickle file')
        return

    with open(file_path, 'rb') as f:
        LOG_INFO('Parsing pickle file: {}'.format(file_path))
        conversation = pickle.load(f)

        LOG_INFO('Found conversation: {}'.format(conversation['conversation_name']))

        df = pd.DataFrame(conversation['messages'])
        df.columns = ['Timestamp', 'Type', 'Participant']
        # df['Datetime'] = pd.to_datetime(df['Timestamp'])
        df['Datetime'] = df['Timestamp'].apply(lambda x:
                datetime.datetime.fromtimestamp(float(x)).toordinal())

        histogram = ggplot.ggplot(df, ggplot.aes(x='Datetime', fill='Participant')) \
                        + ggplot.geom_histogram(alpha=0.6, binwidth=2) \
                        + ggplot.scale_x_date(labels='%b %Y') \
                        + ggplot.ggtitle(conversation['conversation_name']) \
                        + ggplot.ylab('Number of messages') \
                        + ggplot.xlab('Date')

        print(histogram)
def plot_update_frequency(result):    
    import pandas as pd
    import numpy
    
    #turns query results into timeseries of chnages
    d = []
    v = []
    for res in result:
        d.append(pd.Timestamp(res['_id']['timestamp']).to_datetime())
        v.append(res['count'])       
        
    ts = pd.DataFrame(v, index = d, columns = ['changes'])
    ts = ts.resample('W', how='sum')
    ts.index.names = ['date']

    import ggplot
    #plots timeseries of changes       
    p = ggplot.ggplot(ts, ggplot.aes(x = ts.index, y=ts['changes'])) +\
            ggplot.geom_point(color = 'blue') +\
            ggplot.xlab('Period') +\
            ggplot.ylab('Changes') +\
            ggplot.geom_smooth() +\
            ggplot.ylim(low = 0) +\
            ggplot.scale_x_date(breaks = ggplot.date_breaks("12 months"),  labels = ggplot.date_format('%Y-%m')) +\
            ggplot.ggtitle('OpenStreetMaps Denver-Boulder\nChanges per Week')
    return p
Esempio n. 5
0
def render(data, bin_width, plot_density=False):
    if plot_density:
        plot = ggplot.ggplot(data, ggplot.aes(x='datetime', color='conversationWithName')) \
               + ggplot.geom_density() \
               + ggplot.scale_x_date(labels='%b %Y') \
               + ggplot.ggtitle('Conversation Densities') \
               + ggplot.ylab('Density') \
               + ggplot.xlab('Date')
    else:
        plot = ggplot.ggplot(data, ggplot.aes(x='datetime', fill='conversationWithName')) \
               + ggplot.geom_histogram(alpha=0.6, binwidth=bin_width) \
               + ggplot.scale_x_date(labels='%b %Y', breaks='6 months') \
               + ggplot.ggtitle('Message Breakdown') \
               + ggplot.ylab('Number of Messages') \
               + ggplot.xlab('Date')

    print(plot)
Esempio n. 6
0
def data_output(data, chart_title):
		print "Good News! You're data has been returned. I'm happy to show it to you."
		print "Just tell me how you want it - Table or Line Graph?"

		data_output = raw_input("Choose table or line > ")

		if data_output[0].lower() == "t":
			print "Ok, here's your data."
			print data
		elif data_output[0] == "l" or data_output[0].lower() =="g":
			import ggplot as gg 

			plot = gg.ggplot(gg.aes(x='Month, Year', y='Value'), data=data) + \
    			gg.geom_point(color='black') + \
    			gg.geom_line(color='green') + \
    			gg.ggtitle(chart_title) + \
    			gg.xlab("Month, Year") + \
    			gg.ylab("Value") 
    			gg.scale_x_date(breaks = gg.date_breaks('1 month'), labels= gg.date_format("%B"))

			print (plot + gg.theme_xkcd())
			
Esempio n. 7
0
def plot_predictions(date_times, actual_values, predictions, match_id,
                     feature_set_in, filename):
    """
    Plot y-var and save based on specified variables.

    Assumes that df has already been filtered using dplyr's sift mechanism.
    Also assumes that a date has been passed in.
    """
    actual_df = pd.DataFrame()
    actual_df['date_time'] = pd.to_datetime(date_times,
                                            errors="coerce",
                                            infer_datetime_format=True)
    actual_df['search_vol'] = actual_values
    actual_df['match_id'] = "actual" + match_id

    predict_df = pd.DataFrame()
    predict_df['date_time'] = pd.to_datetime(date_times,
                                             errors="coerce",
                                             infer_datetime_format=True)
    predict_df['search_vol'] = list(predictions)
    predict_df['match_id'] = "predictedby_" + str(feature_set_in) + match_id

    plotting_df = pd.concat([actual_df, predict_df], axis=0, ignore_index=True)

    # build layers for plot
    p = ggplot(aes(x='date_time',
                   y='search_vol',
                   group="match_id",
                   color="match_id"),
               data=plotting_df)
    p += geom_line(size=2)

    # informative
    p += labs(x="time (gmt)", y="search volume (scaled to 100)")
    # p += ggtitle("man. city (h) vs. chelsea (a)\naug. 8 '16, etihad stadium")
    p += scale_x_date(labels=date_format("%H:%M:%S"), date_breaks="30 minutes")

    # visual
    t = theme_gray()
    t._rcParams['font.size'] = 8
    t._rcParams['font.family'] = 'monospace'
    p += t

    # done
    p.save(filename, width=16, height=8)
print("#######################################")
print("打印所挖掘的文本文件 text-movie.xls 前几行")
print(df.head())

#text = df.comments.iloc[0]   单个影评情感分析实验, iloc中的index值表示第几个应用,编号从0开始
#s = SnowNLP(text)
#
#print(s.sentiments)


def get_sentiment_cn(text):
    s = SnowNLP(text)
    return s.sentiments


df["sentiment"] = df.comments.apply(get_sentiment_cn)
print("#######################################")
print("打印所挖掘的文本文件 text-movie.xls 部分影评及其情感分析值")
print(df)

print("#######################################")
print("重要信息")
print("所有影评的平均值为:", df.sentiment.mean())
print("所有影评的中位数为:", df.sentiment.median())

ggplot.ggplot(ggplot.aes(x="date", y="sentiment"),
              data=df) + ggplot.geom_point() + ggplot.geom_line(
                  color='blue') + ggplot.scale_x_date(
                      labels=ggplot.date_format("%Y-%m-%d"))

df.sort_values(['sentiment'])[:5]
Esempio n. 9
0
    def render_png(self, buffer):
        """
        Render timeseries plots as PNG images.
        """

        bucket = self.bucket

        import matplotlib.font_manager
        matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')

        import matplotlib
        try:
            matplotlib.use('agg')
        except:
            pass

        import matplotlib.pyplot as plt

        df = self.dataframe
        #df = df.set_index(['time'])

        # Compute datetime range boundaries
        datetime_min = min(df.time)
        datetime_max = max(df.time)
        datetime_delta = datetime_max - datetime_min
        #xmin = pd.to_datetime('2016-05-01')
        #xmax = pd.to_datetime('2016-08-01')

        renderer = bucket.tdata.get('renderer', 'matplotlib')
        if renderer == 'matplotlib':

            # Bring DataFrame into appropriate format
            df = dataframe_index_and_sort(df, 'time')

            # Propagate non-null values forward or backward, otherwise
            # matplotlib would not plot the sparse data frame properly.
            # With time series data, using pad/ffill is extremely common so that the “last known value” is available at every time point.
            # http://pandas.pydata.org/pandas-docs/stable/missing_data.html#filling-missing-values-fillna
            df.fillna(method='pad', inplace=True)

            # Make plots of DataFrame using matplotlib / pylab.
            # http://matplotlib.org/
            # http://pandas.pydata.org/pandas-docs/version/0.13.1/visualization.html
            # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.plot.html
            # https://markthegraph.blogspot.de/2015/05/plotting-time-series-dataframes-in.html

            if 'style' in bucket.tdata and bucket.tdata.style:
                try:
                    plt.style.use(bucket.tdata.style)
                except Exception:
                    error_message = u'# Unknown style "{style_name}", available styles: {available}'.format(
                        style_name=bucket.tdata.style,
                        available=plt.style.available)
                    log.error(error_message)
                    return self.request.error_response(bucket, error_message)

            # Basic plotting
            #df.plot()
            #plt.savefig(buffer)

            # Advanced plotting
            ax = df.plot()
            fig = ax.get_figure()

            # Figure heading
            title = fig.suptitle(bucket.title.human, fontsize=12)
            #fig.tight_layout(pad=1.5)

            # Axis and tick labels
            ax.set_xlabel('Time')
            ax.set_ylabel('Value')
            ax.tick_params(axis='x', labelsize='smaller')

            # Grid and legend
            # http://matplotlib.org/users/legend_guide.html
            # http://matplotlib.org/examples/pylab_examples/legend_demo3.html
            ax.grid(True)

            legend_params = dict(ncol=1,
                                 loc='center left',
                                 bbox_to_anchor=(1, 0.5),
                                 fontsize='small',
                                 shadow=True,
                                 fancybox=True)
            legend = ax.legend(**legend_params)  # title='Origin'
            #ax.legend(**legend_params) # title='Origin'

            # Sort list of legend labels
            # http://stackoverflow.com/questions/22263807/how-is-order-of-items-in-matplotlib-legend-determined/27512450#27512450

            # Axis formatting
            #ax.xaxis_date()
            #ax.autoscale_view()

            # Compute appropriate locator and formatter
            locator, formatter = matplotlib_locator_formatter(datetime_delta,
                                                              span=1)

            #ax.xaxis.set_major_locator(locator)
            ax.xaxis.set_major_formatter(formatter)

            # Figure formatting
            fig.autofmt_xdate()

            # http://stackoverflow.com/questions/10101700/moving-matplotlib-legend-outside-of-the-axis-makes-it-cutoff-by-the-figure-box/10154763#10154763
            fig.savefig(buffer,
                        bbox_extra_artists=(title, legend),
                        bbox_inches='tight')

            # TODO: Add annotations
            """
            # https://stackoverflow.com/questions/11067368/annotate-time-series-plot-in-matplotlib
            # https://stackoverflow.com/questions/17891493/annotating-points-from-a-pandas-dataframe-in-matplotlib-plot
            import matplotlib.dates as mdates
            fig = plot.draw()
            ax = fig.axes[0]
            ax.annotate('Test', (mdates.date2num(x[1]), y[1]), xytext=(15, 15),
                textcoords='offset points', arrowprops=dict(arrowstyle='-|>'))

            """

        elif renderer == 'ggplot':

            # https://yhat.github.io/ggplot/notebook.html?page=build/docs/examples/Multiple%20Line%20Plot.html
            # https://stackoverflow.com/questions/23541497/is-there-a-way-to-plot-a-pandas-series-in-ggplot
            # https://stackoverflow.com/questions/24478925/is-it-possible-to-plot-multiline-chart-on-python-ggplot/24479513#24479513

            # https://github.com/yhat/ggplot/blob/master/docs/how-to/Building%20Faceted%20(or%20Trellised)%20Plots.ipynb
            # https://github.com/yhat/ggplot/blob/master/docs/how-to/Annotating%20Plots%20-%20Titles%20and%20Labels.ipynb
            # https://github.com/yhat/ggplot/blob/master/docs/how-to/How%20to%20make%20xkcd%20style%20graphs.ipynb

            from ggplot import ggplot, aes, qplot, geom_line, geom_text, ggtitle, stat_smooth, scale_x_date, date_format, date_breaks
            from ggplot import theme_538, theme_bw, theme_gray, theme_xkcd

            # https://stackoverflow.com/questions/24478925/is-it-possible-to-plot-multiline-chart-on-python-ggplot/24479513#24479513
            # https://stackoverflow.com/questions/23541497/is-there-a-way-to-plot-a-pandas-series-in-ggplot

            # Convert DataFrame from wide to long format, retaining "time" as visible column
            df = dataframe_wide_to_long_indexed(df, 'time')
            dataframe_index_to_column(df, 'time')

            # Compute appropriate locator and formatter
            locator, formatter = matplotlib_locator_formatter(datetime_delta,
                                                              span=2)

            plot = ggplot(df, aes(x='time', y='value', color='variable'))\
                   + geom_line()\
                   + scale_x_date(limits=(datetime_min, datetime_max), breaks=locator, labels=formatter)\
                   + ggtitle(bucket.title.human)

            # Axis labels
            plot.xlab = 'Time'
            plot.ylab = 'Value'

            # Labs
            #+ stat_smooth(colour='blue', span=0.2) \
            #+ geom_text(aes(x='x', y='y'), label='hello world')
            #+ scale_x_date(limits=(xmin, xmax), breaks=date_breaks('1 hour'), labels=date_format('%Y-%m-%d\n%H:%M'))

            theme_name = bucket.tdata.get('theme')
            # TODO: Switching themes will leak some matplotlib/pyplot properties, postpone to future versions
            if theme_name:
                if isinstance(theme_name, float):
                    theme_name = str(int(theme_name))
                try:
                    theme = eval('theme_' + theme_name)
                    plot += theme()
                except Exception:
                    error_message = u'# Unknown theme "{theme_name}"'.format(
                        theme_name=theme_name)
                    log.error(error_message)
                    return self.request.error_response(bucket, error_message)

            plot.save(buffer)

            # Attempt to reset global matplotlib parameters to get rid of xkcd theme style
            """
            import matplotlib as mpl
            #mpl.rcParams = mpl.rc_params()
            #del mpl.rcParams['path.sketch']
            #del mpl.rcParams['path.effects']
            #mpl.rcParams = mpl.defaultParams.copy()
            #mpl.rcParams.clear()
            #mpl.rcdefaults()
            #mpl.rcParams = mpl.rcParamsOrig
            if 'axes.prop_cycle' in mpl.rcParams:
                del mpl.rcParams['axes.prop_cycle']
            mpl.rcParams.update({'path.sketch': None, 'path.effects': []})
            mpl.rcParams.update(mpl.rc_params())
            """

        elif renderer == 'seaborn':

            # TODO: We don't do statistical plotting yet.

            # https://stanford.edu/~mwaskom/software/seaborn/examples/timeseries_from_dataframe.html
            # https://stanford.edu/~mwaskom/software/seaborn/generated/seaborn.tsplot.html
            import seaborn as sns
            sns.set(style="darkgrid")
            #sns.tsplot(data=gammas, time="timepoint", unit="subject", condition="ROI", value="BOLD signal")
            #print dir(df)
            #df['time'] = pandas.to_datetime(df['time'])
            #df = df.set_index(df.time)
            pprint(df)
            sns.tsplot(data=df, time="time")
            #sns.tsplot(data=df)
            plt.savefig(buffer)

        else:
            error_message = u'# Unknown renderer "{renderer_name}"'.format(
                renderer_name=renderer)
            log.error(error_message)
            return self.request.error_response(bucket, error_message)
Esempio n. 10
0
def plot_vol(dates, x, cp, my_domain):
    # -------------------- Prepare for Plotting -------------------------- #
    # Prepare DataFrame objects for graphing
    #Add a column for the label to show in the legend in the graph
    #Need to reshape it, from (124,) to (124,1) for exmple, so that it
    #will concatenate. This gives a df with [date, vol_data, 'Volume']
    v = ['Volume' for i in xrange(x.shape[0])]
    #df_domain = np.concatenate((x, v), axis=1)
    ndf_vol = np.transpose(np.array([dates, x, v]))
    df_vol = pd.DataFrame(ndf_vol, columns=['Date', 'Volume', 'Data'])

    #Create pre-allocated lists for plotting means and cp
    xmin_list = [0 for i in xrange(len(cp))]  #hold lft pt of vol_mean
    xmax_list = [0 for i in xrange(len(cp))]  #hold rt pt of vol_mean
    yint_list = [0 for i in xrange(len(cp))]  #holds vol_means
    cp_date_list = [0 for i in xrange(len(cp))]  #holds date for cp
    cp_value_list = [0 for i in xrange(len(cp))]  #holds cp value

    ref_idx = 0  #used to keep track of vol_means
    #collect list data for plotting
    for i in xrange(len(cp)):
        cp_idx = cp[i][0] - 1  #-1 b/c 1-indexed (includes cp itself)
        xmin_list[i] = dates[ref_idx].toordinal()  #convert to match ggplot
        xmax_list[i] = dates[cp_idx].toordinal()  #convert to match ggplot
        yint_list[i] = cp[i][2]  #use value from_mean for vol_mean
        cp_date_list[i] = dates[cp_idx]  #date of cp
        #cp_value_list[i] = x[cp_idx] #value of cp
        cp_value_list[i] = cp[i][2]
        ref_idx = cp_idx + 1  #+1 b/c moving to next point

    #Reform lists into a data frame and attach to df_domains. The first two
    #lists can be created together since they are both numeric, but if I try
    #to create all three together all types will be downgraded to strings.
    #np.concatenate avoids this conversion. The transpose is needed to take
    #an item from each to form a single row.
    cp_lbl = ['Change Point' for i in xrange(len(yint_list))]

    #Need to create a dummy entry to put 'Volume Mean' into legend
    cp_date_list.append(dates[0])
    yint_list.append(x[0])
    cp_lbl.append('Volume Mean')
    ndf_cp = np.transpose(np.array([cp_date_list, yint_list, cp_lbl]))
    yint_list.pop(-1)
    cp_date_list.pop(-1)
    df_cp = pd.DataFrame(ndf_cp, columns=['Date', 'Volume', 'Data'])

    df_plot = pd.concat((df_vol, df_cp), axis=0)

    #Need to create a dummy entry to put 'Volume Mean' into legend
    #dummy = np.array([dates[0], x[0], 'Volume Mean']).reshape(1,-1)
    #df_cp = np.concatenate( (df_cp, dummy), axis=0) #add to bottom df_cp
    #df_domain = np.concatenate( (df_domain, df_cp), axis=0 ) #add df_domains

    #convert final array into a pd.DataFrame for printing and plotting
    #df_domain = pd.DataFrame(df_domain, columns=['Date','Volume','Data'])
    #df_domain.to_html(open('out.html','w'))
    #os.system('sudo cp out.html /usr/local/www/analytics/rwing')

    margin = 0.10 * (np.max(x) - np.min(x))
    p = ggplot.ggplot(aes(x='Date', y='Volume', color='Data'), data=df_plot) + \
            ggplot.geom_line(color='blue',size=2) + \
            ggplot.geom_point(x=xmax_list, y=cp_value_list, color='black', \
                        shape='D', size=50) + \
            ggplot.geom_hline(xmin=xmin_list, \
                        xmax=xmax_list, \
                        yintercept=yint_list, color="red", size=3) + \
            ggplot.scale_x_date(labels = date_format("%Y-%m-%d"), breaks="1 week") + \
            ggplot.scale_colour_manual(values = ["black", "blue", "red"]) + \
            ggplot.scale_y_continuous(labels='comma') + \
            ggplot.ylim(low=np.min(x)-margin/4.0, high=np.max(x)+margin) + \
            ggplot.xlab("Week (Marked on Mondays)") + \
            ggplot.ylab("Message Vol") + \
            ggplot.ggtitle("%s\nMessage Volume by Week" % my_domain) + \
            ggplot.theme_seaborn()

    return p