test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']] train.loc[:, 'pickup_datetime_group'] = train['pickup_datetime'].dt.round( group_freq) test.loc[:, 'pickup_datetime_group'] = test['pickup_datetime'].dt.round( group_freq) # Count trips over 60min df_counts = df_all.set_index('pickup_datetime')[['id']].sort_index() df_counts['count_60min'] = df_counts.isnull().rolling(group_freq).count()['id'] train = train.merge(df_counts, on='id', how='left') test = test.merge(df_counts, on='id', how='left') # Count how many trips are going to each cluster over time dropoff_counts = df_all \ .set_index('pickup_datetime') \ .groupby([pd.TimeGrouper(group_freq), 'dropoff_cluster']) \ .agg({'id': 'count'}) \ .reset_index().set_index('pickup_datetime') \ .groupby('dropoff_cluster').rolling('240min').mean() \ .drop('dropoff_cluster', axis=1) \ .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() \ .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'dropoff_cluster_count'}) train['dropoff_cluster_count'] = train[[ 'pickup_datetime_group', 'dropoff_cluster' ]].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0) test['dropoff_cluster_count'] = test[[ 'pickup_datetime_group', 'dropoff_cluster' ]].merge(dropoff_counts,
journeys = journeys.loc[(journeys['Booking Time'] > start) & (journeys['Drop-off Time'] < end)] print(journeys.shape) vehicles = h.convertColumns(vehicles, ['TimeStamp'], h.todatetime) vehicles = vehicles.loc[(vehicles.TimeStamp > start) & (vehicles.TimeStamp < end)] vehicleNames = { id : "vehicle-" + str(index+1) for index, id in enumerate(vehicles['Vehicle ID']).unique() } vehicles = vehicles.replace({"Vehicle ID": vehicleNames}) vehicles = vehicles.sort_values(by=['Vehicle ID', 'TimeStamp']) vehicles['Battery'] = - vehicles.groupby('Vehicle ID')['Battery Level'].diff().fillna(0) vehicles = vehicles.set_index("TimeStamp") batteryPerVehicle = vehicles.groupby('Vehicle ID')[['Battery']].agg(batteryCount) totalBattery = batteryPerVehicle['Battery'].sum() batteryPerVehicle.loc["Total"] = totalBattery fig1 = h.render_mpl_table(batteryPerVehicle, header_columns=0, col_width=7.0, title = "Vehicle Battery Consumption") time_grouper = pd.TimeGrouper(freq=cfg.frequencyForAnalysis) vehicles['Battery'] = vehicles['Battery'].apply(lambda x: x if (x > 0) else 0) battery_freq = vehicles.groupby(["Vehicle ID",time_grouper]).sum()[['Battery']] print(vehicles.shape) vehicles['Occupancy'] = vehicles.apply(countSeats, axis = 1) occupancy_freq = vehicles.groupby(["Vehicle ID",time_grouper]).mean()[['Occupancy']] vehicles['LoadFactor'] = vehicles['Occupancy']*vehicles['Battery'] vehiclesSumPerVehicle = vehicles.groupby('Vehicle ID').sum() loadFactorPerVehicle = vehiclesSumPerVehicle['LoadFactor'] / (vehiclesSumPerVehicle['Battery']*cfg.maxCapacity) loadFactorPerVehicle = loadFactorPerVehicle.to_frame(name="Load Factor") averageLoadFactor = loadFactorPerVehicle['Load Factor'].mean() maxLoadFactor = loadFactorPerVehicle['Load Factor'].max()
def test_timegrouper_with_reg_groups(self): # GH 3794 # allow combinateion of timegrouper/reg groups df_original = DataFrame({ 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date': [ datetime(2013, 1, 1, 13, 0), datetime(2013, 1, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 12, 2, 12, 0), datetime(2013, 12, 2, 14, 0), ] }).set_index('Date') df_sorted = df_original.sort_values(by='Quantity', ascending=False) for df in [df_original, df_sorted]: expected = DataFrame({ 'Buyer': 'Carl Joe Mark'.split(), 'Quantity': [10, 18, 3], 'Date': [ datetime(2013, 12, 31, 0, 0), datetime(2013, 12, 31, 0, 0), datetime(2013, 12, 31, 0, 0), ] }).set_index(['Date', 'Buyer']) result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum() assert_frame_equal(result, expected) expected = DataFrame({ 'Buyer': 'Carl Mark Carl Joe'.split(), 'Quantity': [1, 3, 9, 18], 'Date': [ datetime(2013, 1, 1, 0, 0), datetime(2013, 1, 1, 0, 0), datetime(2013, 7, 1, 0, 0), datetime(2013, 7, 1, 0, 0), ] }).set_index(['Date', 'Buyer']) result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum() assert_frame_equal(result, expected) df_original = DataFrame({ 'Branch': 'A A A A A A A B'.split(), 'Buyer': 'Carl Mark Carl Carl Joe Joe Joe Carl'.split(), 'Quantity': [1, 3, 5, 1, 8, 1, 9, 3], 'Date': [ datetime(2013, 10, 1, 13, 0), datetime(2013, 10, 1, 13, 5), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 1, 20, 0), datetime(2013, 10, 2, 10, 0), datetime(2013, 10, 2, 12, 0), datetime(2013, 10, 2, 14, 0), ] }).set_index('Date') df_sorted = df_original.sort_values(by='Quantity', ascending=False) for df in [df_original, df_sorted]: expected = DataFrame({ 'Buyer': 'Carl Joe Mark Carl Joe'.split(), 'Quantity': [6, 8, 3, 4, 10], 'Date': [ datetime(2013, 10, 1, 0, 0), datetime(2013, 10, 1, 0, 0), datetime(2013, 10, 1, 0, 0), datetime(2013, 10, 2, 0, 0), datetime(2013, 10, 2, 0, 0), ] }).set_index(['Date', 'Buyer']) result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum() assert_frame_equal(result, expected) result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum() expected = DataFrame({ 'Buyer': 'Carl Joe Mark'.split(), 'Quantity': [10, 18, 3], 'Date': [ datetime(2013, 10, 31, 0, 0), datetime(2013, 10, 31, 0, 0), datetime(2013, 10, 31, 0, 0), ] }).set_index(['Date', 'Buyer']) assert_frame_equal(result, expected) # passing the name df = df.reset_index() result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer']).sum() assert_frame_equal(result, expected) with pytest.raises(KeyError): df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum() # passing the level df = df.set_index('Date') result = df.groupby([pd.Grouper(freq='1M', level='Date'), 'Buyer']).sum() assert_frame_equal(result, expected) result = df.groupby([pd.Grouper(freq='1M', level=0), 'Buyer']).sum() assert_frame_equal(result, expected) with pytest.raises(ValueError): df.groupby([pd.Grouper(freq='1M', level='foo'), 'Buyer']).sum() # multi names df = df.copy() df['Date'] = df.index + pd.offsets.MonthEnd(2) result = df.groupby([pd.Grouper(freq='1M', key='Date'), 'Buyer']).sum() expected = DataFrame({ 'Buyer': 'Carl Joe Mark'.split(), 'Quantity': [10, 18, 3], 'Date': [ datetime(2013, 11, 30, 0, 0), datetime(2013, 11, 30, 0, 0), datetime(2013, 11, 30, 0, 0), ] }).set_index(['Date', 'Buyer']) assert_frame_equal(result, expected) # error as we have both a level and a name! with pytest.raises(ValueError): df.groupby( [pd.Grouper(freq='1M', key='Date', level='Date'), 'Buyer']).sum() # single groupers expected = DataFrame({ 'Quantity': [31], 'Date': [datetime(2013, 10, 31, 0, 0)] }).set_index('Date') result = df.groupby(pd.Grouper(freq='1M')).sum() assert_frame_equal(result, expected) result = df.groupby([pd.Grouper(freq='1M')]).sum() assert_frame_equal(result, expected) expected = DataFrame({ 'Quantity': [31], 'Date': [datetime(2013, 11, 30, 0, 0)] }).set_index('Date') result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum() assert_frame_equal(result, expected) result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum() assert_frame_equal(result, expected) # GH 6764 multiple grouping with/without sort df = DataFrame({ 'date': pd.to_datetime([ '20121002', '20121007', '20130130', '20130202', '20130305', '20121002', '20121207', '20130130', '20130202', '20130305', '20130202', '20130305' ]), 'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5], 'whole_cost': [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301, 359, 801], 'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12] }).set_index('date') for freq in ['D', 'M', 'A', 'Q-APR']: expected = df.groupby('user_id')['whole_cost'].resample( freq).sum().dropna().reorder_levels( ['date', 'user_id']).sort_index().astype('int64') expected.name = 'whole_cost' result1 = df.sort_index().groupby( [pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum() assert_series_equal(result1, expected) result2 = df.groupby([pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum() assert_series_equal(result2, expected)
train = pd.merge(train, coord_stats, how='left', on=gby_cols) test = pd.merge(test, coord_stats, how='left', on=gby_cols) group_freq = '60min' df_all = pd.concat((train, test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']] train.loc[:, 'pickup_datetime_group'] = train['pickup_datetime'].dt.round(group_freq) test.loc[:, 'pickup_datetime_group'] = test['pickup_datetime'].dt.round(group_freq) # Count trips over 60min df_counts = df_all.set_index('pickup_datetime')[['id']].sort_index() df_counts['count_60min'] = df_counts.isnull().rolling(group_freq).count()['id'] train = train.merge(df_counts, on='id', how='left') test = test.merge(df_counts, on='id', how='left') # Count how many trips are going to each cluster over time dropoff_counts = df_all .set_index('pickup_datetime') .groupby([pd.TimeGrouper(group_freq), 'dropoff_cluster']) .agg({'id': 'count'}) .reset_index().set_index('pickup_datetime') .groupby('dropoff_cluster').rolling('240min').mean() .drop('dropoff_cluster', axis=1) .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'dropoff_cluster_count'}) train['dropoff_cluster_count'] = train[['pickup_datetime_group', 'dropoff_cluster']].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0) test['dropoff_cluster_count'] = test[['pickup_datetime_group', 'dropoff_cluster']].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0) # Count how many trips are going from each cluster over time df_all = pd.concat((train, test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']] pickup_counts = df_all .set_index('pickup_datetime') .groupby([pd.TimeGrouper(group_freq), 'pickup_cluster']) .agg({'id': 'count'}) .reset_index().set_index('pickup_datetime') .groupby('pickup_cluster').rolling('240min').mean() .drop('pickup_cluster', axis=1) .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'pickup_cluster_count'}) train['pickup_cluster_count'] = train[['pickup_datetime_group', 'pickup_cluster']].merge(pickup_counts, on=['pickup_datetime_group', 'pickup_cluster'], how='left')['pickup_cluster_count'].fillna(0) test['pickup_cluster_count'] = test[['pickup_datetime_group', 'pickup_cluster']].merge(pickup_counts, on=['pickup_datetime_group', 'pickup_cluster'], how='left')['pickup_cluster_count'].fillna(0) # For this particular problem we can add OSRM ([Open Source Routing Machine](http://project-osrm.org/ # "OSRM")) features. This data contains the fastest routes from specific starting points in NY. fr1 = pd.read_csv('../input/new-york-city-taxi-with-osrm/fastest_routes_train_part_1.csv', usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps']) fr2 = pd.read_csv('../input/new-york-city-taxi-with-osrm//fastest_routes_train_part_2.csv', usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])
def generatePlots(self): for key in self.hashtagdict: print key filename = key + '_sentiment.csv' path = '/home/ubuntu/cv/aerial/DeepNetsEO/DeepNetsForEO/nlp/' + self.base_folder + '/' + filename show_df = pd.read_csv(path, names=['datetime', 'username', 'tweet']) show_df.dropna(inplace=True) show_df.drop_duplicates(inplace=True) show_df['sentiment'] = show_df.tweet.apply( self.get_tweet_sentiment) show_df.datetime = show_df.datetime.apply(parser.parse) show_df.set_index(show_df.datetime, inplace=True) show_grouped = show_df.groupby([ pd.TimeGrouper(freq='D'), 'sentiment', ]).size() show_pct = show_grouped.groupby( level=0).apply(lambda x: 100 * x / float(x.sum())) show_NNP = show_pct.swaplevel( i=-1, j=-2).sort_values().unstack().transpose() if show_NNP.shape[1] < 3: continue if 'Negative' in show_NNP.columns and 'Positive' in show_NNP.columns: show_NNP['NetPositive'] = show_NNP['Positive'] - show_NNP[ 'Negative'] elif 'Negative' in show_NNP.columns: show_NNP['NetPositive'] = -show_NNP['Negative'] elif 'Positive' in show_NNP.columns: show_NNP['NetPositive'] = show_NNP['Positive'] show_grouped = show_grouped.swaplevel( i=-1, j=-2).sort_values().unstack().transpose() if 'Negative' in show_NNP.columns and 'Positive' in show_NNP.columns and 'Neutral' in show_NNP.columns: show_grouped[ 'TotalTweets'] = show_grouped['Positive'] + show_grouped[ 'Negative'] + show_grouped['Neutral'] elif 'Negative' in show_NNP.columns and 'Neutral' in show_NNP.columns: show_grouped['TotalTweets'] = show_grouped[ 'Negative'] + show_grouped['Neutral'] elif 'Positive' in show_NNP.columns and 'Neutral' in show_NNP.columns: show_grouped['TotalTweets'] = show_grouped[ 'Positive'] + show_grouped['Neutral'] elif 'Positive' in show_NNP.columns and 'Negative' in show_NNP.columns: show_grouped['TotalTweets'] = show_grouped[ 'Positive'] + show_grouped['Negative'] elif 'Negative' in show_NNP.columns: show_grouped['TotalTweets'] = show_grouped['Negative'] elif 'Positive' in show_NNP.columns: show_grouped['TotalTweets'] = show_grouped['Positive'] elif 'Neutral' in show_NNP.columns: show_grouped['TotalTweets'] = show_grouped['Neutral'] show_NNP['TotalTweets'] = show_grouped['TotalTweets'] fig_show, ax1_show = plt.subplots() ax2_show = ax1_show.twinx() if 'NetPositive' in show_NNP.columns: ax1_show.plot(show_NNP.NetPositive, 'g', linewidth=5) ax2_show.bar(show_NNP.index, show_NNP.TotalTweets, align='center', alpha=0.5) ax1_show.set_ylim([-100, 100]) ax1_show.set_xlabel('Date') ax1_show.set_ylabel('Net Positive Sentiment %') ax2_show.set_ylabel('Total Tweets') fig_show.autofmt_xdate() plt.title(key)
def api_get_league_table_summarised(leagueTableId): """ { "doc": { "title": "league table summarised help", "body": "<p> Obtain the sumarized of the league table </p>" }, "GET": { "label": "Obtain the league_table sumarized", "params":[{"name": "leagueTableId", "type":"string", "required":"true", "doc":"id of the leage_table"}, {"name": "period", "type":"string", "required":"false", "info":"the period to sumarize", "values": ["D", "W", "M", "Y"]}, {"name": "type", "type":"list", "required":"false", "info":"the field to sumarize"}] } } """ companyId = g.get("auth_value") # params from url period = request.args.get('period', 'M') type = request.args.get('type', ['savings', 'smileys']) if not isinstance(type, list): type = type.split(',') # type=savings,smileys in the url periodsAllowed = ['D', 'W', 'M', 'Y'] # Weekly means Monday to Sunday period = period[0].upper() # recupero la info de mongo query = {'companyId': companyId, 'leagueTableId': leagueTableId} doc = app.data.driver.db['league_table'].find_one(query, {'_id': 0}, timeout=False) try: reporting_Units = doc['reporting_Units'] except: reporting_Units = [] # recupero la info de mongo de baseline i creo el resultat per cadascu res_report = {} for reportingUnit in reporting_Units: query_reporting = { 'companyId': companyId, 'reportingUnitId': reportingUnit } doc_reporting = app.data.driver.db['reporting_units'].find_one( query_reporting, timeout=False) if doc_reporting: modelling_Units = doc_reporting['modelling_Units'] res_report[reportingUnit] = [] for modelUnit in modelling_Units: # update_baseline(companyId, modellingUnitId) # TO DO query_baseline = { 'companyId': companyId, 'modellingUnitId': modelUnit } doc_baseline = app.data.driver.db['baselines'].find_one( query_baseline, { 'prediction': 1, 'values': 1, 'smileys': 1, 'timestamps': 1 }, timeout=False) if doc_baseline: res_parcial = {} # creo el dataframe df = pd.DataFrame.from_records({ 'values': doc_baseline['values'], 'smileys': doc_baseline['smileys'], 'prediction': doc_baseline['prediction'], 'timestamps': doc_baseline['timestamps'] }) df = df.set_index(pd.DatetimeIndex(df['timestamps'])) if df.empty != True and period in periodsAllowed: for typ in type: if typ in doc_baseline.keys() or typ == 'savings': if typ in ['savings', 'values', 'prediction']: df_grouped = df.groupby( pd.TimeGrouper(freq=period)).sum() else: df_grouped = df.groupby( pd.TimeGrouper(freq=period)).mean() if typ == 'savings': res_parcial[typ] = df_grouped[ 'prediction'] - df_grouped['values'] else: res_parcial[typ] = df_grouped[typ] res_parcial[typ] = res_parcial[typ].where( (pd.notnull(res_parcial[typ])), None).tolist() # replacing nan by None else: res_parcial[typ] = None try: # if there is any valid type res_parcial['modellingUnitId'] = modelUnit res_parcial[ 'timestamps'] = df_grouped.index.tolist() res_parcial['number_of_elements'] = df[ 'values'].groupby(pd.TimeGrouper( freq=period)).count().tolist() except: for typ in type: res_parcial[typ] = None res_parcial['timestamps'] = None res_parcial['number_of_elements'] = None res_parcial['modellingUnitId'] = modelUnit res_report[reportingUnit].append(res_parcial) return send_response('', (res_report, None, None, 200))
]) data = d_311_grouped.size().to_frame('311').merge( d_c_grouped.size().to_frame('crime'), left_index=True, right_index=True) plt_311 = np.array(data['311'].apply(int)) plt_crime = np.array(data['crime'].apply(int)) plt.close('all') sns.regplot(x=plt_311, y=plt_crime) plt.suptitle('311 v total crime 2010-2016\ngrouped by location') plt.xlabel('Total 311 complaints') plt.ylabel('Total reported crime') plt.savefig('311vcrime.png', format='png') d_311_grouped = data_311[data_311.Borough == 'MANHATTAN'].groupby( by=[pd.TimeGrouper(key='created_date', freq='M'), 'Complaint Type' ]).size().to_frame('total') d_c_grouped = data_crime[data_crime.BORO_NM == 'MANHATTAN'].groupby( by=pd.TimeGrouper(key='CMPLNT_FR_DT', freq='M')).size().to_frame('total') d_c_grouped2 = d_c_grouped.copy() d_c_grouped2['Complaint Type'] = 'crime' d_c_grouped2.set_index('Complaint Type', append=True, inplace=True) d_c_grouped2.index.rename('created_date', level=0, inplace=True) crimecorr = d_311_grouped.unstack(level=1)['total'].corrwith( d_c_grouped2['total']) corridx = ['Complaint Type'] + list( crimecorr[crimecorr > .5].keys()) + ['crime']
def main(): # path to each document # column description file desc_fname = abspath( '../data/section_1/OPEN_DATA_FIRE_INCIDENTS_FILE_DESCRIPTION.xls') # fire incident data file data_fname = abspath( '../data/section_1/Incidents_Responded_to_by_Fire_Companies.csv') # 2010 census data file name census_fname = abspath('../data/section_1/census_zipcode_2010.csv') df_desc = pd.read_excel(desc_fname) df_data = pd.read_csv(data_fname) #translate zip code df_data['INCIDENT_TYPE_CODE'] = df_data['INCIDENT_TYPE_DESC'].apply( lambda x: x.split(' - ')[0]) df_census = pd.read_csv(census_fname) print('1. Most common incident: ') incidents = pd.DataFrame( df_data.groupby('INCIDENT_TYPE_DESC')['INCIDENT_TYPE_DESC'].size()) incidents['incident_ratio'] = incidents['INCIDENT_TYPE_DESC'] / len( df_data) most_common_incident = incidents.sort_values( 'incident_ratio').incident_ratio.iloc[-1] print(most_common_incident) print('\n 2. False calls in Staten island vs Manhattan') df_false = df_data[df_data['INCIDENT_TYPE_CODE'] == '710'] df_false_dest = pd.DataFrame(df_false.groupby('BOROUGH_DESC').size()) manhattan = df_false_dest[df_false_dest.index == '1 - Manhattan'].values[0][0] staten = df_false_dest[df_false_dest.index == '3 - Staten Island'].values[0][0] false_ratio = staten / manhattan print(false_ratio) print('\n 3. Most frequent cooking fire hour ratio') df_cooking = df_data[[ 'INCIDENT_TYPE_DESC', 'INCIDENT_TYPE_CODE', 'INCIDENT_DATE_TIME' ]] df_cooking['Hour'] = pd.to_datetime( df_cooking['INCIDENT_DATE_TIME']).dt.hour hour_count = df_cooking.groupby('Hour')['INCIDENT_TYPE_CODE'].count() cooking_count = df_cooking[df_cooking.INCIDENT_TYPE_CODE == '113'].groupby( 'Hour')['INCIDENT_TYPE_CODE'].count() cooking_proba = pd.DataFrame(cooking_count / hour_count) max_cooking_fire = cooking_proba.max()[0] print(max_cooking_fire) print('\n 4. Average number of units in 111 vs 651') df_111_651 = df_data[df_data.INCIDENT_TYPE_CODE.isin(['111', '651'])] units_111 = df_111_651.groupby( 'INCIDENT_TYPE_CODE')['UNITS_ONSCENE'].mean()['111'] units_651 = df_111_651.groupby( 'INCIDENT_TYPE_CODE')['UNITS_ONSCENE'].mean()['651'] unit_ratio = units_111 / units_651 print(unit_ratio) print('\n 5. About 111 incidents...') df_111 = df_data[df_data.INCIDENT_TYPE_CODE == '111'][[ 'INCIDENT_DATE_TIME', 'ARRIVAL_DATE_TIME', 'ZIP_CODE' ]].dropna() for c in ['INCIDENT_DATE_TIME', 'ARRIVAL_DATE_TIME']: df_111[c] = pd.to_datetime(df_111[c]) print('5-1. 3rd quartile in difference in call and arrival time') df_111['min_diff'] = (df_111.ARRIVAL_DATE_TIME - df_111.INCIDENT_DATE_TIME) / np.timedelta64(1, 'm') third_quartile = np.percentile(df_111.min_diff.values, 75) print(third_quartile) print('5-2. R-square for zip code population vs incidents') df_zipcode = pd.DataFrame(index=df_111.ZIP_CODE.unique()) df_census = df_census.set_index('Zip Code ZCTA') df_zipcode['incidents'] = df_111.groupby('ZIP_CODE').size() df_zipcode = df_zipcode.merge(df_census, left_index=True, right_index=True) df_zipcode = df_zipcode.rename( columns={'2010 Census Population': 'population'}) y = df_zipcode['incidents'] x = df_zipcode['population'] slope, intercept, r_value, p_value, std_err = stats.linregress(x, y) print(r_value) print('\n 6. About CO detectors...') df_co = df_data[~df_data.CO_DETECTOR_PRESENT_DESC.isnull()] df_co['duration_min'] = pd.to_timedelta( pd.to_timedelta(df_co['TOTAL_INCIDENT_DURATION'], unit='s') / np.timedelta64(1, 'm'), unit='m') print('6-1 Incident duration ratios') df_co_group = pd.DataFrame( df_co.groupby([ 'CO_DETECTOR_PRESENT_DESC', pd.TimeGrouper(key='duration_min', freq='10Min') ])['duration_min'].size()) df_co_binned = pd.DataFrame(index=df_co_group.loc['No'].index) df_co_binned['no'] = df_co_group.loc['No'] / df_co_group.loc['No'].sum() df_co_binned['yes'] = df_co_group.loc['Yes'] / df_co_group.loc['Yes'].sum() df_co_binned['ratio'] = df_co_binned['no'] / df_co_binned['yes'] df_co_binned = df_co_binned.iloc[2:7] df_co_binned['mid_bin'] = [25, 35, 45, 55, 65] x = df_co_binned.mid_bin y = df_co_binned.ratio a, b, r, p, stderr = stats.linregress(x, y) predicted = a * 39 + b print(predicted) print('6-2 Chi square statistics likelihood to be longer') df_co['long'] = df_co.duration_min.apply( lambda x: 'long' if x > pd.Timedelta(minutes=60) else 'short') df_co.groupby(['CO_DETECTOR_PRESENT_DESC', 'long']).size().unstack() df_co_time = df_co.groupby(['CO_DETECTOR_PRESENT_DESC', 'long']).size().unstack() statistic, p = stats.chisquare( [df_co_time['long']['No'] / df_co_time.loc['No'].sum()], [df_co_time['long'].sum() / df_co_time.values.sum()]) print(statistic)
#==================================================================== # group data be weekday and/or 20min bin #==================================================================== avg_segment_time_week = df_travel_segment.groupby( ['day_of_week', 'minute_block', 'link_id']).aggregate(np.average) avg_route_time_week = df_trajectories.groupby([ 'intersection_id', 'tollgate_id', 'day_of_week', 'minute_block', ]).aggregate(np.average) avg_route_time_whole = df_trajectories.groupby([ 'intersection_id', 'tollgate_id', pd.TimeGrouper(key="starting_time", freq="20min"), ]).aggregate(np.average) #=========================================================== #Plot speed of network, averaged over whole data period #=========================================================== g = load_graph("phase1_road_network.gt") tmpTxt = g.new_edge_property("string") tmpWidth = g.new_edge_property("double") tmpColor = g.new_edge_property("vector<double>") tmpAvgTime = avg_segment_time_week.mean( level=2) #level=2 means average over same link_id for e in g.edges(): #tmpTxt[e] = "%s %.0fm %d" %( eName[e], eLen[e], eLanes[e])
# only use dates we have volume values for data = data.loc[data.index >= '12-31-1984'] # scale data data['LogOpen'] = data['LogOpen'] - data['LogOpen'].min() data['dx'] = data['dx'] - data['dx'].min() # check everything looks okay # print(data) # need to break into input and output data samples #data = data.resample('B').bfill() # fill in missing days ( holidays..) data = data.resample('B').bfill() # fill in missing days ( holidays..) weeks_df = [g for n, g in data.groupby(pd.TimeGrouper('W'))] months_df = [g for n, g in data.groupby(pd.TimeGrouper('M'))] print("data ", len(data)) print("weeks ", len(weeks_df)) print("months ", len(months_df)) # see if everything looks okay # print(weeks) # print("mins", data['LogOpen'].min(), data['dx'].min(), data['LogVolume'].min()) # print("maxs", data['LogOpen'].max(), data['dx'].max(), data['LogVolume'].max()) # convert to numpy matrix print(len(weeks_df)) weeks = [] for i in range(len(weeks_df)):
# trc = np.zeros(len(weight)) # for k in range(len(weight)): # trc[k] = weight[k]*partial_derivative(portfolioRisk, k, weight) # rc = 0 # for p in range(len(weight)): # for q in range(len(weight)): # rc += (trc[p] - trc[q])**2 # return(rc) indexRets = pd.read_csv('/Users/admin/Desktop/doc/finance/multifactor/data/fof/indexRets.csv') tradeDate = indexRets['tradeDate'] indexRets['tradeDate'] = pd.to_datetime(indexRets['tradeDate'],format='%Y-%m-%d') indexRets.set_index('tradeDate', inplace = True) tradeDate.index = indexRets.index indexMonthlyRets = indexRets.groupby(pd.TimeGrouper(freq = 'M')).sum() indexMonthlyRets = np.exp(indexMonthlyRets) - 1 #translate the log return to normal return nAsset = len(indexRets.columns) weight = np.array([1/nAsset]*nAsset) print(weight) sess = tf.InteractiveSession() tfWeight = tf.Variable(weight, dtype = tf.float32, name = 'weight') import pdb;pdb.set_trace() tfWeight = tf.expand_dims(tfWeight,0) tfCov = tf.placeholder(dtype = tf.float32, shape = (nAsset, nAsset), name = 'retcov') tfRateO = tf.placeholder(dtype = tf.float32, shape = (nAsset), name = 'retmean') tfRate = tf.expand_dims(tfRateO,0) #portfolioCov = tf.matmul(tf.matmul(tf.expand_dims(tfWeight,0), retCov), tf.expand_dims(tfWeight,-1)) portfolioCov = tf.matmul(tf.matmul(tfWeight, tfCov), tf.transpose(tfWeight), name = 'portcov')
def upload(): print request.form data = [] categoryquery = request.form['category'] locationed = request.form['location'] print categoryquery, locationed clinet = MongoClient('127.0.0.1', 27017) conn = clinet["ADM"]["dummy4"] brands = conn.find().distinct("Category") location = conn.find().distinct("Geo") location.append("All") if locationed == "All": T = conn.find({"Category": categoryquery}) else: T = conn.find({"Category": categoryquery, "Geo": locationed}) leg = ['Date'] colname = [] for x in T: data.append(x) clinet.close() df_all = pd.DataFrame(data) #df_all['Date']=df_all['Date'].apply(lambda x: datetime.strptime(x,"%m/%d/%Y").strftime('%m/%d/%Y')) #df_all['Date'] = df_all['Date'].apply(lambda x: datetime.strptime(x,"%m/%d/%Y %H:%M")) #df_all['Date'] = df_all['Date'].apply(lambda x: datetime.strptime(x,"%m/%d/%y")) df_all['Date'] = df_all['Date'].apply( lambda x: datetime.strptime(x, "%m/%d/%Y")) cards_result = [] graph = [] for x in df_all.Brand.unique(): df = df_all[df_all['Brand'] == x] df['predicted_sentiment'] = df['vader_sentiment'] pos = len(df[df['predicted_sentiment'] == "Positive"]) neg = len(df[df['predicted_sentiment'] == "Negative"]) neu = len(df[df['predicted_sentiment'] == "Neutral"]) price_pos = sum(df[df['predicted_sentiment'] == "Positive"]['price']) price_neg = sum(df[df['predicted_sentiment'] == "Negative"]['price']) price_neu = sum(df[df['predicted_sentiment'] == "Negative"]['price']) qualtiy_pos = sum( df[df['predicted_sentiment'] == "Positive"]['quality']) qualtiy_neg = sum( df[df['predicted_sentiment'] == "Negative"]['quality']) qualtiy_neu = sum( df[df['predicted_sentiment'] == "Negative"]['quality']) service_pos = sum( df[df['predicted_sentiment'] == "Positive"]['service']) service_neg = sum( df[df['predicted_sentiment'] == "Negative"]['service']) service_neu = sum( df[df['predicted_sentiment'] == "Negative"]['service']) cards_result.append({ "total": len(df_all), "brand": x, "pos": pos, "neg": neg, "neu": neu, "pp": pos / (pos + neg + neu + 1), "price_pos": price_pos, "price_neg": price_neg, "price_neu": price_neu, "quality_pos": qualtiy_pos, "quality_neg": qualtiy_neg, "quality_neu": qualtiy_neu, "service_pos": service_pos, "service_neg": service_neg, "service_neu": service_neu }) temp = df.set_index('Date').groupby(pd.TimeGrouper('W')).size() temp_pos = df[df['predicted_sentiment'] == "Positive"].set_index( 'Date').groupby(pd.TimeGrouper('W')).size() temp_neg = df[df['predicted_sentiment'] == "Negative"].set_index( 'Date').groupby(pd.TimeGrouper('W')).size() temp_neu = df[df['predicted_sentiment'] == "Neutral"].set_index( 'Date').groupby(pd.TimeGrouper('W')).size() temp_pp = temp_pos * 100.0 / (temp + 1) graph.append(temp) graph.append(temp_pos) graph.append(temp_neg) graph.append(temp_neu) graph.append(temp_pp) colname.append(x + "_total") colname.append(x + "_pos") colname.append(x + "_neg") colname.append(x + "_neu") colname.append(str(x + "_pp")) leg.append(str(x + "_pp")) graph = pd.concat(graph, axis=1) graph = graph.fillna(0) graph.columns = colname print graph temp = [] for index, row in graph.iterrows(): each_week = [] each_week.append(int(index.strftime("%s"))) for l in leg[1:]: each_week.append(row[l]) temp.append(each_week) return render_template('index.html', legends=leg, data=data, category=brands, location=location, temp=temp, locationtag=locationed, serchtag=categoryquery, card=cards_result)
last_value = trajectoryData_train.loc[k-1,'travel_time'] next_value = trajectoryData_train.loc[k+1,'travel_time'] if last_value < 600: trajectoryData_train.loc[k, 'travel_time'] = (last_value + next_value)/2.0 else: trajectoryData_train.loc[k, 'travel_time'] = last_value # COMMAND ---------- trajectoryData_train.describe() # COMMAND ---------- trajectoryData_train['starting_time'] = ps.to_datetime(trajectoryData_train['starting_time'], format='%Y-%m-%d %H:%M:%S') trajectoryData_train = trajectoryData_train.set_index(['starting_time']) trajectoryData_train = trajectoryData_train.groupby([ps.TimeGrouper('20Min'), 'intersection_id', 'tollgate_id']).travel_time.mean().reset_index().rename(columns={'travel_time':'averagetravltime'}) trajectoryData_test['starting_time'] = ps.to_datetime(trajectoryData_test['starting_time'], format="%Y-%m-%d %H:%M:%S") trajectoryData_test = trajectoryData_test.set_index(['starting_time']) trajectoryData_test = trajectoryData_test.groupby([ps.TimeGrouper('20Min'), 'intersection_id', 'tollgate_id']).travel_time.mean().reset_index().rename(columns={'travel_time':'averagetravltime'}) print trajectoryData_train.shape,trajectoryData_test.shape # COMMAND ---------- trajectoryData_train.head() # COMMAND ---------- trajectoryData_train.shape # COMMAND ----------
appended_data = appended_data.iloc[:, :-1] appended_data.index = pd.to_datetime(appended_data.index, dayfirst=True) # filtre des donnees sur une periode specifique (ATTENTION ON TRAVAILLE EN UTC) start = datetime.now().replace(hour=5, minute=0, second=0, microsecond=0) end = (datetime.now() + timedelta(days=1)).replace(hour=5, minute=0, second=0, microsecond=0) filtred_data = appended_data[ (appended_data.index.get_level_values(0) >= str(start)) & (appended_data.index.get_level_values(0) <= str(end))] filtred_data.index = pd.to_datetime(filtred_data.index) agg_10m = filtred_data.groupby(pd.TimeGrouper(freq='10Min')).aggregate(np.sum) mat = mat[(mat.index.get_level_values(0) >= str(start)) & (mat.index.get_level_values(0) <= str(end))] # calcul du centre de masse suivant Ishizaka () sum_D = [] sum_V = [] for i in range(0, len(agg_10m)): tt = agg_10m.iloc[i].replace(0, np.nan) a = tt.astype(float).values.reshape(32, 32) sum_V.append([np.nansum(x) for x in a]) sum_D.append([np.nansum(x) for x in zip(*a)]) df_D = pd.DataFrame(sum_D) df_V = pd.DataFrame(sum_V)
import pandas as pd df = pd.read_hdf('balances.h5', 'balances') print(df.to_string()) print(df.dtypes) df_day = df.set_index('TimeStamp').groupby( [pd.TimeGrouper(freq='10MIN'), 'CURRENCY', 'Source', 'USD', 'BTC']).last() print(df_day.to_string()) df_day = df_day.groupby(['TimeStamp', 'CURRENCY']).sum() print(df_day.to_string())
data_311 = data_311_raw[data_311_raw.created_date < pd.to_datetime(dt.date(2016,1,1))].dropna() minlat = data_crime.Latitude.min() maxlat = data_crime.Latitude.max() minlon = data_crime.Longitude.min() maxlon = data_crime.Longitude.max() latrange = np.arange(minlat, maxlat+0.02, 0.02) lonrange = np.arange(minlon, maxlon+0.02, 0.02) data_crime = data_crime[data_crime.LAW_CAT_CD != 'VIOLATION'] d_c_grouped = data_crime.groupby( by=[pd.cut(data_crime['Latitude'], latrange), pd.cut(data_crime['Longitude'], lonrange), pd.TimeGrouper(key='CMPLNT_FR_DT',freq='M')]) d_311_grouped = data_311.groupby( by=[pd.cut(data_311['Latitude'], latrange), pd.cut(data_311['Longitude'], lonrange), pd.TimeGrouper(key='created_date',freq='M')]) max_var_loc = d_c_grouped.size().unstack().var(axis=1).argmax() data_crime_window = data_crime_raw[data_crime_raw.CMPLNT_FR_DT.between( pd.to_datetime(dt.date(2010,1,1)), pd.to_datetime(dt.date(2015,2,1)))].dropna() data_311_window = data_311_raw[data_311_raw.created_date.between( pd.to_datetime(dt.date(2010,1,1)), pd.to_datetime(dt.date(2015,2,1)))].dropna() d_c_win_grouped = data_crime_window.groupby( by=[pd.cut(data_crime_window['Latitude'], latrange),
def api_modelling_unit_summarised(modellingUnitId): """ { "doc": { "title": "modelling unit summarised help", "body": "<p> Obtain the sumarized of the modelling unit </p>" }, "GET": { "label": "Obtain the modelling unit sumarized", "params":[{"name": "modellingUnitId", "type":"string", "required":"true", "doc":"id of the modelling_unit"}, {"name": "period", "type":"string", "required":"false", "info":"the period to sumarize", "values": ["D", "W", "M", "Y"]}, {"name": "type", "type":"list", "required":"false", "info":"the field to sumarize"}] } } """ companyId = g.get("auth_value") # params from url period = request.args.get('period', 'M') type = request.args.get('type', ['savings', 'smileys']) if not isinstance(type, list): type = type.split(',') # type=savings,smileys in the url periodsAllowed = ['D', 'W', 'M', 'Y'] # Weekly means Monday to Sunday period = period[0].upper() modellingUnitIdList = modellingUnitId.split(';') res_final = [] for modellingUnit in modellingUnitIdList: # recupero la info de mongo # update_baseline(companyId, modellingUnitId) # TO DO query_baseline = { 'companyId': companyId, 'modellingUnitId': modellingUnit } query_fields = { 'values': 1, 'prediction': 1, 'smileys': 1, 'timestamps': 1 } doc_baseline = app.data.driver.db['baselines'].find_one( query_baseline, query_fields) res = {} if doc_baseline and 'values' in doc_baseline: n = -12 * 7 * 24 * 2 # creo el dataframe df = pd.DataFrame.from_records({ 'values': doc_baseline['values'][n:], 'smileys': doc_baseline['smileys'][n:], 'prediction': doc_baseline['prediction'][n:], 'timestamps': doc_baseline['timestamps'][n:] }) df = df.set_index(pd.DatetimeIndex(df['timestamps'])) df = df.drop('timestamps', 1) # a list is needed if not isinstance(type, list): type = [type] if df.empty != True and period in periodsAllowed: # calculo les agrupacions dels diff valors for typ in type: if typ in doc_baseline.keys() or typ == 'savings': if typ in ['savings', 'values', 'prediction']: # filtre per negatius df_grouped = df.clip(lower=0) # filtre per valors >>> # df_grouped = df_grouped[np.abs(df_grouped.prediction-df_grouped.prediction.mean())<=(10*df_grouped.prediction.std())] df_grouped = df_grouped.groupby( pd.TimeGrouper(freq=period)).sum() else: df_grouped = df.groupby( pd.TimeGrouper(freq=period)).mean() # res_parcial['groupedValues'] = df_grouped['value'].tolist() # res_parcial['groupedPrediction'] = df_grouped['prediction'].tolist() if typ == 'savings': res[typ] = df_grouped['prediction'] - df_grouped[ 'values'] else: res[typ] = df_grouped[typ] res[typ] = res[typ].where( (pd.notnull(res[typ])), None).tolist() # replacing nan by None else: res[typ] = None try: # if there is any valid type res['timestamps'] = df_grouped.index.tolist() res['number_of_elements'] = df['values'].groupby( pd.TimeGrouper(freq=period)).count().dropna().tolist() except: for typ in type: res[typ] = None res['timestamps'] = None res['number_of_elements'] = None else: # res_parcial['groupedValues'] = None # res_parcial['groupedPrediction'] = None for typ in type: res[typ] = None res['timestamps'] = None res['number_of_elements'] = None res_final.append(res) # torno una llista o un element res = res_final[0] if len(res_final) < 2 else res_final return send_response('', (res, None, None, 200))
def resample(self, freq, dim, how='mean', skipna=None, closed=None, label=None, base=0, keep_attrs=False): """Resample this object to a new temporal resolution. Handles both downsampling and upsampling. Upsampling with filling is not yet supported; if any intervals contain no values in the original object, they will be given the value ``NaN``. Parameters ---------- freq : str String in the '#offset' to specify the step-size along the resampled dimension, where '#' is an (optional) integer multipler (default 1) and 'offset' is any pandas date offset alias. Examples of valid offsets include: * 'AS': year start * 'QS-DEC': quarterly, starting on December 1 * 'MS': month start * 'D': day * 'H': hour * 'Min': minute The full list of these offset aliases is documented in pandas [1]_. dim : str Name of the dimension to resample along (e.g., 'time'). how : str or func, optional Used for downsampling. If a string, ``how`` must be a valid aggregation operation supported by xarray. Otherwise, ``how`` must be a function that can be called like ``how(values, axis)`` to reduce ndarray values along the given axis. Valid choices that can be provided as a string include all the usual Dataset/DataArray aggregations (``all``, ``any``, ``argmax``, ``argmin``, ``max``, ``mean``, ``median``, ``min``, ``prod``, ``sum``, ``std`` and ``var``), as well as ``first`` and ``last``. skipna : bool, optional Whether to skip missing values when aggregating in downsampling. closed : 'left' or 'right', optional Side of each interval to treat as closed. label : 'left or 'right', optional Side of each interval to use for labeling. base : int, optionalt For frequencies that evenly subdivide 1 day, the "origin" of the aggregated intervals. For example, for '24H' frequency, base could range from 0 through 23. keep_attrs : bool, optional If True, the object's attributes (`attrs`) will be copied from the original object to the new one. If False (default), the new object will be returned without attributes. Returns ------- resampled : same type as caller This object resampled. References ---------- .. [1] http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases """ from .dataarray import DataArray RESAMPLE_DIM = '__resample_dim__' if isinstance(dim, basestring): dim = self[dim] group = DataArray(dim, [(RESAMPLE_DIM, dim)], name=RESAMPLE_DIM) time_grouper = pd.TimeGrouper(freq=freq, how=how, closed=closed, label=label, base=base) gb = self.groupby_cls(self, group, grouper=time_grouper) if isinstance(how, basestring): f = getattr(gb, how) if how in ['first', 'last']: result = f(skipna=skipna, keep_attrs=keep_attrs) else: result = f(dim=dim.name, skipna=skipna, keep_attrs=keep_attrs) else: result = gb.reduce(how, dim=dim.name, keep_attrs=keep_attrs) result = result.rename({RESAMPLE_DIM: dim.name}) return result
def api_get_user_modelling_units_results(userModellingUnitId, period, divider): """ { "doc": { "title": "user modelling units results help", "body": "<p> Obtain the user modelling units results </p>" }, "GET": { "label": "Obtain the user modelling units results", "params":[{"name": "userModellingUnitId", "type":"string", "required":"true", "doc":"id of the user modelling_unit"}, {"name": "period", "type":"string", "required":"false", "info":"the period to sumarize", "values": ["D", "W", "M", "Y"]}, {"name": "divider", "type":"string", "required":"false", "info":"the field to divide the information"}] } } """ companyId = g.get("auth_value") ######## start code periodsAllowed = ['D', 'W', 'M', 'Y'] # Weekly means Monday to Sunday modelling_unit_types = ['electricityConsumption', 'gasConsumption'] period = period[0].upper() # recupero la info de mongo query = { 'companyId': companyId, 'userModellingUnitId': userModellingUnitId } doc = app.data.driver.db['user_modelling_units'].find_one( query, {'_id': 0}) # timeout=False) try: building_id = doc['buildings'] except: building_id = [] buildings_docs = [] for i, building in enumerate(building_id): query_building = {'companyId': companyId, 'buildingId': building} doc_building = app.data.driver.db['buildings'].find_one( query_building, { 'buildingId': 1, 'data.' + divider: 1 }) if doc_building: buildings_docs.append(doc_building) # need to clean the building to obtain the desired dictionary clean_building_docs = [{ "buildingId": b['buildingId'], divider: b['data'][divider] if divider in b['data'] else 'unknown', } for b in buildings_docs] # create the dataframe of buildings building_df = pd.DataFrame.from_records(clean_building_docs) # get all divider and group the tables, then filter them by the different groups building_divider_values = building_df[divider].unique() building_grouped = building_df.groupby(divider) # initialize the results variable results = {} # iterate for all modelling unit types for modelling_unit_type in modelling_unit_types: baselines_by_divider = pd.DataFrame() # and all different divider groups for building_divider in building_divider_values: # get the buildings of thes divider buildings_by_divider = building_grouped.get_group(building_divider) baselines_by_divider_temp = [] for building_id in buildings_by_divider.buildingId: query_baseline = { 'companyId': companyId, 'modellingUnitId': building_id + '-' + modelling_unit_type } doc_baseline = app.data.driver.db['baselines'].find_one( query_baseline, { 'P50': 1, 'values': 1, 'timestamps': 1 }) if (doc_baseline): if len(doc_baseline['values']) == len( doc_baseline['P50']) == len( doc_baseline['timestamps']): df = pd.DataFrame.from_records({ 'values': doc_baseline['values'], 'P50': doc_baseline['P50'], 'timestamps': doc_baseline['timestamps'] }) df = df.set_index(pd.DatetimeIndex(df['timestamps'])) df_grouped = df.groupby( pd.TimeGrouper(freq=period)).sum() baselines_by_divider_temp.append(df_grouped) if baselines_by_divider_temp: baselines_by_divider_temp = pd.concat( baselines_by_divider_temp, axis=1) v = baselines_by_divider_temp # dropna try: final = pd.DataFrame.from_records({ 'P50': v['P50'].sum(axis=1), 'values': v['values'].sum(axis=1) }) except: final = v else: final = pd.DataFrame() final.rename(index=str, columns={ 'values': 'values-' + building_divider, 'P50': 'P50-' + building_divider }, inplace=True) # print(final.dropna()) baselines_by_divider = pd.concat( [baselines_by_divider, final.dropna()], axis=1) baselines_by_divider = baselines_by_divider results[modelling_unit_type] = { "timestamps": baselines_by_divider.index.tolist() if not v.empty else [] } for divider in building_divider_values: try: results[modelling_unit_type].update({ divider: { "values": baselines_by_divider['values-' + divider].tolist() if not v.empty else [], "P50": baselines_by_divider['P50-' + divider].tolist() if not v.empty else [] } }) except: pass return send_response('', (results, None, None, 200))
#cPickle.dump(genres,open(outputdir+'gn_genres.pkl','w')) genres = cPickle.load(open(outputdir+'gn_genres.pkl')) result = pd.DataFrame(0.,index=daterange,columns=genres['genre1']+genres['genre2']+genres['genre3']) # if len(done)==0: # result = pd.DataFrame(0.,index=daterange,columns=genres['genre1']+genres['genre2']+genres['genre3']) # else: # result = pd.read_pickle(outputdir+'genre_data') for i,f in enumerate(files): user_start = time.time() # if f in done: # continue df = pd.read_table(f,sep='\t',header=None,names=['item_id','artist_id','scrobble_time'],parse_dates=['scrobble_time']).join(gn,on='item_id',how='left') for level in genres: vars()['df_'+level] = df.set_index('scrobble_time').groupby([pd.TimeGrouper(freq='D'),level]).count()['item_id'].unstack().reindex(daterange,columns=genres[level]) concat = pd.concat([df_genre1,df_genre2,df_genre3],axis=1).fillna(0) result += concat rootLogger.info("{} ({}/{}, {:.1f}, {}, block {})".format(f,i+1,n_users,time.time()-user_start,len(df),idx)) #time_elapsed = time.time() - start # if time_elapsed >= (wall_time-(time_buffer)): # result.to_pickle(outputdir+'genre_data') # sys.exit() result.to_pickle(outputdir+'genre_data_'+str(idx)) """
def estimate_intraday(returns, positions, transactions, EOD_hour=23): """ Intraday strategies will often not hold positions at the day end. This attempts to find the point in the day that best represents the activity of the strategy on that day, and effectively resamples the end-of-day positions with the positions at this point of day. The point of day is found by detecting when our exposure in the market is at its maximum point. Note that this is an estimate. Parameters ---------- returns : pd.Series Daily returns of the strategy, noncumulative. - See full explanation in create_full_tear_sheet. positions : pd.DataFrame Daily net position values. - See full explanation in create_full_tear_sheet. transactions : pd.DataFrame Prices and amounts of executed trades. One row per trade. - See full explanation in create_full_tear_sheet. Returns ------- pd.DataFrame Daily net position values, resampled for intraday behavior. """ # Construct DataFrame of transaction amounts txn_val = transactions.copy() txn_val.index.names = ['date'] txn_val['value'] = txn_val.amount * txn_val.price txn_val = txn_val.reset_index().pivot_table(index='date', values='value', columns='symbol').replace( np.nan, 0) # Cumulate transaction amounts each day txn_val['date'] = txn_val.index.date txn_val = txn_val.groupby('date').cumsum() # Calculate exposure, then take peak of exposure every day txn_val['exposure'] = txn_val.abs().sum(axis=1) condition = (txn_val['exposure'] == txn_val.groupby( pd.TimeGrouper('24H'))['exposure'].transform(max)) txn_val = txn_val[condition].drop('exposure', axis=1) # Compute cash delta txn_val['cash'] = -txn_val.sum(axis=1) # Shift EOD positions to positions at start of next trading day positions_shifted = positions.copy().shift(1).fillna(0) starting_capital = positions.iloc[0].sum() / (1 + returns[0]) positions_shifted.cash[0] = starting_capital # Format and add start positions to intraday position changes txn_val.index = txn_val.index.normalize() corrected_positions = positions_shifted.add(txn_val, fill_value=0) corrected_positions.index.name = 'period_close' corrected_positions.columns.name = 'sid' return corrected_positions
def uniquelogins(sessions): """Unique logins per days/weeks/months. :return: daily, weekly, monthly 3 lists of dictionaries of the following format [{'x':epoch, 'y': value},] """ # sessions = LoginSession.query.order_by(LoginSession.started_at.asc()).all() if not sessions: return [], [], [] dates = {} for session in sessions: user = session.user # time value is discarded to aggregate on days only date = session.started_at.strftime("%Y/%m/%d") if date not in dates: dates[date] = set() # we want unique users on a given day dates[date].add(user) else: dates[date].add(user) daily = [] weekly = [] monthly = [] for date in sorted(dates.keys()): # print u"{} : {}".format(date, len(dates[date])) date_epoch = unix_time_millis(datetime.strptime(date, "%Y/%m/%d")) daily.append({'x': date_epoch, 'y': len(dates[date])}) # first_day = data[0]['x'] # last_day = data[-1]['x'] daily_serie = pd.Series(dates) # convert the index to Datetime type daily_serie.index = pd.DatetimeIndex(daily_serie.index) # calculate the values instead of users lists daily_serie = daily_serie.apply(lambda x: len(x)) # GroupBy Week/month, Thanks Panda weekly_serie = daily_serie \ .groupby(pd.TimeGrouper(freq='W')) \ .aggregate(numpysum) monthly_serie = daily_serie \ .groupby(pd.TimeGrouper(freq='M')) \ .aggregate(numpysum) for date, value in six.iteritems(weekly_serie): try: value = int(value) except ValueError: continue date_epoch = unix_time_millis(date) weekly.append({'x': date_epoch, 'y': value}) for date, value in six.iteritems(monthly_serie): try: value = int(value) except ValueError: continue date_epoch = unix_time_millis(date) monthly.append({'x': date_epoch, 'y': value}) return daily, weekly, monthly
def get_group_members_over_time(req_obj): # Establish database connection gc.connect_to_database() # Determine group guid group_guid = get_group_guid(req_obj['url']) # Query the database group_members = gc.groups.get_group_members(group_guid, cleaned=False) group_name = gc.groups.name_from_guid(group_guid) # Get mungin' # Convert times to datetime objects group_members['time_created'] = group_members['time_created'].apply( lambda x: pd.to_datetime(x)) group_members.set_index('time_created', inplace=True) group_members = group_members[ group_members.index > pd.to_datetime('2000-01-01')] # Daily group_members_daily = group_members['user_name'].groupby( pd.TimeGrouper(freq='D')).count().cumsum() group_members_daily = group_members_daily.reset_index() # If the requested start date predates the dataframe, pad with 0s if min(group_members_daily['time_created']) > pd.to_datetime( req_obj['start_date']): ix = pd.DatetimeIndex(start=pd.to_datetime(req_obj['start_date']), end=max(group_members_daily['time_created']), freq='D') group_members_daily = group_members_daily.set_index( 'time_created').reindex(ix, fill_value=0).reset_index() group_members_daily.rename(columns={'index': 'time_created'}, inplace=True) # If the requested end date is after the end of the dataframe, pad with last value if max(group_members_daily['time_created']) < pd.to_datetime( req_obj['end_date']): ix = pd.DatetimeIndex(start=pd.to_datetime(req_obj['start_date']), end=pd.to_datetime(req_obj['end_date']), freq='D') group_members_daily = group_members_daily.set_index( 'time_created').reindex( ix, fill_value=max( group_members_daily['user_name'])).reset_index() group_members_daily.rename(columns={'index': 'time_created'}, inplace=True) # Only keep current time selection group_members_daily = group_members_daily[ group_members_daily['time_created'] >= pd.to_datetime( req_obj['start_date'])] group_members_daily = group_members_daily[ group_members_daily['time_created'] <= pd.to_datetime( req_obj['end_date'])] group_members_daily['time_created'] = group_members_daily[ 'time_created'].apply(lambda x: x.strftime('%Y%m%d')) # Monthly group_members_monthly = group_members['user_name'].groupby( pd.TimeGrouper(freq='M')).count().cumsum() group_members_monthly = group_members_monthly.reset_index() # (monthly) If the requested start date predates the oldest time on the dataframe, pad with 0s if min(group_members_monthly['time_created']) > pd.to_datetime( req_obj['start_date']): ix = pd.DatetimeIndex(start=pd.to_datetime(req_obj['start_date']), end=max(group_members_monthly['time_created']), freq='M') group_members_monthly = group_members_monthly.set_index( 'time_created').reindex(ix, fill_value=0).reset_index() group_members_monthly.rename(columns={'index': 'time_created'}, inplace=True) # If the requested end date is after the end of the dataframe, pad with last value if max(group_members_monthly['time_created']) < pd.to_datetime( req_obj['end_date']): ix = pd.DatetimeIndex(start=pd.to_datetime(req_obj['start_date']), end=pd.to_datetime(req_obj['end_date']), freq='M') group_members_monthly = group_members_monthly.set_index( 'time_created').reindex( ix, fill_value=max( group_members_monthly['user_name'])).reset_index() group_members_monthly.rename(columns={'index': 'time_created'}, inplace=True) # Only keep current time selection group_members_monthly = group_members_monthly[ group_members_monthly['time_created'] >= pd.to_datetime( req_obj['start_date'])] group_members_monthly = group_members_monthly[ group_members_monthly['time_created'] <= pd.to_datetime( req_obj['end_date'])] group_members_monthly['time_created'] = group_members_monthly[ 'time_created'].apply(lambda x: x.strftime('%Y%m%d')) send_obj = { 'monthly': { 'dates': group_members_monthly['time_created'].values.tolist(), 'users': group_members_monthly['user_name'].values.tolist() }, 'daily': { 'dates': group_members_daily['time_created'].values.tolist(), 'users': group_members_daily['user_name'].values.tolist(), }, 'group_name': group_name } print(json.dumps(send_obj))
def update_data_frame(self): num_lines = sum(1 for line in open(self.jmeter_results_file)) if self.start_line < num_lines - 10: read_lines = num_lines - self.start_line - 10 #if self.file_size < os.path.getsize(self.jmeter_results_file): #self.file_size = os.path.getsize(self.jmeter_results_file) df = pd.read_csv(self.jmeter_results_file, index_col=0, low_memory=False, skiprows=self.start_line, nrows=read_lines) df.columns = [ 'average', 'URL', 'responseCode', 'success', 'threadName', 'failureMessage', 'grpThreads', 'allThreads' ] df = df[~df['URL'].str.contains('exclude_')] df.index = pd.to_datetime(dateconv((df.index.values / 1000))) # update start line for the next parse self.start_line = self.start_line + read_lines group_by_response_codes = df.groupby('responseCode') add_df = pd.DataFrame() add_df['count'] = group_by_response_codes.success.count() #add_df['thread_count'] = group_by_response_codes['grpThreads'].nunique() add_df = add_df.fillna(0) add_df = add_df.reset_index() add_df.columns = ['response_code', 'count'] df1 = pd.concat([ self.response_codes_frame, add_df ]).groupby('response_code')['count'].sum().reset_index() self.response_codes_frame = df1 #create aggregate table group_by_url = df.groupby('URL') # group date by URLs add_aggregate_data = group_by_url.aggregate({ 'average': np.mean }).round(1) add_aggregate_data['maximum'] = group_by_url.average.max().round(1) add_aggregate_data['minimum'] = group_by_url.average.min().round(1) add_aggregate_data['count'] = group_by_url.success.count().round(1) add_aggregate_data['errors'] = df[( df.success == False)].groupby('URL')['success'].count() add_aggregate_data = add_aggregate_data.fillna(0) add_aggregate_data = add_aggregate_data.reset_index() add_aggregate_data.columns = [ 'URL', 'average', 'maximum', 'minimum', 'count', 'errors' ] #??? df1 = pd.concat([self.aggregate_frame, add_aggregate_data ]).groupby('URL')['average'].mean().reset_index() df2 = pd.concat([self.aggregate_frame, add_aggregate_data ]).groupby('URL')['count', 'errors'].sum().reset_index() df3 = pd.concat([self.aggregate_frame, add_aggregate_data ]).groupby('URL')['maximum'].max().reset_index() df4 = pd.concat([self.aggregate_frame, add_aggregate_data ]).groupby('URL')['minimum'].min().reset_index() result_df = pd.merge(df1, df2, how='inner', on='URL') result_df = pd.merge(result_df, df3, how='inner', on='URL') result_df = pd.merge(result_df, df4, how='inner', on='URL') self.aggregate_frame = result_df add_df2 = pd.DataFrame() gr_by_minute = df.groupby( pd.TimeGrouper(freq='1Min')) # group data by minute add_df2['average'] = gr_by_minute.average.mean() add_df2['median'] = gr_by_minute.average.median() add_df2['count'] = gr_by_minute.success.count() add_df2['errors_count'] = df[(df.success == False)].groupby( pd.TimeGrouper(freq='1Min'))['success'].count() #add_df2['thread_count'] = gr_by_minute['grpThreads'].nunique() #add_df2['rps'] = gr_by_minute.success.count()/60 add_df2 = add_df2.fillna(0) add_df2 = add_df2.reset_index() add_df2.columns = [ 'time', 'average', 'median', 'count', 'errors_count' ] df1 = pd.concat([self.data_frame, add_df2 ]).groupby('time')['average', 'median'].mean().reset_index() df2 = pd.concat( [self.data_frame, add_df2]).groupby('time')['count', 'errors_count'].sum().reset_index() #df3 = pd.concat([self.data_frame,add_df2]).groupby('time')['thread_count'].max().reset_index() result_df = pd.merge(df1, df2, how='inner', on='time') #result_df = pd.merge(result_df1, df3, how='inner',on='time') result_df['rps'] = result_df['count'] / 60 #print 'result_df' self.data_frame = result_df #print self.data_frame else: logger.info(".jtl file was not changed")
for junction in df_train.Junction.unique(): fig = plt.figure(junc, figsize=(10, 2)) df_train[df_train.Junction == junction].resample( "D")["Vehicles"].count().plot() plt.title("Existing values for Junction " + str(junction)) plt.show() # ** -> No missing observations for each hour, and for each junction. ** # In[16]: # We plot vehicles observations per junction for junc in df_train.Junction.unique(): fig = plt.figure(junc, figsize=(20, 3)) df_train[df_train.Junction == junc].groupby( pd.TimeGrouper('D')).mean().Vehicles.plot() plt.title('Vehicles of Junction {}'.format(junc)) plt.show() # ** -> The three first junctions are on the same frequencies of dates. The last junction (number 4) has less observations : since Jan. 2017. The frequency per hour remains unchanged** # ### Stationarity Checks for each TS : # In[48]: # Source of code : https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/ from statsmodels.tsa.stattools import adfuller def test_stationarity(idx, timeseries): fig = plt.figure(idx, figsize=(20, 5))
def pageviews(self, URLs, start_date='30daysAgo', end_date='today', intervals=False): """ Return a dataframe containing views on a particular page. First argument can be a URL string or list of URLs. """ def strip_domain(url): return url.replace('https://gcconnex.gc.ca/','').replace('https://gccollab.ca/','').replace('www.gcpedia.gc.ca/','') metric = 'ga:pageviews' dimension = 'ga:date' # Strip the domain from the URL if type(URLs) == list: URLs = list(map(lambda x: strip_domain(x), URLs )) else: URLs = [strip_domain(URLs)] # Construct filter clauses for both requests filter_clause = self._construct_filter_clause(metric, 'ga:pagePath', URLs) # Should first construct report for found pagePaths. Print to ensure nothing is wonky. # Construct report for stats. response_names = self._make_report(start_date, end_date, metric, 'ga:pagePath', filter_clause, order='views') response_stats = self._make_report(start_date, end_date, metric, 'ga:date', filter_clause, order='date') df_names = self._parse_response_into_df(response_names) df = self._parse_response_into_df(response_stats) df.columns = ['date', 'pageviews'] df['date'] = df['date'].apply(lambda x: pd.to_datetime(x, format='%Y%m%d')) df.set_index('date', inplace=True) idx = pd.date_range(start_date, end_date) #code.interact(local=locals()) df = df.reindex(idx, fill_value=0) df = df[df.index.weekday < 5] # Should work now df['pageviews'] = df['pageviews'].astype(int) if intervals == True: # Create both monthly and daily df_month = df.groupby(pd.TimeGrouper(freq='M')).sum() #code.interact(local=locals()) df_month.reset_index(inplace=True) df_month.rename(columns={'index':'date'}, inplace=True) df_month['pageviews'] = df_month['pageviews'].astype(str) df_month['date'] = df_month['date'].apply(lambda x: x.strftime('%Y%m%d')) df.reset_index(inplace=True) #code.interact(local=locals()) df.rename(columns={'index':'date'}, inplace=True) df['pageviews'] = df['pageviews'].astype(str) df['date'] = df['date'].apply(lambda x: x.strftime('%Y%m%d')) # Build lists from columns for C3 timechart format if intervals == True: return { 'daily': { 'dates': df['date'].values.tolist(), 'pageviews': df['pageviews'].values.tolist() }, 'monthly': { 'dates': df_month['date'].values.tolist(), 'pageviews': df_month['pageviews'].values.tolist() } } else: return { 'dates': df['date'].values.tolist(), 'pageviews': df['pageviews'].values.tolist() }
def plotYearMonthStatsHb(data): #pd.groupby(b,by=[b.index.month,b.index.year]) data.groupby(pd.TimeGrouper(freq='M')).mean().plot() sns.plt.show()
In [ ]: dti = pd.date_range(start='2015-01-01', end='2015-12-31', freq='B') s = pd.Series(np.random.rand(len(dti)), index=dti) 34. Find the sum of the values in s for every Wednesday. In [ ]: s[s.index.weekday == 2].sum() 35. For each calendar month in s, find the mean of values. In [ ]: s.resample('M').mean() 36. For each group of four consecutive calendar months in s, find the date on which the highest value occurred. In [ ]: s.groupby(pd.TimeGrouper('4M')).idxmax() 37. Create a DateTimeIndex consisting of the third Thursday in each month for the years 2015 and 2016. In [ ]: pd.date_range('2015-01-01', '2016-12-31', freq='WOM-3THU') Cleaning Data Making a DataFrame easier to work with Difficulty: easy/medium It happens all the time: someone gives you data containing malformed strings, Python, lists and missing data. How do you tidy it up so you can get on with the analysis? Take this monstrosity as the DataFrame to use in the following puzzles: df = pd.DataFrame({'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm', 'Budapest_PaRis', 'Brussels_londOn'], 'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],
def test_TimeGrouper(self): with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): pd.TimeGrouper(freq='D')
def resample_benchmark_return(self, frequence): return self.benchmark_return.groupby(pd.TimeGrouper( freq=frequence)).agg(stats.cum_returns_final).dropna()