Example #1
0
     test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
train.loc[:, 'pickup_datetime_group'] = train['pickup_datetime'].dt.round(
    group_freq)
test.loc[:, 'pickup_datetime_group'] = test['pickup_datetime'].dt.round(
    group_freq)

# Count trips over 60min
df_counts = df_all.set_index('pickup_datetime')[['id']].sort_index()
df_counts['count_60min'] = df_counts.isnull().rolling(group_freq).count()['id']
train = train.merge(df_counts, on='id', how='left')
test = test.merge(df_counts, on='id', how='left')

# Count how many trips are going to each cluster over time
dropoff_counts = df_all \
    .set_index('pickup_datetime') \
    .groupby([pd.TimeGrouper(group_freq), 'dropoff_cluster']) \
    .agg({'id': 'count'}) \
    .reset_index().set_index('pickup_datetime') \
    .groupby('dropoff_cluster').rolling('240min').mean() \
    .drop('dropoff_cluster', axis=1) \
    .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index() \
    .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'dropoff_cluster_count'})

train['dropoff_cluster_count'] = train[[
    'pickup_datetime_group', 'dropoff_cluster'
]].merge(dropoff_counts,
         on=['pickup_datetime_group', 'dropoff_cluster'],
         how='left')['dropoff_cluster_count'].fillna(0)
test['dropoff_cluster_count'] = test[[
    'pickup_datetime_group', 'dropoff_cluster'
]].merge(dropoff_counts,
Example #2
0
    journeys = journeys.loc[(journeys['Booking Time'] > start) & (journeys['Drop-off Time'] < end)]
    print(journeys.shape)
    vehicles = h.convertColumns(vehicles, ['TimeStamp'], h.todatetime)
    vehicles = vehicles.loc[(vehicles.TimeStamp > start) & (vehicles.TimeStamp < end)]
    vehicleNames = { id : "vehicle-" + str(index+1) for index, id in enumerate(vehicles['Vehicle ID']).unique() }
    vehicles = vehicles.replace({"Vehicle ID": vehicleNames})
    vehicles = vehicles.sort_values(by=['Vehicle ID', 'TimeStamp'])
    vehicles['Battery'] = - vehicles.groupby('Vehicle ID')['Battery Level'].diff().fillna(0)
    vehicles = vehicles.set_index("TimeStamp")
    batteryPerVehicle = vehicles.groupby('Vehicle ID')[['Battery']].agg(batteryCount)
    totalBattery = batteryPerVehicle['Battery'].sum()
    batteryPerVehicle.loc["Total"] = totalBattery
    fig1 = h.render_mpl_table(batteryPerVehicle, header_columns=0, col_width=7.0, title = "Vehicle Battery Consumption")

    time_grouper = pd.TimeGrouper(freq=cfg.frequencyForAnalysis)

    vehicles['Battery'] = vehicles['Battery'].apply(lambda x: x if (x > 0) else 0)
    battery_freq = vehicles.groupby(["Vehicle ID",time_grouper]).sum()[['Battery']]

    print(vehicles.shape)

    vehicles['Occupancy'] = vehicles.apply(countSeats, axis = 1)
    occupancy_freq = vehicles.groupby(["Vehicle ID",time_grouper]).mean()[['Occupancy']]

    vehicles['LoadFactor'] = vehicles['Occupancy']*vehicles['Battery']
    vehiclesSumPerVehicle = vehicles.groupby('Vehicle ID').sum()
    loadFactorPerVehicle = vehiclesSumPerVehicle['LoadFactor'] / (vehiclesSumPerVehicle['Battery']*cfg.maxCapacity)
    loadFactorPerVehicle = loadFactorPerVehicle.to_frame(name="Load Factor")
    averageLoadFactor = loadFactorPerVehicle['Load Factor'].mean()
    maxLoadFactor = loadFactorPerVehicle['Load Factor'].max()
Example #3
0
    def test_timegrouper_with_reg_groups(self):

        # GH 3794
        # allow combinateion of timegrouper/reg groups

        df_original = DataFrame({
            'Branch':
            'A A A A A A A B'.split(),
            'Buyer':
            'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date': [
                datetime(2013, 1, 1, 13, 0),
                datetime(2013, 1, 1, 13, 5),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 12, 2, 12, 0),
                datetime(2013, 12, 2, 14, 0),
            ]
        }).set_index('Date')

        df_sorted = df_original.sort_values(by='Quantity', ascending=False)

        for df in [df_original, df_sorted]:
            expected = DataFrame({
                'Buyer':
                'Carl Joe Mark'.split(),
                'Quantity': [10, 18, 3],
                'Date': [
                    datetime(2013, 12, 31, 0, 0),
                    datetime(2013, 12, 31, 0, 0),
                    datetime(2013, 12, 31, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])

            result = df.groupby([pd.Grouper(freq='A'), 'Buyer']).sum()
            assert_frame_equal(result, expected)

            expected = DataFrame({
                'Buyer':
                'Carl Mark Carl Joe'.split(),
                'Quantity': [1, 3, 9, 18],
                'Date': [
                    datetime(2013, 1, 1, 0, 0),
                    datetime(2013, 1, 1, 0, 0),
                    datetime(2013, 7, 1, 0, 0),
                    datetime(2013, 7, 1, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])
            result = df.groupby([pd.Grouper(freq='6MS'), 'Buyer']).sum()
            assert_frame_equal(result, expected)

        df_original = DataFrame({
            'Branch':
            'A A A A A A A B'.split(),
            'Buyer':
            'Carl Mark Carl Carl Joe Joe Joe Carl'.split(),
            'Quantity': [1, 3, 5, 1, 8, 1, 9, 3],
            'Date': [
                datetime(2013, 10, 1, 13, 0),
                datetime(2013, 10, 1, 13, 5),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 10, 1, 20, 0),
                datetime(2013, 10, 2, 10, 0),
                datetime(2013, 10, 2, 12, 0),
                datetime(2013, 10, 2, 14, 0),
            ]
        }).set_index('Date')

        df_sorted = df_original.sort_values(by='Quantity', ascending=False)
        for df in [df_original, df_sorted]:

            expected = DataFrame({
                'Buyer':
                'Carl Joe Mark Carl Joe'.split(),
                'Quantity': [6, 8, 3, 4, 10],
                'Date': [
                    datetime(2013, 10, 1, 0, 0),
                    datetime(2013, 10, 1, 0, 0),
                    datetime(2013, 10, 1, 0, 0),
                    datetime(2013, 10, 2, 0, 0),
                    datetime(2013, 10, 2, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])

            result = df.groupby([pd.Grouper(freq='1D'), 'Buyer']).sum()
            assert_frame_equal(result, expected)

            result = df.groupby([pd.Grouper(freq='1M'), 'Buyer']).sum()
            expected = DataFrame({
                'Buyer':
                'Carl Joe Mark'.split(),
                'Quantity': [10, 18, 3],
                'Date': [
                    datetime(2013, 10, 31, 0, 0),
                    datetime(2013, 10, 31, 0, 0),
                    datetime(2013, 10, 31, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])
            assert_frame_equal(result, expected)

            # passing the name
            df = df.reset_index()
            result = df.groupby([pd.Grouper(freq='1M', key='Date'),
                                 'Buyer']).sum()
            assert_frame_equal(result, expected)

            with pytest.raises(KeyError):
                df.groupby([pd.Grouper(freq='1M', key='foo'), 'Buyer']).sum()

            # passing the level
            df = df.set_index('Date')
            result = df.groupby([pd.Grouper(freq='1M', level='Date'),
                                 'Buyer']).sum()
            assert_frame_equal(result, expected)
            result = df.groupby([pd.Grouper(freq='1M', level=0),
                                 'Buyer']).sum()
            assert_frame_equal(result, expected)

            with pytest.raises(ValueError):
                df.groupby([pd.Grouper(freq='1M', level='foo'), 'Buyer']).sum()

            # multi names
            df = df.copy()
            df['Date'] = df.index + pd.offsets.MonthEnd(2)
            result = df.groupby([pd.Grouper(freq='1M', key='Date'),
                                 'Buyer']).sum()
            expected = DataFrame({
                'Buyer':
                'Carl Joe Mark'.split(),
                'Quantity': [10, 18, 3],
                'Date': [
                    datetime(2013, 11, 30, 0, 0),
                    datetime(2013, 11, 30, 0, 0),
                    datetime(2013, 11, 30, 0, 0),
                ]
            }).set_index(['Date', 'Buyer'])
            assert_frame_equal(result, expected)

            # error as we have both a level and a name!
            with pytest.raises(ValueError):
                df.groupby(
                    [pd.Grouper(freq='1M', key='Date', level='Date'),
                     'Buyer']).sum()

            # single groupers
            expected = DataFrame({
                'Quantity': [31],
                'Date': [datetime(2013, 10, 31, 0, 0)]
            }).set_index('Date')
            result = df.groupby(pd.Grouper(freq='1M')).sum()
            assert_frame_equal(result, expected)

            result = df.groupby([pd.Grouper(freq='1M')]).sum()
            assert_frame_equal(result, expected)

            expected = DataFrame({
                'Quantity': [31],
                'Date': [datetime(2013, 11, 30, 0, 0)]
            }).set_index('Date')
            result = df.groupby(pd.Grouper(freq='1M', key='Date')).sum()
            assert_frame_equal(result, expected)

            result = df.groupby([pd.Grouper(freq='1M', key='Date')]).sum()
            assert_frame_equal(result, expected)

        # GH 6764 multiple grouping with/without sort
        df = DataFrame({
            'date':
            pd.to_datetime([
                '20121002', '20121007', '20130130', '20130202', '20130305',
                '20121002', '20121207', '20130130', '20130202', '20130305',
                '20130202', '20130305'
            ]),
            'user_id': [1, 1, 1, 1, 1, 3, 3, 3, 5, 5, 5, 5],
            'whole_cost':
            [1790, 364, 280, 259, 201, 623, 90, 312, 359, 301, 359, 801],
            'cost1': [12, 15, 10, 24, 39, 1, 0, 90, 45, 34, 1, 12]
        }).set_index('date')

        for freq in ['D', 'M', 'A', 'Q-APR']:
            expected = df.groupby('user_id')['whole_cost'].resample(
                freq).sum().dropna().reorder_levels(
                    ['date', 'user_id']).sort_index().astype('int64')
            expected.name = 'whole_cost'

            result1 = df.sort_index().groupby(
                [pd.TimeGrouper(freq=freq), 'user_id'])['whole_cost'].sum()
            assert_series_equal(result1, expected)

            result2 = df.groupby([pd.TimeGrouper(freq=freq),
                                  'user_id'])['whole_cost'].sum()
            assert_series_equal(result2, expected)
Example #4
0
    train = pd.merge(train, coord_stats, how='left', on=gby_cols)
    test = pd.merge(test, coord_stats, how='left', on=gby_cols)

group_freq = '60min'
df_all = pd.concat((train, test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
train.loc[:, 'pickup_datetime_group'] = train['pickup_datetime'].dt.round(group_freq)
test.loc[:, 'pickup_datetime_group'] = test['pickup_datetime'].dt.round(group_freq)

# Count trips over 60min
df_counts = df_all.set_index('pickup_datetime')[['id']].sort_index()
df_counts['count_60min'] = df_counts.isnull().rolling(group_freq).count()['id']
train = train.merge(df_counts, on='id', how='left')
test = test.merge(df_counts, on='id', how='left')

# Count how many trips are going to each cluster over time
dropoff_counts = df_all     .set_index('pickup_datetime')     .groupby([pd.TimeGrouper(group_freq), 'dropoff_cluster'])     .agg({'id': 'count'})     .reset_index().set_index('pickup_datetime')     .groupby('dropoff_cluster').rolling('240min').mean()     .drop('dropoff_cluster', axis=1)     .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index()     .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'dropoff_cluster_count'})

train['dropoff_cluster_count'] = train[['pickup_datetime_group', 'dropoff_cluster']].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0)
test['dropoff_cluster_count'] = test[['pickup_datetime_group', 'dropoff_cluster']].merge(dropoff_counts, on=['pickup_datetime_group', 'dropoff_cluster'], how='left')['dropoff_cluster_count'].fillna(0)

# Count how many trips are going from each cluster over time
df_all = pd.concat((train, test))[['id', 'pickup_datetime', 'pickup_cluster', 'dropoff_cluster']]
pickup_counts = df_all     .set_index('pickup_datetime')     .groupby([pd.TimeGrouper(group_freq), 'pickup_cluster'])     .agg({'id': 'count'})     .reset_index().set_index('pickup_datetime')     .groupby('pickup_cluster').rolling('240min').mean()     .drop('pickup_cluster', axis=1)     .reset_index().set_index('pickup_datetime').shift(freq='-120min').reset_index()     .rename(columns={'pickup_datetime': 'pickup_datetime_group', 'id': 'pickup_cluster_count'})

train['pickup_cluster_count'] = train[['pickup_datetime_group', 'pickup_cluster']].merge(pickup_counts, on=['pickup_datetime_group', 'pickup_cluster'], how='left')['pickup_cluster_count'].fillna(0)
test['pickup_cluster_count'] = test[['pickup_datetime_group', 'pickup_cluster']].merge(pickup_counts, on=['pickup_datetime_group', 'pickup_cluster'], how='left')['pickup_cluster_count'].fillna(0)

# For this particular problem we can add OSRM ([Open Source Routing Machine](http://project-osrm.org/ 
# "OSRM")) features. This data contains the fastest routes from specific starting points in NY.
fr1 = pd.read_csv('../input/new-york-city-taxi-with-osrm/fastest_routes_train_part_1.csv', usecols=['id', 'total_distance', 'total_travel_time',  'number_of_steps'])
fr2 = pd.read_csv('../input/new-york-city-taxi-with-osrm//fastest_routes_train_part_2.csv', usecols=['id', 'total_distance', 'total_travel_time', 'number_of_steps'])
Example #5
0
 def generatePlots(self):
     for key in self.hashtagdict:
         print key
         filename = key + '_sentiment.csv'
         path = '/home/ubuntu/cv/aerial/DeepNetsEO/DeepNetsForEO/nlp/' + self.base_folder + '/' + filename
         show_df = pd.read_csv(path,
                               names=['datetime', 'username', 'tweet'])
         show_df.dropna(inplace=True)
         show_df.drop_duplicates(inplace=True)
         show_df['sentiment'] = show_df.tweet.apply(
             self.get_tweet_sentiment)
         show_df.datetime = show_df.datetime.apply(parser.parse)
         show_df.set_index(show_df.datetime, inplace=True)
         show_grouped = show_df.groupby([
             pd.TimeGrouper(freq='D'),
             'sentiment',
         ]).size()
         show_pct = show_grouped.groupby(
             level=0).apply(lambda x: 100 * x / float(x.sum()))
         show_NNP = show_pct.swaplevel(
             i=-1, j=-2).sort_values().unstack().transpose()
         if show_NNP.shape[1] < 3:
             continue
         if 'Negative' in show_NNP.columns and 'Positive' in show_NNP.columns:
             show_NNP['NetPositive'] = show_NNP['Positive'] - show_NNP[
                 'Negative']
         elif 'Negative' in show_NNP.columns:
             show_NNP['NetPositive'] = -show_NNP['Negative']
         elif 'Positive' in show_NNP.columns:
             show_NNP['NetPositive'] = show_NNP['Positive']
         show_grouped = show_grouped.swaplevel(
             i=-1, j=-2).sort_values().unstack().transpose()
         if 'Negative' in show_NNP.columns and 'Positive' in show_NNP.columns and 'Neutral' in show_NNP.columns:
             show_grouped[
                 'TotalTweets'] = show_grouped['Positive'] + show_grouped[
                     'Negative'] + show_grouped['Neutral']
         elif 'Negative' in show_NNP.columns and 'Neutral' in show_NNP.columns:
             show_grouped['TotalTweets'] = show_grouped[
                 'Negative'] + show_grouped['Neutral']
         elif 'Positive' in show_NNP.columns and 'Neutral' in show_NNP.columns:
             show_grouped['TotalTweets'] = show_grouped[
                 'Positive'] + show_grouped['Neutral']
         elif 'Positive' in show_NNP.columns and 'Negative' in show_NNP.columns:
             show_grouped['TotalTweets'] = show_grouped[
                 'Positive'] + show_grouped['Negative']
         elif 'Negative' in show_NNP.columns:
             show_grouped['TotalTweets'] = show_grouped['Negative']
         elif 'Positive' in show_NNP.columns:
             show_grouped['TotalTweets'] = show_grouped['Positive']
         elif 'Neutral' in show_NNP.columns:
             show_grouped['TotalTweets'] = show_grouped['Neutral']
         show_NNP['TotalTweets'] = show_grouped['TotalTweets']
         fig_show, ax1_show = plt.subplots()
         ax2_show = ax1_show.twinx()
         if 'NetPositive' in show_NNP.columns:
             ax1_show.plot(show_NNP.NetPositive, 'g', linewidth=5)
         ax2_show.bar(show_NNP.index,
                      show_NNP.TotalTweets,
                      align='center',
                      alpha=0.5)
         ax1_show.set_ylim([-100, 100])
         ax1_show.set_xlabel('Date')
         ax1_show.set_ylabel('Net Positive Sentiment %')
         ax2_show.set_ylabel('Total Tweets')
         fig_show.autofmt_xdate()
         plt.title(key)
Example #6
0
def api_get_league_table_summarised(leagueTableId):
    """
    {
        "doc": {
            "title": "league table summarised help",
            "body": "<p> Obtain the sumarized of the league table </p>"
        },
        "GET": {
            "label": "Obtain the league_table sumarized",
            "params":[{"name": "leagueTableId", "type":"string", "required":"true", "doc":"id of the leage_table"},
                      {"name": "period", "type":"string", "required":"false", "info":"the period to sumarize", "values": ["D", "W", "M", "Y"]},
                      {"name": "type", "type":"list", "required":"false", "info":"the field to sumarize"}]
        }
    }
    """
    companyId = g.get("auth_value")
    # params from url
    period = request.args.get('period', 'M')
    type = request.args.get('type', ['savings', 'smileys'])
    if not isinstance(type, list):
        type = type.split(',')  # type=savings,smileys in the url

    periodsAllowed = ['D', 'W', 'M', 'Y']  # Weekly means Monday to Sunday
    period = period[0].upper()
    # recupero la info de mongo
    query = {'companyId': companyId, 'leagueTableId': leagueTableId}
    doc = app.data.driver.db['league_table'].find_one(query, {'_id': 0},
                                                      timeout=False)
    try:
        reporting_Units = doc['reporting_Units']
    except:
        reporting_Units = []

        # recupero la info de mongo de baseline i creo el resultat per cadascu
    res_report = {}
    for reportingUnit in reporting_Units:
        query_reporting = {
            'companyId': companyId,
            'reportingUnitId': reportingUnit
        }
        doc_reporting = app.data.driver.db['reporting_units'].find_one(
            query_reporting, timeout=False)
        if doc_reporting:
            modelling_Units = doc_reporting['modelling_Units']
            res_report[reportingUnit] = []
            for modelUnit in modelling_Units:
                # update_baseline(companyId, modellingUnitId)   # TO DO
                query_baseline = {
                    'companyId': companyId,
                    'modellingUnitId': modelUnit
                }
                doc_baseline = app.data.driver.db['baselines'].find_one(
                    query_baseline, {
                        'prediction': 1,
                        'values': 1,
                        'smileys': 1,
                        'timestamps': 1
                    },
                    timeout=False)
                if doc_baseline:
                    res_parcial = {}

                    # creo el dataframe
                    df = pd.DataFrame.from_records({
                        'values':
                        doc_baseline['values'],
                        'smileys':
                        doc_baseline['smileys'],
                        'prediction':
                        doc_baseline['prediction'],
                        'timestamps':
                        doc_baseline['timestamps']
                    })
                    df = df.set_index(pd.DatetimeIndex(df['timestamps']))
                    if df.empty != True and period in periodsAllowed:
                        for typ in type:
                            if typ in doc_baseline.keys() or typ == 'savings':
                                if typ in ['savings', 'values', 'prediction']:
                                    df_grouped = df.groupby(
                                        pd.TimeGrouper(freq=period)).sum()
                                else:
                                    df_grouped = df.groupby(
                                        pd.TimeGrouper(freq=period)).mean()

                                if typ == 'savings':
                                    res_parcial[typ] = df_grouped[
                                        'prediction'] - df_grouped['values']
                                else:
                                    res_parcial[typ] = df_grouped[typ]
                                res_parcial[typ] = res_parcial[typ].where(
                                    (pd.notnull(res_parcial[typ])),
                                    None).tolist()  # replacing nan by None
                            else:
                                res_parcial[typ] = None
                        try:  # if there is any valid type
                            res_parcial['modellingUnitId'] = modelUnit
                            res_parcial[
                                'timestamps'] = df_grouped.index.tolist()
                            res_parcial['number_of_elements'] = df[
                                'values'].groupby(pd.TimeGrouper(
                                    freq=period)).count().tolist()
                        except:
                            for typ in type:
                                res_parcial[typ] = None
                            res_parcial['timestamps'] = None
                            res_parcial['number_of_elements'] = None
                            res_parcial['modellingUnitId'] = modelUnit

                    res_report[reportingUnit].append(res_parcial)

    return send_response('', (res_report, None, None, 200))
Example #7
0
])

data = d_311_grouped.size().to_frame('311').merge(
    d_c_grouped.size().to_frame('crime'), left_index=True, right_index=True)

plt_311 = np.array(data['311'].apply(int))
plt_crime = np.array(data['crime'].apply(int))
plt.close('all')
sns.regplot(x=plt_311, y=plt_crime)
plt.suptitle('311 v total crime 2010-2016\ngrouped by location')
plt.xlabel('Total 311 complaints')
plt.ylabel('Total reported crime')
plt.savefig('311vcrime.png', format='png')

d_311_grouped = data_311[data_311.Borough == 'MANHATTAN'].groupby(
    by=[pd.TimeGrouper(key='created_date', freq='M'), 'Complaint Type'
        ]).size().to_frame('total')

d_c_grouped = data_crime[data_crime.BORO_NM == 'MANHATTAN'].groupby(
    by=pd.TimeGrouper(key='CMPLNT_FR_DT', freq='M')).size().to_frame('total')

d_c_grouped2 = d_c_grouped.copy()
d_c_grouped2['Complaint Type'] = 'crime'
d_c_grouped2.set_index('Complaint Type', append=True, inplace=True)
d_c_grouped2.index.rename('created_date', level=0, inplace=True)

crimecorr = d_311_grouped.unstack(level=1)['total'].corrwith(
    d_c_grouped2['total'])
corridx = ['Complaint Type'] + list(
    crimecorr[crimecorr > .5].keys()) + ['crime']
def main():

    # path to each document
    # column description file
    desc_fname = abspath(
        '../data/section_1/OPEN_DATA_FIRE_INCIDENTS_FILE_DESCRIPTION.xls')
    # fire incident data file
    data_fname = abspath(
        '../data/section_1/Incidents_Responded_to_by_Fire_Companies.csv')
    # 2010 census data file name
    census_fname = abspath('../data/section_1/census_zipcode_2010.csv')

    df_desc = pd.read_excel(desc_fname)
    df_data = pd.read_csv(data_fname)
    #translate zip code
    df_data['INCIDENT_TYPE_CODE'] = df_data['INCIDENT_TYPE_DESC'].apply(
        lambda x: x.split(' - ')[0])
    df_census = pd.read_csv(census_fname)

    print('1. Most common incident: ')
    incidents = pd.DataFrame(
        df_data.groupby('INCIDENT_TYPE_DESC')['INCIDENT_TYPE_DESC'].size())
    incidents['incident_ratio'] = incidents['INCIDENT_TYPE_DESC'] / len(
        df_data)
    most_common_incident = incidents.sort_values(
        'incident_ratio').incident_ratio.iloc[-1]
    print(most_common_incident)

    print('\n 2. False calls in Staten island vs Manhattan')
    df_false = df_data[df_data['INCIDENT_TYPE_CODE'] == '710']
    df_false_dest = pd.DataFrame(df_false.groupby('BOROUGH_DESC').size())
    manhattan = df_false_dest[df_false_dest.index ==
                              '1 - Manhattan'].values[0][0]
    staten = df_false_dest[df_false_dest.index ==
                           '3 - Staten Island'].values[0][0]
    false_ratio = staten / manhattan
    print(false_ratio)

    print('\n 3. Most frequent cooking fire hour ratio')
    df_cooking = df_data[[
        'INCIDENT_TYPE_DESC', 'INCIDENT_TYPE_CODE', 'INCIDENT_DATE_TIME'
    ]]
    df_cooking['Hour'] = pd.to_datetime(
        df_cooking['INCIDENT_DATE_TIME']).dt.hour
    hour_count = df_cooking.groupby('Hour')['INCIDENT_TYPE_CODE'].count()
    cooking_count = df_cooking[df_cooking.INCIDENT_TYPE_CODE == '113'].groupby(
        'Hour')['INCIDENT_TYPE_CODE'].count()
    cooking_proba = pd.DataFrame(cooking_count / hour_count)
    max_cooking_fire = cooking_proba.max()[0]
    print(max_cooking_fire)

    print('\n 4. Average number of units in 111 vs 651')
    df_111_651 = df_data[df_data.INCIDENT_TYPE_CODE.isin(['111', '651'])]
    units_111 = df_111_651.groupby(
        'INCIDENT_TYPE_CODE')['UNITS_ONSCENE'].mean()['111']
    units_651 = df_111_651.groupby(
        'INCIDENT_TYPE_CODE')['UNITS_ONSCENE'].mean()['651']
    unit_ratio = units_111 / units_651
    print(unit_ratio)

    print('\n 5. About 111 incidents...')
    df_111 = df_data[df_data.INCIDENT_TYPE_CODE == '111'][[
        'INCIDENT_DATE_TIME', 'ARRIVAL_DATE_TIME', 'ZIP_CODE'
    ]].dropna()
    for c in ['INCIDENT_DATE_TIME', 'ARRIVAL_DATE_TIME']:
        df_111[c] = pd.to_datetime(df_111[c])
    print('5-1. 3rd quartile in difference in call and arrival time')
    df_111['min_diff'] = (df_111.ARRIVAL_DATE_TIME -
                          df_111.INCIDENT_DATE_TIME) / np.timedelta64(1, 'm')
    third_quartile = np.percentile(df_111.min_diff.values, 75)
    print(third_quartile)
    print('5-2. R-square for zip code population vs incidents')
    df_zipcode = pd.DataFrame(index=df_111.ZIP_CODE.unique())
    df_census = df_census.set_index('Zip Code ZCTA')
    df_zipcode['incidents'] = df_111.groupby('ZIP_CODE').size()
    df_zipcode = df_zipcode.merge(df_census, left_index=True, right_index=True)
    df_zipcode = df_zipcode.rename(
        columns={'2010 Census Population': 'population'})
    y = df_zipcode['incidents']
    x = df_zipcode['population']
    slope, intercept, r_value, p_value, std_err = stats.linregress(x, y)
    print(r_value)

    print('\n 6. About CO detectors...')
    df_co = df_data[~df_data.CO_DETECTOR_PRESENT_DESC.isnull()]
    df_co['duration_min'] = pd.to_timedelta(
        pd.to_timedelta(df_co['TOTAL_INCIDENT_DURATION'], unit='s') /
        np.timedelta64(1, 'm'),
        unit='m')
    print('6-1 Incident duration ratios')
    df_co_group = pd.DataFrame(
        df_co.groupby([
            'CO_DETECTOR_PRESENT_DESC',
            pd.TimeGrouper(key='duration_min', freq='10Min')
        ])['duration_min'].size())
    df_co_binned = pd.DataFrame(index=df_co_group.loc['No'].index)
    df_co_binned['no'] = df_co_group.loc['No'] / df_co_group.loc['No'].sum()
    df_co_binned['yes'] = df_co_group.loc['Yes'] / df_co_group.loc['Yes'].sum()
    df_co_binned['ratio'] = df_co_binned['no'] / df_co_binned['yes']
    df_co_binned = df_co_binned.iloc[2:7]
    df_co_binned['mid_bin'] = [25, 35, 45, 55, 65]
    x = df_co_binned.mid_bin
    y = df_co_binned.ratio
    a, b, r, p, stderr = stats.linregress(x, y)
    predicted = a * 39 + b
    print(predicted)
    print('6-2 Chi square statistics likelihood to be longer')
    df_co['long'] = df_co.duration_min.apply(
        lambda x: 'long' if x > pd.Timedelta(minutes=60) else 'short')
    df_co.groupby(['CO_DETECTOR_PRESENT_DESC', 'long']).size().unstack()
    df_co_time = df_co.groupby(['CO_DETECTOR_PRESENT_DESC',
                                'long']).size().unstack()
    statistic, p = stats.chisquare(
        [df_co_time['long']['No'] / df_co_time.loc['No'].sum()],
        [df_co_time['long'].sum() / df_co_time.values.sum()])
    print(statistic)
Example #9
0
#====================================================================
# group data be weekday and/or 20min bin
#====================================================================

avg_segment_time_week = df_travel_segment.groupby(
    ['day_of_week', 'minute_block', 'link_id']).aggregate(np.average)
avg_route_time_week = df_trajectories.groupby([
    'intersection_id',
    'tollgate_id',
    'day_of_week',
    'minute_block',
]).aggregate(np.average)
avg_route_time_whole = df_trajectories.groupby([
    'intersection_id',
    'tollgate_id',
    pd.TimeGrouper(key="starting_time", freq="20min"),
]).aggregate(np.average)

#===========================================================
#Plot speed of network, averaged over whole data period
#===========================================================
g = load_graph("phase1_road_network.gt")

tmpTxt = g.new_edge_property("string")
tmpWidth = g.new_edge_property("double")
tmpColor = g.new_edge_property("vector<double>")
tmpAvgTime = avg_segment_time_week.mean(
    level=2)  #level=2 means average over same link_id

for e in g.edges():
    #tmpTxt[e] = "%s %.0fm %d" %( eName[e], eLen[e], eLanes[e])
# only use dates we have volume values for
data = data.loc[data.index >= '12-31-1984']

# scale data
data['LogOpen'] = data['LogOpen'] - data['LogOpen'].min()
data['dx'] = data['dx'] - data['dx'].min()

# check everything looks okay
# print(data)

# need to break into input and output data samples
#data = data.resample('B').bfill()       # fill in missing days ( holidays..)
data = data.resample('B').bfill()  # fill in missing days ( holidays..)

weeks_df = [g for n, g in data.groupby(pd.TimeGrouper('W'))]
months_df = [g for n, g in data.groupby(pd.TimeGrouper('M'))]

print("data ", len(data))
print("weeks ", len(weeks_df))
print("months ", len(months_df))

# see if everything looks okay
# print(weeks)
# print("mins", data['LogOpen'].min(), data['dx'].min(), data['LogVolume'].min())
# print("maxs", data['LogOpen'].max(), data['dx'].max(), data['LogVolume'].max())

# convert to numpy matrix
print(len(weeks_df))
weeks = []
for i in range(len(weeks_df)):
Example #11
0
#    trc = np.zeros(len(weight))
#    for k in range(len(weight)):
#        trc[k] = weight[k]*partial_derivative(portfolioRisk, k, weight)
#    rc = 0
#    for p in range(len(weight)):
#        for q in range(len(weight)):
#            rc += (trc[p] - trc[q])**2
#    return(rc)

indexRets = pd.read_csv('/Users/admin/Desktop/doc/finance/multifactor/data/fof/indexRets.csv')
tradeDate = indexRets['tradeDate']
indexRets['tradeDate'] = pd.to_datetime(indexRets['tradeDate'],format='%Y-%m-%d')
indexRets.set_index('tradeDate', inplace = True)
tradeDate.index = indexRets.index

indexMonthlyRets = indexRets.groupby(pd.TimeGrouper(freq = 'M')).sum()
indexMonthlyRets = np.exp(indexMonthlyRets) - 1 #translate the log return to normal return

nAsset = len(indexRets.columns)
weight = np.array([1/nAsset]*nAsset)
print(weight)

sess = tf.InteractiveSession()
tfWeight =  tf.Variable(weight, dtype = tf.float32, name = 'weight')
import pdb;pdb.set_trace()
tfWeight = tf.expand_dims(tfWeight,0)
tfCov = tf.placeholder(dtype = tf.float32, shape = (nAsset, nAsset), name = 'retcov')
tfRateO = tf.placeholder(dtype = tf.float32, shape = (nAsset), name = 'retmean')
tfRate = tf.expand_dims(tfRateO,0)
#portfolioCov = tf.matmul(tf.matmul(tf.expand_dims(tfWeight,0), retCov), tf.expand_dims(tfWeight,-1))
portfolioCov = tf.matmul(tf.matmul(tfWeight, tfCov), tf.transpose(tfWeight), name = 'portcov')
Example #12
0
def upload():
    print request.form
    data = []
    categoryquery = request.form['category']
    locationed = request.form['location']
    print categoryquery, locationed
    clinet = MongoClient('127.0.0.1', 27017)
    conn = clinet["ADM"]["dummy4"]
    brands = conn.find().distinct("Category")
    location = conn.find().distinct("Geo")
    location.append("All")

    if locationed == "All":
        T = conn.find({"Category": categoryquery})
    else:
        T = conn.find({"Category": categoryquery, "Geo": locationed})
    leg = ['Date']
    colname = []
    for x in T:
        data.append(x)
    clinet.close()
    df_all = pd.DataFrame(data)
    #df_all['Date']=df_all['Date'].apply(lambda x: datetime.strptime(x,"%m/%d/%Y").strftime('%m/%d/%Y'))
    #df_all['Date'] = df_all['Date'].apply(lambda x: datetime.strptime(x,"%m/%d/%Y %H:%M"))
    #df_all['Date'] = df_all['Date'].apply(lambda x: datetime.strptime(x,"%m/%d/%y"))
    df_all['Date'] = df_all['Date'].apply(
        lambda x: datetime.strptime(x, "%m/%d/%Y"))
    cards_result = []
    graph = []

    for x in df_all.Brand.unique():
        df = df_all[df_all['Brand'] == x]
        df['predicted_sentiment'] = df['vader_sentiment']
        pos = len(df[df['predicted_sentiment'] == "Positive"])
        neg = len(df[df['predicted_sentiment'] == "Negative"])
        neu = len(df[df['predicted_sentiment'] == "Neutral"])
        price_pos = sum(df[df['predicted_sentiment'] == "Positive"]['price'])
        price_neg = sum(df[df['predicted_sentiment'] == "Negative"]['price'])
        price_neu = sum(df[df['predicted_sentiment'] == "Negative"]['price'])
        qualtiy_pos = sum(
            df[df['predicted_sentiment'] == "Positive"]['quality'])
        qualtiy_neg = sum(
            df[df['predicted_sentiment'] == "Negative"]['quality'])
        qualtiy_neu = sum(
            df[df['predicted_sentiment'] == "Negative"]['quality'])
        service_pos = sum(
            df[df['predicted_sentiment'] == "Positive"]['service'])
        service_neg = sum(
            df[df['predicted_sentiment'] == "Negative"]['service'])
        service_neu = sum(
            df[df['predicted_sentiment'] == "Negative"]['service'])
        cards_result.append({
            "total": len(df_all),
            "brand": x,
            "pos": pos,
            "neg": neg,
            "neu": neu,
            "pp": pos / (pos + neg + neu + 1),
            "price_pos": price_pos,
            "price_neg": price_neg,
            "price_neu": price_neu,
            "quality_pos": qualtiy_pos,
            "quality_neg": qualtiy_neg,
            "quality_neu": qualtiy_neu,
            "service_pos": service_pos,
            "service_neg": service_neg,
            "service_neu": service_neu
        })
        temp = df.set_index('Date').groupby(pd.TimeGrouper('W')).size()
        temp_pos = df[df['predicted_sentiment'] == "Positive"].set_index(
            'Date').groupby(pd.TimeGrouper('W')).size()
        temp_neg = df[df['predicted_sentiment'] == "Negative"].set_index(
            'Date').groupby(pd.TimeGrouper('W')).size()
        temp_neu = df[df['predicted_sentiment'] == "Neutral"].set_index(
            'Date').groupby(pd.TimeGrouper('W')).size()
        temp_pp = temp_pos * 100.0 / (temp + 1)
        graph.append(temp)
        graph.append(temp_pos)
        graph.append(temp_neg)
        graph.append(temp_neu)
        graph.append(temp_pp)
        colname.append(x + "_total")
        colname.append(x + "_pos")
        colname.append(x + "_neg")
        colname.append(x + "_neu")
        colname.append(str(x + "_pp"))
        leg.append(str(x + "_pp"))

    graph = pd.concat(graph, axis=1)
    graph = graph.fillna(0)
    graph.columns = colname
    print graph
    temp = []
    for index, row in graph.iterrows():
        each_week = []
        each_week.append(int(index.strftime("%s")))
        for l in leg[1:]:
            each_week.append(row[l])
        temp.append(each_week)

    return render_template('index.html',
                           legends=leg,
                           data=data,
                           category=brands,
                           location=location,
                           temp=temp,
                           locationtag=locationed,
                           serchtag=categoryquery,
                           card=cards_result)
Example #13
0
        last_value = trajectoryData_train.loc[k-1,'travel_time']
        next_value = trajectoryData_train.loc[k+1,'travel_time']
        if last_value < 600:
            trajectoryData_train.loc[k, 'travel_time'] = (last_value + next_value)/2.0
        else:
            trajectoryData_train.loc[k, 'travel_time'] = last_value

# COMMAND ----------

trajectoryData_train.describe()

# COMMAND ----------

trajectoryData_train['starting_time'] = ps.to_datetime(trajectoryData_train['starting_time'], format='%Y-%m-%d %H:%M:%S')
trajectoryData_train = trajectoryData_train.set_index(['starting_time'])
trajectoryData_train = trajectoryData_train.groupby([ps.TimeGrouper('20Min'), 'intersection_id', 'tollgate_id']).travel_time.mean().reset_index().rename(columns={'travel_time':'averagetravltime'})
trajectoryData_test['starting_time'] = ps.to_datetime(trajectoryData_test['starting_time'], format="%Y-%m-%d %H:%M:%S")
trajectoryData_test = trajectoryData_test.set_index(['starting_time'])
trajectoryData_test = trajectoryData_test.groupby([ps.TimeGrouper('20Min'), 'intersection_id', 'tollgate_id']).travel_time.mean().reset_index().rename(columns={'travel_time':'averagetravltime'})
print trajectoryData_train.shape,trajectoryData_test.shape

# COMMAND ----------

trajectoryData_train.head()

# COMMAND ----------

trajectoryData_train.shape

# COMMAND ----------
Example #14
0
appended_data = appended_data.iloc[:, :-1]
appended_data.index = pd.to_datetime(appended_data.index, dayfirst=True)

# filtre des donnees sur une periode specifique (ATTENTION ON TRAVAILLE EN UTC)
start = datetime.now().replace(hour=5, minute=0, second=0, microsecond=0)
end = (datetime.now() + timedelta(days=1)).replace(hour=5,
                                                   minute=0,
                                                   second=0,
                                                   microsecond=0)

filtred_data = appended_data[
    (appended_data.index.get_level_values(0) >= str(start))
    & (appended_data.index.get_level_values(0) <= str(end))]
filtred_data.index = pd.to_datetime(filtred_data.index)

agg_10m = filtred_data.groupby(pd.TimeGrouper(freq='10Min')).aggregate(np.sum)

mat = mat[(mat.index.get_level_values(0) >= str(start))
          & (mat.index.get_level_values(0) <= str(end))]

# calcul du centre de masse suivant Ishizaka ()
sum_D = []
sum_V = []
for i in range(0, len(agg_10m)):
    tt = agg_10m.iloc[i].replace(0, np.nan)
    a = tt.astype(float).values.reshape(32, 32)
    sum_V.append([np.nansum(x) for x in a])
    sum_D.append([np.nansum(x) for x in zip(*a)])

df_D = pd.DataFrame(sum_D)
df_V = pd.DataFrame(sum_V)
Example #15
0
import pandas as pd

df = pd.read_hdf('balances.h5', 'balances')

print(df.to_string())

print(df.dtypes)

df_day = df.set_index('TimeStamp').groupby(
    [pd.TimeGrouper(freq='10MIN'), 'CURRENCY', 'Source', 'USD', 'BTC']).last()
print(df_day.to_string())
df_day = df_day.groupby(['TimeStamp', 'CURRENCY']).sum()
print(df_day.to_string())
Example #16
0
data_311 = data_311_raw[data_311_raw.created_date < pd.to_datetime(dt.date(2016,1,1))].dropna()

minlat = data_crime.Latitude.min()
maxlat = data_crime.Latitude.max()
minlon = data_crime.Longitude.min()
maxlon = data_crime.Longitude.max()

latrange = np.arange(minlat, maxlat+0.02, 0.02)
lonrange = np.arange(minlon, maxlon+0.02, 0.02)

data_crime = data_crime[data_crime.LAW_CAT_CD != 'VIOLATION']

d_c_grouped = data_crime.groupby(
    by=[pd.cut(data_crime['Latitude'], latrange),
        pd.cut(data_crime['Longitude'], lonrange),
        pd.TimeGrouper(key='CMPLNT_FR_DT',freq='M')])

d_311_grouped = data_311.groupby(
    by=[pd.cut(data_311['Latitude'], latrange),
        pd.cut(data_311['Longitude'], lonrange),
        pd.TimeGrouper(key='created_date',freq='M')])

max_var_loc = d_c_grouped.size().unstack().var(axis=1).argmax()

data_crime_window = data_crime_raw[data_crime_raw.CMPLNT_FR_DT.between(
    pd.to_datetime(dt.date(2010,1,1)), pd.to_datetime(dt.date(2015,2,1)))].dropna()
data_311_window = data_311_raw[data_311_raw.created_date.between(
    pd.to_datetime(dt.date(2010,1,1)), pd.to_datetime(dt.date(2015,2,1)))].dropna()

d_c_win_grouped = data_crime_window.groupby(
    by=[pd.cut(data_crime_window['Latitude'], latrange),
Example #17
0
def api_modelling_unit_summarised(modellingUnitId):
    """
    {
        "doc": {
            "title": "modelling unit summarised help",
            "body": "<p> Obtain the sumarized of the modelling unit </p>"
        },
        "GET": {
            "label": "Obtain the modelling unit sumarized",
            "params":[{"name": "modellingUnitId", "type":"string", "required":"true", "doc":"id of the modelling_unit"},
                      {"name": "period", "type":"string", "required":"false", "info":"the period to sumarize", "values": ["D", "W", "M", "Y"]},
                      {"name": "type", "type":"list", "required":"false", "info":"the field to sumarize"}]
        }
    }
    """
    companyId = g.get("auth_value")
    # params from url
    period = request.args.get('period', 'M')
    type = request.args.get('type', ['savings', 'smileys'])
    if not isinstance(type, list):
        type = type.split(',')  # type=savings,smileys in the url

    periodsAllowed = ['D', 'W', 'M', 'Y']  # Weekly means Monday to Sunday
    period = period[0].upper()

    modellingUnitIdList = modellingUnitId.split(';')
    res_final = []
    for modellingUnit in modellingUnitIdList:
        # recupero la info de mongo
        # update_baseline(companyId, modellingUnitId)   # TO DO
        query_baseline = {
            'companyId': companyId,
            'modellingUnitId': modellingUnit
        }
        query_fields = {
            'values': 1,
            'prediction': 1,
            'smileys': 1,
            'timestamps': 1
        }
        doc_baseline = app.data.driver.db['baselines'].find_one(
            query_baseline, query_fields)
        res = {}

        if doc_baseline and 'values' in doc_baseline:
            n = -12 * 7 * 24 * 2
            # creo el dataframe
            df = pd.DataFrame.from_records({
                'values':
                doc_baseline['values'][n:],
                'smileys':
                doc_baseline['smileys'][n:],
                'prediction':
                doc_baseline['prediction'][n:],
                'timestamps':
                doc_baseline['timestamps'][n:]
            })
            df = df.set_index(pd.DatetimeIndex(df['timestamps']))
            df = df.drop('timestamps', 1)
            # a list is needed
            if not isinstance(type, list):
                type = [type]
            if df.empty != True and period in periodsAllowed:
                # calculo les agrupacions dels diff valors
                for typ in type:
                    if typ in doc_baseline.keys() or typ == 'savings':
                        if typ in ['savings', 'values', 'prediction']:
                            # filtre per negatius
                            df_grouped = df.clip(lower=0)
                            # filtre per valors >>>
                            # df_grouped = df_grouped[np.abs(df_grouped.prediction-df_grouped.prediction.mean())<=(10*df_grouped.prediction.std())]
                            df_grouped = df_grouped.groupby(
                                pd.TimeGrouper(freq=period)).sum()
                        else:
                            df_grouped = df.groupby(
                                pd.TimeGrouper(freq=period)).mean()
                        # res_parcial['groupedValues'] = df_grouped['value'].tolist()
                        # res_parcial['groupedPrediction'] = df_grouped['prediction'].tolist()
                        if typ == 'savings':
                            res[typ] = df_grouped['prediction'] - df_grouped[
                                'values']
                        else:
                            res[typ] = df_grouped[typ]
                        res[typ] = res[typ].where(
                            (pd.notnull(res[typ])),
                            None).tolist()  # replacing nan by None
                    else:
                        res[typ] = None
                try:  # if there is any valid type
                    res['timestamps'] = df_grouped.index.tolist()
                    res['number_of_elements'] = df['values'].groupby(
                        pd.TimeGrouper(freq=period)).count().dropna().tolist()
                except:
                    for typ in type:
                        res[typ] = None
                    res['timestamps'] = None
                    res['number_of_elements'] = None
            else:
                # res_parcial['groupedValues'] =  None
                # res_parcial['groupedPrediction'] = None
                for typ in type:
                    res[typ] = None
                res['timestamps'] = None
                res['number_of_elements'] = None

        res_final.append(res)

    # torno una llista o un element
    res = res_final[0] if len(res_final) < 2 else res_final

    return send_response('', (res, None, None, 200))
Example #18
0
    def resample(self,
                 freq,
                 dim,
                 how='mean',
                 skipna=None,
                 closed=None,
                 label=None,
                 base=0,
                 keep_attrs=False):
        """Resample this object to a new temporal resolution.

        Handles both downsampling and upsampling. Upsampling with filling is
        not yet supported; if any intervals contain no values in the original
        object, they will be given the value ``NaN``.

        Parameters
        ----------
        freq : str
            String in the '#offset' to specify the step-size along the
            resampled dimension, where '#' is an (optional) integer multipler
            (default 1) and 'offset' is any pandas date offset alias. Examples
            of valid offsets include:

            * 'AS': year start
            * 'QS-DEC': quarterly, starting on December 1
            * 'MS': month start
            * 'D': day
            * 'H': hour
            * 'Min': minute

            The full list of these offset aliases is documented in pandas [1]_.
        dim : str
            Name of the dimension to resample along (e.g., 'time').
        how : str or func, optional
            Used for downsampling. If a string, ``how`` must be a valid
            aggregation operation supported by xarray. Otherwise, ``how`` must be
            a function that can be called like ``how(values, axis)`` to reduce
            ndarray values along the given axis. Valid choices that can be
            provided as a string include all the usual Dataset/DataArray
            aggregations (``all``, ``any``, ``argmax``, ``argmin``, ``max``,
            ``mean``, ``median``, ``min``, ``prod``, ``sum``, ``std`` and
            ``var``), as well as ``first`` and ``last``.
        skipna : bool, optional
            Whether to skip missing values when aggregating in downsampling.
        closed : 'left' or 'right', optional
            Side of each interval to treat as closed.
        label : 'left or 'right', optional
            Side of each interval to use for labeling.
        base : int, optionalt
            For frequencies that evenly subdivide 1 day, the "origin" of the
            aggregated intervals. For example, for '24H' frequency, base could
            range from 0 through 23.
        keep_attrs : bool, optional
            If True, the object's attributes (`attrs`) will be copied from
            the original object to the new one.  If False (default), the new
            object will be returned without attributes.

        Returns
        -------
        resampled : same type as caller
            This object resampled.

        References
        ----------

        .. [1] http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
        """
        from .dataarray import DataArray

        RESAMPLE_DIM = '__resample_dim__'
        if isinstance(dim, basestring):
            dim = self[dim]
        group = DataArray(dim, [(RESAMPLE_DIM, dim)], name=RESAMPLE_DIM)
        time_grouper = pd.TimeGrouper(freq=freq,
                                      how=how,
                                      closed=closed,
                                      label=label,
                                      base=base)
        gb = self.groupby_cls(self, group, grouper=time_grouper)
        if isinstance(how, basestring):
            f = getattr(gb, how)
            if how in ['first', 'last']:
                result = f(skipna=skipna, keep_attrs=keep_attrs)
            else:
                result = f(dim=dim.name, skipna=skipna, keep_attrs=keep_attrs)
        else:
            result = gb.reduce(how, dim=dim.name, keep_attrs=keep_attrs)
        result = result.rename({RESAMPLE_DIM: dim.name})
        return result
Example #19
0
def api_get_user_modelling_units_results(userModellingUnitId, period, divider):
    """
    {
        "doc": {
            "title": "user modelling units results help",
            "body": "<p> Obtain the user modelling units results </p>"
        },
        "GET": {
            "label": "Obtain the user modelling units results",
            "params":[{"name": "userModellingUnitId", "type":"string", "required":"true", "doc":"id of the user modelling_unit"},
                      {"name": "period", "type":"string", "required":"false", "info":"the period to sumarize", "values": ["D", "W", "M", "Y"]},
                      {"name": "divider", "type":"string", "required":"false", "info":"the field to divide the information"}]
        }
    }
    """
    companyId = g.get("auth_value")
    ######## start code

    periodsAllowed = ['D', 'W', 'M', 'Y']  # Weekly means Monday to Sunday
    modelling_unit_types = ['electricityConsumption', 'gasConsumption']
    period = period[0].upper()
    # recupero la info de mongo
    query = {
        'companyId': companyId,
        'userModellingUnitId': userModellingUnitId
    }
    doc = app.data.driver.db['user_modelling_units'].find_one(
        query, {'_id': 0})  # timeout=False)
    try:
        building_id = doc['buildings']
    except:
        building_id = []

    buildings_docs = []
    for i, building in enumerate(building_id):
        query_building = {'companyId': companyId, 'buildingId': building}
        doc_building = app.data.driver.db['buildings'].find_one(
            query_building, {
                'buildingId': 1,
                'data.' + divider: 1
            })
        if doc_building:
            buildings_docs.append(doc_building)
            # need to clean the building to obtain the desired dictionary

    clean_building_docs = [{
        "buildingId":
        b['buildingId'],
        divider:
        b['data'][divider] if divider in b['data'] else 'unknown',
    } for b in buildings_docs]

    # create the dataframe of buildings
    building_df = pd.DataFrame.from_records(clean_building_docs)
    # get all divider and group the tables, then filter them by the different groups
    building_divider_values = building_df[divider].unique()
    building_grouped = building_df.groupby(divider)
    # initialize the results variable
    results = {}
    # iterate for all modelling unit types
    for modelling_unit_type in modelling_unit_types:
        baselines_by_divider = pd.DataFrame()
        # and all different divider groups
        for building_divider in building_divider_values:
            # get the buildings of thes divider
            buildings_by_divider = building_grouped.get_group(building_divider)
            baselines_by_divider_temp = []
            for building_id in buildings_by_divider.buildingId:
                query_baseline = {
                    'companyId': companyId,
                    'modellingUnitId': building_id + '-' + modelling_unit_type
                }
                doc_baseline = app.data.driver.db['baselines'].find_one(
                    query_baseline, {
                        'P50': 1,
                        'values': 1,
                        'timestamps': 1
                    })
                if (doc_baseline):
                    if len(doc_baseline['values']) == len(
                            doc_baseline['P50']) == len(
                                doc_baseline['timestamps']):
                        df = pd.DataFrame.from_records({
                            'values':
                            doc_baseline['values'],
                            'P50':
                            doc_baseline['P50'],
                            'timestamps':
                            doc_baseline['timestamps']
                        })
                        df = df.set_index(pd.DatetimeIndex(df['timestamps']))
                        df_grouped = df.groupby(
                            pd.TimeGrouper(freq=period)).sum()
                        baselines_by_divider_temp.append(df_grouped)
            if baselines_by_divider_temp:
                baselines_by_divider_temp = pd.concat(
                    baselines_by_divider_temp, axis=1)
                v = baselines_by_divider_temp  # dropna
                try:
                    final = pd.DataFrame.from_records({
                        'P50':
                        v['P50'].sum(axis=1),
                        'values':
                        v['values'].sum(axis=1)
                    })
                except:
                    final = v
            else:
                final = pd.DataFrame()
            final.rename(index=str,
                         columns={
                             'values': 'values-' + building_divider,
                             'P50': 'P50-' + building_divider
                         },
                         inplace=True)
            # print(final.dropna())
            baselines_by_divider = pd.concat(
                [baselines_by_divider, final.dropna()], axis=1)
            baselines_by_divider = baselines_by_divider

        results[modelling_unit_type] = {
            "timestamps":
            baselines_by_divider.index.tolist() if not v.empty else []
        }
        for divider in building_divider_values:
            try:
                results[modelling_unit_type].update({
                    divider: {
                        "values":
                        baselines_by_divider['values-' + divider].tolist()
                        if not v.empty else [],
                        "P50":
                        baselines_by_divider['P50-' + divider].tolist()
                        if not v.empty else []
                    }
                })
            except:
                pass

    return send_response('', (results, None, None, 200))
Example #20
0
#cPickle.dump(genres,open(outputdir+'gn_genres.pkl','w'))
genres = cPickle.load(open(outputdir+'gn_genres.pkl'))

result = pd.DataFrame(0.,index=daterange,columns=genres['genre1']+genres['genre2']+genres['genre3'])
# if len(done)==0:
#     result = pd.DataFrame(0.,index=daterange,columns=genres['genre1']+genres['genre2']+genres['genre3'])
# else:
#     result = pd.read_pickle(outputdir+'genre_data')

for i,f in enumerate(files):
    user_start = time.time()
    # if f in done:
    #     continue
    df = pd.read_table(f,sep='\t',header=None,names=['item_id','artist_id','scrobble_time'],parse_dates=['scrobble_time']).join(gn,on='item_id',how='left')
    for level in genres:
        vars()['df_'+level] = df.set_index('scrobble_time').groupby([pd.TimeGrouper(freq='D'),level]).count()['item_id'].unstack().reindex(daterange,columns=genres[level])
    concat = pd.concat([df_genre1,df_genre2,df_genre3],axis=1).fillna(0)

    result += concat

    rootLogger.info("{} ({}/{}, {:.1f}, {}, block {})".format(f,i+1,n_users,time.time()-user_start,len(df),idx))
    #time_elapsed = time.time() - start
    # if time_elapsed >= (wall_time-(time_buffer)):
    #     result.to_pickle(outputdir+'genre_data')
    #     sys.exit()

result.to_pickle(outputdir+'genre_data_'+str(idx))


"""
Example #21
0
def estimate_intraday(returns, positions, transactions, EOD_hour=23):
    """
    Intraday strategies will often not hold positions at the day end.
    This attempts to find the point in the day that best represents
    the activity of the strategy on that day, and effectively resamples
    the end-of-day positions with the positions at this point of day.
    The point of day is found by detecting when our exposure in the
    market is at its maximum point. Note that this is an estimate.

    Parameters
    ----------
    returns : pd.Series
        Daily returns of the strategy, noncumulative.
         - See full explanation in create_full_tear_sheet.
    positions : pd.DataFrame
        Daily net position values.
         - See full explanation in create_full_tear_sheet.
    transactions : pd.DataFrame
        Prices and amounts of executed trades. One row per trade.
         - See full explanation in create_full_tear_sheet.

    Returns
    -------
    pd.DataFrame
        Daily net position values, resampled for intraday behavior.
    """

    # Construct DataFrame of transaction amounts
    txn_val = transactions.copy()
    txn_val.index.names = ['date']
    txn_val['value'] = txn_val.amount * txn_val.price
    txn_val = txn_val.reset_index().pivot_table(index='date',
                                                values='value',
                                                columns='symbol').replace(
                                                    np.nan, 0)

    # Cumulate transaction amounts each day
    txn_val['date'] = txn_val.index.date
    txn_val = txn_val.groupby('date').cumsum()

    # Calculate exposure, then take peak of exposure every day
    txn_val['exposure'] = txn_val.abs().sum(axis=1)
    condition = (txn_val['exposure'] == txn_val.groupby(
        pd.TimeGrouper('24H'))['exposure'].transform(max))
    txn_val = txn_val[condition].drop('exposure', axis=1)

    # Compute cash delta
    txn_val['cash'] = -txn_val.sum(axis=1)

    # Shift EOD positions to positions at start of next trading day
    positions_shifted = positions.copy().shift(1).fillna(0)
    starting_capital = positions.iloc[0].sum() / (1 + returns[0])
    positions_shifted.cash[0] = starting_capital

    # Format and add start positions to intraday position changes
    txn_val.index = txn_val.index.normalize()
    corrected_positions = positions_shifted.add(txn_val, fill_value=0)
    corrected_positions.index.name = 'period_close'
    corrected_positions.columns.name = 'sid'

    return corrected_positions
Example #22
0
def uniquelogins(sessions):
    """Unique logins per days/weeks/months.

    :return: daily, weekly, monthly
    3 lists of dictionaries of the following format [{'x':epoch, 'y': value},]
    """
    # sessions = LoginSession.query.order_by(LoginSession.started_at.asc()).all()
    if not sessions:
        return [], [], []
    dates = {}
    for session in sessions:
        user = session.user
        # time value is discarded to aggregate on days only
        date = session.started_at.strftime("%Y/%m/%d")

        if date not in dates:
            dates[date] = set()  # we want unique users on a given day
            dates[date].add(user)
        else:
            dates[date].add(user)

    daily = []
    weekly = []
    monthly = []

    for date in sorted(dates.keys()):
        # print u"{} : {}".format(date, len(dates[date]))
        date_epoch = unix_time_millis(datetime.strptime(date, "%Y/%m/%d"))
        daily.append({'x': date_epoch, 'y': len(dates[date])})

    # first_day = data[0]['x']
    # last_day = data[-1]['x']

    daily_serie = pd.Series(dates)
    # convert the index to Datetime type
    daily_serie.index = pd.DatetimeIndex(daily_serie.index)
    # calculate the values instead of users lists
    daily_serie = daily_serie.apply(lambda x: len(x))

    # GroupBy Week/month, Thanks Panda
    weekly_serie = daily_serie \
        .groupby(pd.TimeGrouper(freq='W')) \
        .aggregate(numpysum)
    monthly_serie = daily_serie \
        .groupby(pd.TimeGrouper(freq='M')) \
        .aggregate(numpysum)

    for date, value in six.iteritems(weekly_serie):
        try:
            value = int(value)
        except ValueError:
            continue
        date_epoch = unix_time_millis(date)
        weekly.append({'x': date_epoch, 'y': value})

    for date, value in six.iteritems(monthly_serie):
        try:
            value = int(value)
        except ValueError:
            continue
        date_epoch = unix_time_millis(date)
        monthly.append({'x': date_epoch, 'y': value})

    return daily, weekly, monthly
Example #23
0
def get_group_members_over_time(req_obj):
    # Establish database connection
    gc.connect_to_database()

    # Determine group guid
    group_guid = get_group_guid(req_obj['url'])

    # Query the database
    group_members = gc.groups.get_group_members(group_guid, cleaned=False)
    group_name = gc.groups.name_from_guid(group_guid)

    # Get mungin'
    # Convert times to datetime objects
    group_members['time_created'] = group_members['time_created'].apply(
        lambda x: pd.to_datetime(x))

    group_members.set_index('time_created', inplace=True)

    group_members = group_members[
        group_members.index > pd.to_datetime('2000-01-01')]

    # Daily
    group_members_daily = group_members['user_name'].groupby(
        pd.TimeGrouper(freq='D')).count().cumsum()
    group_members_daily = group_members_daily.reset_index()

    # If the requested start date predates the dataframe, pad with 0s
    if min(group_members_daily['time_created']) > pd.to_datetime(
            req_obj['start_date']):
        ix = pd.DatetimeIndex(start=pd.to_datetime(req_obj['start_date']),
                              end=max(group_members_daily['time_created']),
                              freq='D')
        group_members_daily = group_members_daily.set_index(
            'time_created').reindex(ix, fill_value=0).reset_index()
        group_members_daily.rename(columns={'index': 'time_created'},
                                   inplace=True)

    # If the requested end date is after the end of the dataframe, pad with last value
    if max(group_members_daily['time_created']) < pd.to_datetime(
            req_obj['end_date']):
        ix = pd.DatetimeIndex(start=pd.to_datetime(req_obj['start_date']),
                              end=pd.to_datetime(req_obj['end_date']),
                              freq='D')
        group_members_daily = group_members_daily.set_index(
            'time_created').reindex(
                ix, fill_value=max(
                    group_members_daily['user_name'])).reset_index()
        group_members_daily.rename(columns={'index': 'time_created'},
                                   inplace=True)

    # Only keep current time selection
    group_members_daily = group_members_daily[
        group_members_daily['time_created'] >= pd.to_datetime(
            req_obj['start_date'])]
    group_members_daily = group_members_daily[
        group_members_daily['time_created'] <= pd.to_datetime(
            req_obj['end_date'])]

    group_members_daily['time_created'] = group_members_daily[
        'time_created'].apply(lambda x: x.strftime('%Y%m%d'))

    # Monthly
    group_members_monthly = group_members['user_name'].groupby(
        pd.TimeGrouper(freq='M')).count().cumsum()
    group_members_monthly = group_members_monthly.reset_index()

    # (monthly) If the requested start date predates the oldest time on the dataframe, pad with 0s
    if min(group_members_monthly['time_created']) > pd.to_datetime(
            req_obj['start_date']):
        ix = pd.DatetimeIndex(start=pd.to_datetime(req_obj['start_date']),
                              end=max(group_members_monthly['time_created']),
                              freq='M')
        group_members_monthly = group_members_monthly.set_index(
            'time_created').reindex(ix, fill_value=0).reset_index()
        group_members_monthly.rename(columns={'index': 'time_created'},
                                     inplace=True)

    # If the requested end date is after the end of the dataframe, pad with last value
    if max(group_members_monthly['time_created']) < pd.to_datetime(
            req_obj['end_date']):
        ix = pd.DatetimeIndex(start=pd.to_datetime(req_obj['start_date']),
                              end=pd.to_datetime(req_obj['end_date']),
                              freq='M')
        group_members_monthly = group_members_monthly.set_index(
            'time_created').reindex(
                ix, fill_value=max(
                    group_members_monthly['user_name'])).reset_index()
        group_members_monthly.rename(columns={'index': 'time_created'},
                                     inplace=True)

    # Only keep current time selection
    group_members_monthly = group_members_monthly[
        group_members_monthly['time_created'] >= pd.to_datetime(
            req_obj['start_date'])]
    group_members_monthly = group_members_monthly[
        group_members_monthly['time_created'] <= pd.to_datetime(
            req_obj['end_date'])]

    group_members_monthly['time_created'] = group_members_monthly[
        'time_created'].apply(lambda x: x.strftime('%Y%m%d'))

    send_obj = {
        'monthly': {
            'dates': group_members_monthly['time_created'].values.tolist(),
            'users': group_members_monthly['user_name'].values.tolist()
        },
        'daily': {
            'dates': group_members_daily['time_created'].values.tolist(),
            'users': group_members_daily['user_name'].values.tolist(),
        },
        'group_name': group_name
    }

    print(json.dumps(send_obj))
Example #24
0
    def update_data_frame(self):
        num_lines = sum(1 for line in open(self.jmeter_results_file))
        if self.start_line < num_lines - 10:
            read_lines = num_lines - self.start_line - 10
            #if self.file_size < os.path.getsize(self.jmeter_results_file):
            #self.file_size = os.path.getsize(self.jmeter_results_file)
            df = pd.read_csv(self.jmeter_results_file,
                             index_col=0,
                             low_memory=False,
                             skiprows=self.start_line,
                             nrows=read_lines)
            df.columns = [
                'average', 'URL', 'responseCode', 'success', 'threadName',
                'failureMessage', 'grpThreads', 'allThreads'
            ]
            df = df[~df['URL'].str.contains('exclude_')]
            df.index = pd.to_datetime(dateconv((df.index.values / 1000)))
            # update start line for the next parse
            self.start_line = self.start_line + read_lines

            group_by_response_codes = df.groupby('responseCode')
            add_df = pd.DataFrame()
            add_df['count'] = group_by_response_codes.success.count()
            #add_df['thread_count'] = group_by_response_codes['grpThreads'].nunique()
            add_df = add_df.fillna(0)
            add_df = add_df.reset_index()
            add_df.columns = ['response_code', 'count']
            df1 = pd.concat([
                self.response_codes_frame, add_df
            ]).groupby('response_code')['count'].sum().reset_index()
            self.response_codes_frame = df1
            #create aggregate table
            group_by_url = df.groupby('URL')  # group date by URLs
            add_aggregate_data = group_by_url.aggregate({
                'average': np.mean
            }).round(1)
            add_aggregate_data['maximum'] = group_by_url.average.max().round(1)
            add_aggregate_data['minimum'] = group_by_url.average.min().round(1)
            add_aggregate_data['count'] = group_by_url.success.count().round(1)
            add_aggregate_data['errors'] = df[(
                df.success == False)].groupby('URL')['success'].count()
            add_aggregate_data = add_aggregate_data.fillna(0)
            add_aggregate_data = add_aggregate_data.reset_index()
            add_aggregate_data.columns = [
                'URL', 'average', 'maximum', 'minimum', 'count', 'errors'
            ]
            #???
            df1 = pd.concat([self.aggregate_frame, add_aggregate_data
                             ]).groupby('URL')['average'].mean().reset_index()
            df2 = pd.concat([self.aggregate_frame, add_aggregate_data
                             ]).groupby('URL')['count',
                                               'errors'].sum().reset_index()
            df3 = pd.concat([self.aggregate_frame, add_aggregate_data
                             ]).groupby('URL')['maximum'].max().reset_index()
            df4 = pd.concat([self.aggregate_frame, add_aggregate_data
                             ]).groupby('URL')['minimum'].min().reset_index()
            result_df = pd.merge(df1, df2, how='inner', on='URL')
            result_df = pd.merge(result_df, df3, how='inner', on='URL')
            result_df = pd.merge(result_df, df4, how='inner', on='URL')
            self.aggregate_frame = result_df
            add_df2 = pd.DataFrame()
            gr_by_minute = df.groupby(
                pd.TimeGrouper(freq='1Min'))  # group data by minute
            add_df2['average'] = gr_by_minute.average.mean()
            add_df2['median'] = gr_by_minute.average.median()
            add_df2['count'] = gr_by_minute.success.count()
            add_df2['errors_count'] = df[(df.success == False)].groupby(
                pd.TimeGrouper(freq='1Min'))['success'].count()
            #add_df2['thread_count'] = gr_by_minute['grpThreads'].nunique()
            #add_df2['rps'] = gr_by_minute.success.count()/60
            add_df2 = add_df2.fillna(0)
            add_df2 = add_df2.reset_index()
            add_df2.columns = [
                'time', 'average', 'median', 'count', 'errors_count'
            ]
            df1 = pd.concat([self.data_frame, add_df2
                             ]).groupby('time')['average',
                                                'median'].mean().reset_index()
            df2 = pd.concat(
                [self.data_frame,
                 add_df2]).groupby('time')['count',
                                           'errors_count'].sum().reset_index()
            #df3 = pd.concat([self.data_frame,add_df2]).groupby('time')['thread_count'].max().reset_index()
            result_df = pd.merge(df1, df2, how='inner', on='time')
            #result_df = pd.merge(result_df1, df3, how='inner',on='time')
            result_df['rps'] = result_df['count'] / 60
            #print 'result_df'
            self.data_frame = result_df
            #print self.data_frame
        else:
            logger.info(".jtl file was not changed")
for junction in df_train.Junction.unique():
    fig = plt.figure(junc, figsize=(10, 2))
    df_train[df_train.Junction == junction].resample(
        "D")["Vehicles"].count().plot()
    plt.title("Existing values for Junction " + str(junction))
    plt.show()

# ** -> No missing observations for each hour, and for each junction. **

# In[16]:

# We plot vehicles observations per junction
for junc in df_train.Junction.unique():
    fig = plt.figure(junc, figsize=(20, 3))
    df_train[df_train.Junction == junc].groupby(
        pd.TimeGrouper('D')).mean().Vehicles.plot()
    plt.title('Vehicles of Junction {}'.format(junc))
plt.show()

# ** -> The three first junctions are on the same frequencies of dates. The last junction (number 4) has less observations : since Jan. 2017. The frequency per hour remains unchanged**

# ### Stationarity Checks for each TS :

# In[48]:

# Source of code : https://www.analyticsvidhya.com/blog/2016/02/time-series-forecasting-codes-python/
from statsmodels.tsa.stattools import adfuller


def test_stationarity(idx, timeseries):
    fig = plt.figure(idx, figsize=(20, 5))
Example #26
0
    def pageviews(self, URLs, start_date='30daysAgo', end_date='today', intervals=False):
        """ Return a dataframe containing views on a particular page.
            First argument can be a URL string or list of URLs. """
        def strip_domain(url):
            return url.replace('https://gcconnex.gc.ca/','').replace('https://gccollab.ca/','').replace('www.gcpedia.gc.ca/','')
        metric = 'ga:pageviews'
        dimension = 'ga:date'
        # Strip the domain from the URL
        if type(URLs) == list:
            URLs = list(map(lambda x: strip_domain(x), URLs ))
        else:
            URLs = [strip_domain(URLs)]
        
        # Construct filter clauses for both requests
        filter_clause = self._construct_filter_clause(metric, 'ga:pagePath', URLs)
        # Should first construct report for found pagePaths. Print to ensure nothing is wonky.
        # Construct report for stats.
        response_names = self._make_report(start_date, end_date, metric, 'ga:pagePath', filter_clause, order='views')
        response_stats = self._make_report(start_date, end_date, metric, 'ga:date', filter_clause, order='date')
        
        df_names = self._parse_response_into_df(response_names)
        df = self._parse_response_into_df(response_stats)
        df.columns = ['date', 'pageviews']
        df['date'] = df['date'].apply(lambda x: pd.to_datetime(x, format='%Y%m%d'))
        
        df.set_index('date', inplace=True)
        
        
        idx = pd.date_range(start_date, end_date)
        #code.interact(local=locals())
        df = df.reindex(idx, fill_value=0)
        df = df[df.index.weekday < 5] # Should work now
        df['pageviews'] = df['pageviews'].astype(int)
        if intervals == True: # Create both monthly and daily
            df_month = df.groupby(pd.TimeGrouper(freq='M')).sum()
            #code.interact(local=locals())
            df_month.reset_index(inplace=True)
            df_month.rename(columns={'index':'date'}, inplace=True)
            df_month['pageviews'] = df_month['pageviews'].astype(str)
            df_month['date'] = df_month['date'].apply(lambda x: x.strftime('%Y%m%d'))
            
        df.reset_index(inplace=True)
        #code.interact(local=locals()) 
        df.rename(columns={'index':'date'}, inplace=True)
        df['pageviews'] = df['pageviews'].astype(str)
        df['date'] = df['date'].apply(lambda x: x.strftime('%Y%m%d'))

        # Build lists from columns for C3 timechart format
        if intervals == True:
            return {
                'daily': {
                    'dates': df['date'].values.tolist(),
                    'pageviews': df['pageviews'].values.tolist()
                },
                'monthly': {
                    'dates': df_month['date'].values.tolist(),
                    'pageviews': df_month['pageviews'].values.tolist()
                }
            }
        else:
            return {
                'dates': df['date'].values.tolist(),
                'pageviews': df['pageviews'].values.tolist()
            }
Example #27
0
def plotYearMonthStatsHb(data):
    #pd.groupby(b,by=[b.index.month,b.index.year])
    data.groupby(pd.TimeGrouper(freq='M')).mean().plot()
    sns.plt.show()
Example #28
0
In [ ]:
dti = pd.date_range(start='2015-01-01', end='2015-12-31', freq='B') 
s = pd.Series(np.random.rand(len(dti)), index=dti)
34. Find the sum of the values in s for every Wednesday.

In [ ]:
s[s.index.weekday == 2].sum()
35. For each calendar month in s, find the mean of values.

In [ ]:
s.resample('M').mean()
36. For each group of four consecutive calendar months in s, find the date on which the highest value occurred.

In [ ]:
s.groupby(pd.TimeGrouper('4M')).idxmax()
37. Create a DateTimeIndex consisting of the third Thursday in each month for the years 2015 and 2016.

In [ ]:
pd.date_range('2015-01-01', '2016-12-31', freq='WOM-3THU')
Cleaning Data
Making a DataFrame easier to work with
Difficulty: easy/medium

It happens all the time: someone gives you data containing malformed strings, Python, lists and missing data. How do you tidy it up so you can get on with the analysis?

Take this monstrosity as the DataFrame to use in the following puzzles:

df = pd.DataFrame({'From_To': ['LoNDon_paris', 'MAdrid_miLAN', 'londON_StockhOlm', 
                               'Budapest_PaRis', 'Brussels_londOn'],
              'FlightNumber': [10045, np.nan, 10065, np.nan, 10085],
 def test_TimeGrouper(self):
     with tm.assert_produces_warning(FutureWarning,
                                     check_stacklevel=False):
         pd.TimeGrouper(freq='D')
Example #30
0
 def resample_benchmark_return(self, frequence):
     return self.benchmark_return.groupby(pd.TimeGrouper(
         freq=frequence)).agg(stats.cum_returns_final).dropna()