def behavior(start_date, end_date, account_name, property_name, profile_name, max_results): # Let pandas fetch the data from Google Analytics, returns a generator object df_chunks = ga.read_ga(secrets=client_secrets, account_name=account_name, property_name=property_name, profile_name=profile_name, dimensions=['date', 'hour', 'minute'], metrics=['pageviews'], start_date=start_date, end_date=end_date, index_col=0, parse_dates={'datetime': ['date', 'hour', 'minute']}, date_parser=lambda x: datetime.strptime(x, '%Y%m%d %H %M'), max_results=max_results, chunksize=10000) # Concatenate the chunks into a DataFrame and get number of rows df = pd.concat(df_chunks) num_rows = df.shape[0] # Resample into half-hour buckets df = df.resample('30Min', how='sum') # Create the behavior table (half-hour x weekday) grouped = df.groupby([df.index.time, df.index.weekday]) behavior = grouped['pageviews'].aggregate(np.sum).unstack() # Make sure the table covers all hours and weekdays behavior = behavior.reindex(index=pd.date_range("00:00", "23:30", freq="30min").time, columns=range(7)) behavior.columns = ['MO', 'TU', 'WE', 'TH', 'FR', 'SA', 'SU'] return behavior, num_rows
def test_getdata(self): try: import httplib2 from pandas.io.ga import GAnalytics, read_ga from pandas.io.auth import AuthenticationConfigError except ImportError: raise nose.SkipTest try: end_date = datetime.now() start_date = end_date - pd.offsets.Day() * 5 end_date = end_date.strftime('%Y-%m-%d') start_date = start_date.strftime('%Y-%m-%d') reader = GAnalytics() df = reader.get_data(metrics=[ 'avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit' ], start_date=start_date, end_date=end_date, dimensions=['date', 'hour'], parse_dates={'ts': ['date', 'hour']}) assert isinstance(df, DataFrame) assert isinstance(df.index, pd.DatetimeIndex) assert len(df) > 1 assert 'date' not in df assert 'hour' not in df assert df.index.name == 'ts' assert 'avgTimeOnSite' in df assert 'visitors' in df assert 'newVisits' in df assert 'pageviewsPerVisit' in df df2 = read_ga(metrics=[ 'avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit' ], start_date=start_date, end_date=end_date, dimensions=['date', 'hour'], parse_dates={'ts': ['date', 'hour']}) assert_frame_equal(df, df2) except AuthenticationConfigError: raise nose.SkipTest except httplib2.ServerNotFoundError: try: h = httplib2.Http() response, content = h.request("http://www.google.com") raise except httplib2.ServerNotFoundError: raise nose.SkipTest
def test_segment(self): try: end_date = datetime.now() start_date = end_date - pd.offsets.Day() * 5 end_date = end_date.strftime('%Y-%m-%d') start_date = start_date.strftime('%Y-%m-%d') reader = GAnalytics() df = reader.get_data(metrics=[ 'avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit' ], start_date=start_date, end_date=end_date, segment=-2, dimensions=['date', 'hour'], parse_dates={'ts': ['date', 'hour']}, index_col=0) self.assertIsInstance(df, pd.DataFrame) self.assertIsInstance(df.index, pd.DatetimeIndex) self.assertGreater(len(df), 1) self.assertTrue('date' not in df) self.assertTrue('hour' not in df) self.assertEqual(df.index.name, 'ts') self.assertTrue('avgTimeOnSite' in df) self.assertTrue('visitors' in df) self.assertTrue('newVisits' in df) self.assertTrue('pageviewsPerVisit' in df) # dynamic df = read_ga(metrics=[ 'avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit' ], start_date=start_date, end_date=end_date, segment="source=~twitter", dimensions=['date', 'hour'], parse_dates={'ts': ['date', 'hour']}, index_col=0) assert isinstance(df, pd.DataFrame) assert isinstance(df.index, pd.DatetimeIndex) self.assertGreater(len(df), 1) self.assertTrue('date' not in df) self.assertTrue('hour' not in df) self.assertEqual(df.index.name, 'ts') self.assertTrue('avgTimeOnSite' in df) self.assertTrue('visitors' in df) self.assertTrue('newVisits' in df) self.assertTrue('pageviewsPerVisit' in df) except AuthenticationConfigError: raise nose.SkipTest("authentication error")
def test_getdata(self): try: import httplib2 from pandas.io.ga import GAnalytics, read_ga from pandas.io.auth import AuthenticationConfigError except ImportError: raise nose.SkipTest try: end_date = datetime.now() start_date = end_date - pd.offsets.Day() * 5 end_date = end_date.strftime('%Y-%m-%d') start_date = start_date.strftime('%Y-%m-%d') reader = GAnalytics() df = reader.get_data( metrics=['avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit'], start_date=start_date, end_date=end_date, dimensions=['date', 'hour'], parse_dates={'ts': ['date', 'hour']}) assert isinstance(df, DataFrame) assert isinstance(df.index, pd.DatetimeIndex) assert len(df) > 1 assert 'date' not in df assert 'hour' not in df assert df.index.name == 'ts' assert 'avgTimeOnSite' in df assert 'visitors' in df assert 'newVisits' in df assert 'pageviewsPerVisit' in df df2 = read_ga( metrics=['avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit'], start_date=start_date, end_date=end_date, dimensions=['date', 'hour'], parse_dates={'ts': ['date', 'hour']}) assert_frame_equal(df, df2) except AuthenticationConfigError: raise nose.SkipTest except httplib2.ServerNotFoundError: try: h = httplib2.Http() response, content = h.request("http://www.google.com") raise except httplib2.ServerNotFoundError: raise nose.SkipTest
def test_segment(self): try: end_date = datetime.now() start_date = end_date - pd.offsets.Day() * 5 end_date = end_date.strftime('%Y-%m-%d') start_date = start_date.strftime('%Y-%m-%d') reader = GAnalytics() df = reader.get_data( metrics=['avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit'], start_date=start_date, end_date=end_date, segment=-2, dimensions=['date', 'hour'], parse_dates={'ts': ['date', 'hour']}, index_col=0) self.assertIsInstance(df, pd.DataFrame) self.assertIsInstance(df.index, pd.DatetimeIndex) self.assertGreater(len(df), 1) self.assertTrue('date' not in df) self.assertTrue('hour' not in df) self.assertEqual(df.index.name, 'ts') self.assertTrue('avgTimeOnSite' in df) self.assertTrue('visitors' in df) self.assertTrue('newVisits' in df) self.assertTrue('pageviewsPerVisit' in df) # dynamic df = read_ga( metrics=['avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit'], start_date=start_date, end_date=end_date, segment="source=~twitter", dimensions=['date', 'hour'], parse_dates={'ts': ['date', 'hour']}, index_col=0) assert isinstance(df, pd.DataFrame) assert isinstance(df.index, pd.DatetimeIndex) self.assertGreater(len(df), 1) self.assertTrue('date' not in df) self.assertTrue('hour' not in df) self.assertEqual(df.index.name, 'ts') self.assertTrue('avgTimeOnSite' in df) self.assertTrue('visitors' in df) self.assertTrue('newVisits' in df) self.assertTrue('pageviewsPerVisit' in df) except AuthenticationConfigError: raise nose.SkipTest("authentication error")
def test_segment(self): try: end_date = datetime.now() start_date = end_date - pd.offsets.Day() * 5 end_date = end_date.strftime('%Y-%m-%d') start_date = start_date.strftime('%Y-%m-%d') reader = GAnalytics() df = reader.get_data( metrics=['avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit'], start_date=start_date, end_date=end_date, segment=-2, dimensions=['date', 'hour'], parse_dates={'ts': ['date', 'hour']}) assert isinstance(df, DataFrame) assert isinstance(df.index, pd.DatetimeIndex) assert len(df) > 1 assert 'date' not in df assert 'hour' not in df assert df.index.name == 'ts' assert 'avgTimeOnSite' in df assert 'visitors' in df assert 'newVisits' in df assert 'pageviewsPerVisit' in df #dynamic df = read_ga( metrics=['avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit'], start_date=start_date, end_date=end_date, segment="source=~twitter", dimensions=['date', 'hour'], parse_dates={'ts': ['date', 'hour']}) assert isinstance(df, DataFrame) assert isinstance(df.index, pd.DatetimeIndex) assert len(df) > 1 assert 'date' not in df assert 'hour' not in df assert df.index.name == 'ts' assert 'avgTimeOnSite' in df assert 'visitors' in df assert 'newVisits' in df assert 'pageviewsPerVisit' in df except AuthenticationConfigError: raise nose.SkipTest
def test_segment(self): try: end_date = datetime.now() start_date = end_date - pd.offsets.Day() * 5 end_date = end_date.strftime('%Y-%m-%d') start_date = start_date.strftime('%Y-%m-%d') reader = GAnalytics() df = reader.get_data(metrics=[ 'avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit' ], start_date=start_date, end_date=end_date, segment=-2, dimensions=['date', 'hour'], parse_dates={'ts': ['date', 'hour']}) assert isinstance(df, DataFrame) assert isinstance(df.index, pd.DatetimeIndex) assert len(df) > 1 assert 'date' not in df assert 'hour' not in df assert df.index.name == 'ts' assert 'avgTimeOnSite' in df assert 'visitors' in df assert 'newVisits' in df assert 'pageviewsPerVisit' in df #dynamic df = read_ga(metrics=[ 'avgTimeOnSite', 'visitors', 'newVisits', 'pageviewsPerVisit' ], start_date=start_date, end_date=end_date, segment="source=~twitter", dimensions=['date', 'hour'], parse_dates={'ts': ['date', 'hour']}) assert isinstance(df, DataFrame) assert isinstance(df.index, pd.DatetimeIndex) assert len(df) > 1 assert 'date' not in df assert 'hour' not in df assert df.index.name == 'ts' assert 'avgTimeOnSite' in df assert 'visitors' in df assert 'newVisits' in df assert 'pageviewsPerVisit' in df except AuthenticationConfigError: raise nose.SkipTest
def behavior(start_date, end_date, account_name, property_name, profile_name, max_results): """ Writes a DataFrame with the number of pageviews per half-hours x weekdays to the Range "behavior" """ # Let pandas fetch the data from Google Analytics, returns a generator object df_chunks = ga.read_ga( secrets=client_secrets, account_name=account_name, property_name=property_name, profile_name=profile_name, dimensions=['date', 'hour', 'minute'], metrics=['pageviews'], start_date=start_date, end_date=end_date, index_col=0, parse_dates={'datetime': ['date', 'hour', 'minute']}, date_parser=lambda x: datetime.strptime(x, '%Y%m%d %H %M'), max_results=max_results, chunksize=10000) # Concatenate the chunks into a DataFrame and get number of rows df = pd.concat(df_chunks) num_rows = df.shape[0] # Resample into half-hour buckets df = df.resample('30Min', how='sum') # Create the behavior table (half-hour x weekday) grouped = df.groupby([df.index.time, df.index.weekday]) behavior = grouped['pageviews'].aggregate(np.sum).unstack() # Make sure the table covers all hours and weekdays behavior = behavior.reindex(index=pd.date_range("00:00", "23:30", freq="30min").time, columns=range(7)) behavior.columns = ['MO', 'TU', 'WE', 'TH', 'FR', 'SA', 'SU'] # Write to Excel. # Time-only values are currently a bit of a pain on Windows, so we set index=False. Range(sheet_dashboard, 'behavior', index=False).value = behavior Range(sheet_dashboard, 'effective').value = num_rows
def behavior(start_date, end_date, account_name, property_name, profile_name, max_results): """ Writes a DataFrame with the number of pageviews per half-hours x weekdays to the Range "behavior" """ # Let pandas fetch the data from Google Analytics, returns a generator object df_chunks = ga.read_ga(secrets=client_secrets, account_name=account_name, property_name=property_name, profile_name=profile_name, dimensions=['date', 'hour', 'minute'], metrics=['pageviews'], start_date=start_date, end_date=end_date, index_col=0, parse_dates={'datetime': ['date', 'hour', 'minute']}, date_parser=lambda x: datetime.strptime(x, '%Y%m%d %H %M'), max_results=max_results, chunksize=10000) # Concatenate the chunks into a DataFrame and get number of rows df = pd.concat(df_chunks) num_rows = df.shape[0] # Resample into half-hour buckets df = df.resample('30Min', how='sum') # Create the behavior table (half-hour x weekday) grouped = df.groupby([df.index.time, df.index.weekday]) behavior = grouped['pageviews'].aggregate(np.sum).unstack() # Make sure the table covers all hours and weekdays behavior = behavior.reindex(index=pd.date_range("00:00", "23:30", freq="30min").time, columns=range(7)) behavior.columns = ['MO', 'TU', 'WE', 'TH', 'FR', 'SA', 'SU'] # Write to Excel. # Time-only values are currently a bit of a pain on Windows, so we set index=False. Range(sheet_dashboard, 'behavior').options(index=False).value = behavior Range(sheet_dashboard, 'effective').value = num_rows
#define the dimensions and metrics to use on the analytics api dimensions = ['pagePath'] metrics = [ 'pageviews', 'uniquePageviews', 'entrances', 'bounceRate', 'exitRate' ] start_date = '2014-08-01' end_date = '2014-08-31' df1 = pd.DataFrame(columns=[ 'pageviews' ]) #dataframe with one series [column] - nameing the column pageviews count = 0 #create a counter variable for i in df[ 0]: #loop through the dataframe df column to pass each slug into the analytics filter field separately hwy = ga.read_ga( metrics=metrics, #api call dimensions=dimensions, start_date=start_date, end_date=end_date, filters=['pagePath==' + i], account_id='26179049') df1 = df1.append( hwy ) #each time the api is called a dataframe [hwy] is created - we append this dataframe to df1 on each pass of the loop count += 1 #counts the number of times the loop is run through print count #prints out the number of times the loop is run through print df1.head() #checks the first 5 results in the dataframe
def PageData(page, mod): ''' args - pagepath for filter - Model type - takes 'LR' for LASSO and 'RF' for Random Forests ''' Store = [] t = datetime.today() t2 = t - timedelta(hours=1) #(2 hours for BST, 1 for UTC which is on Heroku server) delay = t2.strftime('%Y-%m-%d %H:00') star = t - timedelta(30) max_results = 5e7 metrics = ['pageviews'] dimensions = ['pagePath', 'hour', 'date'] dim = ['date', 'hour'] if page != None: filters = ['pagePath=='+page] else: filters = None df1 = ga.read_ga(metrics, dimensions = dim, start_date = star, end_date = delay, parse_dates=[['date', 'hour']], token_file_name = 'static/token/analytics.dat', secrets = 'static/token/client_secrets.json', account_id = '26179049', filters = filters ) ##################### 48 MAX LAG ############################## ind = [] for i in range(72, len(df1)): lag = [1,2,3,4,5,10,17,24,48,72] lagx = list(i - np.array(lag)) Train = df1.ix[lagx] Target = df1.ix[i] TT = Train.T TT.columns = lag TT['Target'] = Target['pageviews'] ind.append(TT) rng = date_range(df1.index[lag[-1]], df1.index[len(df1)-1], freq='H') Set = ind[0].append(ind[1:]) Set.index = rng SetT = Set.ix[:delay] print SetT ############################################################# li = [] if mod == 'LR': cnt = 1 else: cnt = 3 feats = len(lag) SetT = SetT.replace(0,1) X_Train = SetT[SetT.columns[0:feats]][:-170] Y_Train = SetT['Target'].ix[:-170] X_Test = SetT[SetT.columns[0:feats]][-170:] Y_Test = SetT['Target'][-170:] Store.append(X_Train) Store.append(Y_Train) Store.append(X_Test) Store.append(Y_Test) for j in range(0,cnt): print j #Train Model # feats = len(lag) # SetT = SetT.replace(0,1) # # X_Train = SetT[SetT.columns[0:feats]][:-170] # Y_Train = SetT['Target'].ix[:-170] # # # X_Test = SetT[SetT.columns[0:feats]][-170:] # Y_Test = SetT['Target'][-170:] if mod == 'RF': print 50*'-' print "Random Forest Regression" print 50*'-' rf = RandomForestRegressor(n_estimators=500, max_features=feats) else: print 50*'-' print "LASSO Regression" print 50*'-' ################################################################################ # Lasso with path and cross-validation using LassoCV path from sklearn.linear_model import LassoCV # lasso_cv = LassoCV() # y_ = lasso_cv.fit(X_Train, Y_Train) rf = y_ # rf = linear_model.LinearRegression() rf.fit(X_Train, Y_Train) PredRF = rf.predict(X_Test) scoreRF = r2_score(Y_Test,PredRF) MSE = np.mean(np.square(Y_Test-PredRF)) print 'R2 Score = ', scoreRF print 'MSE = ', MSE Res = pd.DataFrame(columns=['res'], index = (range(0,len(Y_Test)))) resid = Y_Test-PredRF Res['res'] = resid.values TSDU = Res['res'].mean()+3*Res['res'].std() TSDL = Res['res'].mean()-3*Res['res'].std() tsdP = Res[Res['res']>(Res['res'].mean()+3*Res['res'].std())] tsdN = Res[Res['res']<(Res['res'].mean()-3*Res['res'].std())] Stats = pd.DataFrame(columns=['yt','pred','resid','TSDU','TSDL'], index=X_Test.index) Stats['yt'] = Y_Test Stats['pred'] = PredRF Stats['resid'] = resid Stats['TSDU'] = TSDU Stats['TSDL'] = TSDL ######### Plotting diabled for heroku build ############################ # plt.figure(5) # # plt.subplot(2, 1, 1) # Stats['yt'].plot() # plt.scatter(Stats['pred'].index, Stats['pred'], s=70, alpha=0.5, c='r') # # plt.title("Random Forest Model") # # plt.subplot(2,1,2) ####################################################################### # Stats['resid'].plot() # Stats['TSDU'].plot(c='r') # Stats['TSDL'].plot(c='r') Stats.index.name = 'Time' Stats['time'] = Stats.index Stats['pred'].astype('int') Stats['resid'].astype('int') Stats['TSDU'].astype('int') Stats['TSDL'].astype('int') li.append(Stats) # plt.title('Residuals and 2 s.d. lines') cat = pd.concat(([i for i in li])) Stats = cat.groupby(cat.index).mean() Stats['time'] = Stats.index AP = len(tsdP) + len(tsdN) print 50*'-' # print "PAGE: " + str(filters[0]) print 50*'-' print "RANDOM FOREST Number of Anomalous Points is: %i" % AP print "%% Anomalous points is: %.1f" % (100*AP/len(Y_Test)) +'%' return Stats, Store, scoreRF
def Trial(): t = datetime.today() yesterday = t - timedelta(0) dbyy = t - timedelta(90) start = dbyy.strftime('%Y-%m-%d') today = t.strftime('%Y-%m-%d %H:00') # end = yesterday.strftime('%Y-%m-%d') start = "2014-06-07" end = "2014-06-10" top100 = [u'/jobsearch', u'/search', u'/', u'/tax-disc', u'/renew-adult-passport', u'/student-finance-register-login', u'/visas-immigration', u'/driving-transaction-finished', u'/browse/abroad/passports', u'/apply-uk-visa', u'/browse/driving', u'/check-uk-visa', u'/get-a-passport-urgently', u'/apply-renew-passport', u'/government/organisations/uk-visas-and-immigration', u'/book-practical-driving-test', u'/bank-holidays', u'/change-date-practical-driving-test', u'/government/organisations/driver-and-vehicle-licensing-agency', u'/contact-jobcentre-plus', u'/book-a-driving-theory-test', u'/browse/driving/driving-licences', u'/benefits-calculators', u'/check-uk-visa/y', u'/jobseekers-allowance/how-to-claim', u'/national-minimum-wage-rates', u'/tax-credits-calculator', u'/browse/benefits/tax-credits', u'/browse/benefits', u'/change-address-driving-licence', u'/contact-the-dvla', u'/browse/working/finding-job', u'/calculate-state-pension', u'/passport-fees', u'/browse/working', u'/contact/hm-revenue-customs/tax-credits-enquiries', u'/overseas-passports', u'/track-passport-application', u'/renew-driving-licence', u'/browse/abroad', u'/get-vehicle-information-from-dvla', u'/apply-first-provisional-driving-licence', u'/student-finance/overview', u'/browse/driving/car-tax-discs', u'/general-visit-visa', u'/apply-online-to-replace-a-driving-licence', u'/government/organisations/hm-passport-office', u'/check-mot-status', u'/uk-border-control', u'/get-a-child-passport', u'/practise-your-driving-theory-test', u'/renewing-your-tax-credits-claim', u'/renewtaxcredits', u'/calculate-state-pension/y', u'/student-finance', u'/photos-for-passports', u'/contact-student-finance-england', u'/visa-processing-times', u'/foreign-travel-advice', u'/jobseekers-allowance', u'/contact', u'/browse/education/student-finance', u'/calculate-vehicle-tax-rates', u'/find-a-visa-application-centre', u'/working-tax-credit', u'/renew-driving-licence-at-70', u'/passport-advice-line', u'/call-charges', u'/overseas-passports/y', u'/countersigning-passport-applications', u'/government/topical-events/sexual-violence-in-conflict', u'/how-the-post-office-check-and-send-service-works', u'/visa-fees', u'/government/organisations', u'/browse/driving/learning-to-drive', u'/browse/working/state-pension', u'/vehicle-tax-rate-tables', u'/get-a-child-passport/your-childs-first-passport', u'/calculate-state-pension/y/age', u'/make-a-sorn', u'/jobseekers-allowance/what-youll-get', u'/general-visit-visa/apply', u'/contact/govuk/anonymous-feedback/thankyou', u'/browse/citizenship/citizenship', u'/general-visit-visa/documents-you-must-provide', u'/jobseekers-allowance/overview', u'/uk-border-control/before-you-leave-for-the-uk', u'/government/organisations/foreign-commonwealth-office', u'/government/collections/national-curriculum', u'/government/organisations/ministry-of-defence', u'/ips-regional-passport-office', u'/hand-luggage-restrictions/overview', u'/jobseekers-allowance/eligibility', u'/register-to-vote', u'/disclosure-barring-service-check/overview', u'/browse/benefits/jobseekers-allowance', u'/dvlaforms', u'/tier-4-general-visa', u'/student-finance/loans-and-grants'] # the above should not be hardcoded. Calculate each time so as to avoid page name changes etc. max_results = 5e7 metrics = ['pageviews'] dimensions = ['pagePath', 'hour', 'date'] dim = ['date', 'hour'] filters = ['pagePath=='+top100[97]] ############### #Find Top 100 pages by pageviews - (pv fine in this case rather than upv) # df = ga.read_ga(metrics, # dim, # start_date = start, # end_date = end, # token_file_name = 'static/token/analytics.dat', # secrets = 'static/token/client_secrets.json', # account_id = '26179049', # max_results=max_results, # chunksize=5000 # ) # df1b = pd.concat([x for x in df]) # df1c = df1b.sort(columns=['pageviews'], ascending=0) df1 = ga.read_ga(metrics, dimensions = dim, start_date = dbyy, end_date = yesterday, parse_dates=[['date', 'hour']], token_file_name = 'static/token/analytics.dat', secrets = 'static/token/client_secrets.json', account_id = '26179049' # filters = filters ) ##################### 48 MAX LAG ############################## ind = [] for i in range(48, len(df1)): lag = [1,2,3,24,48] lagx = list(i - np.array(lag)) Train = df1.ix[lagx] Target = df1.ix[i] TT = Train.T TT.columns = [1,2,3,24,48] TT['Target'] = Target['pageviews'] ind.append(TT) rng = date_range(df1.index[48], df1.index[len(df1)-1], freq='H') Set = ind[0].append(ind[1:]) Set.index = rng SetT = Set.ix[:today][:-1] print SetT ##################### 7 day trial ############################## # ind = [] # # for i in range(168, len(df1)): # # lag = [1,2,3,24,48, 168] # lagx = list(i - np.array(lag)) # # Train = df1.ix[lagx] # Target = df1.ix[i] # # TT = Train.T # TT.columns = [1,2,3,24,48, 168] # TT['Target'] = Target['pageviews'] # ind.append(TT) # # rng = date_range(df1.index[168], df1.index[len(df1)-1], freq='H') # Set = ind[0].append(ind[1:]) # Set.index = rng # SetT = Set.ix[:today][:-1] # print SetT ################################################# TrainSamp = 0.8 feats = 5 TS = int(np.round(TrainSamp*len(SetT))) X_Train = SetT[SetT.columns[0:feats]].head(TS) Y_Train = SetT['Target'].head(TS) X_Test = SetT[SetT.columns[0:feats]].ix[TS:] Y_Test = SetT['Target'].ix[TS:] X_Train = X_Train.replace(0,1) X_Test = X_Test.replace(0,1) Y_Train = Y_Train.replace(0,1) Y_Test = Y_Test.replace(0,1) print 50*'-' print "Random Forest Regression" print 50*'-' rf = RandomForestRegressor(n_estimators=500, max_features=feats) rf.fit(X_Train, Y_Train) PredRF = rf.predict(X_Test) scoreRF = r2_score(Y_Test,PredRF) MSE = np.mean(np.square(Y_Test-PredRF)) print 'R2 Score = ', scoreRF print 'MSE = ', MSE Res = pd.DataFrame(columns=['res'], index = (range(0,len(Y_Test)))) resid = Y_Test-PredRF Res['res'] = resid.values TSDU = Res['res'].mean()+3*Res['res'].std() TSDL = Res['res'].mean()-3*Res['res'].std() tsdP = Res[Res['res']>(Res['res'].mean()+3*Res['res'].std())] tsdN = Res[Res['res']<(Res['res'].mean()-3*Res['res'].std())] plt.figure(2) plt.plot(Y_Test, Y_Test) plt.scatter(Y_Test,PredRF, s=40, alpha=0.5, c='r') ############################## plt.figure(1) plt.subplot(2, 2, 1) plt.plot(range(0,len(Y_Test)), Y_Test) plt.scatter(range(0,len(Y_Test)), PredRF, s=70, alpha=0.5, c='r') plt.xlim([0,len(Y_Test)]) plt.title("Random Forest Model") plt.subplot(2,2,3) plt.plot(range(0,len(Y_Test)),resid) plt.plot(range(0,len(Y_Test)), [TSDU]*len(Y_Test), c='r') plt.plot(range(0,len(Y_Test)), [TSDL]*len(Y_Test), c='r') plt.xlim([0,len(Y_Test)]) ############################### AP = len(tsdP) + len(tsdN) print 50*'-' print "PAGE: " + str(filters[0]) print 50*'-' print "RANDOM FOREST Number of Anomalous Points is: %i" % AP print "%% Anomalous points is: %.1f" % (100*AP/len(Y_Test)) +'%' ################################################################# print 50*'-' print "Linear Model" print 50*'-' clf = linear_model.LinearRegression() clf.fit(np.log(X_Train), np.log(Y_Train)) LMPred = clf.predict(np.log(X_Test)) scoreLM = r2_score(np.log(Y_Test),LMPred) MSELM = np.mean(np.square(Y_Test-np.exp(LMPred))) print "R2 Score = ", scoreLM print "MSE = ", MSELM ########################################################### print 50*'-' print "LASSO Regression" print 50*'-' ################################################################################ # Lasso with path and cross-validation using LassoCV path from sklearn.linear_model import LassoCV lasso_cv = LassoCV() y_ = lasso_cv.fit(X_Train, Y_Train).predict(X_Test) # .predict(X_Test) print "Optimal regularization parameter = %s" % lasso_cv.alpha_ # Compute explained variance on test data print "r^2 on test data : %f" % (1 - np.linalg.norm(Y_Test - y_)**2 / np.linalg.norm(Y_Test)**2) print "LASSO MSE: ", np.mean(np.square((Y_Test-(y_)))) Res2 = pd.DataFrame(columns=['res'], index = (range(0,len(Y_Test)))) resid2 = Y_Test-y_ Res2['res'] = resid2.values TSDU2 = Res2['res'].mean()+3*Res2['res'].std() TSDL2 = Res2['res'].mean()-3*Res2['res'].std() tsdP2 = Res2[Res2['res']>(Res2['res'].mean()+3*Res2['res'].std())] tsdN2 = Res2[Res2['res']<(Res2['res'].mean()-3*Res2['res'].std())] AP = len(tsdP2) + len(tsdN2) print 50*'-' print "PAGE: " + str(filters[0]) print 50*'-' print "LASSP Number of Anomalous Points is: %i" % AP print "%% Anomalous points is: %.1f" % (100*AP/len(Y_Test)) +'%' # plt.figure(3) ################################################# plt.subplot(2, 2, 2) plt.plot(range(0,len(Y_Test)), Y_Test) plt.scatter(range(0,len(Y_Test)), y_, s=70, alpha=0.5, c='r') plt.xlim([0,len(Y_Test)]) plt.title('LASSO Model') plt.subplot(2,2,4) plt.plot(range(0,len(Y_Test)),resid2) plt.plot(range(0,len(Y_Test)), [TSDU2]*len(Y_Test), c='r') plt.plot(range(0,len(Y_Test)), [TSDL2]*len(Y_Test), c='r') plt.xlim([0,len(Y_Test)])
import pandas.io.ga as ga # prints top 30 landing pages by pageviews in descending order df = ga.read_ga(['pageviews','entrances'], dimensions=['date', 'landingPagePath'], start_date='2013-04-01') reset_df = df.reset_index() sorted_df = reset_df.sort(columns=['pageviews','entrances'], ascending = False) print (sorted_df.head(30))
start_date = '2016-01-01' end_date = '2017-01-01' dimensions = ['date', 'cityId', 'city'] # Metrics for Request metrics = ['sessions', 'users'] tempStartTime = start_date count = 0 while str(tempStartTime) <= end_date: temp_df = ga.read_ga(account_id=account_id, profile_id=view_id, property_id=property_id, metrics=metrics, dimensions=dimensions, start_date=tempStartTime, end_date=today, index_col=0) tempStartTime = temp_df.index.max() if count == 0: df = temp_df else: df = pd.concat([df, temp_df]) count += 1 time.sleep(.2) # In[22]:
# -*- coding: utf-8 -*- import pandas.io.ga as ga import pandas as pd # goes through profiles,accounts and provides aggregate of visits/pageviews ids = {'123':'456'} filters = "source==Facebook";"medium==Social" all_data = pd.DataFrame() for profile, account in ids.iteritems(): df = ga.read_ga(['visits', 'pageviews'], dimensions=['date', 'landingPagePath', 'medium', 'campaign', 'source'], start_date='2013-03-01', end_date='2013-04-24', account_id=account, profile_id=profile, filters=filters, chunksize = 1000) for d in df: d['profile'] = profile d = d.reset_index() all_data = all_data.append(d) all_data.to_csv('C:\\tmp\\' + '322.csv', ',', line_terminator = '\n')
def PageData(page, mod): ''' args - pagepath for filter - Model type - takes 'LR' for LASSO and 'RF' for Random Forests ''' Store = [] t = datetime.today() t2 = t - timedelta( hours=1) #(2 hours for BST, 1 for UTC which is on Heroku server) delay = t2.strftime('%Y-%m-%d %H:00') star = t - timedelta(30) max_results = 5e7 metrics = ['pageviews'] dimensions = ['pagePath', 'hour', 'date'] dim = ['date', 'hour'] if page != None: filters = ['pagePath==' + page] else: filters = None df1 = ga.read_ga(metrics, dimensions=dim, start_date=star, end_date=delay, parse_dates=[['date', 'hour']], token_file_name='static/token/analytics.dat', secrets='static/token/client_secrets.json', account_id='26179049', filters=filters) ##################### 48 MAX LAG ############################## ind = [] for i in range(72, len(df1)): lag = [1, 2, 3, 4, 5, 10, 17, 24, 48, 72] lagx = list(i - np.array(lag)) Train = df1.ix[lagx] Target = df1.ix[i] TT = Train.T TT.columns = lag TT['Target'] = Target['pageviews'] ind.append(TT) rng = date_range(df1.index[lag[-1]], df1.index[len(df1) - 1], freq='H') Set = ind[0].append(ind[1:]) Set.index = rng SetT = Set.ix[:delay] print SetT ############################################################# li = [] if mod == 'LR': cnt = 1 else: cnt = 3 feats = len(lag) SetT = SetT.replace(0, 1) X_Train = SetT[SetT.columns[0:feats]][:-170] Y_Train = SetT['Target'].ix[:-170] X_Test = SetT[SetT.columns[0:feats]][-170:] Y_Test = SetT['Target'][-170:] Store.append(X_Train) Store.append(Y_Train) Store.append(X_Test) Store.append(Y_Test) for j in range(0, cnt): print j #Train Model # feats = len(lag) # SetT = SetT.replace(0,1) # # X_Train = SetT[SetT.columns[0:feats]][:-170] # Y_Train = SetT['Target'].ix[:-170] # # # X_Test = SetT[SetT.columns[0:feats]][-170:] # Y_Test = SetT['Target'][-170:] if mod == 'RF': print 50 * '-' print "Random Forest Regression" print 50 * '-' rf = RandomForestRegressor(n_estimators=500, max_features=feats) else: print 50 * '-' print "LASSO Regression" print 50 * '-' ################################################################################ # Lasso with path and cross-validation using LassoCV path from sklearn.linear_model import LassoCV # lasso_cv = LassoCV() # y_ = lasso_cv.fit(X_Train, Y_Train) rf = y_ # rf = linear_model.LinearRegression() rf.fit(X_Train, Y_Train) PredRF = rf.predict(X_Test) scoreRF = r2_score(Y_Test, PredRF) MSE = np.mean(np.square(Y_Test - PredRF)) print 'R2 Score = ', scoreRF print 'MSE = ', MSE Res = pd.DataFrame(columns=['res'], index=(range(0, len(Y_Test)))) resid = Y_Test - PredRF Res['res'] = resid.values TSDU = Res['res'].mean() + 3 * Res['res'].std() TSDL = Res['res'].mean() - 3 * Res['res'].std() tsdP = Res[Res['res'] > (Res['res'].mean() + 3 * Res['res'].std())] tsdN = Res[Res['res'] < (Res['res'].mean() - 3 * Res['res'].std())] Stats = pd.DataFrame(columns=['yt', 'pred', 'resid', 'TSDU', 'TSDL'], index=X_Test.index) Stats['yt'] = Y_Test Stats['pred'] = PredRF Stats['resid'] = resid Stats['TSDU'] = TSDU Stats['TSDL'] = TSDL ######### Plotting diabled for heroku build ############################ # plt.figure(5) # # plt.subplot(2, 1, 1) # Stats['yt'].plot() # plt.scatter(Stats['pred'].index, Stats['pred'], s=70, alpha=0.5, c='r') # # plt.title("Random Forest Model") # # plt.subplot(2,1,2) ####################################################################### # Stats['resid'].plot() # Stats['TSDU'].plot(c='r') # Stats['TSDL'].plot(c='r') Stats.index.name = 'Time' Stats['time'] = Stats.index Stats['pred'].astype('int') Stats['resid'].astype('int') Stats['TSDU'].astype('int') Stats['TSDL'].astype('int') li.append(Stats) # plt.title('Residuals and 2 s.d. lines') cat = pd.concat(([i for i in li])) Stats = cat.groupby(cat.index).mean() Stats['time'] = Stats.index AP = len(tsdP) + len(tsdN) print 50 * '-' # print "PAGE: " + str(filters[0]) print 50 * '-' print "RANDOM FOREST Number of Anomalous Points is: %i" % AP print "%% Anomalous points is: %.1f" % (100 * AP / len(Y_Test)) + '%' return Stats, Store, scoreRF
def Trial(): t = datetime.today() yesterday = t - timedelta(0) dbyy = t - timedelta(90) start = dbyy.strftime('%Y-%m-%d') today = t.strftime('%Y-%m-%d %H:00') # end = yesterday.strftime('%Y-%m-%d') start = "2014-06-07" end = "2014-06-10" top100 = [ u'/jobsearch', u'/search', u'/', u'/tax-disc', u'/renew-adult-passport', u'/student-finance-register-login', u'/visas-immigration', u'/driving-transaction-finished', u'/browse/abroad/passports', u'/apply-uk-visa', u'/browse/driving', u'/check-uk-visa', u'/get-a-passport-urgently', u'/apply-renew-passport', u'/government/organisations/uk-visas-and-immigration', u'/book-practical-driving-test', u'/bank-holidays', u'/change-date-practical-driving-test', u'/government/organisations/driver-and-vehicle-licensing-agency', u'/contact-jobcentre-plus', u'/book-a-driving-theory-test', u'/browse/driving/driving-licences', u'/benefits-calculators', u'/check-uk-visa/y', u'/jobseekers-allowance/how-to-claim', u'/national-minimum-wage-rates', u'/tax-credits-calculator', u'/browse/benefits/tax-credits', u'/browse/benefits', u'/change-address-driving-licence', u'/contact-the-dvla', u'/browse/working/finding-job', u'/calculate-state-pension', u'/passport-fees', u'/browse/working', u'/contact/hm-revenue-customs/tax-credits-enquiries', u'/overseas-passports', u'/track-passport-application', u'/renew-driving-licence', u'/browse/abroad', u'/get-vehicle-information-from-dvla', u'/apply-first-provisional-driving-licence', u'/student-finance/overview', u'/browse/driving/car-tax-discs', u'/general-visit-visa', u'/apply-online-to-replace-a-driving-licence', u'/government/organisations/hm-passport-office', u'/check-mot-status', u'/uk-border-control', u'/get-a-child-passport', u'/practise-your-driving-theory-test', u'/renewing-your-tax-credits-claim', u'/renewtaxcredits', u'/calculate-state-pension/y', u'/student-finance', u'/photos-for-passports', u'/contact-student-finance-england', u'/visa-processing-times', u'/foreign-travel-advice', u'/jobseekers-allowance', u'/contact', u'/browse/education/student-finance', u'/calculate-vehicle-tax-rates', u'/find-a-visa-application-centre', u'/working-tax-credit', u'/renew-driving-licence-at-70', u'/passport-advice-line', u'/call-charges', u'/overseas-passports/y', u'/countersigning-passport-applications', u'/government/topical-events/sexual-violence-in-conflict', u'/how-the-post-office-check-and-send-service-works', u'/visa-fees', u'/government/organisations', u'/browse/driving/learning-to-drive', u'/browse/working/state-pension', u'/vehicle-tax-rate-tables', u'/get-a-child-passport/your-childs-first-passport', u'/calculate-state-pension/y/age', u'/make-a-sorn', u'/jobseekers-allowance/what-youll-get', u'/general-visit-visa/apply', u'/contact/govuk/anonymous-feedback/thankyou', u'/browse/citizenship/citizenship', u'/general-visit-visa/documents-you-must-provide', u'/jobseekers-allowance/overview', u'/uk-border-control/before-you-leave-for-the-uk', u'/government/organisations/foreign-commonwealth-office', u'/government/collections/national-curriculum', u'/government/organisations/ministry-of-defence', u'/ips-regional-passport-office', u'/hand-luggage-restrictions/overview', u'/jobseekers-allowance/eligibility', u'/register-to-vote', u'/disclosure-barring-service-check/overview', u'/browse/benefits/jobseekers-allowance', u'/dvlaforms', u'/tier-4-general-visa', u'/student-finance/loans-and-grants' ] # the above should not be hardcoded. Calculate each time so as to avoid page name changes etc. max_results = 5e7 metrics = ['pageviews'] dimensions = ['pagePath', 'hour', 'date'] dim = ['date', 'hour'] filters = ['pagePath==' + top100[97]] ############### #Find Top 100 pages by pageviews - (pv fine in this case rather than upv) # df = ga.read_ga(metrics, # dim, # start_date = start, # end_date = end, # token_file_name = 'static/token/analytics.dat', # secrets = 'static/token/client_secrets.json', # account_id = '26179049', # max_results=max_results, # chunksize=5000 # ) # df1b = pd.concat([x for x in df]) # df1c = df1b.sort(columns=['pageviews'], ascending=0) df1 = ga.read_ga(metrics, dimensions=dim, start_date=dbyy, end_date=yesterday, parse_dates=[['date', 'hour']], token_file_name='static/token/analytics.dat', secrets='static/token/client_secrets.json', account_id='26179049' # filters = filters ) ##################### 48 MAX LAG ############################## ind = [] for i in range(48, len(df1)): lag = [1, 2, 3, 24, 48] lagx = list(i - np.array(lag)) Train = df1.ix[lagx] Target = df1.ix[i] TT = Train.T TT.columns = [1, 2, 3, 24, 48] TT['Target'] = Target['pageviews'] ind.append(TT) rng = date_range(df1.index[48], df1.index[len(df1) - 1], freq='H') Set = ind[0].append(ind[1:]) Set.index = rng SetT = Set.ix[:today][:-1] print SetT ##################### 7 day trial ############################## # ind = [] # # for i in range(168, len(df1)): # # lag = [1,2,3,24,48, 168] # lagx = list(i - np.array(lag)) # # Train = df1.ix[lagx] # Target = df1.ix[i] # # TT = Train.T # TT.columns = [1,2,3,24,48, 168] # TT['Target'] = Target['pageviews'] # ind.append(TT) # # rng = date_range(df1.index[168], df1.index[len(df1)-1], freq='H') # Set = ind[0].append(ind[1:]) # Set.index = rng # SetT = Set.ix[:today][:-1] # print SetT ################################################# TrainSamp = 0.8 feats = 5 TS = int(np.round(TrainSamp * len(SetT))) X_Train = SetT[SetT.columns[0:feats]].head(TS) Y_Train = SetT['Target'].head(TS) X_Test = SetT[SetT.columns[0:feats]].ix[TS:] Y_Test = SetT['Target'].ix[TS:] X_Train = X_Train.replace(0, 1) X_Test = X_Test.replace(0, 1) Y_Train = Y_Train.replace(0, 1) Y_Test = Y_Test.replace(0, 1) print 50 * '-' print "Random Forest Regression" print 50 * '-' rf = RandomForestRegressor(n_estimators=500, max_features=feats) rf.fit(X_Train, Y_Train) PredRF = rf.predict(X_Test) scoreRF = r2_score(Y_Test, PredRF) MSE = np.mean(np.square(Y_Test - PredRF)) print 'R2 Score = ', scoreRF print 'MSE = ', MSE Res = pd.DataFrame(columns=['res'], index=(range(0, len(Y_Test)))) resid = Y_Test - PredRF Res['res'] = resid.values TSDU = Res['res'].mean() + 3 * Res['res'].std() TSDL = Res['res'].mean() - 3 * Res['res'].std() tsdP = Res[Res['res'] > (Res['res'].mean() + 3 * Res['res'].std())] tsdN = Res[Res['res'] < (Res['res'].mean() - 3 * Res['res'].std())] plt.figure(2) plt.plot(Y_Test, Y_Test) plt.scatter(Y_Test, PredRF, s=40, alpha=0.5, c='r') ############################## plt.figure(1) plt.subplot(2, 2, 1) plt.plot(range(0, len(Y_Test)), Y_Test) plt.scatter(range(0, len(Y_Test)), PredRF, s=70, alpha=0.5, c='r') plt.xlim([0, len(Y_Test)]) plt.title("Random Forest Model") plt.subplot(2, 2, 3) plt.plot(range(0, len(Y_Test)), resid) plt.plot(range(0, len(Y_Test)), [TSDU] * len(Y_Test), c='r') plt.plot(range(0, len(Y_Test)), [TSDL] * len(Y_Test), c='r') plt.xlim([0, len(Y_Test)]) ############################### AP = len(tsdP) + len(tsdN) print 50 * '-' print "PAGE: " + str(filters[0]) print 50 * '-' print "RANDOM FOREST Number of Anomalous Points is: %i" % AP print "%% Anomalous points is: %.1f" % (100 * AP / len(Y_Test)) + '%' ################################################################# print 50 * '-' print "Linear Model" print 50 * '-' clf = linear_model.LinearRegression() clf.fit(np.log(X_Train), np.log(Y_Train)) LMPred = clf.predict(np.log(X_Test)) scoreLM = r2_score(np.log(Y_Test), LMPred) MSELM = np.mean(np.square(Y_Test - np.exp(LMPred))) print "R2 Score = ", scoreLM print "MSE = ", MSELM ########################################################### print 50 * '-' print "LASSO Regression" print 50 * '-' ################################################################################ # Lasso with path and cross-validation using LassoCV path from sklearn.linear_model import LassoCV lasso_cv = LassoCV() y_ = lasso_cv.fit(X_Train, Y_Train).predict(X_Test) # .predict(X_Test) print "Optimal regularization parameter = %s" % lasso_cv.alpha_ # Compute explained variance on test data print "r^2 on test data : %f" % ( 1 - np.linalg.norm(Y_Test - y_)**2 / np.linalg.norm(Y_Test)**2) print "LASSO MSE: ", np.mean(np.square((Y_Test - (y_)))) Res2 = pd.DataFrame(columns=['res'], index=(range(0, len(Y_Test)))) resid2 = Y_Test - y_ Res2['res'] = resid2.values TSDU2 = Res2['res'].mean() + 3 * Res2['res'].std() TSDL2 = Res2['res'].mean() - 3 * Res2['res'].std() tsdP2 = Res2[Res2['res'] > (Res2['res'].mean() + 3 * Res2['res'].std())] tsdN2 = Res2[Res2['res'] < (Res2['res'].mean() - 3 * Res2['res'].std())] AP = len(tsdP2) + len(tsdN2) print 50 * '-' print "PAGE: " + str(filters[0]) print 50 * '-' print "LASSP Number of Anomalous Points is: %i" % AP print "%% Anomalous points is: %.1f" % (100 * AP / len(Y_Test)) + '%' # plt.figure(3) ################################################# plt.subplot(2, 2, 2) plt.plot(range(0, len(Y_Test)), Y_Test) plt.scatter(range(0, len(Y_Test)), y_, s=70, alpha=0.5, c='r') plt.xlim([0, len(Y_Test)]) plt.title('LASSO Model') plt.subplot(2, 2, 4) plt.plot(range(0, len(Y_Test)), resid2) plt.plot(range(0, len(Y_Test)), [TSDU2] * len(Y_Test), c='r') plt.plot(range(0, len(Y_Test)), [TSDL2] * len(Y_Test), c='r') plt.xlim([0, len(Y_Test)])