def run(self): access_token = os.getenv('FB_ACCESS_TOKEN') if not access_token: raise EnvironmentError("FB Access token is not set") with self.input().open('r') as facts_file: facts = json.load(facts_file) page_id = facts['ids']['instagram']['pageId'] all_media = [] fields = ','.join(self.columns.keys()) # use limit=100 to keep amount of requests small # 100 is the maximum value the Graph API will accept limit = 100 media_url = (f'{API_BASE}/{page_id}/media' f'?fields={fields}&limit={limit}') response = try_request_multiple_times(media_url) response_json = response.json() current_count = len(response_json['data']) all_media.extend(response_json['data']) logger.info("Fetching Instagram posts ...") while 'next' in response_json['paging']: next_url = response_json['paging']['next'] response = try_request_multiple_times(next_url) response_json = response.json() current_count += len(response_json['data']) if sys.stdout.isatty(): print( f"\rFetched {current_count} Instagram posts", end='', flush=True) for media in response_json['data']: all_media.append(media) if self.minimal_mode: logger.info("Running in minimal mode, stopping now") response_json['paging'].pop('next') if sys.stdout.isatty(): print() # have to manually print newline logger.info("Fetching of Instagram posts complete") df = pd.DataFrame([ { column: adapter(media[column]) for (column, adapter) in self.columns.items() } for media in all_media ]) with self.output().open('w') as output_file: df.to_csv(output_file, index=False, header=True)
def run(self): with self.input().open('r') as facts_file: facts = json.load(facts_file) page_id = facts['ids']['instagram']['pageId'] df = pd.DataFrame(columns=self.columns) metrics = ','.join([ 'impressions', 'reach', 'profile_views', 'follower_count', 'website_clicks' ]) period = 'day' url = f'{API_BASE}/{page_id}/insights?metric={metrics}&period={period}' response = try_request_multiple_times(url) response_data = response.json()['data'] timestamp = response_data[0]['values'][0]['end_time'] impressions = response_data[0]['values'][0]['value'] reach = response_data[1]['values'][0]['value'] profile_views = response_data[2]['values'][0]['value'] follower_count = response_data[3]['values'][0]['value'] website_clicks = response_data[4]['values'][0]['value'] df.loc[0] = [ timestamp, impressions, reach, profile_views, follower_count, website_clicks ] with self.output().open('w') as output_file: df.to_csv(output_file, index=False, header=True)
def run(self): with self.input().open('r') as facts_file: facts = json.load(facts_file) page_id = facts['ids']['instagram']['pageId'] df = pd.DataFrame(columns=self.columns) fields = ','.join([ 'followers_count', 'media_count' ]) url = f'{API_BASE}/{page_id}?fields={fields}' response = try_request_multiple_times(url) response_data = response.json() timestamp = dt.datetime.now() follower_count = response_data.get('followers_count') media_count = response_data.get('media_count') df.loc[0] = [ timestamp, follower_count, media_count ] with self.output().open('w') as output_file: df.to_csv(output_file, index=False, header=True)
def run(self): with self.input().open('r') as facts_file: facts = json.load(facts_file) page_id = facts['ids']['instagram']['pageId'] df = pd.DataFrame(columns=self.columns) metrics = ','.join([ 'impressions', 'reach', 'profile_views', 'follower_count', 'website_clicks' ]) period = 'day' url = f'{API_BASE}/{page_id}/insights?metric={metrics}&period={period}' response = try_request_multiple_times(url) response_data = response.json()['data'] timestamp = response_data[0]['values'][0]['end_time'] metrics = self.extract_metrics(response_data) df = df.append({'timestamp': timestamp, **metrics}, ignore_index=True) with self.output().open('w') as output_file: df.to_csv(output_file, index=False, header=True)
def run(self): with self.input().open('r') as input_file: post_df = pd.read_csv(input_file) if self.minimal_mode: post_df = post_df.head(5) generic_metrics = ['impressions', 'reach', 'engagement', 'saved'] performance_df = pd.DataFrame(columns=[ column for column in self.columns if not column.startswith('delta_') ]) fetch_time = dt.datetime.now() for i, row in self.tqdm(post_df.iterrows(), desc="Fetching insights for instagram posts", total=len(post_df)): # Fetch only insights for less than 2 months old posts post_time = dtparser.parse(row['timestamp']) if post_time.date() < \ fetch_time.date() - self.timespan: continue metrics = ','.join(generic_metrics) if row['media_type'] == 'VIDEO': metrics += ',video_views' # causes error if used on non-video url = f'{API_BASE}/{row["id"]}/insights?metric={metrics}' response = try_request_multiple_times(url) response_data = response.json()['data'] impressions = response_data[0]['values'][0]['value'] reach = response_data[1]['values'][0]['value'] engagement = response_data[2]['values'][0]['value'] saved = response_data[3]['values'][0]['value'] video_views = response_data[4]['values'][0]['value']\ if row['media_type'] == 'VIDEO'\ else 0 # for non-video posts performance_df.loc[i] = [ str(row['id']), # The type was lost during CSV conversion fetch_time, impressions, reach, engagement, saved, video_views ] performance_df = self.filter_fkey_violations(performance_df) performance_df = self.condense_performance_values( performance_df, delta_function=PerformanceValueCondenser.linear_delta) with self.output().open('w') as output_file: performance_df.to_csv(output_file, index=False, header=True)
def _get_single_metric(page_id, metric, period='lifetime'): url = f'{API_BASE}/{page_id}/insights?metric={metric}&period={period}' res = try_request_multiple_times(url) return res.json()['data'][0]['values'][0]['value']