def fetch_updated_mail(self, booking_id): # This would be cleaner to put into an extra function, # but dynamic dependencies only work when yielded from 'run()' logger.info(f"Fetching new mail for booking {booking_id}") # First step: Get customer of booking (cannot use customer_id, # since it has been derived from the wrong e-mail address) booking_html_task = FetchGomusHTML(url=f'/admin/bookings/{booking_id}') yield booking_html_task with booking_html_task.output().open('r') as booking_html_fp: booking_html = html.fromstring(booking_html_fp.read()) booking_customer = booking_html.xpath( '//body/div[2]/div[2]/div[3]/div[4]/div[2]' '/div[2]/div[2]/div[1]/div[1]/div[1]/a')[0] gomus_id = int(booking_customer.get('href').split('/')[-1]) # Second step: Get current e-mail address for customer customer_html_task = FetchGomusHTML(url=f'/admin/customers/{gomus_id}') yield customer_html_task with customer_html_task.output().open('r') as customer_html_fp: customer_html = html.fromstring(customer_html_fp.read()) customer_email = self.parse_text( customer_html, '//body/div[2]/div[2]/div[3]/div/div[2]/div[1]' '/div/div[3]/div/div[1]/div[1]/div/dl/dd[1]') # Update customer ID in gomus_customer # and gomus_to_customer_mapping customer_id = hash_id(customer_email) old_customer = self.db_connector.query( query=f'SELECT customer_id FROM gomus_to_customer_mapping ' f'WHERE gomus_id = {gomus_id}', only_first=True) if not old_customer: logger.warning( "Cannot update email address of customer which is not in " "database.\nSkipping ...") return old_customer_id = old_customer[0] logger.info(f"Replacing old customer ID {old_customer_id} " f"with new customer ID {customer_id}") # References are updated through foreign key # references via ON UPDATE CASCADE self.db_connector.execute(f''' UPDATE gomus_customer SET customer_id = {customer_id} WHERE customer_id = {old_customer_id} ''')
def fetch_comments(self, df): invalid_count = 0 # Handle each post for i in df.index: page_id, post_id = df['page_id'][i], df['post_id'][i] fb_post_id = f'{page_id}_{post_id}' post_date = self.post_date(df, i) if post_date < self.minimum_relevant_date: continue # Grab up to 100 comments for the post (maximum) limit = 100 # 'toplevel' or 'stream' (toplevel doesn't include replies) # Using 'toplevel' here allows us to safely # set parent to None for all comments returned # by the first query filt = 'toplevel' # 'chronological' or 'reverse_chronolocial' order = 'chronological' fields = ','.join( ['id', 'created_time', 'comment_count', 'message', 'comments']) url = (f'{API_BASE}/{fb_post_id}/comments?limit={limit}' f'filter={filt}&order={order}&fields={fields}') response = try_request_multiple_times(url) if response.status_code == 400: invalid_count += 1 continue response_data = response.json().get('data') logger.info(f"Fetched {len(response_data)} " f"comments for post {post_id}") # Handle each comment for the post for comment in response_data: comment_id = comment.get('id').split('_')[1] yield { 'post_id': str(post_id), 'comment_id': str(comment_id), 'page_id': str(page_id), 'post_date': comment.get('created_time'), 'text': comment.get('message'), 'is_from_museum': self.is_from_museum(comment), 'response_to': None } if not comment.get('comment_count'): continue try: # Handle each reply for the comment for reply in comment['comments']['data']: yield { 'comment_id': reply.get('id').split('_')[1], 'page_id': str(page_id), 'post_id': str(post_id), 'post_date': reply.get('created_time'), 'text': reply.get('message'), 'is_from_museum': self.is_from_museum(reply), 'response_to': str(comment_id) } except KeyError: # Sometimes, replies become unavailable. In this case, # the Graph API returns the true 'comment_count', # but does not provide a 'comments' field anyway logger.warning(f"Failed to retrieve replies for comment " f"{comment.get('id')}") if invalid_count: logger.warning(f"Skipped {invalid_count} posts")
def run(self): current_timestamp = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") performances = [] with self.input().open('r') as csv_in: df = pd.read_csv(csv_in) if self.minimal_mode: df = df.head(5) invalid_count = 0 pbar = self.tqdm(df.index, desc="Fetching performance data for facebook posts") for index in pbar: page_id, post_id = \ str(df['page_id'][index]), str(df['post_id'][index]) fb_post_id = f'{page_id}_{post_id}' post_date = self.post_date(df, index) if post_date < self.minimum_relevant_date: continue logger.debug(f"Loading performance data for FB post {fb_post_id}") metrics = ','.join([ 'post_reactions_by_type_total', 'post_activity_by_action_type', 'post_clicks_by_type', 'post_negative_feedback', 'post_impressions_paid', 'post_impressions', 'post_impressions_unique' # "reach" ]) url = f'{API_BASE}/{fb_post_id}/insights?metric={metrics}' response = try_request_multiple_times(url) if response.status_code == 400: invalid_count += 1 continue response.raise_for_status() # in case of another error response_content = response.json() post_perf = { 'timestamp': current_timestamp, } # Reactions reactions = response_content['data'][0]['values'][0]['value'] post_perf['react_like'] = int(reactions.get('like', 0)) post_perf['react_love'] = int(reactions.get('love', 0)) post_perf['react_wow'] = int(reactions.get('wow', 0)) post_perf['react_haha'] = int(reactions.get('haha', 0)) post_perf['react_sorry'] = int(reactions.get('sorry', 0)) post_perf['react_anger'] = int(reactions.get('anger', 0)) # Activity activity = response_content['data'][1]['values'][0]['value'] post_perf['likes'] = int(activity.get('like', 0)) post_perf['shares'] = int(activity.get('share', 0)) post_perf['comments'] = int(activity.get('comment', 0)) # Clicks clicks = response_content['data'][2]['values'][0]['value'] post_perf['video_clicks'] = int(clicks.get('video play', 0)) post_perf['link_clicks'] = int(clicks.get('link clicks', 0)) post_perf['other_clicks'] = int(clicks.get('other clicks', 0)) # negative feedback (only one field) post_perf['negative_feedback'] = \ response_content['data'][3]['values'][0]['value'] # number of times the post entered a person's screen through # paid distribution such as an ad post_perf['paid_impressions'] = \ response_content['data'][4]['values'][0]['value'] post_perf['post_impressions'] = \ response_content['data'][5]['values'][0]['value'] post_perf['post_impressions_unique'] = \ response_content['data'][6]['values'][0]['value'] post_perf.update(page_id=page_id, post_id=post_id) performances.append(post_perf) if invalid_count: logger.warning(f"Skipped {invalid_count} posts") df = pd.DataFrame(performances) # For some reason, all except the first set of performance # values get inserted twice into the performances list. # Investigate and fix the root cause, this is a workaround # TODO: Is this still up to date? Could not reproduce. df.drop_duplicates(subset='post_id', inplace=True, ignore_index=True) df = self.filter_fkey_violations(df) df = self.condense_performance_values(df) with self.output().open('w') as output_file: df.to_csv(output_file, index=False, header=True)