Ejemplo n.º 1
0
    def fetch_updated_mail(self, booking_id):
        # This would be cleaner to put into an extra function,
        # but dynamic dependencies only work when yielded from 'run()'
        logger.info(f"Fetching new mail for booking {booking_id}")

        # First step: Get customer of booking (cannot use customer_id,
        # since it has been derived from the wrong e-mail address)
        booking_html_task = FetchGomusHTML(url=f'/admin/bookings/{booking_id}')
        yield booking_html_task
        with booking_html_task.output().open('r') as booking_html_fp:
            booking_html = html.fromstring(booking_html_fp.read())
        booking_customer = booking_html.xpath(
            '//body/div[2]/div[2]/div[3]/div[4]/div[2]'
            '/div[2]/div[2]/div[1]/div[1]/div[1]/a')[0]
        gomus_id = int(booking_customer.get('href').split('/')[-1])

        # Second step: Get current e-mail address for customer
        customer_html_task = FetchGomusHTML(url=f'/admin/customers/{gomus_id}')
        yield customer_html_task
        with customer_html_task.output().open('r') as customer_html_fp:
            customer_html = html.fromstring(customer_html_fp.read())
        customer_email = self.parse_text(
            customer_html, '//body/div[2]/div[2]/div[3]/div/div[2]/div[1]'
            '/div/div[3]/div/div[1]/div[1]/div/dl/dd[1]')

        # Update customer ID in gomus_customer
        # and gomus_to_customer_mapping
        customer_id = hash_id(customer_email)
        old_customer = self.db_connector.query(
            query=f'SELECT customer_id FROM gomus_to_customer_mapping '
            f'WHERE gomus_id = {gomus_id}',
            only_first=True)
        if not old_customer:
            logger.warning(
                "Cannot update email address of customer which is not in "
                "database.\nSkipping ...")
            return
        old_customer_id = old_customer[0]

        logger.info(f"Replacing old customer ID {old_customer_id} "
                    f"with new customer ID {customer_id}")

        # References are updated through foreign key
        # references via ON UPDATE CASCADE
        self.db_connector.execute(f'''
            UPDATE gomus_customer
            SET customer_id = {customer_id}
            WHERE customer_id = {old_customer_id}
        ''')
Ejemplo n.º 2
0
    def fetch_comments(self, df):

        invalid_count = 0

        # Handle each post
        for i in df.index:
            page_id, post_id = df['page_id'][i], df['post_id'][i]
            fb_post_id = f'{page_id}_{post_id}'
            post_date = self.post_date(df, i)
            if post_date < self.minimum_relevant_date:
                continue

            # Grab up to 100 comments for the post (maximum)
            limit = 100

            # 'toplevel' or 'stream' (toplevel doesn't include replies)
            # Using 'toplevel' here allows us to safely
            # set parent to None for all comments returned
            # by the first query
            filt = 'toplevel'

            # 'chronological' or 'reverse_chronolocial'
            order = 'chronological'

            fields = ','.join(
                ['id', 'created_time', 'comment_count', 'message', 'comments'])

            url = (f'{API_BASE}/{fb_post_id}/comments?limit={limit}'
                   f'filter={filt}&order={order}&fields={fields}')

            response = try_request_multiple_times(url)
            if response.status_code == 400:
                invalid_count += 1
                continue
            response_data = response.json().get('data')

            logger.info(f"Fetched {len(response_data)} "
                        f"comments for post {post_id}")

            # Handle each comment for the post
            for comment in response_data:
                comment_id = comment.get('id').split('_')[1]

                yield {
                    'post_id': str(post_id),
                    'comment_id': str(comment_id),
                    'page_id': str(page_id),
                    'post_date': comment.get('created_time'),
                    'text': comment.get('message'),
                    'is_from_museum': self.is_from_museum(comment),
                    'response_to': None
                }

                if not comment.get('comment_count'):
                    continue
                try:
                    # Handle each reply for the comment
                    for reply in comment['comments']['data']:
                        yield {
                            'comment_id': reply.get('id').split('_')[1],
                            'page_id': str(page_id),
                            'post_id': str(post_id),
                            'post_date': reply.get('created_time'),
                            'text': reply.get('message'),
                            'is_from_museum': self.is_from_museum(reply),
                            'response_to': str(comment_id)
                        }
                except KeyError:
                    # Sometimes, replies become unavailable. In this case,
                    # the Graph API returns the true 'comment_count',
                    # but does not provide a 'comments' field anyway
                    logger.warning(f"Failed to retrieve replies for comment "
                                   f"{comment.get('id')}")

        if invalid_count:
            logger.warning(f"Skipped {invalid_count} posts")
Ejemplo n.º 3
0
    def run(self):

        current_timestamp = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        performances = []
        with self.input().open('r') as csv_in:
            df = pd.read_csv(csv_in)

        if self.minimal_mode:
            df = df.head(5)

        invalid_count = 0
        pbar = self.tqdm(df.index,
                         desc="Fetching performance data for facebook posts")
        for index in pbar:
            page_id, post_id = \
                str(df['page_id'][index]), str(df['post_id'][index])
            fb_post_id = f'{page_id}_{post_id}'
            post_date = self.post_date(df, index)
            if post_date < self.minimum_relevant_date:
                continue

            logger.debug(f"Loading performance data for FB post {fb_post_id}")

            metrics = ','.join([
                'post_reactions_by_type_total',
                'post_activity_by_action_type',
                'post_clicks_by_type',
                'post_negative_feedback',
                'post_impressions_paid',
                'post_impressions',
                'post_impressions_unique'  # "reach"
            ])
            url = f'{API_BASE}/{fb_post_id}/insights?metric={metrics}'

            response = try_request_multiple_times(url)

            if response.status_code == 400:
                invalid_count += 1
                continue
            response.raise_for_status()  # in case of another error
            response_content = response.json()

            post_perf = {
                'timestamp': current_timestamp,
            }

            # Reactions
            reactions = response_content['data'][0]['values'][0]['value']
            post_perf['react_like'] = int(reactions.get('like', 0))
            post_perf['react_love'] = int(reactions.get('love', 0))
            post_perf['react_wow'] = int(reactions.get('wow', 0))
            post_perf['react_haha'] = int(reactions.get('haha', 0))
            post_perf['react_sorry'] = int(reactions.get('sorry', 0))
            post_perf['react_anger'] = int(reactions.get('anger', 0))

            # Activity
            activity = response_content['data'][1]['values'][0]['value']
            post_perf['likes'] = int(activity.get('like', 0))
            post_perf['shares'] = int(activity.get('share', 0))
            post_perf['comments'] = int(activity.get('comment', 0))

            # Clicks
            clicks = response_content['data'][2]['values'][0]['value']
            post_perf['video_clicks'] = int(clicks.get('video play', 0))
            post_perf['link_clicks'] = int(clicks.get('link clicks', 0))
            post_perf['other_clicks'] = int(clicks.get('other clicks', 0))

            # negative feedback (only one field)
            post_perf['negative_feedback'] = \
                response_content['data'][3]['values'][0]['value']

            # number of times the post entered a person's screen through
            # paid distribution such as an ad
            post_perf['paid_impressions'] = \
                response_content['data'][4]['values'][0]['value']

            post_perf['post_impressions'] = \
                response_content['data'][5]['values'][0]['value']

            post_perf['post_impressions_unique'] = \
                response_content['data'][6]['values'][0]['value']

            post_perf.update(page_id=page_id, post_id=post_id)
            performances.append(post_perf)
        if invalid_count:
            logger.warning(f"Skipped {invalid_count} posts")

        df = pd.DataFrame(performances)

        # For some reason, all except the first set of performance
        # values get inserted twice into the performances list.
        # Investigate and fix the root cause, this is a workaround
        # TODO: Is this still up to date? Could not reproduce.
        df.drop_duplicates(subset='post_id', inplace=True, ignore_index=True)

        df = self.filter_fkey_violations(df)
        df = self.condense_performance_values(df)

        with self.output().open('w') as output_file:
            df.to_csv(output_file, index=False, header=True)