예제 #1
0
def get_request(url, sess_id):
    """Request the given URL from the gomus servers and return the results."""
    cookies = dict(_session_id=sess_id)
    response = requests.get(url, cookies=cookies)
    response.raise_for_status()
    if response.ok:
        logger.info("HTTP request successful")

    return response.content
    def run(self):

        dfs = []
        for n in range(self.n_min, self.n_max + 1):
            logger.info(f"Collecting n={n}-grams ...")
            ngram_file = yield QueryDb(query=self._build_query(n))
            with ngram_file.open('r') as ngram_stream:
                dfs.append(pd.read_csv(ngram_stream))

        ngrams = pd.concat(dfs)

        with self.output().open('w') as output_stream:
            ngrams.to_csv(output_stream, index=False, header=True)
예제 #3
0
def direct_download_url(base_url, report, timespan):
    """Generate download URL for a gomus report."""
    start_time, end_time = parse_timespan(timespan)
    base_return = base_url + f'/{report}.xlsx'

    if start_time == dt.date.min:
        return base_return

    # timespan is valid
    end_time = end_time.strftime("%Y-%m-%d")
    start_time = start_time.strftime("%Y-%m-%d")
    logger.info(f"Requesting report for timespan "
                f"from {start_time} to {end_time}")
    return f'{base_return}?end_at={end_time}&start_at={start_time}'
예제 #4
0
    def fetch_updated_mail(self, booking_id):
        # This would be cleaner to put into an extra function,
        # but dynamic dependencies only work when yielded from 'run()'
        logger.info(f"Fetching new mail for booking {booking_id}")

        # First step: Get customer of booking (cannot use customer_id,
        # since it has been derived from the wrong e-mail address)
        booking_html_task = FetchGomusHTML(url=f'/admin/bookings/{booking_id}')
        yield booking_html_task
        with booking_html_task.output().open('r') as booking_html_fp:
            booking_html = html.fromstring(booking_html_fp.read())
        booking_customer = booking_html.xpath(
            '//body/div[2]/div[2]/div[3]/div[4]/div[2]'
            '/div[2]/div[2]/div[1]/div[1]/div[1]/a')[0]
        gomus_id = int(booking_customer.get('href').split('/')[-1])

        # Second step: Get current e-mail address for customer
        customer_html_task = FetchGomusHTML(url=f'/admin/customers/{gomus_id}')
        yield customer_html_task
        with customer_html_task.output().open('r') as customer_html_fp:
            customer_html = html.fromstring(customer_html_fp.read())
        customer_email = self.parse_text(
            customer_html, '//body/div[2]/div[2]/div[3]/div/div[2]/div[1]'
            '/div/div[3]/div/div[1]/div[1]/div/dl/dd[1]')

        # Update customer ID in gomus_customer
        # and gomus_to_customer_mapping
        customer_id = hash_id(customer_email)
        old_customer = self.db_connector.query(
            query=f'SELECT customer_id FROM gomus_to_customer_mapping '
            f'WHERE gomus_id = {gomus_id}',
            only_first=True)
        if not old_customer:
            logger.warning(
                "Cannot update email address of customer which is not in "
                "database.\nSkipping ...")
            return
        old_customer_id = old_customer[0]

        logger.info(f"Replacing old customer ID {old_customer_id} "
                    f"with new customer ID {customer_id}")

        # References are updated through foreign key
        # references via ON UPDATE CASCADE
        self.db_connector.execute(f'''
            UPDATE gomus_customer
            SET customer_id = {customer_id}
            WHERE customer_id = {old_customer_id}
        ''')
    def get_thumbnail_uri(self, permalink):
        url = self.get_thumbnail_url(permalink)
        if not url:
            return None

        permalink_match = regex.search(
            r'instagram\.com/(?P<type>p|tv)/(?P<id>[\w-]+)/', permalink)
        if permalink_match['type'] != 'p':
            # TODO: Support IGTV thumbnails as well. See #395 (comment 20498).
            logger.info(f"Skipping unsupported media type for post {url}")
            return self.empty_data_uri
        short_id = permalink_match['id']

        loader = self.create_instaloader(
            quiet=True,
            download_videos=False,
            download_geotags=False,
            download_comments=False,
            save_metadata=False
        )

        directory = f'{self.output_dir}/instagram/thumbnails'
        filepath = f'{directory}/{short_id}'
        ext = 'jpg'
        url += f'&ext={ext}'
        os.makedirs(directory, exist_ok=True)
        try:
            loader.download_pic(filepath, url, dt.datetime.now())
        except instaloader.exceptions.ConnectionException as error:
            if "404 when accessing" not in str(error):
                raise
            # Return a truthy value instead of None to avoid redundant
            # retrys upon every later execution of the task.
            return self.empty_data_uri

        filepath += f'.{ext}'

        # Current width of downloaded thumbnails is 320 px. If this is
        # changed, we might want to resize it here.
        with open(filepath, 'rb') as data_file:
            data = base64.b64encode(data_file.read())
        return f'data:image/jpeg;base64,{data.decode()}'
예제 #6
0
    def run(self) -> None:

        logger.info("loading credentials...")
        credentials = self.load_credentials()
        try:
            logger.info("creating service...")
            service = self.load_service(credentials)
            logger.info("fetching reviews...")
            raw_reviews = list(self.fetch_raw_reviews(service))
        except googleapiclient.errors.HttpError as error:
            if error.resp.status is not None:
                raise
            logger.error("Generic HTTPError raised by Google Maps. Aborting. "
                         "If you see this error message frequently, consider "
                         "to do something against it.")
            raw_reviews = []
        logger.info("extracting reviews...")
        reviews_df = self.extract_reviews(raw_reviews)
        logger.info("success! writing...")

        with self.output().open('w') as output_file:
            reviews_df.to_csv(output_file, index=False)
    def run(self):
        access_token = os.getenv('FB_ACCESS_TOKEN')
        if not access_token:
            raise EnvironmentError("FB Access token is not set")

        with self.input().open('r') as facts_file:
            facts = json.load(facts_file)
        page_id = facts['ids']['instagram']['pageId']

        all_media = []

        fields = ','.join(self.columns.keys())
        # use limit=100 to keep amount of requests small
        # 100 is the maximum value the Graph API will accept
        limit = 100

        media_url = (f'{API_BASE}/{page_id}/media'
                     f'?fields={fields}&limit={limit}')

        response = try_request_multiple_times(media_url)
        response_json = response.json()

        current_count = len(response_json['data'])
        all_media.extend(response_json['data'])

        logger.info("Fetching Instagram posts ...")
        while 'next' in response_json['paging']:
            next_url = response_json['paging']['next']
            response = try_request_multiple_times(next_url)
            response_json = response.json()

            current_count += len(response_json['data'])
            if sys.stdout.isatty():
                print(
                    f"\rFetched {current_count} Instagram posts",
                    end='',
                    flush=True)
            for media in response_json['data']:
                all_media.append(media)

            if self.minimal_mode:
                logger.info("Running in minimal mode, stopping now")
                response_json['paging'].pop('next')

        if sys.stdout.isatty():
            print()  # have to manually print newline
        logger.info("Fetching of Instagram posts complete")

        df = pd.DataFrame([
            {
                column: adapter(media[column])
                for (column, adapter)
                in self.columns.items()
            }
            for media
            in all_media
        ])
        with self.output().open('w') as output_file:
            df.to_csv(output_file, index=False, header=True)
예제 #8
0
    def fetch_posts(self, page_id):

        limit = 100
        url = f'{API_BASE}/{page_id}/published_posts?limit={limit}'

        response = try_request_multiple_times(url)
        response_content = response.json()
        yield from response_content['data']

        i = 1
        while 'next' in response_content['paging']:
            logger.info(f"Fetched approx. {i * limit} Facebook posts")
            i += 1
            url = response_content['paging']['next']

            # next(log_loop)
            response = try_request_multiple_times(url)
            response_content = response.json()
            yield from response_content['data']

            if self.minimal_mode:
                response_content['paging'].pop('next')

        logger.info("Fetching of facebook posts completed")
예제 #9
0
def request_report(report_type, session_id):
    """Download a generated report from the Gomus servers."""
    base_url = 'https://barberini.gomus.de'
    report_parts = report_type.split("_")
    report_id = REPORT_IDS[report_type]

    logger.info(f"Working with report '{report_parts[0]}.xlsx'")

    if report_id > 0:
        # Work with the kind of report that is generated and maintained
        logger.info("Fetching report")
        url = f'{base_url}/admin/reports/{report_id}.xlsx'
    else:
        # Work with the kind of report that is requested directly
        logger.info("Directly downloading report")
        timespan = report_parts[1] if len(report_parts) >= 2 else ''
        url = direct_download_url(base_url, report_parts[0], timespan)

    return get_request(url, session_id)
예제 #10
0
import os
import subprocess as sp

from _utils import db_connector, logger
CONNECTOR = db_connector()

logging.basicConfig(level=logging.INFO)

REFERENCING_TABLES = ['fb_post_comment', 'fb_post_performance']

# Are there any existing data to preserve?
if not any(
        CONNECTOR.exists(f'SELECT * FROM {table}')
        for table in REFERENCING_TABLES):
    # Nothing to preserve, get into the fast lane
    logger.info("Truncating fb_post in the fast line")
    CONNECTOR.execute('''
        TRUNCATE TABLE fb_post CASCADE
    ''')
    exit(0)

# Otherwise, to keep existing data from referencing tables, we will need to do
# some SQL acrobatics below.
logger.info("Truncating fb_post in the long line")

try:
    with CONNECTOR._create_connection() as conn:
        with conn.cursor() as cur:

            # 1. Decouple performance table from post table
            logger.info("Dropping constraints")
예제 #11
0
    def fetch_comments(self, df):

        invalid_count = 0

        # Handle each post
        for i in df.index:
            page_id, post_id = df['page_id'][i], df['post_id'][i]
            fb_post_id = f'{page_id}_{post_id}'
            post_date = self.post_date(df, i)
            if post_date < self.minimum_relevant_date:
                continue

            # Grab up to 100 comments for the post (maximum)
            limit = 100

            # 'toplevel' or 'stream' (toplevel doesn't include replies)
            # Using 'toplevel' here allows us to safely
            # set parent to None for all comments returned
            # by the first query
            filt = 'toplevel'

            # 'chronological' or 'reverse_chronolocial'
            order = 'chronological'

            fields = ','.join(
                ['id', 'created_time', 'comment_count', 'message', 'comments'])

            url = (f'{API_BASE}/{fb_post_id}/comments?limit={limit}'
                   f'filter={filt}&order={order}&fields={fields}')

            response = try_request_multiple_times(url)
            if response.status_code == 400:
                invalid_count += 1
                continue
            response_data = response.json().get('data')

            logger.info(f"Fetched {len(response_data)} "
                        f"comments for post {post_id}")

            # Handle each comment for the post
            for comment in response_data:
                comment_id = comment.get('id').split('_')[1]

                yield {
                    'post_id': str(post_id),
                    'comment_id': str(comment_id),
                    'page_id': str(page_id),
                    'post_date': comment.get('created_time'),
                    'text': comment.get('message'),
                    'is_from_museum': self.is_from_museum(comment),
                    'response_to': None
                }

                if not comment.get('comment_count'):
                    continue
                try:
                    # Handle each reply for the comment
                    for reply in comment['comments']['data']:
                        yield {
                            'comment_id': reply.get('id').split('_')[1],
                            'page_id': str(page_id),
                            'post_id': str(post_id),
                            'post_date': reply.get('created_time'),
                            'text': reply.get('message'),
                            'is_from_museum': self.is_from_museum(reply),
                            'response_to': str(comment_id)
                        }
                except KeyError:
                    # Sometimes, replies become unavailable. In this case,
                    # the Graph API returns the true 'comment_count',
                    # but does not provide a 'comments' field anyway
                    logger.warning(f"Failed to retrieve replies for comment "
                                   f"{comment.get('id')}")

        if invalid_count:
            logger.warning(f"Skipped {invalid_count} posts")
예제 #12
0
    def run(self):

        reviews = self.fetch_all()
        logger.info("storing results")
        with self.output().open('w') as output_file:
            reviews.to_csv(output_file, index=False, header=True)