def get_request(url, sess_id): """Request the given URL from the gomus servers and return the results.""" cookies = dict(_session_id=sess_id) response = requests.get(url, cookies=cookies) response.raise_for_status() if response.ok: logger.info("HTTP request successful") return response.content
def run(self): dfs = [] for n in range(self.n_min, self.n_max + 1): logger.info(f"Collecting n={n}-grams ...") ngram_file = yield QueryDb(query=self._build_query(n)) with ngram_file.open('r') as ngram_stream: dfs.append(pd.read_csv(ngram_stream)) ngrams = pd.concat(dfs) with self.output().open('w') as output_stream: ngrams.to_csv(output_stream, index=False, header=True)
def direct_download_url(base_url, report, timespan): """Generate download URL for a gomus report.""" start_time, end_time = parse_timespan(timespan) base_return = base_url + f'/{report}.xlsx' if start_time == dt.date.min: return base_return # timespan is valid end_time = end_time.strftime("%Y-%m-%d") start_time = start_time.strftime("%Y-%m-%d") logger.info(f"Requesting report for timespan " f"from {start_time} to {end_time}") return f'{base_return}?end_at={end_time}&start_at={start_time}'
def fetch_updated_mail(self, booking_id): # This would be cleaner to put into an extra function, # but dynamic dependencies only work when yielded from 'run()' logger.info(f"Fetching new mail for booking {booking_id}") # First step: Get customer of booking (cannot use customer_id, # since it has been derived from the wrong e-mail address) booking_html_task = FetchGomusHTML(url=f'/admin/bookings/{booking_id}') yield booking_html_task with booking_html_task.output().open('r') as booking_html_fp: booking_html = html.fromstring(booking_html_fp.read()) booking_customer = booking_html.xpath( '//body/div[2]/div[2]/div[3]/div[4]/div[2]' '/div[2]/div[2]/div[1]/div[1]/div[1]/a')[0] gomus_id = int(booking_customer.get('href').split('/')[-1]) # Second step: Get current e-mail address for customer customer_html_task = FetchGomusHTML(url=f'/admin/customers/{gomus_id}') yield customer_html_task with customer_html_task.output().open('r') as customer_html_fp: customer_html = html.fromstring(customer_html_fp.read()) customer_email = self.parse_text( customer_html, '//body/div[2]/div[2]/div[3]/div/div[2]/div[1]' '/div/div[3]/div/div[1]/div[1]/div/dl/dd[1]') # Update customer ID in gomus_customer # and gomus_to_customer_mapping customer_id = hash_id(customer_email) old_customer = self.db_connector.query( query=f'SELECT customer_id FROM gomus_to_customer_mapping ' f'WHERE gomus_id = {gomus_id}', only_first=True) if not old_customer: logger.warning( "Cannot update email address of customer which is not in " "database.\nSkipping ...") return old_customer_id = old_customer[0] logger.info(f"Replacing old customer ID {old_customer_id} " f"with new customer ID {customer_id}") # References are updated through foreign key # references via ON UPDATE CASCADE self.db_connector.execute(f''' UPDATE gomus_customer SET customer_id = {customer_id} WHERE customer_id = {old_customer_id} ''')
def get_thumbnail_uri(self, permalink): url = self.get_thumbnail_url(permalink) if not url: return None permalink_match = regex.search( r'instagram\.com/(?P<type>p|tv)/(?P<id>[\w-]+)/', permalink) if permalink_match['type'] != 'p': # TODO: Support IGTV thumbnails as well. See #395 (comment 20498). logger.info(f"Skipping unsupported media type for post {url}") return self.empty_data_uri short_id = permalink_match['id'] loader = self.create_instaloader( quiet=True, download_videos=False, download_geotags=False, download_comments=False, save_metadata=False ) directory = f'{self.output_dir}/instagram/thumbnails' filepath = f'{directory}/{short_id}' ext = 'jpg' url += f'&ext={ext}' os.makedirs(directory, exist_ok=True) try: loader.download_pic(filepath, url, dt.datetime.now()) except instaloader.exceptions.ConnectionException as error: if "404 when accessing" not in str(error): raise # Return a truthy value instead of None to avoid redundant # retrys upon every later execution of the task. return self.empty_data_uri filepath += f'.{ext}' # Current width of downloaded thumbnails is 320 px. If this is # changed, we might want to resize it here. with open(filepath, 'rb') as data_file: data = base64.b64encode(data_file.read()) return f'data:image/jpeg;base64,{data.decode()}'
def run(self) -> None: logger.info("loading credentials...") credentials = self.load_credentials() try: logger.info("creating service...") service = self.load_service(credentials) logger.info("fetching reviews...") raw_reviews = list(self.fetch_raw_reviews(service)) except googleapiclient.errors.HttpError as error: if error.resp.status is not None: raise logger.error("Generic HTTPError raised by Google Maps. Aborting. " "If you see this error message frequently, consider " "to do something against it.") raw_reviews = [] logger.info("extracting reviews...") reviews_df = self.extract_reviews(raw_reviews) logger.info("success! writing...") with self.output().open('w') as output_file: reviews_df.to_csv(output_file, index=False)
def run(self): access_token = os.getenv('FB_ACCESS_TOKEN') if not access_token: raise EnvironmentError("FB Access token is not set") with self.input().open('r') as facts_file: facts = json.load(facts_file) page_id = facts['ids']['instagram']['pageId'] all_media = [] fields = ','.join(self.columns.keys()) # use limit=100 to keep amount of requests small # 100 is the maximum value the Graph API will accept limit = 100 media_url = (f'{API_BASE}/{page_id}/media' f'?fields={fields}&limit={limit}') response = try_request_multiple_times(media_url) response_json = response.json() current_count = len(response_json['data']) all_media.extend(response_json['data']) logger.info("Fetching Instagram posts ...") while 'next' in response_json['paging']: next_url = response_json['paging']['next'] response = try_request_multiple_times(next_url) response_json = response.json() current_count += len(response_json['data']) if sys.stdout.isatty(): print( f"\rFetched {current_count} Instagram posts", end='', flush=True) for media in response_json['data']: all_media.append(media) if self.minimal_mode: logger.info("Running in minimal mode, stopping now") response_json['paging'].pop('next') if sys.stdout.isatty(): print() # have to manually print newline logger.info("Fetching of Instagram posts complete") df = pd.DataFrame([ { column: adapter(media[column]) for (column, adapter) in self.columns.items() } for media in all_media ]) with self.output().open('w') as output_file: df.to_csv(output_file, index=False, header=True)
def fetch_posts(self, page_id): limit = 100 url = f'{API_BASE}/{page_id}/published_posts?limit={limit}' response = try_request_multiple_times(url) response_content = response.json() yield from response_content['data'] i = 1 while 'next' in response_content['paging']: logger.info(f"Fetched approx. {i * limit} Facebook posts") i += 1 url = response_content['paging']['next'] # next(log_loop) response = try_request_multiple_times(url) response_content = response.json() yield from response_content['data'] if self.minimal_mode: response_content['paging'].pop('next') logger.info("Fetching of facebook posts completed")
def request_report(report_type, session_id): """Download a generated report from the Gomus servers.""" base_url = 'https://barberini.gomus.de' report_parts = report_type.split("_") report_id = REPORT_IDS[report_type] logger.info(f"Working with report '{report_parts[0]}.xlsx'") if report_id > 0: # Work with the kind of report that is generated and maintained logger.info("Fetching report") url = f'{base_url}/admin/reports/{report_id}.xlsx' else: # Work with the kind of report that is requested directly logger.info("Directly downloading report") timespan = report_parts[1] if len(report_parts) >= 2 else '' url = direct_download_url(base_url, report_parts[0], timespan) return get_request(url, session_id)
import os import subprocess as sp from _utils import db_connector, logger CONNECTOR = db_connector() logging.basicConfig(level=logging.INFO) REFERENCING_TABLES = ['fb_post_comment', 'fb_post_performance'] # Are there any existing data to preserve? if not any( CONNECTOR.exists(f'SELECT * FROM {table}') for table in REFERENCING_TABLES): # Nothing to preserve, get into the fast lane logger.info("Truncating fb_post in the fast line") CONNECTOR.execute(''' TRUNCATE TABLE fb_post CASCADE ''') exit(0) # Otherwise, to keep existing data from referencing tables, we will need to do # some SQL acrobatics below. logger.info("Truncating fb_post in the long line") try: with CONNECTOR._create_connection() as conn: with conn.cursor() as cur: # 1. Decouple performance table from post table logger.info("Dropping constraints")
def fetch_comments(self, df): invalid_count = 0 # Handle each post for i in df.index: page_id, post_id = df['page_id'][i], df['post_id'][i] fb_post_id = f'{page_id}_{post_id}' post_date = self.post_date(df, i) if post_date < self.minimum_relevant_date: continue # Grab up to 100 comments for the post (maximum) limit = 100 # 'toplevel' or 'stream' (toplevel doesn't include replies) # Using 'toplevel' here allows us to safely # set parent to None for all comments returned # by the first query filt = 'toplevel' # 'chronological' or 'reverse_chronolocial' order = 'chronological' fields = ','.join( ['id', 'created_time', 'comment_count', 'message', 'comments']) url = (f'{API_BASE}/{fb_post_id}/comments?limit={limit}' f'filter={filt}&order={order}&fields={fields}') response = try_request_multiple_times(url) if response.status_code == 400: invalid_count += 1 continue response_data = response.json().get('data') logger.info(f"Fetched {len(response_data)} " f"comments for post {post_id}") # Handle each comment for the post for comment in response_data: comment_id = comment.get('id').split('_')[1] yield { 'post_id': str(post_id), 'comment_id': str(comment_id), 'page_id': str(page_id), 'post_date': comment.get('created_time'), 'text': comment.get('message'), 'is_from_museum': self.is_from_museum(comment), 'response_to': None } if not comment.get('comment_count'): continue try: # Handle each reply for the comment for reply in comment['comments']['data']: yield { 'comment_id': reply.get('id').split('_')[1], 'page_id': str(page_id), 'post_id': str(post_id), 'post_date': reply.get('created_time'), 'text': reply.get('message'), 'is_from_museum': self.is_from_museum(reply), 'response_to': str(comment_id) } except KeyError: # Sometimes, replies become unavailable. In this case, # the Graph API returns the true 'comment_count', # but does not provide a 'comments' field anyway logger.warning(f"Failed to retrieve replies for comment " f"{comment.get('id')}") if invalid_count: logger.warning(f"Skipped {invalid_count} posts")
def run(self): reviews = self.fetch_all() logger.info("storing results") with self.output().open('w') as output_file: reviews.to_csv(output_file, index=False, header=True)