def fetch_for_country(self, country_code): with self.input().open('r') as facts_file: facts = json.load(facts_file) app_id = facts['ids']['apple']['appId'] url = (f'https://itunes.apple.com/{country_code}/rss/customerreviews/' f'page=1/id={app_id}/sortby=mostrecent/xml') data_list = [] while url: try: data, url = self.fetch_page(url) data_list += data except requests.exceptions.HTTPError as error: if error.response is not None and ( error.response.status_code == 503 or (error.response.status_code in {403, 404} and country_code not in {'DE', 'US', 'GB'})): logger.error(f"Encountered {error.response.status_code} " f"server error '{error}' for country code " f"'{country_code}'") logger.error("Continuing anyway...") break else: raise if not data_list: # no reviews for the given country code logger.debug(f"Empty data for country {country_code}") result = pd.DataFrame(data_list) result['country_code'] = country_code result.insert(0, 'app_id', app_id) return result
def fetch_all(self): data = [] country_codes = sorted(self.get_country_codes()) if self.minimal_mode: random_num = random.randint(0, len(country_codes) - 2) # nosec country_codes = country_codes[random_num:random_num + 2] country_codes.append('CA') tbar = self.tqdm(country_codes, desc="Fetching appstore reviews") for country_code in tbar: tbar.set_description(f"Fetching appstore reviews ({country_code})") try: data_for_country = self.fetch_for_country(country_code) if not data_for_country.empty: data.append(data_for_country) logger.debug(f'Fetching appstore reviews for {country_code}') except requests.HTTPError as error: if error.response.status_code == 400: # not all countries are available pass else: raise try: ret = pd.concat(data) except ValueError: ret = pd.DataFrame(columns=[]) return ret.drop_duplicates(subset=['app_id', 'appstore_review_id'])
def guess_language(self): try: return langdetect.detect(self.text) except langdetect.lang_detect_exception.LangDetectException as e: # langdetect can not handle emoji-only and link-only texts logger.debug(f'langdetect failed for one Doc. Error: {e}') logger.debug(f'Failure happened for Doc {self.to_dict()}')
def extract_capacities(self, html_path): with open(html_path) as file: src = file.read() dom: html.HtmlElement = html.fromstring(src) quota_id, min_date = self.extract_header(dom) logger.debug("Scraping capacities from quota_id=%s for min_date=%s", quota_id, min_date) capacities = self.create_zero_data(min_date) def load_data(data): return pd.DataFrame( data, columns=[*capacities.index.names, *capacities.columns], dtype=object).set_index(capacities.index.names) basic_capacities = load_data(self.extract_basic_capacities(dom)) capacities.update(basic_capacities) detailed_capacities = load_data( self.extract_detailed_capacities(src, min_date)) capacities.update(detailed_capacities) capacities = capacities.reset_index() capacities.insert(0, 'quota_id', quota_id) return capacities
def fetch_tweets(self, query, start_date, limit): """All searches are limited to German tweets (twitter lang code de).""" logger.debug( f"Querying Tweets. term \"{query}\", " f"limit: {limit}, start_date: {start_date}" ) tweets = [] # tweets go in this list # set config options for twint c = twint.Config() c.Limit = limit c.Search = query c.Store_object = True c.Since = f'{start_date} 00:00:00' c.Lang = 'de' c.Hide_output = True c.Store_object_tweets_list = tweets # execute the twitter search twint.run.Search(c) # create dataframe from search results tweets_df = pd.DataFrame([ { 'term': query, 'user_id': t.user_id, 'tweet_id': t.id, 'text': t.tweet, 'response_to': '', 'post_date': t.datestamp, 'permalink': t.link, 'likes': t.likes_count, 'retweets': t.retweets_count, 'replies': t.replies_count } for t in tweets ]) # insert space before links to match hashtags correctly if not tweets_df.empty: tweets_df['text'] = tweets_df['text']\ .str.replace('pic.', ' pic.', regex=False)\ .str.replace('https', ' https', regex=False)\ .str.replace('http', ' http', regex=False) return tweets_df
def main(): # noqa: D103 for table in PERFORMANCE_TABLES: condenser = PerformanceValueCondenser(CONNECTOR, table) key_columns = condenser.get_key_columns() performance_columns = condenser.get_performance_columns(key_columns) data, header = CONNECTOR.query_with_header(f'SELECT * FROM {table}') df = pd.DataFrame(data, columns=header) # Special treatment because of multi-column key # (pandas unique only works on series -> 1d) if table == 'fb_post_performance': df.drop(columns='page_id', inplace=True) key_columns.remove('page_id') key_column = key_columns[0] before = len(df) to_drop = [] unique_ids = df[key_column].unique() logger.debug("Condensing performance table: %s", table) logger.debug(f"Processing {len(unique_ids)} unique ids") logger.debug("Before: %s", before) for unique_id in unique_ids: ordered_entries = df.loc[df[key_column] == unique_id] \ .sort_values(by=TIMESTAMP_COLUMN, axis='index', ascending=True) prev_row = None for i, row in ordered_entries.iterrows(): if prev_row is None: # could be 0 prev_row = row continue # if current and previous entries are equal, # flag current entry for deletion if row[performance_columns] \ .equals(prev_row[performance_columns]): to_drop.append(i) prev_row = row logger.debug("After: %s", before - len(to_drop)) to_drop_df = df[df.index.isin(to_drop)] # Note this could be optimized by using # cursor.copy_from and a temporary table. queries = [] for _, row in to_drop_df.iterrows(): queries.append(f''' DELETE FROM {table} WHERE {key_column} = '{row[key_column]}' AND {TIMESTAMP_COLUMN} = '{row[TIMESTAMP_COLUMN]}' ''') if queries: CONNECTOR.execute(*queries)
def run(self): # Approach: Sequentially fetch all quota IDs, ignoring missing ones. # Stop when more than max_missing_ids consecutive IDs were invalid. quota_id = last_confirmed_id = 0 with self.output().open('w') as output: print('file_path', file=output) while quota_id - last_confirmed_id <= self.max_missing_ids: quota_id += 1 if self.minimal_mode: quota_id += -1 + 4 html = yield FetchGomusHTML(url=f'/admin/quotas/' f'{quota_id}', ignored_status_codes=[404]) if html.has_error(): logger.debug(f"Skipping invalid quota_id={quota_id}") continue last_confirmed_id = quota_id print(html.path, file=output)
def run(self): current_timestamp = dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") performances = [] with self.input().open('r') as csv_in: df = pd.read_csv(csv_in) if self.minimal_mode: df = df.head(5) invalid_count = 0 pbar = self.tqdm(df.index, desc="Fetching performance data for facebook posts") for index in pbar: page_id, post_id = \ str(df['page_id'][index]), str(df['post_id'][index]) fb_post_id = f'{page_id}_{post_id}' post_date = self.post_date(df, index) if post_date < self.minimum_relevant_date: continue logger.debug(f"Loading performance data for FB post {fb_post_id}") metrics = ','.join([ 'post_reactions_by_type_total', 'post_activity_by_action_type', 'post_clicks_by_type', 'post_negative_feedback', 'post_impressions_paid', 'post_impressions', 'post_impressions_unique' # "reach" ]) url = f'{API_BASE}/{fb_post_id}/insights?metric={metrics}' response = try_request_multiple_times(url) if response.status_code == 400: invalid_count += 1 continue response.raise_for_status() # in case of another error response_content = response.json() post_perf = { 'timestamp': current_timestamp, } # Reactions reactions = response_content['data'][0]['values'][0]['value'] post_perf['react_like'] = int(reactions.get('like', 0)) post_perf['react_love'] = int(reactions.get('love', 0)) post_perf['react_wow'] = int(reactions.get('wow', 0)) post_perf['react_haha'] = int(reactions.get('haha', 0)) post_perf['react_sorry'] = int(reactions.get('sorry', 0)) post_perf['react_anger'] = int(reactions.get('anger', 0)) # Activity activity = response_content['data'][1]['values'][0]['value'] post_perf['likes'] = int(activity.get('like', 0)) post_perf['shares'] = int(activity.get('share', 0)) post_perf['comments'] = int(activity.get('comment', 0)) # Clicks clicks = response_content['data'][2]['values'][0]['value'] post_perf['video_clicks'] = int(clicks.get('video play', 0)) post_perf['link_clicks'] = int(clicks.get('link clicks', 0)) post_perf['other_clicks'] = int(clicks.get('other clicks', 0)) # negative feedback (only one field) post_perf['negative_feedback'] = \ response_content['data'][3]['values'][0]['value'] # number of times the post entered a person's screen through # paid distribution such as an ad post_perf['paid_impressions'] = \ response_content['data'][4]['values'][0]['value'] post_perf['post_impressions'] = \ response_content['data'][5]['values'][0]['value'] post_perf['post_impressions_unique'] = \ response_content['data'][6]['values'][0]['value'] post_perf.update(page_id=page_id, post_id=post_id) performances.append(post_perf) if invalid_count: logger.warning(f"Skipped {invalid_count} posts") df = pd.DataFrame(performances) # For some reason, all except the first set of performance # values get inserted twice into the performances list. # Investigate and fix the root cause, this is a workaround # TODO: Is this still up to date? Could not reproduce. df.drop_duplicates(subset='post_id', inplace=True, ignore_index=True) df = self.filter_fkey_violations(df) df = self.condense_performance_values(df) with self.output().open('w') as output_file: df.to_csv(output_file, index=False, header=True)