def update_top_image_reposts(uowm: UnitOfWorkManager, reddit: Reddit) -> NoReturn: days = [1, 7, 30, 365] with uowm.start() as uow: uow.session.execute('TRUNCATE `stats_top_image_repost`') for day in days: result = uow.session.execute( 'SELECT repost_of, COUNT(*) c FROM image_reposts WHERE detected_at > NOW() - INTERVAL :days DAY GROUP BY repost_of HAVING c > 1 ORDER BY c DESC LIMIT 2000', {'days': day}) for chunk in chunk_list(result.fetchall(), 100): reddit_ids_to_lookup = [] for post in chunk: existing = uow.stats_top_image_repost.get_by_post_id_and_days( post[0], day) if existing: existing.repost_count = post[1] continue reddit_ids_to_lookup.append(f't3_{post[0]}') for submission in reddit.info(reddit_ids_to_lookup): count_data = next( (x for x in chunk if x[0] == submission.id)) if not count_data: continue uow.stats_top_image_repost.add( StatsTopImageRepost(post_id=count_data[0], repost_count=count_data[1], days=day, nsfw=submission.over_18)) uow.commit()
def filter_removed_posts(reddit: Reddit, matches: List[SearchMatch]) -> List[SearchMatch]: """ Take a list of SearchMatches, get the submission from Reddit and see if they have been removed :param reddit: Praw Reddit instance :param matches: List of matches :return: List of filtered matches """ if not matches: return matches if len(matches) > 100: log.info('Skipping removed post check due to > 100 matches (%s)', len(matches)) return matches post_ids = [f't3_{match.post.post_id}' for match in matches] submissions = reddit.info(post_ids) for sub in submissions: if sub.__dict__.get('removed', None): log.debug('Removed Post Filter Reject - %s', sub.id) del matches[next(i for i, x in enumerate(matches) if x.post.post_id == sub.id)] return matches
def defuzzed_submissions_scores(connection: Reddit, submissions: List[Submission], iterations: int) -> Mapping[Submission, List[int]]: """"De-fuzzes" multiple submissions' scores by batch requesting each score from Reddit multiple times, and calculating the average score for each.""" def t3_(id: str) -> str: if id.startswith('t3_'): return id else: return f't3_{id}' # scores is a dict mapping submission ids to lists of scores ids = [ t3_(submission.id) for submission in submissions ] scores = { i : list() for i in ids } for _ in range(iterations): for submission in connection.info(ids): scores[t3_(submission.id)].append(submission.score) # map given submissions to submission ids idmap = { t3_(submission.id) : submission for submission in submissions } return { idmap[i] : scores[i] for i in ids }
def get_request_params(client_id, redirect_uri, thing): scopes = ["*"] reddit = Reddit( client_id=client_id, client_secret=None, redirect_uri=redirect_uri, user_agent="Award fetcher by u/Lil_SpazJoekp", ) state = str(random.randint(0, 65000)) url = reddit.auth.url(scopes, state, "temporary") print(f"Open this url in your browser: {url}") sys.stdout.flush() client = receive_connection() data = client.recv(1024).decode("utf-8") param_tokens = data.split(" ", 2)[1].split("?", 1)[1].split("&") params = { key: value for (key, value) in [token.split("=") for token in param_tokens] } if state != params["state"]: send_message( client, f"State mismatch. Expected: {state} Received: {params['state']}", ) return elif "error" in params: send_message(client, params["error"]) return reddit.auth.authorize(params["code"]) thing = list(reddit.info([thing]))[0] subreddit = thing.subreddit_id return reddit._authorized_core._authorizer.access_token, thing.fullname, subreddit
class Praw(Loader): def __init__(self, root, config, subreddit): Loader.__init__(self, 'praw', root, config, subreddit) self.endpoint = { 'user_agent': Env.USER_AGENT(), 'client_id': Env.REDDIT_CLIENT_ID(), 'client_secret': Env.REDDIT_CLIENT_SECRET() } self.reddit = Reddit(**self.endpoint) # config parameters self.types = self.config['praw']['types'] self.periode = self.config['praw']['periode'] self.retrospect_time = self.config['praw']['retrospect_time'] # initial run variables self.last_run = {} for file_type in self.types: self.last_run[file_type] = 0 # saved run variables for file_type in self.types: meta = self.read_meta(file_type) if 'last_run' in meta: self.last_run[file_type] = meta['last_run'] def run(self): self.runevent.set() try: # download reddit data while not self.stopped(): stores = [ Store('crawler', self.root, self.config, self.subreddit), Store('pushshift', self.root, self.config, self.subreddit) ] for file_type in self.types: self.download(file_type, stores) # periodic run if self.alive(): self.log(f'sleep for {self.periode} seconds') self.time.sleep(self.periode) else: break except KeyboardInterrupt: self.runevent.clear() raise KeyboardInterrupt() except Exception as e: self.log(f'...run error {repr(e)}') self.runevent.clear() def download(self, file_type, stores): now = int(datetime.now(timezone.utc).timestamp()) # set last run from now self.last_run[file_type] = now # define columns columns = { 'submission': [ 'submission', 'subreddit', 'author', 'created', 'retrieved', 'edited', 'pinned', 'archived', 'locked', 'removed', 'deleted', 'is_self', 'is_video', 'is_original_content', 'title', 'link_flair_text', 'upvote_ratio', 'score', 'gilded', 'total_awards_received', 'num_comments', 'num_crossposts', 'selftext', 'thumbnail', 'shortlink' ], 'comment': [ 'submission', 'subreddit', 'comment', 'author', 'created', 'retrieved' # TODO fetch comments ] }[file_type] # read existing data df = self.read_data(file_type) if df.empty: df = pd.DataFrame(columns=columns).set_index(file_type) df = df.sort_values(by=['created', 'retrieved']) # load metadata idxs = list(df.index) for store in stores: df_store = store.read_data(file_type) # validate dataset if df_store.empty: continue df_store = df_store.sort_values(by=['created', 'retrieved']) # obtain existing items df_store_existing = df_store[df_store.index.isin(idxs)] df_store_existing = df_store_existing.sort_values( by=['created', 'retrieved']) # update last x hours based on retrospect time sliding window last_time = df_store.iloc[0][ 'created'] if df_store_existing.empty else df_store_existing.iloc[ -1]['created'] update_time = last_time - (60 * 60 * self.retrospect_time) self.log( f'update data after {datetime.fromtimestamp(update_time)} from {store.name}' ) # obtain fetch ids prefix = {'submission': 't3_', 'comment': 't1_'} df_store_update = df_store[df_store['created'] >= update_time] ids = list(prefix[file_type] + df_store_update.index) # process submissions if file_type == 'submission': # fetch data data = self.fetch(file_type, ids) # update submission data df_update = pd.DataFrame(data, columns=columns).set_index(file_type) df = df.combine_first(df_update) df.update(df_update) # updated data self.log(f'updated {df_update.shape[0]} {file_type}s') # convert datatypes df = df.convert_dtypes() df = df.sort_values(by=['created', 'retrieved']) # write data self.write_data(file_type, df, overwrite=True, last_run=self.last_run[file_type]) self.log(f'exported {df.shape[0]} {file_type}s') # export data file_path = os.path.join(self.root, 'data', 'export', self.subreddit, f'{file_type}.csv') os.makedirs(os.path.dirname(file_path), exist_ok=True) df.to_csv(file_path, header=True, index=True, doublequote=True, quoting=csv.QUOTE_NONNUMERIC, sep=',', encoding='utf-8') def fetch(self, file_type, ids): try: data = [] # chunk id's into batches of size 100 self.log(f'download {len(ids)} {file_type}s') batches = [ids[i:i + 100] for i in range(0, len(ids), 100)] for fullnames in tqdm(batches, desc=self.text() + 'fetching', unit_scale=100): now = datetime.now(timezone.utc).timestamp() # process submissions if file_type == 'submission': # request data submissions = self.reddit.info(fullnames=fullnames) # parse submissions data += [[ str(x.id), str(self.subreddit), str(x.author.name if x.author else '[deleted]'), int(x.created_utc), int(now), int(x.edited), int(x.pinned), int(x.archived), int(x.locked), int(x.selftext == '[removed]' or x.removed_by_category != None), int(x.selftext == '[deleted]'), int(x.is_self), int(x.is_video), int(x.is_original_content), str(x.title), str(x.link_flair_text), float(x.upvote_ratio), int(x.score), int(x.gilded), int(x.total_awards_received), int(x.num_comments), int(x.num_crossposts), str(x.selftext), str(x.thumbnail), str(x.shortlink) ] for x in submissions] # wait for next request Sleep(0.35) return data except Exception as e: self.log(f'...request error {repr(e)}, retry') Sleep(1) return []