class Loader(Thread, Logger, Store): def __init__(self, name, root, config, subreddit): Thread.__init__(self, name=name) Logger.__init__(self, name=name, context=f'r/{subreddit}', plain=False) Store.__init__(self, name=name, root=root, config=config, subreddit=subreddit) # thread events self.runevent = Event() self.stopevent = Event() # time helpers self.time = Sleep(10, immediate=False) def running(self): return self.runevent.is_set() def stopped(self): return self.stopevent.is_set() def alive(self): return self.is_alive() def run(self): raise NotImplementedError() def stop(self, timeout=None): self.stopevent.set() self.time.wake() while self.running(): Sleep(0.1) if self.alive(): self.join(timeout)
def fetch(self, file_type, ids): try: data = [] # chunk id's into batches of size 100 self.log(f'download {len(ids)} {file_type}s') batches = [ids[i:i + 100] for i in range(0, len(ids), 100)] for fullnames in tqdm(batches, desc=self.text() + 'fetching', unit_scale=100): now = datetime.now(timezone.utc).timestamp() # process submissions if file_type == 'submission': # request data submissions = self.reddit.info(fullnames=fullnames) # parse submissions data += [[ str(x.id), str(self.subreddit), str(x.author.name if x.author else '[deleted]'), int(x.created_utc), int(now), int(x.edited), int(x.pinned), int(x.archived), int(x.locked), int(x.selftext == '[removed]' or x.removed_by_category != None), int(x.selftext == '[deleted]'), int(x.is_self), int(x.is_video), int(x.is_original_content), str(x.title), str(x.link_flair_text), float(x.upvote_ratio), int(x.score), int(x.gilded), int(x.total_awards_received), int(x.num_comments), int(x.num_crossposts), str(x.selftext), str(x.thumbnail), str(x.shortlink) ] for x in submissions] # wait for next request Sleep(0.35) return data except Exception as e: self.log(f'...request error {repr(e)}, retry') Sleep(1) return []
def __init__(self, name, root, config, subreddit): Thread.__init__(self, name=name) Logger.__init__(self, name=name, context=f'r/{subreddit}', plain=False) Store.__init__(self, name=name, root=root, config=config, subreddit=subreddit) # thread events self.runevent = Event() self.stopevent = Event() # time helpers self.time = Sleep(10, immediate=False)
def fetch(config, subreddit): logger = Logger('main', 'fetch', plain=True) loaders = [] try: # pushshift pushshift = Pushshift(root, config, subreddit) loaders.append(pushshift) # crawler crawler = Crawler(root, config, subreddit) loaders.append(crawler) # praw praw = Praw(root, config, subreddit) loaders.append(praw) # start loader threads background = False # TODO thread implementation for loader in loaders: if background: loader.start() else: loader.run() # wait until abort while background: Sleep(1) except KeyboardInterrupt: for loader in loaders: loader.stop(1) raise KeyboardInterrupt() except Exception as e: logger.log(f'...fetch error {repr(e)}')
def fetch(self, url, file_type, data=[]): try: now = datetime.now(timezone.utc).timestamp() # terminate created = [x[3] for x in data] if len(created) and created[-1] <= self.last_run[file_type]: return [x for x in data if x[3] > self.last_run[file_type]] # request data response = requests.get(url, headers={ 'User-Agent': Env.USER_AGENT() }).content content = html.fromstring(response) # parse submissions things = content.xpath( './/div[contains(@class,"thing") and @data-fullname]') data += [[ x.get('data-fullname').partition('_')[2].strip(), self.subreddit, x.get('data-author'), int(x.get('data-timestamp')) // 1000, int(now) ] for x in things if x.get('data-fullname').startswith('t3_')] # fetched data created = [x[3] for x in data] self.log( f'fetched {len(data)} {file_type}s after {datetime.fromtimestamp(created[-1]).strftime("%Y-%m-%d %H:%M:%S")}' ) # wait for next request Sleep(0.35) # parse next url url_next = content.xpath( './/a[contains(@rel,"next") and @href]/@href') if len(url_next): return self.fetch(url_next[0], file_type, data) except Exception as e: self.log(f'...request error {repr(e)}, retry') Sleep(1) return [x for x in data if x[3] > self.last_run[file_type]]
def stop(self, timeout=None): self.stopevent.set() self.time.wake() while self.running(): Sleep(0.1) if self.alive(): self.join(timeout)
def fetch(self, url, file_type): try: # request data result = requests.get(url, headers={ 'User-Agent': Env.USER_AGENT() }).json() # validate result if 'data' not in result or not len(result['data']): self.log( f'fetched 0 {file_type}s after {datetime.fromtimestamp(self.last_run[file_type]).strftime("%Y-%m-%d %H:%M:%S")}' ) return None # build data data = [] for x in result['data']: # set last run from current item self.last_run[file_type] = x['created_utc'] - 1 if file_type == 'submission' and 'selftext' in x: # parse submissions data += [[ x['id'], self.subreddit, x['author'], x['created_utc'], x['retrieved_on'] ]] elif file_type == 'comment' and 'body' in x: # parse comments data += [[ x['parent_id'].partition('_')[2], x['id'], self.subreddit, x['author'], x['created_utc'], x['retrieved_on'] ]] # fetched data self.log( f'fetched {len(data)} {file_type}s after {datetime.fromtimestamp(self.last_run[file_type]).strftime("%Y-%m-%d %H:%M:%S")}' ) return data except Exception as e: self.log(f'...request error {repr(e)}, retry') Sleep(1) return []
def download(self, file_type): count = 0 now = int(datetime.now(timezone.utc).timestamp()) # set last run from now if self.last_run[file_type] == self.end_run[file_type]: self.last_run[file_type] = now self.log( f'download {file_type}s before {datetime.fromtimestamp(self.last_run[file_type]).strftime("%Y-%m-%d %H:%M:%S")}' ) # define columns columns = { 'submission': ['submission', 'subreddit', 'author', 'created', 'retrieved'], 'comment': [ 'submission', 'comment', 'subreddit', 'author', 'created', 'retrieved' ] # TODO fetch comments }[file_type] while True: # fetch data url = self.endpoint.format(file_type, self.subreddit, str(self.end_run[file_type]), str(self.last_run[file_type])) data = self.fetch(url, file_type) # validate data if data is None: if count == 0: self.log(f'exported 0 {file_type}s') break # build dataframe and sort df = pd.DataFrame(data, columns=columns).set_index(file_type) df = df.sort_values(by=['created', 'retrieved']) # check result if not df.empty: count += df.shape[0] # append data self.write_data(file_type, df, overwrite=False, last_run=self.last_run[file_type], end_run=self.end_run[file_type]) self.log(f'exported {df.shape[0]} {file_type}s') # wait for next request Sleep(0.35) # set last run and end run from now self.last_run[file_type] = now if count > 0: self.end_run[file_type] = now # update state self.write_meta(file_type, last_run=self.last_run[file_type], end_run=self.end_run[file_type])
config = json.load(f) # kaggle client kaggle = Kaggle(config=os.path.join('config', 'kaggle.json')) # start background tasks while not terminated: for subreddit in args.subreddits: # fetch data fetch(config, subreddit) # pause requests if args.pause: logger.log(f'\n{"-"*45}{"PAUSING":^15}{"-"*45}\n') Sleep(args.pause) else: logger.log(f'\n{"-"*105}\n') # check termination if terminated: break else: # publish data publish(args.publish, kaggle) except KeyboardInterrupt as e: logger.log(f'...aborted') except Exception as e: logger.log(f'...error {repr(e)}') finally: