def __init__(self): self.api = psaw.PushshiftAPI() self._subreddit = '' self._limit = 10 self._start = int(dt.datetime(2020, 1, 1).timestamp()) self._end = 0 self._query = ''
def get_urls(subreddit='gifs', max_urls=10000, end_time=None, score_threshold=5): api = psaw.PushshiftAPI() if end_time is None: end_time = int(datetime.datetime.now().timestamp()) query = api.search_submissions(before=end_time, subreddit='gifs', filter=['url', 'score', 'title', 'permalink', 'subreddit'], limit=max_urls, score='>%d' % score_threshold, is_self=False, over_18=False) seen = {} for i, subm in enumerate(tqdm.tqdm(query, total=max_urls)): url = subm.url if url in seen: continue seen[url] = True # weird issue with psaw/pushshift that breaks score=">2" if subm.score < score_threshold: continue entry = { 'url': url, 'score': subm.score, 'title': subm.title, 'permalink': subm.permalink, 'subreddit': subm.subreddit } yield entry
def get_comment_mapping(author: str, num_comments: int): fic_id_to_submissions, submissions_to_fics = defaultdict(set), defaultdict( set) bar = progressbar.ProgressBar(max_value=num_comments) api = psaw.PushshiftAPI() errors = [] for i, comment in enumerate( api.search_comments( author=author, filter=['score', 'id', 'link_id', 'body', 'permalink'], limit=num_comments)): bar.update(i) for fic_name, fic_id in re.findall(comment_regex, comment.body): try: # Validate that all attributes exist (some of these will not for removed/deleted submissions) _, _, _, _, _ = comment.score, comment.id, comment.link_id, comment.body, comment.permalink f = FicComment(name=fic_name, id=fic_id, score=comment.score, permalink=comment.permalink) fic_id_to_submissions[fic_id].add(comment.link_id) submissions_to_fics[comment.link_id].add(f) except Exception as e: errors.append(str(e)) continue bar.finish() if len(errors) > 0: print("Errors:\n" + "\n".join(errors)) print(f"{len(errors)} errors.") return fic_id_to_submissions, submissions_to_fics
def reddit_scr(keyword): ''' this function is dedicated to scrap reddit data using psaw :param keyword: str, keyword used for searching :return: str ''' # use psaw's API api = psaw.PushshiftAPI() start_time = int(dt.datetime(2020, 1, 1).timestamp()) output_raw = list( api.search_submissions(after=start_time, q=keyword, limit=100000000)) # output = api.search_comments(after=start_time, q=keyword, limit=1) output = [] curr = [ ] # this list is used for holding an entry before putting it into the final csv file for obj in output_raw: if obj.subreddit == 'Comcast_Xfinity': # convert the timestamp to a more convenient format t = time.localtime(int(obj.created_utc)) t2 = time.strftime("%Y-%m-%d %H:%M:%S", t) tf = dt.datetime.strptime(t2, "%Y-%m-%d %H:%M:%S") # combine the attributes to form an entry curr.append(tf) curr.append(obj.subreddit) curr.append(obj.title) curr.append(obj.selftext) # append the entry into output output.append(curr) curr = [] # form the csv file file = open('reddit_data4.csv', 'a+', newline='') with file: write = csv.writer(file) write.writerows(output) return 'Done'
def __init__(self, secrets_manager: RedditSecretsManager): secrets = secrets_manager.get_secrets() reddit = praw.Reddit( user_agent="Comment Extraction (by /u/balindwalinstalin)", client_id=secrets["REDDIT_CLIENT_ID"], client_secret=secrets["REDDIT_CLIENT_SECRET"], ) self.reddit = psaw.PushshiftAPI(reddit)
def get_comments(self, submission): comment_search = psaw.PushshiftAPI(r=self.reddit).search_comments( submission_id=submission.id, return_batch=True) for comment in comment_search: author = comment.author.name if author == 'AutoModerator': continue yield { 'comment_author': author, 'comment_body': comment.body, 'comment_score': comment.score, 'comment_created_utc': comment.created_utc, 'comment_id': comment.id, 'comment_parent_id': comment.parent_id }
def __init__(self, client_id, client_secret, username, password, bot_name): self.client_id = client_id self.client_secret = client_secret self.username = username self.password = password self.bot_name = bot_name self.reddit = praw.Reddit( client_id=self.client_id, client_secret=self.client_secret, user_agent=f"ChangeMeClient/0.1 by /u/{self.bot_name}", username=self.username, password=self.password) self.PS = psaw.PushshiftAPI() self.frame = wx.Frame(parent=None, title="Reddit", size=(325, 255)) panel = wx.Panel(self.frame) subreddit_text = wx.StaticText(panel, label="Subreddit: ", pos=(20, 20)) limit_text = wx.StaticText(panel, label="Limit: ", pos=(20, 50)) directory_text = wx.StaticText(panel, label="Directory:", pos=(20, 80)) self.download_text = wx.StaticText(panel, label="Waiting...", pos=(20, 140)) self.progress_text = wx.StaticText(panel, label="0/0", pos=(20, 180)) self.subreddit_textctrl = wx.TextCtrl(panel, pos=(100, 15)) self.limit_textctrl = wx.TextCtrl(panel, pos=(100, 45)) directory_button = wx.Button(panel, label="Select", pos=(100, 75)) directory_button.Bind(wx.EVT_BUTTON, self._set_directory) scrape_button = wx.Button(panel, label="Scrape", pos=(100, 105)) scrape_button.Bind(wx.EVT_BUTTON, self._start_scrape) self.progress_bar = wx.Gauge(panel, range=100, pos=(20, 160), size=(265, 15)) self.directory = os.getcwd() self.scrape_thread = threading.Thread(target=self._scrape, daemon=True) self.frame.Show()
def main(): reddit = praw.Reddit(client_id="jSTLDT5NQzi6LA", client_secret="Q2VODbHrd_Zykjj0zcWi0z7M3MA", password="******", user_agent="markover", username="******") api = psaw.PushshiftAPI(reddit) db = sqlite3.connect('r_france.sqlite') gen = api.search_comments(subreddit='france') for comment in progressbar.progressbar(gen): if comment.author is None: author = '' else: author = comment.author.name if not_in(db, comment.id): db.cursor().execute('INSERT INTO comments VALUES(?, ?, ?, ?)', (comment.id, author, comment.body, comment.created_utc)).close() db.commit()
def init_watch_pushshift(subreddit: str, hours: int) -> str: """ Initiate watch of subreddit using Pushshift, create CSV, return filename. """ import psaw print(f"\nInitializing watch on {subreddit}") hours_ago = NOW.subtract(hours=hours) hours_ago_as_timestamp = hours_ago.int_timestamp print(f"fetching initial posts from {subreddit}") pushshift = psaw.PushshiftAPI() submissions = pushshift.search_submissions( after=hours_ago_as_timestamp, subreddit=subreddit, filter=["id", "subreddit", "author", "created_utc"], ) submissions_d = collections.defaultdict(list) for submission in submissions: created_utc_human = pendulum.from_timestamp( submission.created_utc).format("YYYYMMDD HH:mm:ss") submissions_d["id"].append(submission.id) submissions_d["subreddit"].append(submission.subreddit) submissions_d["author_p"].append(submission.author) submissions_d["del_author_p"].append("FALSE") submissions_d["created_utc"].append(created_utc_human) submissions_d["found_utc"].append(NOW_STR) submissions_d["del_author_r"].append("FALSE") submissions_d["del_author_r_utc"].append("NA") submissions_d["del_text_r"].append("FALSE") submissions_d["del_text_r_utc"].append("NA") submissions_d["rem_text_r"].append("FALSE") submissions_d["rem_text_r_utc"].append("NA") submissions_d["removed_by_category_r"].append("FALSE") watch_fn = (f"{DATA_DIR}/watch-{subreddit}-{NOW.format('YYYYMMDD')}" f"_n{len(submissions_d['id'])}.csv") watch_df = pd.DataFrame.from_dict(submissions_d) watch_df.to_csv(watch_fn, index=True, encoding="utf-8-sig", na_rep="NA") return watch_fn
def download_subreddit_comments(subreddit, start, count, destination, min_word_count=None): api = psaw.PushshiftAPI() fields = [ 'author', 'author_flair_text', 'body', 'created_utc', 'gildings', 'id', 'parent_id', 'permalink', 'score', 'subreddit', 'subreddit_id', ] start_epoch = int(start.timestamp()) gen = api.search_comments(subreddit=subreddit, filter=fields, after=start_epoch) output_path = destination / subreddit if not output_path.exists(): print("creating output directory: {}".format(output_path)) output_path.mkdir() comments_saved = 0 while comments_saved < count: comment = next(gen) if min_word_count and len(comment.body.split()) < min_word_count: continue if comments_saved % 50 == 0: print("comment {}: {}".format(comments_saved, comment.body)) output_file = output_path / "{}.json".format(comment.id) json.dump(comment.d_, output_file.open('w', encoding='utf8')) comments_saved += 1
def get_psaw_api(): return psaw.PushshiftAPI()
def main(): arg_parser = argparse.ArgumentParser( description='Reddit canned response bot') arg_parser.add_argument( dest='bot_config_file', type=str, help= 'json bot config file (see examples/minimal_example_bot_config.json)') arg_parser.add_argument( '--dry-run', dest='dry_run', type=int, const=0, default=None, nargs='?', help='Doesn\'t actually reply, just prints what it would\'ve sent.' ' A number of hours prior to "now" may also be supplied to ' 'iterate over old comments first e.g. "--dry-run=168"') arg_parser.add_argument('--verbose', dest='verbose', action='store_true', help='Display additional debug messages') arg_parser.add_argument('--skip-tests', dest='skip_tests', action='store_true', help='Skips tests') arguments = arg_parser.parse_args() # Reddit credentials should be supplied via praw.ini file. reddit = praw.Reddit() pushshift = psaw.PushshiftAPI(reddit) # Setup logging to stdout and rotating files log_stream_handler = logging.StreamHandler(sys.stdout) log_stream_handler.setLevel( logging.DEBUG if arguments.verbose else logging.INFO) log_stream_handler.setFormatter( logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) log.addHandler(log_stream_handler) # Rotate file log through 3 * 10MiB files log_file_handler = logging.handlers.RotatingFileHandler( pathlib.Path(reddit.config.username).with_suffix('.log'), maxBytes=10 * 1048576, backupCount=2) log_file_handler.setLevel(logging.DEBUG) log_file_handler.setFormatter( logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')) log.addHandler(log_file_handler) with open(arguments.bot_config_file) as f: bot_config = json.load(f) # type: Dict[str, typing.Any] canned_responses = [ CannedResponse(**kwargs) for kwargs in bot_config['canned_responses'] ] reply_generator = ReplyGenerator( canned_responses, bot_config.get('comment_mention_reply', None), bot_config['postfix']) tests = bot_config.get('tests', None) if tests and not arguments.skip_tests: # Run tests log.setLevel(logging.WARNING) # Hide test log output suite = unittest.TestSuite() suite.addTest(BotTests(tests, reply_generator)) unittest.TextTestRunner().run(suite) log.setLevel(logging.DEBUG) # Restore log output max_comments_per_submission = bot_config.get( 'max_comments_per_submission', DEFAULT_MAX_COMMENTS_PER_SUBMISSION) delete_unliked_comment_score = bot_config.get( 'delete_unliked_comment_score', DEFAULT_DELETE_UNLIKED_COMMENT_SCORE) dry_run = arguments.dry_run is not None start_time_offset_hours = 0 if arguments.dry_run is None else -arguments.dry_run bot = Bot(pushshift, reply_generator, bot_config['subreddits'], max_comments_per_submission, delete_unliked_comment_score, dry_run=dry_run, start_time_offset_hours=start_time_offset_hours) bot.run()
client_id = args.client_id client_secret = args.client_secret if args.credentials.exists() and args.credentials.is_file(): with open(args.credentials) as credentials_file: credentials = json.load(credentials_file) client_id = credentials.get('client_id', client_id) client_secret = credentials.get('client_secret', client_secret) # Make sure that we have a client id AND a client secret. assert client_id is not None and client_secret is not None reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=args.user_agent) api = psaw.PushshiftAPI(reddit) SUBMISSION_SERIALIZE_ATTRIBUTES = [ 'created_utc', 'id', 'name', 'permalink', 'score', 'title', 'upvote_ratio', 'url', 'selftext' ] COMMENT_SERIALIZE_ATTRIBUTES = [ 'body', 'created_utc', 'id', 'is_submitter', 'link_id', 'parent_id', 'permalink', 'score', 'subreddit_id' ] def _serialize_reddit_object(obj, attributes, print_func=print): data = {attribute: getattr(obj, attribute) for attribute in attributes} if obj.author is not None:
import simplejson as json import psaw import requests from flask import Flask from flask import request, jsonify, abort import firebase_admin from firebase_admin import credentials from firebase_admin import firestore from google.cloud import pubsub_v1 app = Flask(__name__) firebase_admin.initialize_app(credentials.ApplicationDefault()) reddit = psaw.PushshiftAPI() db = firestore.client() session = requests.Session() publisher = pubsub_v1.PublisherClient() @app.route('/r/<subreddit>') def run(subreddit): before = request.args.get('before', type=int) submissions = reddit.search_submissions( before=before, subreddit=subreddit, limit=500, filter=[ 'id',
def __init__(self, credentials, size, fields=None): self.ps_api = psaw.PushshiftAPI(max_results_per_request=size) self.reddit = praw.Reddit(**credentials) if fields is None: self.fields = self.default_fields
and save the urls to a txt Arguments: query {instace of api.search_submissions} -- where: api = psaw.PushshiftAPI() save_urls_path {str} -- path to save urls """ reddit_to_db = RedditToDb(save_to_table) with tqdm.tqdm() as pbar: for subm in query: reddit_to_db.insert_submission_into_db(subm) pbar.update(1) if __name__ == "__main__": api = psaw.PushshiftAPI() end_time = int(datetime.datetime(2019, 6, 1).timestamp()) # start_time = int(datetime.datetime(2000, 6, 1).timestamp()) # search = "tl & dr" # search = "selftext:(tl & dr)" # search = "selftext:tl & dr" # search = "selftext:tl" 0 # self_text_search = "tl & dr" bunch of meanigless results # self_text_search = "' tl' & 'dr'" 3 results #-------------self text search # TLDRs in self text (=> usually summarizing the reddit data) # self_text_search = "'tl' & 'dr'" # query = api.search_submissions( # # q=search, # selftext=self_text_search,
def collect_comment_star_citizen(save_file, limit=None, used_saved=False, append=False, do_roadmap=True, flair_list=["OFFICIAL"]): if used_saved and not append: try: comments_list = load_all_comments(db_name=save_file) return comments_list except: print("Could not retrieve saved comments, getting comment normally") else: comments_list = [] with open("credentials.json") as f: credentials = json.loads(f.read()) reddit = praw.Reddit(client_id=credentials["id"], client_secret=credentials["secret"], user_agent="Comment Extraction") api = psaw.PushshiftAPI(reddit) title_filter = re.compile(".*(Star Citizen Roadmap Update|Squadron 42 Roadmap Update).*") official_title_filter = re.compile(".*([Ee]vocati +[Pp]atch|([Pp]atch|Release|P[Tt][Uu])[ -]+[nN]otes).*") initial_epoch = int(dt.datetime(2012, 10, 20).timestamp()) raw_comments_list = [] num_submission = 0 try: if do_roadmap: for submission in api.search_submissions(author="Odysseus-Ithaca", subreddit="starcitizen", after=initial_epoch): if submission is None: break if title_filter.match(submission.title) is None: continue print("submission {} - flair {}".format(submission.title, submission.link_flair_text)) submission.comments.replace_more(limit=None, threshold=1) raw_comments_list += submission.comments.list() num_submission += 1 if not limit is None and len(raw_comments_list) >= limit: break print("{} submission for odysseus done".format(num_submission)) if len(flair_list) > 0: raw_comments_list, num_submission = retrieve_flair(api, raw_comments_list, num_submission, flair_list, lambda s:official_title_filter.match(s.title), after_cond=initial_epoch) except KeyboardInterrup: print("Received keyboard interrup - stopping scraping") finally: print("{} submission done in total".format(num_submission)) #Retrieve all attributes tmp_list = [] for c in raw_comments_list: attributes_raw = vars(c) attributes = {} #filter lazy attributes attributes["submission_title"] = c.submission.title attributes["submission_name"] = c.submission.name attributes["submission_flair"] = c.submission.link_flair_text for key, value in attributes_raw.items(): if key.startswith("_"): continue elif key == "subreddit" or key == "author": continue attributes[key] = value tmp_list.append(attributes) comments_list += tmp_list #Cache results save_comments(comments_list, append, db_name=save_file) return comments_list