from pmb import * from psaw import PushshiftAPI config = get_config() ps = PushshiftAPI() def checkuser(args, channel): try: user = args['user'] except: return "No user given." response = "" submissions = ps.search_submissions(author=user, subreddit='politics', limit=10, filter=['created_utc', 'title', 'permalink', 'id']) response += '*SUBMISSIONS*\n' for submission in submissions: ts = pendulum.from_timestamp(submission.created_utc, tz='UTC').to_datetime_string() out = '- *{}*: "{}" <https://reddit.com{}|Post> | <http://api.pushshift.io/reddit/search/submission/?ids={}|Archive>'.format(ts, submission.title, submission.permalink, submission.id) response += out + '\n' response += '\n' comments = ps.search_comments(author=user, subreddit='politics', limit=10, filter=['created_utc', 'body', 'permalink', 'id']) response += '*COMMENTS*\n' for comment in comments: ts = pendulum.from_timestamp(comment.created_utc, tz='UTC').to_datetime_string() if len(comment.body) <= 100:
def get_due_dilligence( ticker: str, limit: int = 5, n_days: int = 3, show_all_flairs: bool = False ) -> List[praw.models.reddit.submission.Submission]: """Gets due dilligence posts from list of subreddits [Source: reddit] Parameters ---------- ticker: str Stock ticker limit: int Number of posts to get n_days: int Number of days back to get posts show_all_flairs: bool Search through all flairs (apart from Yolo and Meme) Returns ------- List[praw.models.reddit.submission.Submission] List of submissions """ praw_api = praw.Reddit( client_id=cfg.API_REDDIT_CLIENT_ID, client_secret=cfg.API_REDDIT_CLIENT_SECRET, username=cfg.API_REDDIT_USERNAME, user_agent=cfg.API_REDDIT_USER_AGENT, password=cfg.API_REDDIT_PASSWORD, ) psaw_api = PushshiftAPI() n_ts_after = int((datetime.today() - timedelta(days=n_days)).timestamp()) l_flair_text = [ "DD", "technical analysis", "Catalyst", "News", "Advice", "Chart", "Charts and Setups", "Fundamental Analysis", "forex", "Trade Idea", ] l_sub_reddits_dd = [ "pennystocks", "RobinHoodPennyStocks", "Daytrading", "StockMarket", "stocks", "investing", "wallstreetbets", "forex", "Forexstrategy", ] submissions = psaw_api.search_submissions(after=int(n_ts_after), subreddit=l_sub_reddits_dd, q=ticker, filter=["id"]) n_flair_posts_found = 0 subs = [] for submission in submissions: # Get more information about post using PRAW api submission = praw_api.submission(id=submission.id) # Ensure that the post hasn't been removed in the meanwhile if not submission.removed_by_category: # Either just filter out Yolo, and Meme flairs, or focus on DD, based on b_DD flag if ( submission.link_flair_text in l_flair_text, submission.link_flair_text not in ["Yolo", "Meme"], )[show_all_flairs]: subs.append(submission) # Increment count of valid posts found n_flair_posts_found += 1 # Check if number of wanted posts found has been reached if n_flair_posts_found > limit - 1: break return subs
def popular_tickers(l_args): parser = argparse.ArgumentParser( prog='popular', description="""Print latest popular tickers. [Source: Reddit] """) parser.add_argument('-l', "--limit", action="store", dest="n_limit", type=check_positive, default=50, help='limit of posts retrieved per sub reddit.') parser.add_argument( '-s', "--sub", action="store", dest="s_subreddit", type=str, help="""subreddits to look for tickers, e.g. pennystocks,stocks. Default: pennystocks, RobinHoodPennyStocks, Daytrading, StockMarket, stocks, investing, wallstreetbets""" ) parser.add_argument('-d', "--days", action="store", dest="n_days", type=check_positive, default=1, help="look for the tickers from those n past days.") try: (ns_parser, l_unknown_args) = parser.parse_known_args(l_args) if l_unknown_args: print( f"The following args couldn't be interpreted: {l_unknown_args}\n" ) return n_ts_after = int( (datetime.today() - timedelta(days=ns_parser.n_days)).timestamp()) if ns_parser.s_subreddit: if ',' in ns_parser.s_subreddit: l_sub_reddits = ns_parser.s_subreddit.split(',') else: l_sub_reddits = [ns_parser.s_subreddit] else: l_sub_reddits = [ 'pennystocks', 'RobinHoodPennyStocks', 'Daytrading', 'StockMarket', 'stocks', 'investing', 'wallstreetbets' ] d_submission = {} d_watchlist_tickers = {} l_watchlist_links = list() l_watchlist_author = list() praw_api = praw.Reddit(client_id=cfg.API_REDDIT_CLIENT_ID, client_secret=cfg.API_REDDIT_CLIENT_SECRET, username=cfg.API_REDDIT_USERNAME, user_agent=cfg.API_REDDIT_USER_AGENT, password=cfg.API_REDDIT_PASSWORD) psaw_api = PushshiftAPI() for s_sub_reddit in l_sub_reddits: print( f"Search for latest tickers under {ns_parser.n_limit} '{s_sub_reddit}' posts" ) submissions = psaw_api.search_submissions(after=int(n_ts_after), subreddit=s_sub_reddit, limit=ns_parser.n_limit, filter=['id']) n_tickers = 0 while True: submission = next(submissions, None) if submission: # Get more information about post using PRAW api submission = praw_api.submission(id=submission.id) # Ensure that the post hasn't been removed by moderator in the meanwhile, #that there is a description and it's not just an image, that the flair is #meaningful, and that we aren't re-considering same author's content if not submission.removed_by_category and (submission.selftext or submission.title) \ and submission.author.name not in l_watchlist_author: ls_text = list() ls_text.append(submission.selftext) ls_text.append(submission.title) submission.comments.replace_more(limit=0) for comment in submission.comments.list(): ls_text.append(comment.body) l_tickers_found = list() for s_text in ls_text: for s_ticker in set( re.findall(r'([A-Z]{3,5} )', s_text)): l_tickers_found.append(s_ticker.strip()) if l_tickers_found: n_tickers += len(l_tickers_found) # Add another author's name to the parsed watchlists l_watchlist_author.append(submission.author.name) # Lookup stock tickers within a watchlist for key in l_tickers_found: if key in d_watchlist_tickers: # Increment stock ticker found d_watchlist_tickers[key] += 1 else: # Initialize stock ticker found d_watchlist_tickers[key] = 1 # Check if search_submissions didn't get anymore posts else: break print(f" {n_tickers} tickers found.") lt_watchlist_sorted = sorted(d_watchlist_tickers.items(), key=lambda item: item[1], reverse=True) if lt_watchlist_sorted: print( f"\nThe following TOP10 tickers have been mentioned in the last {ns_parser.n_days} days:" ) n_top_stocks = 0 for t_ticker in lt_watchlist_sorted: if n_top_stocks > 9: break try: # If try doesn't trigger exception, it means that this stock exists on finviz #thus we can print it. finviz.get_stock(t_ticker[0]) print(f"{t_ticker[1]} {t_ticker[0]}") n_top_stocks += 1 except: pass else: print("No tickers found") print("") except: print("")
import praw from prawcore.exceptions import RequestException from psaw import PushshiftAPI reddit = praw.Reddit(client_id='', client_secret='', user_agent='my user agent') api = PushshiftAPI(reddit) gen1 = [] try: gen1 = api.search_submissions(author="newtothistinderthing") except RequestException: print("Failed to fetch submission") posts_list = list(gen1) posts = [] for post_item in posts_list: if len(post_item.selftext.split()) >= 100: post_instance = {} post_instance.update({"id": post_item.id}) post_instance.update({"created_utc": post_item.created_utc}) post_instance.update({"subreddit": str(post_item.subreddit)}) post_instance.update({"titlxe": post_item.title}) post_instance.update({"selftext": post_item.selftext}) post_instance.update({"score": post_item.score}) post_instance.update({"num_comments": post_item.num_comments}) posts.append(post_instance) for post in posts: print(post)
def get_watchlists( n_to_get: int, ) -> Tuple[List[praw.models.reddit.submission.Submission], Dict, int]: """Get reddit users watchlists [Source: reddit] Parameters ---------- n_to_get : int Number of posts to look through Returns ------- List[praw.models.reddit.submission.Submission]: List of reddit submissions Dict: Dictionary of tickers and counts int Count of how many posts were analyzed """ d_watchlist_tickers: Dict = {} l_watchlist_author = [] subs = [] praw_api = praw.Reddit( client_id=cfg.API_REDDIT_CLIENT_ID, client_secret=cfg.API_REDDIT_CLIENT_SECRET, username=cfg.API_REDDIT_USERNAME, user_agent=cfg.API_REDDIT_USER_AGENT, password=cfg.API_REDDIT_PASSWORD, ) psaw_api = PushshiftAPI() submissions = psaw_api.search_submissions( subreddit=l_sub_reddits, q="WATCHLIST|Watchlist|watchlist", filter=["id"], ) n_flair_posts_found = 0 for sub in submissions: submission = praw_api.submission(id=sub.id) if (not submission.removed_by_category and submission.selftext and submission.link_flair_text not in ["Yolo", "Meme"] and submission.author.name not in l_watchlist_author): l_tickers_found = find_tickers(submission) if l_tickers_found: # Add another author's name to the parsed watchlists l_watchlist_author.append(submission.author.name) # Lookup stock tickers within a watchlist for key in l_tickers_found: if key in d_watchlist_tickers: # Increment stock ticker found d_watchlist_tickers[key] += 1 else: # Initialize stock ticker found d_watchlist_tickers[key] = 1 # Increment count of valid posts found n_flair_posts_found += 1 subs.append(submission) if n_flair_posts_found > n_to_get - 1: break return subs, d_watchlist_tickers, n_flair_posts_found
import praw import pandas as pd import datetime as dt import config from psaw import PushshiftAPI import numpy as np import time import sys reddit = praw.Reddit(client_id=config.client, client_secret=config.secret, user_agent=config.user) api = PushshiftAPI(reddit) SUBREDDIT = 'wallstreetbets' KEYWORD = 'dogecoin' tic = dt.datetime.today() start = int(dt.datetime(2021, 4, 26).timestamp()) posts = list( api.search_submissions( after=start, subreddit=SUBREDDIT, filter=[ 'title', 'selftext', 'created_utc', 'num_comments', 'score', 'permalink', 'upvote_ratio' ], limit=1000000, q=KEYWORD, stop_condition=lambda x: x.created_utc < dt.datetime.now( dt.timezone.utc).replace(tzinfo=dt.timezone.utc).timestamp( ) - 60 * 60 * 24 * 300))
import datetime import os from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union import pandas as pd import praw from loguru import logger from psaw import PushshiftAPI from tqdm import tqdm from finlang.config.constants import (COMMENT_FIELDS, SUBMISSION_FIELDS, SUBMISSION_SORTS, reddit) from finlang.nlp_utils import db_utils as dbu from finlang.scrape_utils import data_utils as du api = PushshiftAPI() def get_submissions(subreddit: praw.reddit.Subreddit, sort_method: str, **kwargs) -> Iterator[Any]: """Gets the submissions from a specified subreddit using a given sorting option33 Args: subreddit: Name of subreddit sort_method: Method of sorting to use when pulling the submissions kwargs: additional keyword arguments used in the specific sort method call Returns: List of submissions retrieved from the subreddit """ sort_method = sort_method.casefold()
import argparse import collections import copy import itertools import json import logging import os import random from pathlib import Path import pandas as pd from psaw import PushshiftAPI from tqdm import tqdm api = PushshiftAPI() parser = argparse.ArgumentParser() parser.add_argument( "-o", "--out_path", type=str, default="./data/reddit_threads/", help="Path or url of the dataset. If empty download from S3.", ) parser.add_argument( "-s", "--subreddit", type=str, action="append", default=[
""" import logging from datetime import datetime from typing import Dict, List from psaw import PushshiftAPI from bootstrap import END_DATE, START_DATE, blossom from bootstrap.migrate_redis_data import ( CommentData, comment_filter, dict_from_comment, extract_id_from_grafeas_url, ) push = PushshiftAPI() def _get_link_id(tr: Dict) -> str: """Get the Reddit link ID of the given transcription. For https://reddit.com/r/worldnewsvideo/comments/s48db9/bronx_fire_landlord_had_history_of_neglecting_heat/hss8wz0/ it returns s48db9. """ return tr["url"].split("/")[6] def _remove_footer(transcription: str) -> str: """Remove the footer of the transcription.""" parts = transcription.split("---") if len(parts) < 3:
def __init__(self): self.__api = PushshiftAPI()
def make_pushshiftAPI(reddit): api = PushshiftAPI(reddit) return api
import csv from datetime import datetime import time from psaw import PushshiftAPI api = PushshiftAPI() with open( 'saved_posts.csv', 'r', newline='') as f: # Opens the CSV files from your Reddit data request next(f) progress = 1 reader = csv.reader(f) first_column = next(zip(*reader)) data = list(first_column) with open('fixed_posts.csv', 'a', newline='' ) as f2: # Opens/creates the new CSV where the post info will go writer = csv.writer(f2) writer.writerow([ "Date/Time", "Subreddit", "Post Title", "Link", "Is Self-Post?", "URL", "Post ID" ]) # Writes name for each column for i in data: i = str(i) search = api.search_submissions( ids=i, limit=1, aggs='title') # Searches Pushshift for matching post for post in search: subreddit = post.subreddit title = post.title title = title.encode("ascii", errors="ignore").decode(
import requests import csv import json import pandas as pd import pymongo from psaw import PushshiftAPI api = PushshiftAPI() input = input('Enter company name: ') news = list( api.search_submissions(subreddit='news', filter=[ 'title', 'url', 'num_comments', 'author', 'score', 'created_utc' ], limit=150000)) print("connecting to db") # myclient = pymongo.MongoClient("18.219.233.150:27017") myclient = pymongo.MongoClient("127.0.0.1:27017") print("database connected") database = myclient['fibstock'] collection = database['news'] coll = [] for submission in news: coll.append({ 'title': submission.title, 'link': submission.url, 'publishedAt': submission.created_utc,
class RedditScraper: def __init__(self, subreddit_list, ds, after, before, config): self.reddit: praw.Reddit = praw.Reddit( client_id=config["REDDIT"].get("client_id"), client_secret=config["REDDIT"].get("client_secret"), user_agent=config["REDDIT"].get("user_agent"), ) self.api = PushshiftAPI(self.reddit) self.subreddit_list = subreddit_list self.ds = ds self.after = after self.before = before self.submissions = None self.engine = create_engine(config["DB"].get("db_url")) self.session = Session(self.engine) def get_submissions(self): self.submissions = map( RedditSubmission, self.api.search_submissions( after=self.after, before=self.before, subreddit=",".join(self.subreddit_list), ), ) def upload(self, fetch_past): redditors = {} submissions = {} subreddits = {} comments = {} redditor_subreddit_comments = {} redditor_subreddit_submissions = {} for s in self.submissions: try: if (not s or not s.submission or not hasattr(s.submission, "id") or not s.submission.id or not s.author or not hasattr(s.author, "id") or not s.author.id): continue if not fetch_past: redditors[s.author.id] = parse_redditor(s.author) submissions[s.submission.id] = { "id": s.submission.id, "score": s.submission.score, } subreddits[s.subreddit.id] = { "id": s.subreddit.id, "name": s.subreddit.display_name, } redditor_subreddit_submissions[s.submission.id] = { "redditor_id": s.author.id, "subreddit_id": s.subreddit.id, "submission_id": s.submission.id, "ds": self.ds, } for c in s.comments: if (not c or not hasattr(c, "id") or not c.id or not c.author or not hasattr(c.author, "id")): continue comments[c.id] = { "id": c.id, "submission_id": s.submission.id, "score": c.score, } subreddits[s.subreddit.id] = { "id": s.subreddit.id, "name": s.subreddit.display_name, } redditors[c.author.id] = parse_redditor(c.author) redditor_subreddit_comments[c.id] = { "redditor_id": c.author.id, "subreddit_id": s.subreddit.id, "comment_id": c.id, "ds": self.ds, } except Exception as e: logging.warning(f"An error has occured while parsing: {e}") continue if not redditor_subreddit_comments and not redditor_subreddit_submissions: return len(redditor_subreddit_submissions), len( redditor_subreddit_comments) # PostgreSQL upsert # https://docs.sqlalchemy.org/en/13/dialects/postgresql.html#insert-on-conflict-upsert try: self.session.execute( insert(m.Redditor).values(list( redditors.values())).on_conflict_do_nothing()) if not fetch_past: self.session.execute( insert(m.Submission).values(list( submissions.values())).on_conflict_do_nothing()) self.session.execute( insert(m.Redditor_Subreddit_Submission).values( list(redditor_subreddit_submissions.values())). on_conflict_do_nothing()) self.session.execute( insert(m.Subreddit).values(list( subreddits.values())).on_conflict_do_nothing()) self.session.execute( insert(m.Comment).values(list( comments.values())).on_conflict_do_nothing()) self.session.execute( insert(m.Redditor_Subreddit_Comment).values( list(redditor_subreddit_comments.values())). on_conflict_do_nothing()) self.session.commit() except Exception as e: logging.warning(f"An error occured during insertion: {e}") self.session.rollback() finally: self.session.close() return len(redditor_subreddit_submissions), len( redditor_subreddit_comments)
from psaw import PushshiftAPI import datetime as dt api = PushshiftAPI() start_epoch=int(dt.datetime(2017, 1, 1).timestamp()) print(list(api.search_submissions(after=start_epoch, subreddit='politics', filter=['url','author', 'title', 'subreddit'], limit=10)))
def reddit_scrape_by_entity(entity, start_date, end_date): ''' Retrieves posts relating to entity from reddit within the stipulated time frame Input: entity(string): entity name to retrieve data on start_date(datetime): date to begin scraping from end_date(datetime): date to stop scraping Output: df(dataframe): dataframe with columns = [author, url, excerpt, subreddit, title, article_date, type, entity, source_id, content, count, date_time_all, coin, source] ''' # initialise api api = PushshiftAPI() # convert datetime to timestamp start_epoch = int(start_date.timestamp()) end_epoch = int(end_date.timestamp()) # read in list of subreddits subreddits = pd.read_csv(r'../scraping/data/subreddit_list.csv')['subreddit'].tolist() entity = entity.lower() ############################## Submissions ################################ # query and generate the related information gen_submission = api.search_submissions(q=entity,after= start_epoch, before = end_epoch, filter=['created_utc', 'title', 'selftext', 'permalink', 'author', 'subreddit', 'id'], subreddit = subreddits) # generate dataframe for required data df_submission = pd.DataFrame([post.d_ for post in gen_submission]) # format dataframe if df_submission.empty == False: df_submission['title'] = df_submission['title'].apply(lambda x: str(x).lower()) df_submission['date_time'] = df_submission['created_utc'].apply(lambda x: datetime.fromtimestamp(x)) df_submission['selftext'] = df_submission['selftext'].apply(lambda x: str(x).lower()) df_submission['permalink'] = df_submission['permalink'].apply(lambda x: 'www.reddit.com'+ x) df_submission['author'] = df_submission['author'].apply(lambda x: x.lower()) df_submission['subreddit'] = df_submission['subreddit'].apply(lambda x: x.lower()) df_submission['type'] = 'submission' #Remove unecessary columns of data df_submission = df_submission.drop(columns = ['created_utc','created']) df_submission = df_submission.rename(columns={'selftext': 'excerpt', 'permalink':'article_url'}) ############################## Comments ################################ # query and generate the related information gen_comments = api.search_comments(q=entity,after= start_epoch, before = end_epoch, filter=['created_utc', 'body', 'permalink', 'author', 'subreddit', 'id'], subreddit = subreddits) # generate dataframe for required data df_comment = pd.DataFrame([comm.d_ for comm in gen_comments]) # format dataframe if df_comment.empty == False: df_comment['date_time'] = df_comment['created_utc'].apply(lambda x: datetime.fromtimestamp(x)) df_comment['body'] = df_comment['body'].apply(lambda x: str(x).lower()) df_comment['permalink'] = df_comment['permalink'].apply(lambda x: 'www.reddit.com'+ x) df_comment['author'] = df_comment['author'].apply(lambda x: x.lower()) df_comment['subreddit'] = df_comment['subreddit'].apply(lambda x: x.lower()) df_comment['excerpt'] = '' df_comment['type'] = 'comments' df_comment['id'] = 'comments/' + df_comment['id'] # remove unecessary columns of data df_comment = df_comment.drop(columns = ['created_utc','created']) # for comments, there are no titles so the body of the comment will be used as the title df_comment = df_comment.rename(columns={'body': 'title', 'permalink':'article_url'}) # concatenate submissions and comments dataframe df = pd.DataFrame(columns = ['author', 'article_url', 'excerpt', 'subreddit','title', 'date_time','type','entity','id']) df = df.append(df_submission) df = df.append(df_comment) df['entity'] = entity df = df.fillna('') df["text"] = df["title"] + " " + df["excerpt"] # filter out irrelevant data mask1 = list(df.apply(lambda x: filter_out(x["title"]) and filter_out(x["excerpt"]), axis=1)) df = df[mask1] mask2 = list(df.apply(lambda x: filter_in(x["title"]) or filter_in(x["excerpt"]), axis=1)) df = df[mask2] mask3 = list(df.apply(lambda x: filter_entity(str(x["text"]), entity), axis=1)) df = df[mask3] # process duplicates df = process_duplicates(df) # find all coins that are relevant in text df['coin'] = df['text'].apply(lambda x: get_coins(x)) # reset index df = df.reset_index(drop=True) # add source column df['source'] = 'reddit' # rename dataframe using naming convention in final database df = df.rename({'text':'content', 'article_url':'url', 'date_time':'article_date','id':'source_id'}, axis = 1) # keep only relevant columns df = df[['source','source_id','article_date','content', 'url','count','entity', 'author','coin']] return df
from psaw import PushshiftAPI import praw import pandas as pd import datetime as dt import os import numpy as np # insert Reddit credentials here # reddit = praw.Reddit(...) # make sure we're in read-only mode # reddit.read_only = True # use PRAW credentials; then PSAW returns the IDs that we can use in PRAW api = PushshiftAPI(reddit) # set range of dates to scrape start_day = dt.datetime(2021, 1, 10) date_list = [start_day + dt.timedelta(days=x) for x in range(1)] # create empty list to hold submission ids all_ids = list() # iterate through the dates and pull the posts for day in date_list: # set starting day for this loop start_epoch = int(day.timestamp()) # add one day to start_epoch end_epoch = start_epoch + (24 * 60 * 60) # get the submission ids for a given day
except: print('no com') redditPassword = config.settings['redditPassword'] redditClientSecret = config.settings['redditClientSecret'] redditClientSecret = config.client_secret redditPassword = config.redditPassword reddit = praw.Reddit(client_id='GCjpdb-78ljIQg', client_secret=redditClientSecret, password=redditPassword, user_agent='testguyman', username='******') api = PushshiftAPI(reddit) time_start = dt.datetime(2020, 2, 21) track_time_minutes = dt.datetime.now().time() track_time_combined = datetime.combine(time_start, track_time_minutes) print(track_time_combined) time_end = time_start + timedelta(days=1) stock_ticker_tracking_array1 = {} start_epoch = time_start end_epoch = time_end List1 = list( api.search_submissions(q='Daily Discussion Thread', after=start_epoch, before=end_epoch,
class ReadCommentsAll(): def __init__(self, subreddit_name, limit): print("API parameters:", subreddit_name, limit) ranges = [(1, 1, 2019, 1, 2, 2019), (1, 2, 2019, 1, 3, 2019), (1, 3, 2019, 1, 4, 2019), (1, 4, 2019, 1, 5, 2019), (1, 5, 2019, 1, 6, 2019), (1, 6, 2019, 1, 7, 2019), (1, 7, 2019, 1, 8, 2019), (1, 8, 2019, 1, 9, 2019), (1, 9, 2019, 1, 10, 2019), (1, 10, 2019, 1, 11, 2019), (1, 11, 2019, 1, 12, 2019), (1, 12, 2019, 1, 1, 2020), (1, 1, 2020, 1, 2, 2020), (1, 2, 2020, 1, 3, 2020), (1, 3, 2020, 1, 4, 2020), (1, 4, 2020, 1, 5, 2020), (1, 5, 2020, 1, 6, 2020), (1, 6, 2020, 1, 7, 2020), (1, 7, 2020, 1, 8, 2020), (1, 8, 2020, 1, 9, 2020), (1, 9, 2020, 1, 10, 2020), (1, 10, 2020, 1, 11, 2020), (1, 11, 2020, 1, 12, 2020), (1, 12, 2020, 1, 1, 2021), (1, 1, 2021, 1, 2, 2021), (1, 2, 2021, 1, 3, 2021), (1, 3, 2021, 1, 4, 2021)] for d1, m1, y1, d2, m2, y2 in ranges: posted_after = int(datetime.datetime(y1, m1, d1).timestamp()) posted_before = int(datetime.datetime(y2, m2, d2).timestamp()) self.api = PushshiftAPI() self.comBatchNo = 0 self.outputPath = './{0}/{1}/'.format(subreddit_name, posted_after) Path(self.outputPath).mkdir(parents=True, exist_ok=True) self.getComments(subreddit_name, None, [ 'created_utc', 'score', 'selftext', 'title', 'upvote_ratio', 'body' ], posted_after, posted_before, limit) def saveData(self, items: list): fileId = 0 filePath = '' self.comBatchNo += 1 fileId = self.comBatchNo filePath = self.outputPath filePath += 'file' + str(fileId) print("{0} - {1} - {2}".format(time.time(), len(items), filePath)) data = '' for item in items: data += item + '\n' with codecs.open(filePath, 'w', encoding='utf-8-sig') as file: file.write(data) data = None def getComments(self, subreddit_name: str, query: str, fields: list, after: int, before: int, limit=1000, sortOrder='desc', sortType='score'): try: query = self.api.search_comments(subreddit=subreddit_name, after=after, before=before, limit=limit, filter=fields) submissions = list() for element in query: submissions.append(json.dumps(element.d_)) if len(submissions) == 1000: self.saveData(submissions.copy()) submissions = list() if len(submissions) > 0: self.saveData(submissions.copy()) submissions = list() except: print("Unexpected error:", sys.exc_info()) print("Done!!!")
from psaw import PushshiftAPI import arrow import pandas as pd file_name_template = 'aita_posts_{}_june.csv' cols = [ 'id', 'created_ts', 'author', 'title', 'body', 'flair', 'was_deleted', 'was_removed' ] api = PushshiftAPI() data_lines = [] for day in range(5, 31): start_time = arrow.now() file_name = file_name_template.format(str(day) if day > 9 else f'0{day}') # file_name = 'aita_posts_small_june_sample.csv' print(f'Processing {file_name}...') df = pd.read_csv(file_name, sep='\t', header=None, names=cols) for index, row in df.iterrows(): all_comments_count = 0 aita_comments_count = 0 all_submissions_count = 0 aita_submissions_count = 0 author = row['author'] created_ts = row['created_ts']
import praw, prawcore from psaw import PushshiftAPI import datetime as dt import time api = PushshiftAPI() # Reddit account information; you cannot delete data for an account you do not have access to. username = "" # Your Reddit username password = "" # Your Reddit password # These two values are needed to access Reddit’s API as a script application (see Authenticating via OAuth for other application types). # If you don’t already have a client ID and client secret, follow Reddit’s First Steps Guide to create them. # https://github.com/reddit-archive/reddit/wiki/OAuth2-Quick-Start-Example#first-steps client_id = "" # Reddit app client id client_secret = "" # Reddit app secret # A user agent is a unique identifier that helps Reddit determine the source of network requests. # https://github.com/reddit-archive/reddit/wiki/API user_agent = "" # Edit comment first before deleting it. Leave blank to leave comment unedited at time of deletion. # Likely an unnecessary feature as the data would still be retained by Pushshift at time of deletion, but I included it for peace of mind anyway. edit_value = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua." ask_before_deleting = True # Ask before deleting every comment. False = auto delete. start_epoch = dt.datetime( 2005, 6, 25 ) # Year, Month, Day. Only comments / submissions created after the given datetime will be fetched and deleted. end_epoch = dt.datetime( 2020, 7, 24
def __init__(self, client_id, client_secret, user_agent): self.reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent) self.api = PushshiftAPI(self.reddit)
#!/usr/bin/env python # coding: utf-8 from psaw import PushshiftAPI import json api = PushshiftAPI() # Load list of users. # The list of users associated with this campaign is based one the official release from Reddit at: # https://www.reddit.com/r/redditsecurity/comments/e74nml/suspected_campaign_from_russia_on_reddit/ authors = [] with open("./data/userlist.csv") as f: authors = f.read().splitlines() # Perform searches using Pushshift API contributions_by_author = [] for author in authors: entry = {} entry["author"] = author submissions = api.search_submissions(limit=99999, author=author) entry["submissions"] = list(submissions) comments = api.search_comments(limit=99999, author=author) entry["comments"] = list(comments) contributions_by_author.append(entry)
import praw from psaw import PushshiftAPI import pandas as pd import datetime as dt import re from pprint import pprint as print reddit = praw.Reddit(client_id='nsitoMzM8H19pA', client_secret='sx4jlxSsmL6n4NOUt080VZ1dvas', user_agent='Watch Exchange Web Scrapper') api = PushshiftAPI(reddit) start_epoch = int(dt.datetime(2017, 1, 1).timestamp()) results = list(api.search_submissions(subreddit='Watchexchange', limit=1)) wts = re.compile('\\[WTS\\]') price = re.compile(r'[$][\d]+') testpost = reddit.submission(id=results[0].id) print(testpost)
def get_spac( limit: int = 5, ) -> Tuple[List[praw.models.reddit.submission.Submission], Dict, int]: """Get posts containing SPAC from top subreddits [Source: reddit] Parameters ---------- limit : int, optional Number of posts to get for each subreddit, by default 5 Returns ------- List[praw.models.reddit.submission.Submission] : List of reddit submissions Dict : Dictionary of tickers and counts int : Number of posts found. """ praw_api = praw.Reddit( client_id=cfg.API_REDDIT_CLIENT_ID, client_secret=cfg.API_REDDIT_CLIENT_SECRET, username=cfg.API_REDDIT_USERNAME, user_agent=cfg.API_REDDIT_USER_AGENT, password=cfg.API_REDDIT_PASSWORD, ) d_watchlist_tickers: Dict = {} l_watchlist_author = [] subs = [] psaw_api = PushshiftAPI() submissions = psaw_api.search_submissions( subreddit=l_sub_reddits, q="SPAC|Spac|spac|Spacs|spacs", filter=["id"], ) n_flair_posts_found = 0 for submission in submissions: # Get more information about post using PRAW api submission = praw_api.submission(id=submission.id) # Ensure that the post hasn't been removed by moderator in the meanwhile, # that there is a description and it's not just an image, that the flair is # meaningful, and that we aren't re-considering same author's watchlist if (not submission.removed_by_category and submission.selftext and submission.link_flair_text not in ["Yolo", "Meme"] and submission.author.name not in l_watchlist_author): l_tickers_found = find_tickers(submission) subs.append(submission) if l_tickers_found: # Add another author's name to the parsed watchlists l_watchlist_author.append(submission.author.name) # Lookup stock tickers within a watchlist for key in l_tickers_found: if key in d_watchlist_tickers: # Increment stock ticker found d_watchlist_tickers[key] += 1 else: # Initialize stock ticker found d_watchlist_tickers[key] = 1 # Increment count of valid posts found n_flair_posts_found += 1 # Check if number of wanted posts found has been reached if n_flair_posts_found > limit - 1: break return subs, d_watchlist_tickers, n_flair_posts_found
from pymongo import MongoClient import sys sys.path.append("../") from configuration import configuration import datetime as dt from psaw import PushshiftAPI client = MongoClient(configuration.DB_HOST, configuration.DB_PORT) db = client[configuration.DB_NAME] COLLECTION = "neet_covid_2" api = PushshiftAPI() start_epoch = int(dt.datetime(2020, 1, 1).timestamp()) end_epoch = int(dt.datetime(2020, 12, 31).timestamp()) gen = api.search_submissions(subreddit="NEET", after=start_epoch, before=end_epoch) cache = [] for s in gen: to_save = s.d_ to_save["type"] = "post" cache.append(to_save)
def get_popular_tickers(n_top: int, posts_to_look_at: int, subreddits: str = "") -> pd.DataFrame: """Get popular tickers from list of subreddits [Source: reddit] Parameters ---------- n_top : int Number of top tickers to get posts_to_look_at : int How many posts to analyze in each subreddit subreddits : str, optional String of comma separated subreddits. Returns ------- pd.DataFrame DataFrame of top tickers from supplied subreddits """ if subreddits: sub_reddit_list = subreddits.split(",") if "," in subreddits else [ subreddits ] else: sub_reddit_list = l_sub_reddits d_watchlist_tickers: Dict = {} l_watchlist_author = [] praw_api = praw.Reddit( client_id=cfg.API_REDDIT_CLIENT_ID, client_secret=cfg.API_REDDIT_CLIENT_SECRET, username=cfg.API_REDDIT_USERNAME, user_agent=cfg.API_REDDIT_USER_AGENT, password=cfg.API_REDDIT_PASSWORD, ) psaw_api = PushshiftAPI() for s_sub_reddit in sub_reddit_list: print( f"Search for latest tickers for {posts_to_look_at} '{s_sub_reddit}' posts" ) submissions = psaw_api.search_submissions( subreddit=s_sub_reddit, limit=posts_to_look_at, filter=["id"], ) n_tickers = 0 for submission in submissions: try: # Get more information about post using PRAW api submission = praw_api.submission(id=submission.id) # Ensure that the post hasn't been removed by moderator in the meanwhile, # that there is a description and it's not just an image, that the flair is # meaningful, and that we aren't re-considering same author's content if (not submission.removed_by_category and (submission.selftext or submission.title) and submission.author.name not in l_watchlist_author): l_tickers_found = find_tickers(submission) if l_tickers_found: n_tickers += len(l_tickers_found) # Add another author's name to the parsed watchlists l_watchlist_author.append(submission.author.name) # Lookup stock tickers within a watchlist for key in l_tickers_found: if key in d_watchlist_tickers: # Increment stock ticker found d_watchlist_tickers[key] += 1 else: # Initialize stock ticker found d_watchlist_tickers[key] = 1 except ResponseException: print( "Received a response from Reddit with an authorization error. check your token.\n" ) return pd.DataFrame() print(f" {n_tickers} potential tickers found.") lt_watchlist_sorted = sorted(d_watchlist_tickers.items(), key=lambda item: item[1], reverse=True) if lt_watchlist_sorted: n_top_stocks = 0 # pylint: disable=redefined-outer-name popular_tickers = [] for t_ticker in lt_watchlist_sorted: if n_top_stocks > n_top: break try: # If try doesn't trigger exception, it means that this stock exists on finviz # thus we can print it. stock_info = finviz.get_stock(t_ticker[0]) popular_tickers.append(( t_ticker[1], t_ticker[0], stock_info["Company"], stock_info["Sector"], stock_info["Price"], stock_info["Change"], stock_info["Perf Month"], f"https://finviz.com/quote.ashx?t={t_ticker[0]}", )) n_top_stocks += 1 except HTTPError as e: if e.response.status_code != 404: print(f"Unexpected exception from Finviz: {e}") except Exception as e: print(e, "\n") return popular_tickers_df = pd.DataFrame( popular_tickers, columns=[ "Mentions", "Ticker", "Company", "Sector", "Price", "Change", "Perf Month", "URL", ], ) return popular_tickers_df
from psaw import PushshiftAPI import pandas as pd import datetime as dt import time api = PushshiftAPI() start_epoch = int(time.time()) listOfPosts = [] while (len(listOfPosts) < 100000): listOfPosts.extend( list( api.search_submissions(before=start_epoch, subreddit='toastme', filter=[ 'id', 'permalink', 'url', 'author', 'title', 'subreddit', 'score', 'num_comments' ], limit=500))) start_epoch = listOfPosts[-1].created_utc print(listOfPosts[-1]) df = pd.DataFrame(columns=[ 'id', 'title', 'url', 'author', 'score', 'num_comments', 'comments_url' ]) for post in listOfPosts: if post.num_comments >= 10: df = df.append( {
def watchlist(l_args): parser = argparse.ArgumentParser( prog='watchlist', description="""Print other users watchlist. [Source: Reddit]""") parser.add_argument('-l', "--limit", action="store", dest="n_limit", type=check_positive, default=5, help='limit of posts with watchlists retrieved.') try: (ns_parser, l_unknown_args) = parser.parse_known_args(l_args) if l_unknown_args: print( f"The following args couldn't be interpreted: {l_unknown_args}\n" ) return l_sub_reddits = [ 'pennystocks', 'RobinHoodPennyStocks', 'Daytrading', 'StockMarket', 'stocks', 'investing', 'wallstreetbets' ] d_submission = {} d_watchlist_tickers = {} l_watchlist_links = list() l_watchlist_author = list() ls_text = list() praw_api = praw.Reddit(client_id=cfg.API_REDDIT_CLIENT_ID, client_secret=cfg.API_REDDIT_CLIENT_SECRET, username=cfg.API_REDDIT_USERNAME, user_agent=cfg.API_REDDIT_USER_AGENT, password=cfg.API_REDDIT_PASSWORD) dt_last_time_market_close = get_last_time_market_was_open( datetime.now() - timedelta(hours=24)) n_ts_after = int(dt_last_time_market_close.timestamp()) psaw_api = PushshiftAPI() submissions = psaw_api.search_submissions( after=n_ts_after, subreddit=l_sub_reddits, q='WATCHLIST|Watchlist|watchlist', filter=['id']) n_flair_posts_found = 0 while True: submission = next(submissions, None) if submission: # Get more information about post using PRAW api submission = praw_api.submission(id=submission.id) # Ensure that the post hasn't been removed by moderator in the meanwhile, #that there is a description and it's not just an image, that the flair is #meaningful, and that we aren't re-considering same author's watchlist if not submission.removed_by_category and submission.selftext \ and submission.link_flair_text not in ['Yolo', 'Meme'] \ and submission.author.name not in l_watchlist_author: ls_text = list() ls_text.append(submission.selftext) ls_text.append(submission.title) submission.comments.replace_more(limit=0) for comment in submission.comments.list(): ls_text.append(comment.body) l_tickers_found = list() for s_text in ls_text: for s_ticker in set( re.findall(r'([A-Z]{3,5} )', s_text)): l_tickers_found.append(s_ticker.strip()) if l_tickers_found: # Add another author's name to the parsed watchlists l_watchlist_author.append(submission.author.name) # Lookup stock tickers within a watchlist for key in l_tickers_found: if key in d_watchlist_tickers: # Increment stock ticker found d_watchlist_tickers[key] += 1 else: # Initialize stock ticker found d_watchlist_tickers[key] = 1 l_watchlist_links.append( f"https://www.reddit.com{submission.permalink}") # delte below, not necessary I reckon. Probably just link? # Refactor data s_datetime = datetime.utcfromtimestamp( submission.created_utc).strftime( "%d/%m/%Y %H:%M:%S") s_link = f"https://www.reddit.com{submission.permalink}" s_all_awards = "" for award in submission.all_awardings: s_all_awards += f"{award['count']} {award['name']}\n" s_all_awards = s_all_awards[:-2] # Create dictionary with data to construct dataframe allows to save data d_submission[submission.id] = { 'created_utc': s_datetime, 'subreddit': submission.subreddit, 'link_flair_text': submission.link_flair_text, 'title': submission.title, 'score': submission.score, 'link': s_link, 'num_comments': submission.num_comments, 'upvote_ratio': submission.upvote_ratio, 'awards': s_all_awards } # Print post data collected so far print(f"\n{s_datetime} - {submission.title}") print(f"{s_link}") t_post = PrettyTable([ 'Subreddit', 'Flair', 'Score', '# Comments', 'Upvote %', "Awards" ]) t_post.add_row([ submission.subreddit, submission.link_flair_text, submission.score, submission.num_comments, f"{round(100*submission.upvote_ratio)}%", s_all_awards ]) print(t_post) print("") # Increment count of valid posts found n_flair_posts_found += 1 # Check if number of wanted posts found has been reached if n_flair_posts_found > ns_parser.n_limit - 1: break # Check if search_submissions didn't get anymore posts else: break if n_flair_posts_found: lt_watchlist_sorted = sorted(d_watchlist_tickers.items(), key=lambda item: item[1], reverse=True) s_watchlist_tickers = "" n_tickers = 0 for t_ticker in lt_watchlist_sorted: try: # If try doesn't trigger exception, it means that this stock exists on finviz #thus we can print it. finviz.get_stock(t_ticker[0]) if int(t_ticker[1]) > 1: s_watchlist_tickers += f"{t_ticker[1]} {t_ticker[0]}, " n_tickers += 1 except: pass if n_tickers: print( "The following stock tickers have been mentioned more than once across the previous watchlists:" ) print(s_watchlist_tickers[:-2] + '\n') print("") except: print("")
def subreddit_data_old(): cache_file = config['paths']['cache_path'] data = request.get_json() # return current subreddit name if len(data) == 1: if 'subredditIndex' in data: try: subreddit = reddit.subreddit(subreddit_names[data['subredditIndex']]) widgets = subreddit.widgets # id_card is for reddit redesign, not old reddit id_card = widgets.id_card if data['subredditIndex'] < len(subreddit_names): return jsonify( {'subreddit_name': subreddit.display_name, 'subreddit_subscribers': subreddit.subscribers, 'subreddit_subscriber_text': id_card.subscribersText}) except exceptions.Forbidden: # subreddit is private so skip it and return a placeholder return jsonify( {'subreddit_name': subreddit_names[data['subredditIndex']], 'subreddit_subscribers': 0, 'subreddit_subscriber_text': 'subscribers'}) elif 'clickedId' in data: # clicked posts are marked as such and will show changes in information try: with lock: with open(cache_file, 'r') as f: post_cache = json.load(f) except FileNotFoundError: return jsonify(), 404 clicked_id = data['clickedId'] print('Clicked {}'.format(clicked_id)) clicked_post = post_cache[clicked_id] clicked_post['visited'] = True # marks the clicked_post as clicked on clicked_post['visit_time'] = time.time() # time on click clicked_post['visit_comment_count'] = clicked_post['comment_count'] # number of comments on click # save cache with lock: post_cache = None try: with open(cache_file, 'r') as f: post_cache = json.load(f) post_cache = remove_outdated(post_cache) except FileNotFoundError: pass if post_cache is not None: post_cache[clicked_id].update(clicked_post) with open(cache_file, 'w') as f: json.dump(post_cache, f) return jsonify(), 200 elif 'viewedId' in data: if 'viewed_post_ids' in session: viewed_post_ids = session['viewed_post_ids'] else: viewed_post_ids = {} viewed_id = data['viewedId'] if viewed_id not in viewed_post_ids: viewed_post_ids[viewed_id] = None try: with lock: with open(cache_file, 'r') as f: post_cache = json.load(f) except FileNotFoundError: return jsonify(), 404 print('Viewed {}'.format(viewed_id)) viewed_post = post_cache[viewed_id] if not config['modes'].getboolean('debug_mode'): viewed_post['display_count'] += 1 # save cache with lock: post_cache = None try: with open(cache_file, 'r') as f: post_cache = json.load(f) post_cache = remove_outdated(post_cache) except FileNotFoundError: pass if post_cache is not None: post_cache[viewed_id].update(viewed_post) with open(cache_file, 'w') as f: json.dump(post_cache, f) session['viewed_post_ids'] = viewed_post_ids return jsonify(), 200 return jsonify(), 404 # return post data cur_sub_num = data['subredditIndex'] cur_post_num = data['postIndex'] post_amount = data['postAmount'] sort_type = data['sortType'] if cur_sub_num >= len(subreddit_names): return {} subreddit_name = subreddit_names[cur_sub_num] subreddit = reddit.subreddit(subreddit_name) # deal with quarantined subreddits try: subreddit.quaran.opt_in() except exceptions.Forbidden: pass if not config['modes'].getboolean('slow_mode'): submissions = subreddit.top(sort_type, limit=(cur_post_num + post_amount)) for _ in range(cur_post_num): try: next(submissions) except StopIteration: break else: # posts = get_posts(submissions, SUBMISSION_SCORE_DEGRADATION) # print('slow mode') # slow mode only shows posts older than 24 hours # posts which have been visited should no longer be shown because they have settled down by now # load cache, filter posts earlier than 24 hours # start_time = time.time() # cached_posts = get_cached_posts(subreddit.display_name, min_hours=24, max_hours=48 + 8) # end_time = time.time() # if len(cached_posts) > 0: # print('using cached posts', len(cached_posts)) # posts = cached_posts # else: # print('no cached posts for r/{} meet requirements'.format(subreddit.display_name)) # print('getting cached posts took {} seconds'.format(end_time - start_time)) # for some reason sort_type will cause duplicates to be returned # the submissions seemingly start duplicating / looping back # using a small limit such as 10 will avoid duplication # unknown whether using limit itself avoids duplication # sort_type = score is inaccurate, it is more accurate to fetch all submissions and sort them # PSAW API # this was previous initialized at the start in global scope # causing Pushshift API requests to get mixed up and return results from multiple API requests at once # initializing a new api object for each request seems to solve this problem api = PushshiftAPI(reddit) submissions = list(api.search_submissions(subreddit=subreddit_name, after='56h', before='24h')) id_set = set() for submission in submissions: if submission.id in id_set: print('{}: "{}" is duplicated'.format(submission.id, submission.title)) else: id_set.add(submission.id) if submission.subreddit.display_name.lower() != subreddit_name.lower(): print('submission {} of {} != subreddit {}'.format(submission.id, submission.subreddit.display_name, subreddit_name)) submissions.sort(key=lambda item: item.score, reverse=True) submissions = submissions[:10] print(id_set) print(subreddit.display_name, submissions) posts = get_posts(submissions) print( f'sub #{cur_sub_num}: {subreddit.display_name}, post {cur_post_num}, {post_amount} posts, offset {cur_post_num + post_amount}, {posts}') return jsonify(posts)