Ejemplo n.º 1
0
def get_comments_from_wallstreetbets(before, after):
    """
    Functions returns comments dataframe in particular horizon.
    :param before: pd.Timestamp()
    :param after: pd.Timestamp()
    :return: pd.DataFrame()
    """

    # Get cpu counts to specify maximum cores on VM available
    max_threads = os.cpu_count() * 5

    # Scrap comments from wallstreetbets
    api = PushshiftAPI()
    subreddit = "wallstreetbets"
    comments = api.search_comments(
        # PMAW parameters
        mem_safe=True,
        num_workers=max_threads,
        # Pushift.io parameters
        subreddit=subreddit,
        after=int(after.timestamp()),
        before=int(before.timestamp())
    )

    # Clean dataframe with comments
    comments_df = pd.DataFrame(comments)
    if not comments_df.empty:
        comments_df = comments_df[['id', 'author', 'body', 'created_utc']].drop_duplicates()
        comments_df.created_utc = pd.to_datetime(comments_df.created_utc, unit='s')
        comments_df = comments_df[~comments_df.body.isin(['[removed]', '[deleted]'])]
        comments_df = comments_df.sort_values('created_utc').reset_index(drop=True)

    return comments_df
Ejemplo n.º 2
0
def test_asc_sort():
    with pytest.raises(NotImplementedError):
        api = PushshiftAPI()
        comments = api.search_comments(subreddit="science",
                                       limit=100,
                                       before=1629990795,
                                       sort='asc')
Ejemplo n.º 3
0
def test_comment_praw_query():
    api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit)
    comments = api_praw.search_comments(q="quantum",
                                        subreddit="science",
                                        limit=100,
                                        before=1629990795)
    assert (len(comments) == 100)
Ejemplo n.º 4
0
def test_comment_praw_mem_safe():
    api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit)
    comments = api_praw.search_comments(subreddit="science",
                                        limit=1000,
                                        mem_safe=True,
                                        before=1629990795)
    assert (len(comments) == 1000)
Ejemplo n.º 5
0
def test_praw_ids_filter():
    def fxn(item):
        return item['ups'] > 2

    api_praw = PushshiftAPI(praw=reddit)
    comments = api_praw.search_comments(ids=comment_ids, filter_fn=fxn)
    assert (len(comments) == 4)
Ejemplo n.º 6
0
class pea():
    def __init__(self, start: dt.datetime, end: dt.datetime, subreddit: str):
        self.start = start
        self.end = end
        self.subreddit = subreddit

        # initializing PMAW wrapper
        self.api = PushshiftAPI()
        # slices the columns
        self.peadf = pd.DataFrame(columns=['created_utc', 'author', 'body'])

    def __str__(self, limit=None):
        return "start: {}, end: {}, subreddit: {}".format(
            self.start, self.end, self.subreddit)

    # get dataframe with time in utc, author, and body of text
    def getdf(self, limit=None):
        # prints out basic info
        print(self)

        # uses PMAW to gather the info form PushShift
        comments = self.api.search_comments(subreddit=self.subreddit,
                                            rate_limit=30,
                                            limit=limit,
                                            before=int(self.end.timestamp()),
                                            after=int(self.start.timestamp()))
        print(f'Retrieved {len(comments)} comments from Pushshift')
        comments_df = pd.DataFrame(comments)
        comments_df = comments_df.loc[:, ['created_utc', 'author', 'body']]
        self.peadf = comments_df.sort_values(by=['created_utc']).set_index(
            ['created_utc'])

        return self.peadf

    # adds another column with tickers that are present in the body
    def analyze_df(self):
        self.peadf['tickers'] = self.peadf.apply(
            lambda row: analyze(row['body']), axis=1)
        return self.peadf
Ejemplo n.º 7
0
class redditdb:
    def __init__(self, subreddit: str = None, start: dt.datetime = None, path: str = os.getcwd(), limit: int = None):
        self.api = PushshiftAPI() 
        self.subreddit = subreddit
        self.start = start
        self.path = path
        self.limit = limit
        self.fpath = os.path.join(self.path, self.subreddit)
        self.fpathComments = os.path.join(self.fpath, "comments")
        self.fpathPosts = os.path.join(self.fpath, "posts")

        self.dfComments_list = None
        self.dfPosts_list = None

    # get list of dataframes
    def updateListComments(self):
        self.dfComments_list = [f for f in listdir(self.fpathComments) if isfile(join(self.fpathComments, f))]

    def updateListPosts(self):
        self.dfPosts_list = [f for f in listdir(self.fpathPosts) if isfile(join(self.fpathPosts, f))]

    # updates comments and posts
    def updateAll(self, date: dt.datetime):
        if not os.path.isdir(self.fpath):
            os.makedirs(self.fpath)

        self.updateComments(date)
        self.updatePosts(date)

    # update set of comment dataframes to yesterday
    def updateComments(self, date: dt.datetime):
        # makes the directory if it doesn't already exist
        if not os.path.isdir(self.fpathComments):
            os.makedirs(self.fpathComments)
        # starts downloading
        self.getallComments(date)
    
    def updatePosts(self, date: dt.datetime):
        # makes the directory if it doesn't already exist
        if not os.path.isdir(self.fpathPosts):
            os.makedirs(self.fpathPosts)
        # starts downloading
        self.getallPosts(date)
        
    # save all data in a time range
    def getallComments(self, end: dt.datetime):
        self.updateListComments()
        print("Retrieving comment data from {}: {} to {}".format(self.subreddit, self.start, end))

        dset = set(self.dfComments_list)

        for i in range( int( (end-self.start).days ) + 1):
            day = self.start + dt.timedelta(days = i)

            # check if day is already accounted for, if not download the comment dataframe
            if not '{}.csv'.format(day.date()) in dset:
                print('\033[32m{}\033[37m comments has not been downloaded for {}/comments. Downloading...'. format(day.date(), self.subreddit))
                self.savedayComments(day = day)
            else:
                print('\033[31m{}\033[37m comments already exists in {}/comments'. format(day.date(), self.subreddit))

    def getallPosts(self, end: dt.datetime):
        self.updateListPosts()
        print("Retrieving post data from {}: {} to {}".format(self.subreddit, self.start, end))

        dset = set(self.dfPosts_list)

        for i in range( int( (end-self.start).days ) + 1):
            day = self.start + dt.timedelta(days = i)

            # check if day is already accounted for, if not download the post dataframe
            if not '{}.csv'.format(day.date()) in dset:
                print('\033[32m{}\033[37m posts has not been downloaded in {}/posts. Downloading...'. format(day.date(), self.subreddit))
                self.savedayPosts(day = day)
            else:
                print('\033[31m{}\033[37m posts already exists in {}/posts'. format(day.date(), self.subreddit))

    
    # helper function to save data by day
    def savedayComments(self, day: dt.datetime):
        # gets comments from PushShift using PMAW wrapper
        comments = self.api.search_comments(subreddit=self.subreddit, rate_limit = 20, limit=self.limit, before=int((day+dt.timedelta(days=1)).timestamp()), after=int(day.timestamp()))
        print(f'Retrieved {len(comments)} comments from Pushshift')

        # converts into a dataframe with utc as index
        comments_df = pd.DataFrame(comments)
        # extra check
        if not comments_df.empty:
            comments_df = comments_df.sort_values(by=['created_utc']).set_index(['created_utc'])
        
        # calls download function
        self.downloadComments(comments_df, day)
    
    # helper function to save data by day
    def savedayPosts(self, day: dt.datetime):
        # gets posts from PushShift using PMAW wrapper
        posts = self.api.search_submissions(subreddit=self.subreddit, rate_limit = 20, limit=self.limit, before=int((day+dt.timedelta(days=1)).timestamp()), after=int(day.timestamp()))
        print(f'Retrieved {len(posts)} posts from Pushshift')

        # converts into a dataframe with utc as index
        posts_df = pd.DataFrame(posts)
        # extra check
        if not posts_df.empty:
            posts_df = posts_df.sort_values(by=['created_utc']).set_index(['created_utc'])
        
        # calls download function
        self.downloadPosts(posts_df, day)


    # download the info into a dataframe
    # path/subreddit/date
    def downloadComments(self, df: pd.DataFrame, day: dt.datetime):
        # checks to see if a folder with the name of the subreddit already exists
        if not os.path.isdir(self.fpathComments):
            os.makedirs(self.fpathComments)
        
        # names the file the current day
        fname = '{}.csv'.format(day.date())

        #save the file
        df.to_csv(os.path.join(self.fpathComments, fname))
        
     # download the info into a dataframe
    # path/subreddit/date
    def downloadPosts(self, df: pd.DataFrame, day: dt.datetime):
        # checks to see if a folder with the name of the subreddit already exists
        if not os.path.isdir(self.fpathPosts):
            os.makedirs(self.fpathPosts)
        
        # names the file the current day
        fname = '{}.csv'.format(day.date())

        #save the file
        df.to_csv(os.path.join(self.fpathPosts, fname))

    # load dataframe into memory
    def loadDayComments(self, day: dt.datetime):
        # check if folder exists
 
        if not os.path.isdir(self.fpathComments):
            print('The folder for {}/comments does not exist'.format(self.subreddit))
            return pd.DataFrame()
        else:
            # get name of file for the date
            fname = '{}.csv'.format(day.date())
            loadpath = os.path.join(self.fpathComments, fname)
            # check for file
            if os.path.exists(loadpath):
                return pd.read_csv(loadpath) 
            else:
                print("\033[31m{}\033[37m does not exist in {}/posts".format(day.date(), self.subreddit))
                return pd.DataFrame()
    
    # load dataframe into memory
    def loadDayPosts(self, day: dt.datetime):
        # check if folder exists
 
        if not os.path.isdir(self.fpathPosts):
            print('The folder for {}/posts does not exist'.format(self.subreddit))
            return pd.DataFrame()
        else:
            # get name of file for the date
            fname = '{}.csv'.format(day.date())
            loadpath = os.path.join(self.fpathPosts, fname)
            # check for file
            if os.path.exists(loadpath):
                return pd.read_csv(loadpath) 
            else:
                print("\033[31m{}\033[37m does not exist in {}/posts".format(day.date(), self.subreddit))
                return pd.DataFrame()


    def loadRangeAll(self, start: dt.datetime, end: dt.datetime):
        return self.loadRangeComments(start, end), self.loadRangePosts(start, end)

    # return the combined dataframe
    def loadRangeComments(self, start: dt.datetime, end: dt.datetime):
        dfout = pd.DataFrame()
        for i in range( int( (end-start).days ) + 1):
            day = start + dt.timedelta(days = i)
            df = self.loadDayComments(day)
            
            if not df.empty:
                dfout = dfout.append(df)
        
        return dfout

    # return the combined dataframe
    def loadRangePosts(self, start: dt.datetime, end: dt.datetime):
        dfout = pd.DataFrame()
        for i in range( int( (end-start).days ) + 1):
            day = start + dt.timedelta(days = i)
            df = self.loadDayPosts(day)
            
            if not df.empty:
                dfout = dfout.append(df)
        
        return dfout
Ejemplo n.º 8
0
import requests
import datetime as dt
from pmaw import PushshiftAPI
import pandas as pd

api = PushshiftAPI() 

# before = int(dt.datetime(2021,1,1).timestamp())
# after = int(dt.datetime(2020,12,1).timestamp())

before = int(dt.datetime(2021,2,1).timestamp())
after = int(dt.datetime(2021,1,1).timestamp())

subreddit="wallstreetbets"
limit=100

comments = api.search_comments(subreddit=subreddit, limit=limit, before=before, after=after)
print(f'Retrieved {len(comments)} comments from Pushshift')

comments_df = pd.DataFrame(comments)# preview the comments data
print(comments_df.index)
print(comments_df.head(10).loc[:,['created_utc', 'author', 'body']])

# comments_df.to_csv('./wsb_comments.csv', header=True, index=False, columns=list(comments_df.axes[1]))
Ejemplo n.º 9
0
def test_safe_exit_praw():
    with pytest.raises(NotImplementedError):
        api_praw = PushshiftAPI(praw=reddit)
        comments = api_praw.search_comments(ids=comment_ids, safe_exit=True)
Ejemplo n.º 10
0
                results_df = results_df[
                    results_df.removed_by_category.isnull()]

            text_list_title = results_df.title.tolist()
            text_list_content = results_df.selftext.replace(
                '', np.nan)  #Replace empty text with nan
            text_list_content = text_list_content.dropna().tolist(
            )  #Drop empty row (a lot of submission don't have text content)
            test_list_data = test_list_data + text_list_title + text_list_content

    else:
        file_name = 'comments'
        results = api.search_comments(score=min_score,
                                      sort='desc',
                                      sort_type='score',
                                      subreddit=subreddit,
                                      size=size,
                                      before=before,
                                      after=after,
                                      limit=limit)
        results_df = pd.DataFrame(results)

        # Extract text to list if result is not empty
        if not results_df.empty:
            text_list = results_df.body.tolist()
            test_list_data = test_list_data + text_list

    if not results_df.empty:
        if len(total_results_df) == 0:
            total_results_df = results_df.copy(deep=True)
        else:
            total_results_df = total_results_df.append(results_df)
Ejemplo n.º 11
0
def test_comment_praw_ids():
    api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit)
    comments = api_praw.search_comments(ids=comment_ids)
    assert (len(comments) == len(comment_ids))
Ejemplo n.º 12
0
def test_comment_search_limit():
    api = PushshiftAPI(file_checkpoint=1)
    comments = api.search_comments(subreddit="science",
                                   limit=100,
                                   before=1629990795)
    assert (len(comments) == 100)
Ejemplo n.º 13
0
def test_comment_search_ids():
    api = PushshiftAPI(file_checkpoint=1)
    comments = api.search_comments(ids=comment_ids)
    assert (len(comments) == len(comment_ids))
Ejemplo n.º 14
0
# SCrapes for reddit comments.

import pandas as pd
from pmaw import PushshiftAPI

api = PushshiftAPI()

import datetime as dt

before = int(dt.datetime(2021, 2, 1, 0, 0).timestamp())
after = int(dt.datetime(2020, 1, 1, 0, 0).timestamp())

subreddit = "wallstreetbets"
limit = 100000
comments = api.search_comments(query="GME",
                               subreddit=subreddit,
                               limit=limit,
                               before=before,
                               after=after)
print(f'Retrieved {len(comments)} comments from Pushshift')

comments_df = pd.DataFrame(comments)
# preview the comments data
comments_df.head(5)

comments_df.to_csv('./wsb_comments.csv',
                   header=True,
                   index=False,
                   columns=list(comments_df.axes[1]))