Ejemplo n.º 1
0
def test_submission_praw_mem_safe():
    api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit)
    posts = api_praw.search_submissions(subreddit="science",
                                        limit=1000,
                                        mem_safe=True,
                                        before=1629990795)
    assert (len(posts) == 1000)
Ejemplo n.º 2
0
def test_submission_praw_query():
    api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit)
    posts = api_praw.search_submissions(q="quantum",
                                        subreddit="science",
                                        limit=100,
                                        before=1629990795)
    assert (len(posts) == 100)
Ejemplo n.º 3
0
def test_filter_key_exception():
    with pytest.raises(KeyError):
        api = PushshiftAPI()

        def fxn(item):
            return item['badkeydoesntexist'] > 2

        posts = api.search_submissions(ids=post_ids, filter_fn=fxn)
Ejemplo n.º 4
0
def test_filter_param_exception():
    with pytest.raises(TypeError):
        api = PushshiftAPI()

        def fxn():
            return True

        posts = api.search_submissions(ids=post_ids, filter_fn=fxn)
Ejemplo n.º 5
0
def test_search_ids_filter():
    api = PushshiftAPI()

    def fxn(item):
        return item['score'] > 2

    posts = api.search_submissions(ids=post_ids, filter_fn=fxn)
    assert (len(posts) == 2)
Ejemplo n.º 6
0
class redditdb:
    def __init__(self, subreddit: str = None, start: dt.datetime = None, path: str = os.getcwd(), limit: int = None):
        self.api = PushshiftAPI() 
        self.subreddit = subreddit
        self.start = start
        self.path = path
        self.limit = limit
        self.fpath = os.path.join(self.path, self.subreddit)
        self.fpathComments = os.path.join(self.fpath, "comments")
        self.fpathPosts = os.path.join(self.fpath, "posts")

        self.dfComments_list = None
        self.dfPosts_list = None

    # get list of dataframes
    def updateListComments(self):
        self.dfComments_list = [f for f in listdir(self.fpathComments) if isfile(join(self.fpathComments, f))]

    def updateListPosts(self):
        self.dfPosts_list = [f for f in listdir(self.fpathPosts) if isfile(join(self.fpathPosts, f))]

    # updates comments and posts
    def updateAll(self, date: dt.datetime):
        if not os.path.isdir(self.fpath):
            os.makedirs(self.fpath)

        self.updateComments(date)
        self.updatePosts(date)

    # update set of comment dataframes to yesterday
    def updateComments(self, date: dt.datetime):
        # makes the directory if it doesn't already exist
        if not os.path.isdir(self.fpathComments):
            os.makedirs(self.fpathComments)
        # starts downloading
        self.getallComments(date)
    
    def updatePosts(self, date: dt.datetime):
        # makes the directory if it doesn't already exist
        if not os.path.isdir(self.fpathPosts):
            os.makedirs(self.fpathPosts)
        # starts downloading
        self.getallPosts(date)
        
    # save all data in a time range
    def getallComments(self, end: dt.datetime):
        self.updateListComments()
        print("Retrieving comment data from {}: {} to {}".format(self.subreddit, self.start, end))

        dset = set(self.dfComments_list)

        for i in range( int( (end-self.start).days ) + 1):
            day = self.start + dt.timedelta(days = i)

            # check if day is already accounted for, if not download the comment dataframe
            if not '{}.csv'.format(day.date()) in dset:
                print('\033[32m{}\033[37m comments has not been downloaded for {}/comments. Downloading...'. format(day.date(), self.subreddit))
                self.savedayComments(day = day)
            else:
                print('\033[31m{}\033[37m comments already exists in {}/comments'. format(day.date(), self.subreddit))

    def getallPosts(self, end: dt.datetime):
        self.updateListPosts()
        print("Retrieving post data from {}: {} to {}".format(self.subreddit, self.start, end))

        dset = set(self.dfPosts_list)

        for i in range( int( (end-self.start).days ) + 1):
            day = self.start + dt.timedelta(days = i)

            # check if day is already accounted for, if not download the post dataframe
            if not '{}.csv'.format(day.date()) in dset:
                print('\033[32m{}\033[37m posts has not been downloaded in {}/posts. Downloading...'. format(day.date(), self.subreddit))
                self.savedayPosts(day = day)
            else:
                print('\033[31m{}\033[37m posts already exists in {}/posts'. format(day.date(), self.subreddit))

    
    # helper function to save data by day
    def savedayComments(self, day: dt.datetime):
        # gets comments from PushShift using PMAW wrapper
        comments = self.api.search_comments(subreddit=self.subreddit, rate_limit = 20, limit=self.limit, before=int((day+dt.timedelta(days=1)).timestamp()), after=int(day.timestamp()))
        print(f'Retrieved {len(comments)} comments from Pushshift')

        # converts into a dataframe with utc as index
        comments_df = pd.DataFrame(comments)
        # extra check
        if not comments_df.empty:
            comments_df = comments_df.sort_values(by=['created_utc']).set_index(['created_utc'])
        
        # calls download function
        self.downloadComments(comments_df, day)
    
    # helper function to save data by day
    def savedayPosts(self, day: dt.datetime):
        # gets posts from PushShift using PMAW wrapper
        posts = self.api.search_submissions(subreddit=self.subreddit, rate_limit = 20, limit=self.limit, before=int((day+dt.timedelta(days=1)).timestamp()), after=int(day.timestamp()))
        print(f'Retrieved {len(posts)} posts from Pushshift')

        # converts into a dataframe with utc as index
        posts_df = pd.DataFrame(posts)
        # extra check
        if not posts_df.empty:
            posts_df = posts_df.sort_values(by=['created_utc']).set_index(['created_utc'])
        
        # calls download function
        self.downloadPosts(posts_df, day)


    # download the info into a dataframe
    # path/subreddit/date
    def downloadComments(self, df: pd.DataFrame, day: dt.datetime):
        # checks to see if a folder with the name of the subreddit already exists
        if not os.path.isdir(self.fpathComments):
            os.makedirs(self.fpathComments)
        
        # names the file the current day
        fname = '{}.csv'.format(day.date())

        #save the file
        df.to_csv(os.path.join(self.fpathComments, fname))
        
     # download the info into a dataframe
    # path/subreddit/date
    def downloadPosts(self, df: pd.DataFrame, day: dt.datetime):
        # checks to see if a folder with the name of the subreddit already exists
        if not os.path.isdir(self.fpathPosts):
            os.makedirs(self.fpathPosts)
        
        # names the file the current day
        fname = '{}.csv'.format(day.date())

        #save the file
        df.to_csv(os.path.join(self.fpathPosts, fname))

    # load dataframe into memory
    def loadDayComments(self, day: dt.datetime):
        # check if folder exists
 
        if not os.path.isdir(self.fpathComments):
            print('The folder for {}/comments does not exist'.format(self.subreddit))
            return pd.DataFrame()
        else:
            # get name of file for the date
            fname = '{}.csv'.format(day.date())
            loadpath = os.path.join(self.fpathComments, fname)
            # check for file
            if os.path.exists(loadpath):
                return pd.read_csv(loadpath) 
            else:
                print("\033[31m{}\033[37m does not exist in {}/posts".format(day.date(), self.subreddit))
                return pd.DataFrame()
    
    # load dataframe into memory
    def loadDayPosts(self, day: dt.datetime):
        # check if folder exists
 
        if not os.path.isdir(self.fpathPosts):
            print('The folder for {}/posts does not exist'.format(self.subreddit))
            return pd.DataFrame()
        else:
            # get name of file for the date
            fname = '{}.csv'.format(day.date())
            loadpath = os.path.join(self.fpathPosts, fname)
            # check for file
            if os.path.exists(loadpath):
                return pd.read_csv(loadpath) 
            else:
                print("\033[31m{}\033[37m does not exist in {}/posts".format(day.date(), self.subreddit))
                return pd.DataFrame()


    def loadRangeAll(self, start: dt.datetime, end: dt.datetime):
        return self.loadRangeComments(start, end), self.loadRangePosts(start, end)

    # return the combined dataframe
    def loadRangeComments(self, start: dt.datetime, end: dt.datetime):
        dfout = pd.DataFrame()
        for i in range( int( (end-start).days ) + 1):
            day = start + dt.timedelta(days = i)
            df = self.loadDayComments(day)
            
            if not df.empty:
                dfout = dfout.append(df)
        
        return dfout

    # return the combined dataframe
    def loadRangePosts(self, start: dt.datetime, end: dt.datetime):
        dfout = pd.DataFrame()
        for i in range( int( (end-start).days ) + 1):
            day = start + dt.timedelta(days = i)
            df = self.loadDayPosts(day)
            
            if not df.empty:
                dfout = dfout.append(df)
        
        return dfout
Ejemplo n.º 7
0
class QueryRedditPostsV2(QueryPostsInterface):
    def __init__(self, list_filters, parallelize, log) -> None:
        super().__init__()
        self._list_filters = list_filters
        self._parallelize = parallelize
        self._log = log
        self._dict_df_posts = {}
        self._api = PushshiftAPI()

    @property
    def dict_df_posts(self):
        return self._dict_df_posts

    def set_dict_df_posts(self, key, df) -> None:
        if (len(df) > 0):
            now = dt.datetime.now()
            dt_string = now.strftime("%d_%m_%Y_%H_%M_%S")
            k = 'reddit_pmaw_' + key + '_' + dt_string
            self._dict_df_posts[k] = df

    def query(self, reddit_filter, subreddit) -> pd.DataFrame:
        df_posts = pd.DataFrame()

        try:
            posts = self._api.search_submissions(subreddit=subreddit,
                                                 limit=reddit_filter.items,
                                                 **reddit_filter.query_params)

            # create the dataframe
            df_posts = pd.DataFrame(posts)

            # standardize the name of the text column
            df_posts.rename(
                columns={'selftext': 'text'},
                inplace=True)  # change 'selftext' to 'text' for preprocessing

            # format the date
            df_posts['created_utc'] = df_posts['created_utc'].apply(
                dt.datetime.fromtimestamp)

            # create a column with the value from the 'label' filter parameter
            if (reddit_filter.label is not None):
                df_posts['label'] = reddit_filter.label

        except:
            self._log.exception('Fail to query Reddit posts.')

        # return the dataframe
        return df_posts

    def query_par(self, reddit_filter, queue, subreddit) -> None:
        # call query function to query posts and create a dataframe
        df_posts = self.query(reddit_filter, subreddit)

        # put the pandas dataframe in the queue
        queue.put(df_posts)

    def query_manager(self) -> None:
        self._log.timer_message(
            'Collecting Reddit data with the pmaw package.')

        # select only the Reddit filters
        list_reddit_filters = list(
            filter(lambda x: (x.key == 'Reddit' and x.library == 'pmaw'),
                   self._list_filters))

        # both methods perform the same task using a parallel or sequential strategy
        if (self._parallelize):
            # query posts parallelized
            self.query_parallel(list_reddit_filters)
        else:
            # query posts sequentially
            self.query_sequential(list_reddit_filters)

    def query_sequential(self, list_filters) -> None:
        start_time_seq = time.time()

        # separate filters by type
        search_filters = list(
            filter(lambda x: (x.filter_type == 'search'), list_filters))

        # for each subreddit from each filter, create a query of posts
        # concatenate all dataframes of posts information
        # for each subreddit from each filter, create a query of posts
        # concatenate all dataframes of posts information
        df_search_posts = pd.DataFrame()
        for sf in search_filters:
            for subreddit in sf.subreddits:
                df_posts = self.query(sf, subreddit)
                df_search_posts = pd.concat([df_search_posts, df_posts])

        self.set_dict_df_posts('search_posts', df_search_posts)
        self._log.user_message('Reddit posts\' query finished.')

        final_time_seq = time.time() - start_time_seq
        self._log.timer_message('Sequential Query Time: ' +
                                str(final_time_seq) + ' seconds.')

    def query_parallel(self, list_filters) -> None:
        start_time_par = time.time()

        # separate filters by type
        search_filters = list(
            filter(lambda x: (x.filter_type == 'search'), list_filters))

        # configure the queue
        queue_search = Queue()

        # for each subreddit from each filter, create a query of posts
        # concatenate all dataframes of posts information
        processes_search = []
        for sf in search_filters:
            processes_search.extend([
                Process(target=self.query_par, args=(sf, queue_search, sub))
                for sub in sf.subreddits
            ])

        # start the processes
        for p in processes_search:
            p.start()

        # concatenate all dataframes of search information
        df_search_posts = pd.DataFrame()
        for _ in processes_search:
            df_process_posts = queue_search.get()
            df_search_posts = pd.concat([df_search_posts, df_process_posts])

        self.set_dict_df_posts('search_posts', df_search_posts)
        self._log.user_message('Reddit posts\' query finished.')

        # wait the processes
        for p in processes_search:
            p.join()

        final_time_par = time.time() - start_time_par
        self._log.timer_message('Parallelized Query Time: ' +
                                str(final_time_par) + ' seconds.')
Ejemplo n.º 8
0
def test_submission_praw_ids():
    api_praw = PushshiftAPI(file_checkpoint=1, praw=reddit)
    posts = api_praw.search_submissions(ids=post_ids)
    assert (len(posts) == len(post_ids))
Ejemplo n.º 9
0
def test_submission_search_limit():
    api = PushshiftAPI(file_checkpoint=1)
    posts = api.search_submissions(subreddit="science",
                                   limit=100,
                                   before=1629990795)
    assert (len(posts) == 100)
Ejemplo n.º 10
0
def test_submission_search_ids():
    api = PushshiftAPI(file_checkpoint=1)
    posts = api.search_submissions(ids=post_ids)
    assert (len(posts) == len(post_ids))
Ejemplo n.º 11
0
#------------------------------------PARAMETERS EDIT------------------------------------#
min_score = '>40'  # > here is >=  #Sumission score 40, comment score 10
time_range = 365 * 2
download_per_loop = 10  # days
IS_SUBMISSION = True

#Download by smaller step to avoid imcomplete result (too many resuls coulde be lost due to PushShift shards are down???)
for i in tqdm(range(int(time_range / download_per_loop) + 2)):
    after = before - 60 * 60 * 24 * download_per_loop  # download per day

    if IS_SUBMISSION:
        file_name = 'submissions'
        results = api.search_submissions(score=min_score,
                                         sort='desc',
                                         sort_type='score',
                                         subreddit=subreddit,
                                         size=size,
                                         before=before,
                                         after=after,
                                         limit=limit)
        results_df = pd.DataFrame(results)

        if not results_df.empty:
            # Remove anypost that was removed by any (post that was not removed has this value as nan)
            if hasattr(results_df, 'removed_by_category'):
                results_df = results_df[
                    results_df.removed_by_category.isnull()]

            text_list_title = results_df.title.tolist()
            text_list_content = results_df.selftext.replace(
                '', np.nan)  #Replace empty text with nan
            text_list_content = text_list_content.dropna().tolist(
Ejemplo n.º 12
0
def test_filter_callable():
    with pytest.raises(ValueError):
        api = PushshiftAPI()
        posts = api.search_submissions(ids=post_ids, filter_fn='fxn')