Esempi in Python per PushshiftAPI, esempi in Python per psaw.PushshiftAPI

Esempio n. 1

0

Mostra file

File: pushshift.py Progetto: rpolitics/PoliticsModeratorBot

from pmb import *
from psaw import PushshiftAPI

config = get_config()

ps = PushshiftAPI()


def checkuser(args, channel):
	try:
		user = args['user']
	except:
		return "No user given."

	response = ""

	submissions = ps.search_submissions(author=user, subreddit='politics', limit=10, filter=['created_utc', 'title', 'permalink', 'id'])
	response += '*SUBMISSIONS*\n'
	for submission in submissions:
		ts = pendulum.from_timestamp(submission.created_utc, tz='UTC').to_datetime_string()
		out = '- *{}*: "{}" <https://reddit.com{}|Post> | <http://api.pushshift.io/reddit/search/submission/?ids={}|Archive>'.format(ts, submission.title, submission.permalink, submission.id)
		response += out + '\n'

	response += '\n'

	comments = ps.search_comments(author=user, subreddit='politics', limit=10, filter=['created_utc', 'body', 'permalink', 'id'])
	response += '*COMMENTS*\n'
	for comment in comments:
		ts = pendulum.from_timestamp(comment.created_utc, tz='UTC').to_datetime_string()

		if len(comment.body) <= 100:

Esempio n. 2

0

Mostra file

def get_due_dilligence(
    ticker: str,
    limit: int = 5,
    n_days: int = 3,
    show_all_flairs: bool = False
) -> List[praw.models.reddit.submission.Submission]:
    """Gets due dilligence posts from list of subreddits [Source: reddit]

    Parameters
    ----------
    ticker: str
        Stock ticker
    limit: int
        Number of posts to get
    n_days: int
        Number of days back to get posts
    show_all_flairs: bool
        Search through all flairs (apart from Yolo and Meme)

    Returns
    -------
    List[praw.models.reddit.submission.Submission]
        List of submissions
    """
    praw_api = praw.Reddit(
        client_id=cfg.API_REDDIT_CLIENT_ID,
        client_secret=cfg.API_REDDIT_CLIENT_SECRET,
        username=cfg.API_REDDIT_USERNAME,
        user_agent=cfg.API_REDDIT_USER_AGENT,
        password=cfg.API_REDDIT_PASSWORD,
    )

    psaw_api = PushshiftAPI()

    n_ts_after = int((datetime.today() - timedelta(days=n_days)).timestamp())
    l_flair_text = [
        "DD",
        "technical analysis",
        "Catalyst",
        "News",
        "Advice",
        "Chart",
        "Charts and Setups",
        "Fundamental Analysis",
        "forex",
        "Trade Idea",
    ]
    l_sub_reddits_dd = [
        "pennystocks",
        "RobinHoodPennyStocks",
        "Daytrading",
        "StockMarket",
        "stocks",
        "investing",
        "wallstreetbets",
        "forex",
        "Forexstrategy",
    ]

    submissions = psaw_api.search_submissions(after=int(n_ts_after),
                                              subreddit=l_sub_reddits_dd,
                                              q=ticker,
                                              filter=["id"])
    n_flair_posts_found = 0
    subs = []
    for submission in submissions:
        # Get more information about post using PRAW api
        submission = praw_api.submission(id=submission.id)

        # Ensure that the post hasn't been removed in the meanwhile
        if not submission.removed_by_category:

            # Either just filter out Yolo, and Meme flairs, or focus on DD, based on b_DD flag
            if (
                    submission.link_flair_text in l_flair_text,
                    submission.link_flair_text not in ["Yolo", "Meme"],
            )[show_all_flairs]:

                subs.append(submission)
                # Increment count of valid posts found
                n_flair_posts_found += 1

        # Check if number of wanted posts found has been reached
        if n_flair_posts_found > limit - 1:
            break

    return subs

Esempio n. 3

0

Mostra file

File: reddit_api.py Progetto: wx-b/GamestonkTerminal

def popular_tickers(l_args):
    parser = argparse.ArgumentParser(
        prog='popular',
        description="""Print latest popular tickers. [Source: Reddit] """)
    parser.add_argument('-l',
                        "--limit",
                        action="store",
                        dest="n_limit",
                        type=check_positive,
                        default=50,
                        help='limit of posts retrieved per sub reddit.')
    parser.add_argument(
        '-s',
        "--sub",
        action="store",
        dest="s_subreddit",
        type=str,
        help="""subreddits to look for tickers, e.g. pennystocks,stocks.
                        Default: pennystocks, RobinHoodPennyStocks, Daytrading, StockMarket, stocks, investing, wallstreetbets"""
    )
    parser.add_argument('-d',
                        "--days",
                        action="store",
                        dest="n_days",
                        type=check_positive,
                        default=1,
                        help="look for the tickers from those n past days.")

    try:
        (ns_parser, l_unknown_args) = parser.parse_known_args(l_args)

        if l_unknown_args:
            print(
                f"The following args couldn't be interpreted: {l_unknown_args}\n"
            )
            return

        n_ts_after = int(
            (datetime.today() - timedelta(days=ns_parser.n_days)).timestamp())

        if ns_parser.s_subreddit:
            if ',' in ns_parser.s_subreddit:
                l_sub_reddits = ns_parser.s_subreddit.split(',')
            else:
                l_sub_reddits = [ns_parser.s_subreddit]
        else:
            l_sub_reddits = [
                'pennystocks', 'RobinHoodPennyStocks', 'Daytrading',
                'StockMarket', 'stocks', 'investing', 'wallstreetbets'
            ]

        d_submission = {}
        d_watchlist_tickers = {}
        l_watchlist_links = list()
        l_watchlist_author = list()

        praw_api = praw.Reddit(client_id=cfg.API_REDDIT_CLIENT_ID,
                               client_secret=cfg.API_REDDIT_CLIENT_SECRET,
                               username=cfg.API_REDDIT_USERNAME,
                               user_agent=cfg.API_REDDIT_USER_AGENT,
                               password=cfg.API_REDDIT_PASSWORD)

        psaw_api = PushshiftAPI()

        for s_sub_reddit in l_sub_reddits:
            print(
                f"Search for latest tickers under {ns_parser.n_limit} '{s_sub_reddit}' posts"
            )
            submissions = psaw_api.search_submissions(after=int(n_ts_after),
                                                      subreddit=s_sub_reddit,
                                                      limit=ns_parser.n_limit,
                                                      filter=['id'])

            n_tickers = 0
            while True:
                submission = next(submissions, None)
                if submission:
                    # Get more information about post using PRAW api
                    submission = praw_api.submission(id=submission.id)

                    # Ensure that the post hasn't been removed by moderator in the meanwhile,
                    #that there is a description and it's not just an image, that the flair is
                    #meaningful, and that we aren't re-considering same author's content
                    if not submission.removed_by_category and (submission.selftext or submission.title) \
                        and submission.author.name not in l_watchlist_author:
                        ls_text = list()
                        ls_text.append(submission.selftext)
                        ls_text.append(submission.title)

                        submission.comments.replace_more(limit=0)
                        for comment in submission.comments.list():
                            ls_text.append(comment.body)

                        l_tickers_found = list()
                        for s_text in ls_text:
                            for s_ticker in set(
                                    re.findall(r'([A-Z]{3,5} )', s_text)):
                                l_tickers_found.append(s_ticker.strip())

                        if l_tickers_found:
                            n_tickers += len(l_tickers_found)

                            # Add another author's name to the parsed watchlists
                            l_watchlist_author.append(submission.author.name)

                            # Lookup stock tickers within a watchlist
                            for key in l_tickers_found:
                                if key in d_watchlist_tickers:
                                    # Increment stock ticker found
                                    d_watchlist_tickers[key] += 1
                                else:
                                    # Initialize stock ticker found
                                    d_watchlist_tickers[key] = 1

                # Check if search_submissions didn't get anymore posts
                else:
                    break

            print(f"  {n_tickers} tickers found.")

        lt_watchlist_sorted = sorted(d_watchlist_tickers.items(),
                                     key=lambda item: item[1],
                                     reverse=True)
        if lt_watchlist_sorted:
            print(
                f"\nThe following TOP10 tickers have been mentioned in the last {ns_parser.n_days} days:"
            )
            n_top_stocks = 0
            for t_ticker in lt_watchlist_sorted:
                if n_top_stocks > 9:
                    break
                try:
                    # If try doesn't trigger exception, it means that this stock exists on finviz
                    #thus we can print it.
                    finviz.get_stock(t_ticker[0])
                    print(f"{t_ticker[1]} {t_ticker[0]}")
                    n_top_stocks += 1
                except:
                    pass
        else:
            print("No tickers found")
        print("")

    except:
        print("")

Esempio n. 4

0

Mostra file

File: grabber.py Progetto: neemashahbazi/Reddit-Corpus

import praw
from prawcore.exceptions import RequestException
from psaw import PushshiftAPI

reddit = praw.Reddit(client_id='',
                     client_secret='',
                     user_agent='my user agent')

api = PushshiftAPI(reddit)
gen1 = []
try:
    gen1 = api.search_submissions(author="newtothistinderthing")
except RequestException:
    print("Failed to fetch submission")

    posts_list = list(gen1)
    posts = []
    for post_item in posts_list:
        if len(post_item.selftext.split()) >= 100:
            post_instance = {}
            post_instance.update({"id": post_item.id})
            post_instance.update({"created_utc": post_item.created_utc})
            post_instance.update({"subreddit": str(post_item.subreddit)})
            post_instance.update({"titlxe": post_item.title})
            post_instance.update({"selftext": post_item.selftext})
            post_instance.update({"score": post_item.score})
            post_instance.update({"num_comments": post_item.num_comments})
            posts.append(post_instance)

    for post in posts:
        print(post)

Esempio n. 5

0

Mostra file

def get_watchlists(
    n_to_get: int,
) -> Tuple[List[praw.models.reddit.submission.Submission], Dict, int]:
    """Get reddit users watchlists [Source: reddit]

    Parameters
    ----------
    n_to_get : int
        Number of posts to look through

    Returns
    -------
    List[praw.models.reddit.submission.Submission]:
        List of reddit submissions
    Dict:
        Dictionary of tickers and counts
    int
        Count of how many posts were analyzed
    """
    d_watchlist_tickers: Dict = {}
    l_watchlist_author = []
    subs = []

    praw_api = praw.Reddit(
        client_id=cfg.API_REDDIT_CLIENT_ID,
        client_secret=cfg.API_REDDIT_CLIENT_SECRET,
        username=cfg.API_REDDIT_USERNAME,
        user_agent=cfg.API_REDDIT_USER_AGENT,
        password=cfg.API_REDDIT_PASSWORD,
    )
    psaw_api = PushshiftAPI()
    submissions = psaw_api.search_submissions(
        subreddit=l_sub_reddits,
        q="WATCHLIST|Watchlist|watchlist",
        filter=["id"],
    )
    n_flair_posts_found = 0
    for sub in submissions:
        submission = praw_api.submission(id=sub.id)
        if (not submission.removed_by_category and submission.selftext
                and submission.link_flair_text not in ["Yolo", "Meme"]
                and submission.author.name not in l_watchlist_author):
            l_tickers_found = find_tickers(submission)

            if l_tickers_found:
                # Add another author's name to the parsed watchlists
                l_watchlist_author.append(submission.author.name)

                # Lookup stock tickers within a watchlist
                for key in l_tickers_found:
                    if key in d_watchlist_tickers:
                        # Increment stock ticker found
                        d_watchlist_tickers[key] += 1
                    else:
                        # Initialize stock ticker found
                        d_watchlist_tickers[key] = 1

                # Increment count of valid posts found
                n_flair_posts_found += 1
                subs.append(submission)
        if n_flair_posts_found > n_to_get - 1:
            break
    return subs, d_watchlist_tickers, n_flair_posts_found

Esempio n. 6

0

Mostra file

import praw
import pandas as pd
import datetime as dt
import config
from psaw import PushshiftAPI
import numpy as np
import time
import sys
reddit = praw.Reddit(client_id=config.client,
                     client_secret=config.secret,
                     user_agent=config.user)
api = PushshiftAPI(reddit)

SUBREDDIT = 'wallstreetbets'
KEYWORD = 'dogecoin'

tic = dt.datetime.today()
start = int(dt.datetime(2021, 4, 26).timestamp())
posts = list(
    api.search_submissions(
        after=start,
        subreddit=SUBREDDIT,
        filter=[
            'title', 'selftext', 'created_utc', 'num_comments', 'score',
            'permalink', 'upvote_ratio'
        ],
        limit=1000000,
        q=KEYWORD,
        stop_condition=lambda x: x.created_utc < dt.datetime.now(
            dt.timezone.utc).replace(tzinfo=dt.timezone.utc).timestamp(
            ) - 60 * 60 * 24 * 300))

Esempio n. 7

0

Mostra file

File: reddit_utils.py Progetto: ryansdowning/nlp_research

import datetime
import os
from typing import Any, Iterable, Iterator, List, Optional, Tuple, Union

import pandas as pd
import praw
from loguru import logger
from psaw import PushshiftAPI
from tqdm import tqdm

from finlang.config.constants import (COMMENT_FIELDS, SUBMISSION_FIELDS,
                                      SUBMISSION_SORTS, reddit)
from finlang.nlp_utils import db_utils as dbu
from finlang.scrape_utils import data_utils as du

api = PushshiftAPI()


def get_submissions(subreddit: praw.reddit.Subreddit, sort_method: str,
                    **kwargs) -> Iterator[Any]:
    """Gets the submissions from a specified subreddit using a given sorting option33

    Args:
        subreddit: Name of subreddit
        sort_method: Method of sorting to use when pulling the submissions
        kwargs: additional keyword arguments used in the specific sort method call

    Returns:
        List of submissions retrieved from the subreddit
    """
    sort_method = sort_method.casefold()

Esempio n. 8

0

Mostra file

import argparse
import collections
import copy
import itertools
import json
import logging
import os
import random
from pathlib import Path

import pandas as pd
from psaw import PushshiftAPI
from tqdm import tqdm

api = PushshiftAPI()

parser = argparse.ArgumentParser()
parser.add_argument(
    "-o",
    "--out_path",
    type=str,
    default="./data/reddit_threads/",
    help="Path or url of the dataset. If empty download from S3.",
)
parser.add_argument(
    "-s",
    "--subreddit",
    type=str,
    action="append",
    default=[

Esempio n. 9

0

Mostra file

File: multi-comment-transcriptions.py Progetto: GrafeasGroup/blossom

"""
import logging
from datetime import datetime
from typing import Dict, List

from psaw import PushshiftAPI

from bootstrap import END_DATE, START_DATE, blossom
from bootstrap.migrate_redis_data import (
    CommentData,
    comment_filter,
    dict_from_comment,
    extract_id_from_grafeas_url,
)

push = PushshiftAPI()


def _get_link_id(tr: Dict) -> str:
    """Get the Reddit link ID of the given transcription.

    For https://reddit.com/r/worldnewsvideo/comments/s48db9/bronx_fire_landlord_had_history_of_neglecting_heat/hss8wz0/
    it returns s48db9.
    """
    return tr["url"].split("/")[6]


def _remove_footer(transcription: str) -> str:
    """Remove the footer of the transcription."""
    parts = transcription.split("---")
    if len(parts) < 3:

Esempio n. 10

0

Mostra file

    def __init__(self):

        self.__api = PushshiftAPI()

Esempio n. 11

0

Mostra file

File: pushsift_reddit_crawler.py Progetto: wilmis/Reddit-language-trends

def make_pushshiftAPI(reddit):
    api = PushshiftAPI(reddit)
    return api

Esempio n. 12

0

Mostra file

File: post-extractor.py Progetto: Nawor3565/Reddit-Saved-Post-Extractor

import csv
from datetime import datetime
import time
from psaw import PushshiftAPI

api = PushshiftAPI()

with open(
        'saved_posts.csv', 'r',
        newline='') as f:  # Opens the CSV files from your Reddit data request
    next(f)
    progress = 1
    reader = csv.reader(f)
    first_column = next(zip(*reader))
    data = list(first_column)
    with open('fixed_posts.csv', 'a', newline=''
              ) as f2:  # Opens/creates the new CSV where the post info will go
        writer = csv.writer(f2)
        writer.writerow([
            "Date/Time", "Subreddit", "Post Title", "Link", "Is Self-Post?",
            "URL", "Post ID"
        ])  # Writes name for each column
        for i in data:
            i = str(i)
            search = api.search_submissions(
                ids=i, limit=1,
                aggs='title')  # Searches Pushshift for matching post
            for post in search:
                subreddit = post.subreddit
                title = post.title
                title = title.encode("ascii", errors="ignore").decode(

Esempio n. 13

0

Mostra file

import requests
import csv
import json
import pandas as pd
import pymongo
from psaw import PushshiftAPI

api = PushshiftAPI()

input = input('Enter company name: ')

news = list(
    api.search_submissions(subreddit='news',
                           filter=[
                               'title', 'url', 'num_comments', 'author',
                               'score', 'created_utc'
                           ],
                           limit=150000))
print("connecting to db")
# myclient = pymongo.MongoClient("18.219.233.150:27017")
myclient = pymongo.MongoClient("127.0.0.1:27017")
print("database connected")
database = myclient['fibstock']
collection = database['news']

coll = []
for submission in news:
    coll.append({
        'title': submission.title,
        'link': submission.url,
        'publishedAt': submission.created_utc,

Esempio n. 14

0

Mostra file

class RedditScraper:
    def __init__(self, subreddit_list, ds, after, before, config):
        self.reddit: praw.Reddit = praw.Reddit(
            client_id=config["REDDIT"].get("client_id"),
            client_secret=config["REDDIT"].get("client_secret"),
            user_agent=config["REDDIT"].get("user_agent"),
        )
        self.api = PushshiftAPI(self.reddit)
        self.subreddit_list = subreddit_list
        self.ds = ds
        self.after = after
        self.before = before
        self.submissions = None
        self.engine = create_engine(config["DB"].get("db_url"))
        self.session = Session(self.engine)

    def get_submissions(self):
        self.submissions = map(
            RedditSubmission,
            self.api.search_submissions(
                after=self.after,
                before=self.before,
                subreddit=",".join(self.subreddit_list),
            ),
        )

    def upload(self, fetch_past):
        redditors = {}
        submissions = {}
        subreddits = {}
        comments = {}
        redditor_subreddit_comments = {}
        redditor_subreddit_submissions = {}
        for s in self.submissions:
            try:
                if (not s or not s.submission
                        or not hasattr(s.submission, "id")
                        or not s.submission.id or not s.author
                        or not hasattr(s.author, "id") or not s.author.id):
                    continue
                if not fetch_past:
                    redditors[s.author.id] = parse_redditor(s.author)
                    submissions[s.submission.id] = {
                        "id": s.submission.id,
                        "score": s.submission.score,
                    }
                    subreddits[s.subreddit.id] = {
                        "id": s.subreddit.id,
                        "name": s.subreddit.display_name,
                    }
                    redditor_subreddit_submissions[s.submission.id] = {
                        "redditor_id": s.author.id,
                        "subreddit_id": s.subreddit.id,
                        "submission_id": s.submission.id,
                        "ds": self.ds,
                    }
                for c in s.comments:
                    if (not c or not hasattr(c, "id") or not c.id
                            or not c.author or not hasattr(c.author, "id")):
                        continue
                    comments[c.id] = {
                        "id": c.id,
                        "submission_id": s.submission.id,
                        "score": c.score,
                    }
                    subreddits[s.subreddit.id] = {
                        "id": s.subreddit.id,
                        "name": s.subreddit.display_name,
                    }
                    redditors[c.author.id] = parse_redditor(c.author)
                    redditor_subreddit_comments[c.id] = {
                        "redditor_id": c.author.id,
                        "subreddit_id": s.subreddit.id,
                        "comment_id": c.id,
                        "ds": self.ds,
                    }
            except Exception as e:
                logging.warning(f"An error has occured while parsing: {e}")
                continue

        if not redditor_subreddit_comments and not redditor_subreddit_submissions:
            return len(redditor_subreddit_submissions), len(
                redditor_subreddit_comments)

        # PostgreSQL upsert
        # https://docs.sqlalchemy.org/en/13/dialects/postgresql.html#insert-on-conflict-upsert
        try:
            self.session.execute(
                insert(m.Redditor).values(list(
                    redditors.values())).on_conflict_do_nothing())
            if not fetch_past:
                self.session.execute(
                    insert(m.Submission).values(list(
                        submissions.values())).on_conflict_do_nothing())
                self.session.execute(
                    insert(m.Redditor_Subreddit_Submission).values(
                        list(redditor_subreddit_submissions.values())).
                    on_conflict_do_nothing())
            self.session.execute(
                insert(m.Subreddit).values(list(
                    subreddits.values())).on_conflict_do_nothing())
            self.session.execute(
                insert(m.Comment).values(list(
                    comments.values())).on_conflict_do_nothing())
            self.session.execute(
                insert(m.Redditor_Subreddit_Comment).values(
                    list(redditor_subreddit_comments.values())).
                on_conflict_do_nothing())
            self.session.commit()
        except Exception as e:
            logging.warning(f"An error occured during insertion: {e}")
            self.session.rollback()
        finally:
            self.session.close()
        return len(redditor_subreddit_submissions), len(
            redditor_subreddit_comments)

Esempio n. 15

0

Mostra file

File: redditpull.py Progetto: ipdgc/Sentiment-in-PD-Progression

from psaw import PushshiftAPI
import datetime as dt

api = PushshiftAPI()

start_epoch=int(dt.datetime(2017, 1, 1).timestamp())

print(list(api.search_submissions(after=start_epoch,
                            subreddit='politics',
                            filter=['url','author', 'title', 'subreddit'],
                            limit=10)))

Esempio n. 16

0

Mostra file

File: reddit.py Progetto: x0rzkov/lynx-blockchain-risk-scoring

def reddit_scrape_by_entity(entity, start_date, end_date):
    '''
    Retrieves posts relating to entity from reddit within the stipulated time frame 

    Input:
        entity(string): entity name to retrieve data on
        start_date(datetime): date to begin scraping from
        end_date(datetime): date to stop scraping
    Output:
        df(dataframe): dataframe with columns = [author, url, excerpt, subreddit, title, article_date, type, entity,
                                    	        source_id, content, count, date_time_all, coin, source]
    '''

    # initialise api
    api = PushshiftAPI()

    # convert datetime to timestamp
    start_epoch = int(start_date.timestamp())
    end_epoch = int(end_date.timestamp())

    # read in list of subreddits
    subreddits = pd.read_csv(r'../scraping/data/subreddit_list.csv')['subreddit'].tolist()
    
    entity = entity.lower()

    ############################## Submissions ################################
    
    # query and generate the related information
    gen_submission = api.search_submissions(q=entity,after= start_epoch, before = end_epoch,
            filter=['created_utc', 'title', 'selftext', 'permalink', 'author', 'subreddit', 'id'],
            subreddit = subreddits)

    # generate dataframe for required data
    df_submission = pd.DataFrame([post.d_ for post in gen_submission])

    # format dataframe 
    if df_submission.empty == False:
        df_submission['title'] = df_submission['title'].apply(lambda x: str(x).lower())
        df_submission['date_time'] = df_submission['created_utc'].apply(lambda x: datetime.fromtimestamp(x))
        df_submission['selftext'] = df_submission['selftext'].apply(lambda x: str(x).lower())
        df_submission['permalink'] = df_submission['permalink'].apply(lambda x: 'www.reddit.com'+ x)
        df_submission['author'] = df_submission['author'].apply(lambda x: x.lower())
        df_submission['subreddit'] = df_submission['subreddit'].apply(lambda x: x.lower())
        df_submission['type'] = 'submission'

        #Remove unecessary columns of data
        df_submission = df_submission.drop(columns = ['created_utc','created'])

        df_submission = df_submission.rename(columns={'selftext': 'excerpt', 'permalink':'article_url'})
    

    ############################## Comments ################################

    # query and generate the related information
    gen_comments = api.search_comments(q=entity,after= start_epoch, before = end_epoch,
            filter=['created_utc', 'body', 'permalink', 'author', 'subreddit', 'id'],
            subreddit = subreddits)


    # generate dataframe for required data
    df_comment = pd.DataFrame([comm.d_ for comm in gen_comments])

    # format dataframe 
    if df_comment.empty == False:
        df_comment['date_time'] = df_comment['created_utc'].apply(lambda x: datetime.fromtimestamp(x))
        df_comment['body'] = df_comment['body'].apply(lambda x: str(x).lower())
        df_comment['permalink'] = df_comment['permalink'].apply(lambda x: 'www.reddit.com'+ x)
        df_comment['author'] = df_comment['author'].apply(lambda x: x.lower())
        df_comment['subreddit'] = df_comment['subreddit'].apply(lambda x: x.lower())
        df_comment['excerpt'] = ''
        df_comment['type'] = 'comments'
        df_comment['id'] = 'comments/' + df_comment['id']

        # remove unecessary columns of data
        df_comment = df_comment.drop(columns = ['created_utc','created'])

        # for comments, there are no titles so the body of the comment will be used as the title
        df_comment = df_comment.rename(columns={'body': 'title', 'permalink':'article_url'})    

    # concatenate submissions and comments dataframe
    df = pd.DataFrame(columns = ['author', 'article_url', 'excerpt', 'subreddit','title', 'date_time','type','entity','id'])
    df = df.append(df_submission)
    df = df.append(df_comment)
    
    df['entity'] = entity
    
    df = df.fillna('')
    df["text"] = df["title"] + " " + df["excerpt"]

    # filter out irrelevant data
    mask1 = list(df.apply(lambda x: filter_out(x["title"]) and filter_out(x["excerpt"]), axis=1))
    df = df[mask1]
    mask2 = list(df.apply(lambda x: filter_in(x["title"]) or filter_in(x["excerpt"]), axis=1))
    df = df[mask2]
    mask3 = list(df.apply(lambda x: filter_entity(str(x["text"]), entity), axis=1))
    df = df[mask3]

    # process duplicates
    df = process_duplicates(df)

    # find all coins that are relevant in text
    df['coin'] = df['text'].apply(lambda x: get_coins(x))

    # reset index
    df = df.reset_index(drop=True)

    # add source column
    df['source'] = 'reddit'

    # rename dataframe using naming convention in final database
    df = df.rename({'text':'content', 'article_url':'url', 'date_time':'article_date','id':'source_id'}, axis = 1)
    
    # keep only relevant columns
    df = df[['source','source_id','article_date','content', 'url','count','entity', 'author','coin']]

    return df

Esempio n. 17

0

Mostra file

from psaw import PushshiftAPI
import praw
import pandas as pd
import datetime as dt
import os
import numpy as np

# insert Reddit credentials here
# reddit = praw.Reddit(...)

# make sure we're in read-only mode
# reddit.read_only = True

# use PRAW credentials; then PSAW returns the IDs that we can use in PRAW
api = PushshiftAPI(reddit)

# set range of dates to scrape
start_day = dt.datetime(2021, 1, 10)
date_list = [start_day + dt.timedelta(days=x) for x in range(1)]

# create empty list to hold submission ids
all_ids = list()

# iterate through the dates and pull the posts
for day in date_list:
    # set starting day for this loop
    start_epoch = int(day.timestamp())
    # add one day to start_epoch
    end_epoch = start_epoch + (24 * 60 * 60)

    # get the submission ids for a given day

Esempio n. 18

0

Mostra file

File: redditScrape.py Progetto: jpike97/DgenerateFed

        except:
            print('no com')


redditPassword = config.settings['redditPassword']
redditClientSecret = config.settings['redditClientSecret']

redditClientSecret = config.client_secret
redditPassword = config.redditPassword

reddit = praw.Reddit(client_id='GCjpdb-78ljIQg',
                     client_secret=redditClientSecret,
                     password=redditPassword,
                     user_agent='testguyman',
                     username='******')
api = PushshiftAPI(reddit)

time_start = dt.datetime(2020, 2, 21)
track_time_minutes = dt.datetime.now().time()
track_time_combined = datetime.combine(time_start, track_time_minutes)
print(track_time_combined)
time_end = time_start + timedelta(days=1)
stock_ticker_tracking_array1 = {}
start_epoch = time_start

end_epoch = time_end

List1 = list(
    api.search_submissions(q='Daily Discussion Thread',
                           after=start_epoch,
                           before=end_epoch,

Esempio n. 19

0

Mostra file

File: ReadCommentsAll.py Progetto: Rathore25/Reddit-Sentiment-Pre-Post-Covid

class ReadCommentsAll():
    def __init__(self, subreddit_name, limit):
        print("API parameters:", subreddit_name, limit)

        ranges = [(1, 1, 2019, 1, 2, 2019), (1, 2, 2019, 1, 3, 2019),
                  (1, 3, 2019, 1, 4, 2019), (1, 4, 2019, 1, 5, 2019),
                  (1, 5, 2019, 1, 6, 2019), (1, 6, 2019, 1, 7, 2019),
                  (1, 7, 2019, 1, 8, 2019), (1, 8, 2019, 1, 9, 2019),
                  (1, 9, 2019, 1, 10, 2019), (1, 10, 2019, 1, 11, 2019),
                  (1, 11, 2019, 1, 12, 2019), (1, 12, 2019, 1, 1, 2020),
                  (1, 1, 2020, 1, 2, 2020), (1, 2, 2020, 1, 3, 2020),
                  (1, 3, 2020, 1, 4, 2020), (1, 4, 2020, 1, 5, 2020),
                  (1, 5, 2020, 1, 6, 2020), (1, 6, 2020, 1, 7, 2020),
                  (1, 7, 2020, 1, 8, 2020), (1, 8, 2020, 1, 9, 2020),
                  (1, 9, 2020, 1, 10, 2020), (1, 10, 2020, 1, 11, 2020),
                  (1, 11, 2020, 1, 12, 2020), (1, 12, 2020, 1, 1, 2021),
                  (1, 1, 2021, 1, 2, 2021), (1, 2, 2021, 1, 3, 2021),
                  (1, 3, 2021, 1, 4, 2021)]

        for d1, m1, y1, d2, m2, y2 in ranges:
            posted_after = int(datetime.datetime(y1, m1, d1).timestamp())
            posted_before = int(datetime.datetime(y2, m2, d2).timestamp())

            self.api = PushshiftAPI()
            self.comBatchNo = 0
            self.outputPath = './{0}/{1}/'.format(subreddit_name, posted_after)

            Path(self.outputPath).mkdir(parents=True, exist_ok=True)

            self.getComments(subreddit_name, None, [
                'created_utc', 'score', 'selftext', 'title', 'upvote_ratio',
                'body'
            ], posted_after, posted_before, limit)

    def saveData(self, items: list):
        fileId = 0
        filePath = ''

        self.comBatchNo += 1
        fileId = self.comBatchNo
        filePath = self.outputPath

        filePath += 'file' + str(fileId)

        print("{0} - {1} - {2}".format(time.time(), len(items), filePath))

        data = ''

        for item in items:
            data += item + '\n'

        with codecs.open(filePath, 'w', encoding='utf-8-sig') as file:
            file.write(data)

        data = None

    def getComments(self,
                    subreddit_name: str,
                    query: str,
                    fields: list,
                    after: int,
                    before: int,
                    limit=1000,
                    sortOrder='desc',
                    sortType='score'):
        try:
            query = self.api.search_comments(subreddit=subreddit_name,
                                             after=after,
                                             before=before,
                                             limit=limit,
                                             filter=fields)
            submissions = list()

            for element in query:
                submissions.append(json.dumps(element.d_))
                if len(submissions) == 1000:
                    self.saveData(submissions.copy())
                    submissions = list()

            if len(submissions) > 0:
                self.saveData(submissions.copy())
                submissions = list()
        except:
            print("Unexpected error:", sys.exc_info())

        print("Done!!!")

Esempio n. 20

0

Mostra file

from psaw import PushshiftAPI
import arrow
import pandas as pd

file_name_template = 'aita_posts_{}_june.csv'
cols = [
    'id', 'created_ts', 'author', 'title', 'body', 'flair', 'was_deleted',
    'was_removed'
]

api = PushshiftAPI()
data_lines = []

for day in range(5, 31):

    start_time = arrow.now()
    file_name = file_name_template.format(str(day) if day > 9 else f'0{day}')
    # file_name = 'aita_posts_small_june_sample.csv'
    print(f'Processing {file_name}...')

    df = pd.read_csv(file_name, sep='\t', header=None, names=cols)

    for index, row in df.iterrows():

        all_comments_count = 0
        aita_comments_count = 0
        all_submissions_count = 0
        aita_submissions_count = 0

        author = row['author']
        created_ts = row['created_ts']

Esempio n. 21

0

Mostra file

File: RedditPurger.py Progetto: redowul/reddit-purger

import praw, prawcore
from psaw import PushshiftAPI
import datetime as dt
import time

api = PushshiftAPI()

# Reddit account information; you cannot delete data for an account you do not have access to.
username = ""  # Your Reddit username
password = ""  # Your Reddit password

# These two values are needed to access Reddit’s API as a script application (see Authenticating via OAuth for other application types).
# If you don’t already have a client ID and client secret, follow Reddit’s First Steps Guide to create them.
# https://github.com/reddit-archive/reddit/wiki/OAuth2-Quick-Start-Example#first-steps
client_id = ""  # Reddit app client id
client_secret = ""  # Reddit app secret

# A user agent is a unique identifier that helps Reddit determine the source of network requests.
# https://github.com/reddit-archive/reddit/wiki/API
user_agent = ""

# Edit comment first before deleting it. Leave blank to leave comment unedited at time of deletion.
# Likely an unnecessary feature as the data would still be retained by Pushshift at time of deletion, but I included it for peace of mind anyway.
edit_value = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."
ask_before_deleting = True  # Ask before deleting every comment. False = auto delete.

start_epoch = dt.datetime(
    2005, 6, 25
)  # Year, Month, Day. Only comments / submissions created after the given datetime will be fetched and deleted.
end_epoch = dt.datetime(
    2020, 7, 24

Esempio n. 22

0

Mostra file

File: RedditCBBScrape.py Progetto: saharzafari/WatchCBB

    def __init__(self, client_id, client_secret, user_agent):

        self.reddit = praw.Reddit(client_id=client_id,
                                  client_secret=client_secret,
                                  user_agent=user_agent)
        self.api = PushshiftAPI(self.reddit)

Esempio n. 23

0

Mostra file

File: download_authors.py Progetto: JuanCanham/reddit-uk-leak-accounts

#!/usr/bin/env python
# coding: utf-8

from psaw import PushshiftAPI
import json

api = PushshiftAPI()

# Load list of users.
# The list of users associated with this campaign is based one the official release from Reddit at:
# https://www.reddit.com/r/redditsecurity/comments/e74nml/suspected_campaign_from_russia_on_reddit/
authors = []

with open("./data/userlist.csv") as f:
    authors = f.read().splitlines()

# Perform searches using Pushshift API
contributions_by_author = []

for author in authors:
    entry = {}
    entry["author"] = author

    submissions = api.search_submissions(limit=99999, author=author)
    entry["submissions"] = list(submissions)

    comments = api.search_comments(limit=99999, author=author)
    entry["comments"] = list(comments)

    contributions_by_author.append(entry)

Esempio n. 24

0

Mostra file

File: scrap.py Progetto: sachedal/watchexchange

import praw
from psaw import PushshiftAPI
import pandas as pd
import datetime as dt
import re
from pprint import pprint as print

reddit = praw.Reddit(client_id='nsitoMzM8H19pA',
                     client_secret='sx4jlxSsmL6n4NOUt080VZ1dvas',
                     user_agent='Watch Exchange Web Scrapper')
api = PushshiftAPI(reddit)

start_epoch = int(dt.datetime(2017, 1, 1).timestamp())

results = list(api.search_submissions(subreddit='Watchexchange', limit=1))
wts = re.compile('\\[WTS\\]')
price = re.compile(r'[$][\d]+')

testpost = reddit.submission(id=results[0].id)
print(testpost)

Esempio n. 25

0

Mostra file

def get_spac(
    limit: int = 5,
) -> Tuple[List[praw.models.reddit.submission.Submission], Dict, int]:
    """Get posts containing SPAC from top subreddits [Source: reddit]

    Parameters
    ----------
    limit : int, optional
        Number of posts to get for each subreddit, by default 5

    Returns
    -------
    List[praw.models.reddit.submission.Submission] :
        List of reddit submissions
    Dict :
        Dictionary of tickers and counts
    int :
        Number of posts found.
    """
    praw_api = praw.Reddit(
        client_id=cfg.API_REDDIT_CLIENT_ID,
        client_secret=cfg.API_REDDIT_CLIENT_SECRET,
        username=cfg.API_REDDIT_USERNAME,
        user_agent=cfg.API_REDDIT_USER_AGENT,
        password=cfg.API_REDDIT_PASSWORD,
    )

    d_watchlist_tickers: Dict = {}
    l_watchlist_author = []
    subs = []
    psaw_api = PushshiftAPI()
    submissions = psaw_api.search_submissions(
        subreddit=l_sub_reddits,
        q="SPAC|Spac|spac|Spacs|spacs",
        filter=["id"],
    )
    n_flair_posts_found = 0
    for submission in submissions:

        # Get more information about post using PRAW api
        submission = praw_api.submission(id=submission.id)

        # Ensure that the post hasn't been removed  by moderator in the meanwhile,
        # that there is a description and it's not just an image, that the flair is
        # meaningful, and that we aren't re-considering same author's watchlist
        if (not submission.removed_by_category and submission.selftext
                and submission.link_flair_text not in ["Yolo", "Meme"]
                and submission.author.name not in l_watchlist_author):
            l_tickers_found = find_tickers(submission)
            subs.append(submission)
            if l_tickers_found:
                # Add another author's name to the parsed watchlists
                l_watchlist_author.append(submission.author.name)

                # Lookup stock tickers within a watchlist
                for key in l_tickers_found:
                    if key in d_watchlist_tickers:
                        # Increment stock ticker found
                        d_watchlist_tickers[key] += 1
                    else:
                        # Initialize stock ticker found
                        d_watchlist_tickers[key] = 1

                # Increment count of valid posts found
                n_flair_posts_found += 1

            # Check if number of wanted posts found has been reached
            if n_flair_posts_found > limit - 1:
                break

    return subs, d_watchlist_tickers, n_flair_posts_found

Esempio n. 26

0

Mostra file

from pymongo import MongoClient
import sys
sys.path.append("../")
from configuration import configuration
import datetime as dt
from psaw import PushshiftAPI

client = MongoClient(configuration.DB_HOST, configuration.DB_PORT)

db = client[configuration.DB_NAME]

COLLECTION = "neet_covid_2"
api = PushshiftAPI()

start_epoch = int(dt.datetime(2020, 1, 1).timestamp())

end_epoch = int(dt.datetime(2020, 12, 31).timestamp())

gen = api.search_submissions(subreddit="NEET",
                             after=start_epoch,
                             before=end_epoch)

cache = []

for s in gen:

    to_save = s.d_
    to_save["type"] = "post"

    cache.append(to_save)

Esempio n. 27

0

Mostra file

def get_popular_tickers(n_top: int,
                        posts_to_look_at: int,
                        subreddits: str = "") -> pd.DataFrame:
    """Get popular tickers from list of subreddits [Source: reddit]

    Parameters
    ----------
    n_top : int
        Number of top tickers to get
    posts_to_look_at : int
        How many posts to analyze in each subreddit
    subreddits : str, optional
        String of comma separated subreddits.

    Returns
    -------
    pd.DataFrame
        DataFrame of top tickers from supplied subreddits
    """
    if subreddits:
        sub_reddit_list = subreddits.split(",") if "," in subreddits else [
            subreddits
        ]
    else:
        sub_reddit_list = l_sub_reddits
    d_watchlist_tickers: Dict = {}
    l_watchlist_author = []

    praw_api = praw.Reddit(
        client_id=cfg.API_REDDIT_CLIENT_ID,
        client_secret=cfg.API_REDDIT_CLIENT_SECRET,
        username=cfg.API_REDDIT_USERNAME,
        user_agent=cfg.API_REDDIT_USER_AGENT,
        password=cfg.API_REDDIT_PASSWORD,
    )

    psaw_api = PushshiftAPI()
    for s_sub_reddit in sub_reddit_list:
        print(
            f"Search for latest tickers for {posts_to_look_at} '{s_sub_reddit}' posts"
        )
        submissions = psaw_api.search_submissions(
            subreddit=s_sub_reddit,
            limit=posts_to_look_at,
            filter=["id"],
        )

        n_tickers = 0
        for submission in submissions:
            try:
                # Get more information about post using PRAW api
                submission = praw_api.submission(id=submission.id)

                # Ensure that the post hasn't been removed by moderator in the meanwhile,
                # that there is a description and it's not just an image, that the flair is
                # meaningful, and that we aren't re-considering same author's content
                if (not submission.removed_by_category
                        and (submission.selftext or submission.title)
                        and submission.author.name not in l_watchlist_author):
                    l_tickers_found = find_tickers(submission)

                    if l_tickers_found:
                        n_tickers += len(l_tickers_found)

                        # Add another author's name to the parsed watchlists
                        l_watchlist_author.append(submission.author.name)

                        # Lookup stock tickers within a watchlist
                        for key in l_tickers_found:
                            if key in d_watchlist_tickers:
                                # Increment stock ticker found
                                d_watchlist_tickers[key] += 1
                            else:
                                # Initialize stock ticker found
                                d_watchlist_tickers[key] = 1

            except ResponseException:
                print(
                    "Received a response from Reddit with an authorization error. check your token.\n"
                )
                return pd.DataFrame()

        print(f"  {n_tickers} potential tickers found.")
    lt_watchlist_sorted = sorted(d_watchlist_tickers.items(),
                                 key=lambda item: item[1],
                                 reverse=True)

    if lt_watchlist_sorted:
        n_top_stocks = 0
        # pylint: disable=redefined-outer-name
        popular_tickers = []
        for t_ticker in lt_watchlist_sorted:
            if n_top_stocks > n_top:
                break
            try:
                # If try doesn't trigger exception, it means that this stock exists on finviz
                # thus we can print it.
                stock_info = finviz.get_stock(t_ticker[0])
                popular_tickers.append((
                    t_ticker[1],
                    t_ticker[0],
                    stock_info["Company"],
                    stock_info["Sector"],
                    stock_info["Price"],
                    stock_info["Change"],
                    stock_info["Perf Month"],
                    f"https://finviz.com/quote.ashx?t={t_ticker[0]}",
                ))
                n_top_stocks += 1
            except HTTPError as e:
                if e.response.status_code != 404:
                    print(f"Unexpected exception from Finviz: {e}")
            except Exception as e:
                print(e, "\n")
                return

        popular_tickers_df = pd.DataFrame(
            popular_tickers,
            columns=[
                "Mentions",
                "Ticker",
                "Company",
                "Sector",
                "Price",
                "Change",
                "Perf Month",
                "URL",
            ],
        )
    return popular_tickers_df

Esempio n. 28

0

Mostra file

from psaw import PushshiftAPI
import pandas as pd
import datetime as dt
import time

api = PushshiftAPI()
start_epoch = int(time.time())

listOfPosts = []

while (len(listOfPosts) < 100000):
    listOfPosts.extend(
        list(
            api.search_submissions(before=start_epoch,
                                   subreddit='toastme',
                                   filter=[
                                       'id', 'permalink', 'url', 'author',
                                       'title', 'subreddit', 'score',
                                       'num_comments'
                                   ],
                                   limit=500)))
    start_epoch = listOfPosts[-1].created_utc
    print(listOfPosts[-1])

df = pd.DataFrame(columns=[
    'id', 'title', 'url', 'author', 'score', 'num_comments', 'comments_url'
])
for post in listOfPosts:
    if post.num_comments >= 10:
        df = df.append(
            {

Esempio n. 29

0

Mostra file

File: reddit_api.py Progetto: wx-b/GamestonkTerminal

def watchlist(l_args):
    parser = argparse.ArgumentParser(
        prog='watchlist',
        description="""Print other users watchlist. [Source: Reddit]""")
    parser.add_argument('-l',
                        "--limit",
                        action="store",
                        dest="n_limit",
                        type=check_positive,
                        default=5,
                        help='limit of posts with watchlists retrieved.')

    try:
        (ns_parser, l_unknown_args) = parser.parse_known_args(l_args)

        if l_unknown_args:
            print(
                f"The following args couldn't be interpreted: {l_unknown_args}\n"
            )
            return

        l_sub_reddits = [
            'pennystocks', 'RobinHoodPennyStocks', 'Daytrading', 'StockMarket',
            'stocks', 'investing', 'wallstreetbets'
        ]

        d_submission = {}
        d_watchlist_tickers = {}
        l_watchlist_links = list()
        l_watchlist_author = list()
        ls_text = list()

        praw_api = praw.Reddit(client_id=cfg.API_REDDIT_CLIENT_ID,
                               client_secret=cfg.API_REDDIT_CLIENT_SECRET,
                               username=cfg.API_REDDIT_USERNAME,
                               user_agent=cfg.API_REDDIT_USER_AGENT,
                               password=cfg.API_REDDIT_PASSWORD)

        dt_last_time_market_close = get_last_time_market_was_open(
            datetime.now() - timedelta(hours=24))
        n_ts_after = int(dt_last_time_market_close.timestamp())
        psaw_api = PushshiftAPI()
        submissions = psaw_api.search_submissions(
            after=n_ts_after,
            subreddit=l_sub_reddits,
            q='WATCHLIST|Watchlist|watchlist',
            filter=['id'])

        n_flair_posts_found = 0
        while True:
            submission = next(submissions, None)
            if submission:
                # Get more information about post using PRAW api
                submission = praw_api.submission(id=submission.id)

                # Ensure that the post hasn't been removed  by moderator in the meanwhile,
                #that there is a description and it's not just an image, that the flair is
                #meaningful, and that we aren't re-considering same author's watchlist
                if not submission.removed_by_category and submission.selftext \
                    and submission.link_flair_text not in ['Yolo', 'Meme'] \
                    and submission.author.name not in l_watchlist_author:
                    ls_text = list()
                    ls_text.append(submission.selftext)
                    ls_text.append(submission.title)

                    submission.comments.replace_more(limit=0)
                    for comment in submission.comments.list():
                        ls_text.append(comment.body)

                    l_tickers_found = list()
                    for s_text in ls_text:
                        for s_ticker in set(
                                re.findall(r'([A-Z]{3,5} )', s_text)):
                            l_tickers_found.append(s_ticker.strip())

                    if l_tickers_found:
                        # Add another author's name to the parsed watchlists
                        l_watchlist_author.append(submission.author.name)

                        # Lookup stock tickers within a watchlist
                        for key in l_tickers_found:
                            if key in d_watchlist_tickers:
                                # Increment stock ticker found
                                d_watchlist_tickers[key] += 1
                            else:
                                # Initialize stock ticker found
                                d_watchlist_tickers[key] = 1

                        l_watchlist_links.append(
                            f"https://www.reddit.com{submission.permalink}")
                        # delte below, not necessary I reckon. Probably just link?

                        # Refactor data
                        s_datetime = datetime.utcfromtimestamp(
                            submission.created_utc).strftime(
                                "%d/%m/%Y %H:%M:%S")
                        s_link = f"https://www.reddit.com{submission.permalink}"
                        s_all_awards = ""
                        for award in submission.all_awardings:
                            s_all_awards += f"{award['count']} {award['name']}\n"
                        s_all_awards = s_all_awards[:-2]

                        # Create dictionary with data to construct dataframe allows to save data
                        d_submission[submission.id] = {
                            'created_utc': s_datetime,
                            'subreddit': submission.subreddit,
                            'link_flair_text': submission.link_flair_text,
                            'title': submission.title,
                            'score': submission.score,
                            'link': s_link,
                            'num_comments': submission.num_comments,
                            'upvote_ratio': submission.upvote_ratio,
                            'awards': s_all_awards
                        }

                        # Print post data collected so far
                        print(f"\n{s_datetime} - {submission.title}")
                        print(f"{s_link}")
                        t_post = PrettyTable([
                            'Subreddit', 'Flair', 'Score', '# Comments',
                            'Upvote %', "Awards"
                        ])
                        t_post.add_row([
                            submission.subreddit, submission.link_flair_text,
                            submission.score, submission.num_comments,
                            f"{round(100*submission.upvote_ratio)}%",
                            s_all_awards
                        ])
                        print(t_post)
                        print("")

                        # Increment count of valid posts found
                        n_flair_posts_found += 1

                # Check if number of wanted posts found has been reached
                if n_flair_posts_found > ns_parser.n_limit - 1:
                    break

            # Check if search_submissions didn't get anymore posts
            else:
                break

        if n_flair_posts_found:
            lt_watchlist_sorted = sorted(d_watchlist_tickers.items(),
                                         key=lambda item: item[1],
                                         reverse=True)
            s_watchlist_tickers = ""
            n_tickers = 0
            for t_ticker in lt_watchlist_sorted:
                try:
                    # If try doesn't trigger exception, it means that this stock exists on finviz
                    #thus we can print it.
                    finviz.get_stock(t_ticker[0])
                    if int(t_ticker[1]) > 1:
                        s_watchlist_tickers += f"{t_ticker[1]} {t_ticker[0]}, "
                    n_tickers += 1
                except:
                    pass
            if n_tickers:
                print(
                    "The following stock tickers have been mentioned more than once across the previous watchlists:"
                )
                print(s_watchlist_tickers[:-2] + '\n')
        print("")

    except:
        print("")

Esempio n. 30

0

Mostra file

def subreddit_data_old():
    cache_file = config['paths']['cache_path']
    data = request.get_json()
    # return current subreddit name
    if len(data) == 1:
        if 'subredditIndex' in data:
            try:
                subreddit = reddit.subreddit(subreddit_names[data['subredditIndex']])
                widgets = subreddit.widgets
                # id_card is for reddit redesign, not old reddit
                id_card = widgets.id_card
                if data['subredditIndex'] < len(subreddit_names):
                    return jsonify(
                        {'subreddit_name': subreddit.display_name, 'subreddit_subscribers': subreddit.subscribers,
                         'subreddit_subscriber_text': id_card.subscribersText})
            except exceptions.Forbidden:
                # subreddit is private so skip it and return a placeholder
                return jsonify(
                    {'subreddit_name': subreddit_names[data['subredditIndex']], 'subreddit_subscribers': 0,
                     'subreddit_subscriber_text': 'subscribers'})
        elif 'clickedId' in data:
            # clicked posts are marked as such and will show changes in information
            try:
                with lock:
                    with open(cache_file, 'r') as f:
                        post_cache = json.load(f)
            except FileNotFoundError:
                return jsonify(), 404

            clicked_id = data['clickedId']
            print('Clicked {}'.format(clicked_id))
            clicked_post = post_cache[clicked_id]
            clicked_post['visited'] = True  # marks the clicked_post as clicked on
            clicked_post['visit_time'] = time.time()  # time on click
            clicked_post['visit_comment_count'] = clicked_post['comment_count']  # number of comments on click

            # save cache
            with lock:
                post_cache = None
                try:
                    with open(cache_file, 'r') as f:
                        post_cache = json.load(f)
                        post_cache = remove_outdated(post_cache)
                except FileNotFoundError:
                    pass
                if post_cache is not None:
                    post_cache[clicked_id].update(clicked_post)
                with open(cache_file, 'w') as f:
                    json.dump(post_cache, f)
            return jsonify(), 200
        elif 'viewedId' in data:
            if 'viewed_post_ids' in session:
                viewed_post_ids = session['viewed_post_ids']
            else:
                viewed_post_ids = {}

            viewed_id = data['viewedId']
            if viewed_id not in viewed_post_ids:
                viewed_post_ids[viewed_id] = None

                try:
                    with lock:
                        with open(cache_file, 'r') as f:
                            post_cache = json.load(f)
                except FileNotFoundError:
                    return jsonify(), 404

                print('Viewed {}'.format(viewed_id))
                viewed_post = post_cache[viewed_id]

                if not config['modes'].getboolean('debug_mode'):
                    viewed_post['display_count'] += 1

                # save cache
                with lock:
                    post_cache = None
                    try:
                        with open(cache_file, 'r') as f:
                            post_cache = json.load(f)
                            post_cache = remove_outdated(post_cache)
                    except FileNotFoundError:
                        pass
                    if post_cache is not None:
                        post_cache[viewed_id].update(viewed_post)
                    with open(cache_file, 'w') as f:
                        json.dump(post_cache, f)
            session['viewed_post_ids'] = viewed_post_ids
            return jsonify(), 200

        return jsonify(), 404
    # return post data
    cur_sub_num = data['subredditIndex']
    cur_post_num = data['postIndex']
    post_amount = data['postAmount']
    sort_type = data['sortType']

    if cur_sub_num >= len(subreddit_names):
        return {}

    subreddit_name = subreddit_names[cur_sub_num]
    subreddit = reddit.subreddit(subreddit_name)
    # deal with quarantined subreddits
    try:
        subreddit.quaran.opt_in()
    except exceptions.Forbidden:
        pass

    if not config['modes'].getboolean('slow_mode'):
        submissions = subreddit.top(sort_type, limit=(cur_post_num + post_amount))
        for _ in range(cur_post_num):
            try:
                next(submissions)
            except StopIteration:
                break
    else:
        # posts = get_posts(submissions, SUBMISSION_SCORE_DEGRADATION)
        # print('slow mode')
        # slow mode only shows posts older than 24 hours
        # posts which have been visited should no longer be shown because they have settled down by now
        # load cache, filter posts earlier than 24 hours
        # start_time = time.time()
        # cached_posts = get_cached_posts(subreddit.display_name, min_hours=24, max_hours=48 + 8)
        # end_time = time.time()
        # if len(cached_posts) > 0:
        #     print('using cached posts', len(cached_posts))
        #     posts = cached_posts
        # else:
        #     print('no cached posts for r/{} meet requirements'.format(subreddit.display_name))
        # print('getting cached posts took {} seconds'.format(end_time - start_time))

        # for some reason sort_type will cause duplicates to be returned
        # the submissions seemingly start duplicating / looping back
        # using a small limit such as 10 will avoid duplication
        # unknown whether using limit itself avoids duplication
        # sort_type = score is inaccurate, it is more accurate to fetch all submissions and sort them

        # PSAW API
        # this was previous initialized at the start in global scope
        # causing Pushshift API requests to get mixed up and return results from multiple API requests at once
        # initializing a new api object for each request seems to solve this problem
        api = PushshiftAPI(reddit)

        submissions = list(api.search_submissions(subreddit=subreddit_name, after='56h', before='24h'))
        id_set = set()
        for submission in submissions:
            if submission.id in id_set:
                print('{}: "{}" is duplicated'.format(submission.id, submission.title))
            else:
                id_set.add(submission.id)
            if submission.subreddit.display_name.lower() != subreddit_name.lower():
                print('submission {} of {} != subreddit {}'.format(submission.id, submission.subreddit.display_name,
                                                                   subreddit_name))
        submissions.sort(key=lambda item: item.score, reverse=True)
        submissions = submissions[:10]
        print(id_set)
        print(subreddit.display_name, submissions)
    posts = get_posts(submissions)
    print(
        f'sub #{cur_sub_num}: {subreddit.display_name}, post {cur_post_num}, {post_amount} posts, offset {cur_post_num + post_amount}, {posts}')
    return jsonify(posts)