def test_load_crawled(self): with open("crawled.txt", "r") as f: saved_len = len(f.read()) bot = RedditBot("crawled.txt") self.assertTrue(bot.has_crawled("384390")) self.assertFalse(bot.has_crawled("123456")) with open("crawled.txt", "r") as f: self.assertEqual(saved_len, len(f.read()))
def main(): SUBREDDIT = "all" SEARCH_PHRASE = 'googleSearch!' COMMENT_LIMIT = None # COMMENT_LIMIT = 500000 prev_comment_ids = load() logger = Logger() logger.setName('google bot logger') logger.start() bot = RedditBot(SEARCH_PHRASE, logger, prev_comment_ids) bot.login() thread_num = 0 # while True: # crawler = bot.crawl_comments(thread_num,SUBREDDIT,COMMENT_LIMIT) # try: # crawler_thread = threading.Thread(target=crawler,name="bot_thread_"+str(thread_num)) # crawler_thread.start() # except Exception as e: # if e == APIException: # print("[EXCEPTION] rate limit hit") # sleep(100) # sleep(30) while True: try: thread_num += 1 crawler = bot.crawl_comments(thread_num, SUBREDDIT, COMMENT_LIMIT) crawler_thread = threading.Thread(target=crawler, name="bot_thread_" + str(thread_num)) crawler_thread.start() crawler_thread.join() except Exception as e: if e == APIException: print("[EXCEPTION] rate limit hit") sleep(100) #make sure logger is finished commiting logs to file while not logger.is_finished(): sleep(1) logger.stop_thread() return 0
def __init__(self): self.busy = Value("i", 0) self.current_website = None self.current_task = None reddit = praw.Reddit( 'opendirectories-bot', user_agent= 'github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)') self.reddit_bot = RedditBot("crawled.txt", reddit) self.db = Database("db.sqlite3") scheduler = BackgroundScheduler() scheduler.add_job(self.check_new_task, "interval", seconds=1) scheduler.start()
def __init__(self, task: Task): super().__init__(task) reddit = praw.Reddit( 'opendirectories-bot', user_agent='github.com/simon987/od-database (by /u/Hexahedr_n)') self.reddit_bot = RedditBot("crawled.txt", reddit)
def __init__(self): self.busy = Value("i", 0) self.current_website = None self.current_task = None reddit = praw.Reddit('opendirectories-bot', user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)') self.reddit_bot = RedditBot("crawled.txt", reddit) self.db = Database("db.sqlite3") scheduler = BackgroundScheduler() scheduler.add_job(self.check_new_task, "interval", seconds=1) scheduler.start()
def test_save_crawled(self): if os.path.isfile("crawled_empty.txt"): os.remove("crawled_empty.txt") open("crawled_empty.txt", "w").close() tmp_bot = RedditBot("crawled_empty.txt") tmp_bot.log_crawl("000000") bot = RedditBot("crawled_empty.txt") self.assertTrue(bot.has_crawled("000000"))
import praw from reddit_bot import RedditBot from database import Database, Website import od_util import os import re pattern = re.compile("[\[\]\\\()]+") reddit = praw.Reddit( 'opendirectories-bot', user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)') db = Database("db.sqlite3") subreddit = reddit.subreddit("opendirectories") # subreddit = reddit.subreddit("test") bot = RedditBot("crawled.txt", reddit) submissions = [] def handle_exact_repost(website_id, reddit_obj): stats = db.get_website_stats(website_id) comment = bot.get_comment({"": stats}, website_id, "I already scanned this website on " + website.last_modified + " UTC") print(comment) print("Exact repost!") bot.reply(reddit_obj, comment) def handle_subdir_repost(website_id, reddit_obj):
class TaskManager: def __init__(self): self.busy = Value("i", 0) self.current_website = None self.current_task = None reddit = praw.Reddit( 'opendirectories-bot', user_agent= 'github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)') self.reddit_bot = RedditBot("crawled.txt", reddit) self.db = Database("db.sqlite3") scheduler = BackgroundScheduler() scheduler.add_job(self.check_new_task, "interval", seconds=1) scheduler.start() def check_new_task(self): if self.current_task is None: task = self.db.dequeue() if task: website_id, post_id, comment_id = task website = self.db.get_website_by_id(website_id) self.current_task = Process(target=self.execute_task, args=(website, self.busy, post_id, comment_id)) self.current_website = website self.current_task.start() elif self.busy.value == 0: self.current_task.terminate() self.current_task = None self.current_website = None def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str): busy.value = 1 if os.path.exists("data.json"): os.remove("data.json") print("Started crawling task") process = CrawlerProcess(get_project_settings()) process.crawl("od_links", base_url=website.url) process.start() print("Done crawling") self.db.import_json("data.json", website) os.remove("data.json") print("Imported in SQLite3") if post_id: # Reply to post stats = self.db.get_website_stats(website.id) comment = self.reddit_bot.get_comment({"": stats}, website.id) print(comment) if "total_size" in stats and stats["total_size"] > 10000000: post = self.reddit_bot.reddit.submission(post_id) self.reddit_bot.reply(post, comment) pass else: self.reddit_bot.log_crawl(post_id) elif comment_id: # Reply to comment stats = self.db.get_website_stats(website.id) comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id) print(comment) reddit_comment = self.reddit_bot.reddit.comment(comment_id) self.reddit_bot.reply(reddit_comment, comment) busy.value = 0 print("Done crawling task")
from multiprocessing import Process import praw reddit = praw.Reddit( 'opendirectories-bot', user_agent= 'github.com/simon987/opendirectories-bot v1.0 (by /u/Hexahedr_n)') subreddit = reddit.subreddit("opendirectories") subs = [] for submission in subreddit.new(limit=3): subs.append(submission) bot = RedditBot("crawled.txt") tq = TaskQueue() for s in subs: if not s.is_self: if not bot.has_crawled(s.id) and not tq.is_queued(s.id): tq.push(CrawTask(s)) print("id: " + s.id) print("url: " + str(s.url)) print("title: " + str(s.title)) def execute_task(submission):
class TaskManager: def __init__(self): self.busy = Value("i", 0) self.current_website = None self.current_task = None reddit = praw.Reddit('opendirectories-bot', user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)') self.reddit_bot = RedditBot("crawled.txt", reddit) self.db = Database("db.sqlite3") scheduler = BackgroundScheduler() scheduler.add_job(self.check_new_task, "interval", seconds=1) scheduler.start() def check_new_task(self): if self.current_task is None: task = self.db.dequeue() if task: website_id, post_id, comment_id = task website = self.db.get_website_by_id(website_id) self.current_task = Process(target=self.execute_task, args=(website, self.busy, post_id, comment_id)) self.current_website = website self.current_task.start() elif self.busy.value == 0: self.current_task.terminate() self.current_task = None self.current_website = None def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str): busy.value = 1 if os.path.exists("data.json"): os.remove("data.json") print("Started crawling task") process = CrawlerProcess(get_project_settings()) process.crawl("od_links", base_url=website.url) process.start() print("Done crawling") self.db.import_json("data.json", website) os.remove("data.json") print("Imported in SQLite3") if post_id: # Reply to post stats = self.db.get_website_stats(website.id) comment = self.reddit_bot.get_comment({"": stats}, website.id) print(comment) if "total_size" in stats and stats["total_size"] > 10000000: post = self.reddit_bot.reddit.submission(post_id) self.reddit_bot.reply(post, comment) pass else: self.reddit_bot.log_crawl(post_id) elif comment_id: # Reply to comment stats = self.db.get_website_stats(website.id) comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id) print(comment) reddit_comment = self.reddit_bot.reddit.comment(comment_id) self.reddit_bot.reply(reddit_comment, comment) busy.value = 0 print("Done crawling task")
import praw from reddit_bot import RedditBot from database import Database, Website import od_util import os import re pattern = re.compile("[\[\]\\\()]+") reddit = praw.Reddit('opendirectories-bot', user_agent='github.com/simon987/od-database v1.0 (by /u/Hexahedr_n)') db = Database("db.sqlite3") subreddit = reddit.subreddit("opendirectories") # subreddit = reddit.subreddit("test") bot = RedditBot("crawled.txt", reddit) submissions = [] def handle_exact_repost(website_id, reddit_obj): stats = db.get_website_stats(website_id) comment = bot.get_comment({"": stats}, website_id, "I already scanned this website on " + website.last_modified + " UTC") print(comment) print("Exact repost!") bot.reply(reddit_obj, comment) def handle_subdir_repost(website_id, reddit_obj): website = db.get_website_by_id(website_id) message = "I already scanned a parent directory of this website on " + website.last_modified + " UTC"