Beispiel #1
0
    def test_load_crawled(self):

        with open("crawled.txt", "r") as f:
            saved_len = len(f.read())

        bot = RedditBot("crawled.txt")

        self.assertTrue(bot.has_crawled("384390"))
        self.assertFalse(bot.has_crawled("123456"))

        with open("crawled.txt", "r") as f:
            self.assertEqual(saved_len, len(f.read()))
Beispiel #2
0
def main():
    SUBREDDIT = "all"
    SEARCH_PHRASE = 'googleSearch!'
    COMMENT_LIMIT = None
    # COMMENT_LIMIT = 500000

    prev_comment_ids = load()

    logger = Logger()
    logger.setName('google bot logger')
    logger.start()

    bot = RedditBot(SEARCH_PHRASE, logger, prev_comment_ids)
    bot.login()

    thread_num = 0
    # while True:
    # crawler = bot.crawl_comments(thread_num,SUBREDDIT,COMMENT_LIMIT)

    # try:
    # 	crawler_thread = threading.Thread(target=crawler,name="bot_thread_"+str(thread_num))
    # 	crawler_thread.start()
    # except Exception as e:
    # 	if e == APIException:
    # 		print("[EXCEPTION] rate limit hit")
    # 		sleep(100)

    # sleep(30)

    while True:
        try:
            thread_num += 1
            crawler = bot.crawl_comments(thread_num, SUBREDDIT, COMMENT_LIMIT)
            crawler_thread = threading.Thread(target=crawler,
                                              name="bot_thread_" +
                                              str(thread_num))
            crawler_thread.start()
            crawler_thread.join()
        except Exception as e:
            if e == APIException:
                print("[EXCEPTION] rate limit hit")
                sleep(100)

    #make sure logger is finished commiting logs to file
    while not logger.is_finished():
        sleep(1)

    logger.stop_thread()
    return 0
Beispiel #3
0
    def __init__(self):
        self.busy = Value("i", 0)
        self.current_website = None
        self.current_task = None

        reddit = praw.Reddit(
            'opendirectories-bot',
            user_agent=
            'github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
        self.reddit_bot = RedditBot("crawled.txt", reddit)

        self.db = Database("db.sqlite3")
        scheduler = BackgroundScheduler()
        scheduler.add_job(self.check_new_task, "interval", seconds=1)
        scheduler.start()
Beispiel #4
0
    def __init__(self, task: Task):
        super().__init__(task)

        reddit = praw.Reddit(
            'opendirectories-bot',
            user_agent='github.com/simon987/od-database (by /u/Hexahedr_n)')
        self.reddit_bot = RedditBot("crawled.txt", reddit)
Beispiel #5
0
    def __init__(self):
        self.busy = Value("i", 0)
        self.current_website = None
        self.current_task = None

        reddit = praw.Reddit('opendirectories-bot',
                             user_agent='github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
        self.reddit_bot = RedditBot("crawled.txt", reddit)

        self.db = Database("db.sqlite3")
        scheduler = BackgroundScheduler()
        scheduler.add_job(self.check_new_task, "interval", seconds=1)
        scheduler.start()
Beispiel #6
0
    def test_save_crawled(self):

        if os.path.isfile("crawled_empty.txt"):
            os.remove("crawled_empty.txt")

        open("crawled_empty.txt", "w").close()

        tmp_bot = RedditBot("crawled_empty.txt")
        tmp_bot.log_crawl("000000")

        bot = RedditBot("crawled_empty.txt")

        self.assertTrue(bot.has_crawled("000000"))
import praw
from reddit_bot import RedditBot
from database import Database, Website
import od_util
import os
import re

pattern = re.compile("[\[\]\\\()]+")
reddit = praw.Reddit(
    'opendirectories-bot',
    user_agent='github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
db = Database("db.sqlite3")
subreddit = reddit.subreddit("opendirectories")
# subreddit = reddit.subreddit("test")
bot = RedditBot("crawled.txt", reddit)

submissions = []


def handle_exact_repost(website_id, reddit_obj):
    stats = db.get_website_stats(website_id)
    comment = bot.get_comment({"": stats}, website_id,
                              "I already scanned this website on " +
                              website.last_modified + " UTC")
    print(comment)
    print("Exact repost!")
    bot.reply(reddit_obj, comment)


def handle_subdir_repost(website_id, reddit_obj):
Beispiel #8
0
class TaskManager:
    def __init__(self):
        self.busy = Value("i", 0)
        self.current_website = None
        self.current_task = None

        reddit = praw.Reddit(
            'opendirectories-bot',
            user_agent=
            'github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
        self.reddit_bot = RedditBot("crawled.txt", reddit)

        self.db = Database("db.sqlite3")
        scheduler = BackgroundScheduler()
        scheduler.add_job(self.check_new_task, "interval", seconds=1)
        scheduler.start()

    def check_new_task(self):
        if self.current_task is None:
            task = self.db.dequeue()

            if task:
                website_id, post_id, comment_id = task
                website = self.db.get_website_by_id(website_id)
                self.current_task = Process(target=self.execute_task,
                                            args=(website, self.busy, post_id,
                                                  comment_id))
                self.current_website = website
                self.current_task.start()

        elif self.busy.value == 0:
            self.current_task.terminate()
            self.current_task = None
            self.current_website = None

    def execute_task(self, website: Website, busy: Value, post_id: str,
                     comment_id: str):
        busy.value = 1
        if os.path.exists("data.json"):
            os.remove("data.json")
        print("Started crawling task")
        process = CrawlerProcess(get_project_settings())
        process.crawl("od_links", base_url=website.url)
        process.start()
        print("Done crawling")

        self.db.import_json("data.json", website)
        os.remove("data.json")
        print("Imported in SQLite3")

        if post_id:
            # Reply to post
            stats = self.db.get_website_stats(website.id)
            comment = self.reddit_bot.get_comment({"": stats}, website.id)
            print(comment)
            if "total_size" in stats and stats["total_size"] > 10000000:
                post = self.reddit_bot.reddit.submission(post_id)
                self.reddit_bot.reply(post, comment)
                pass
            else:
                self.reddit_bot.log_crawl(post_id)

        elif comment_id:
            # Reply to comment
            stats = self.db.get_website_stats(website.id)
            comment = self.reddit_bot.get_comment({"There you go!": stats},
                                                  website.id)
            print(comment)
            reddit_comment = self.reddit_bot.reddit.comment(comment_id)
            self.reddit_bot.reply(reddit_comment, comment)
        busy.value = 0
        print("Done crawling task")
Beispiel #9
0
from multiprocessing import Process
import praw

reddit = praw.Reddit(
    'opendirectories-bot',
    user_agent=
    'github.com/simon987/opendirectories-bot v1.0  (by /u/Hexahedr_n)')

subreddit = reddit.subreddit("opendirectories")

subs = []

for submission in subreddit.new(limit=3):
    subs.append(submission)

bot = RedditBot("crawled.txt")
tq = TaskQueue()

for s in subs:

    if not s.is_self:
        if not bot.has_crawled(s.id) and not tq.is_queued(s.id):
            tq.push(CrawTask(s))

            print("id: " + s.id)
            print("url: " + str(s.url))
            print("title: " + str(s.title))


def execute_task(submission):
Beispiel #10
0
class TaskManager:

    def __init__(self):
        self.busy = Value("i", 0)
        self.current_website = None
        self.current_task = None

        reddit = praw.Reddit('opendirectories-bot',
                             user_agent='github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
        self.reddit_bot = RedditBot("crawled.txt", reddit)

        self.db = Database("db.sqlite3")
        scheduler = BackgroundScheduler()
        scheduler.add_job(self.check_new_task, "interval", seconds=1)
        scheduler.start()

    def check_new_task(self):
        if self.current_task is None:
            task = self.db.dequeue()

            if task:
                website_id, post_id, comment_id = task
                website = self.db.get_website_by_id(website_id)
                self.current_task = Process(target=self.execute_task,
                                            args=(website, self.busy, post_id, comment_id))
                self.current_website = website
                self.current_task.start()

        elif self.busy.value == 0:
            self.current_task.terminate()
            self.current_task = None
            self.current_website = None

    def execute_task(self, website: Website, busy: Value, post_id: str, comment_id: str):
        busy.value = 1
        if os.path.exists("data.json"):
            os.remove("data.json")
        print("Started crawling task")
        process = CrawlerProcess(get_project_settings())
        process.crawl("od_links", base_url=website.url)
        process.start()
        print("Done crawling")

        self.db.import_json("data.json", website)
        os.remove("data.json")
        print("Imported in SQLite3")

        if post_id:
            # Reply to post
            stats = self.db.get_website_stats(website.id)
            comment = self.reddit_bot.get_comment({"": stats}, website.id)
            print(comment)
            if "total_size" in stats and stats["total_size"] > 10000000:
                post = self.reddit_bot.reddit.submission(post_id)
                self.reddit_bot.reply(post, comment)
                pass
            else:
                self.reddit_bot.log_crawl(post_id)

        elif comment_id:
            # Reply to comment
            stats = self.db.get_website_stats(website.id)
            comment = self.reddit_bot.get_comment({"There you go!": stats}, website.id)
            print(comment)
            reddit_comment = self.reddit_bot.reddit.comment(comment_id)
            self.reddit_bot.reply(reddit_comment, comment)
        busy.value = 0
        print("Done crawling task")
import praw
from reddit_bot import RedditBot
from database import Database, Website
import od_util
import os
import re

pattern = re.compile("[\[\]\\\()]+")
reddit = praw.Reddit('opendirectories-bot',
                     user_agent='github.com/simon987/od-database v1.0  (by /u/Hexahedr_n)')
db = Database("db.sqlite3")
subreddit = reddit.subreddit("opendirectories")
# subreddit = reddit.subreddit("test")
bot = RedditBot("crawled.txt", reddit)

submissions = []


def handle_exact_repost(website_id, reddit_obj):
    stats = db.get_website_stats(website_id)
    comment = bot.get_comment({"": stats}, website_id,
                              "I already scanned this website on " + website.last_modified + " UTC")
    print(comment)
    print("Exact repost!")
    bot.reply(reddit_obj, comment)


def handle_subdir_repost(website_id, reddit_obj):

    website = db.get_website_by_id(website_id)
    message = "I already scanned a parent directory of this website on " + website.last_modified + " UTC"