Beispiel #1
0
def add_to_database(subforum, link, post):
    # post is a defaultdict that defaults to ""
    post['home'] = home
    post['subname'] = subforum
    post['thread'] = link 
    if not post['plink']:
        post['plink'] = link 
    dblib.insert_data(con, cur, post)
Beispiel #2
0
def scrape_thread(browser, home, con, cur):
    """This function handles the main loop and parses the data obtained

  INPUTS: string (path to image directory), Selenium Browser object, string (subforum page), string (home url),
  string (thread name), int (thread page), string (subforum name), string (subforum link), 
  MySQLdb Connection Object, MySQLdb Cursor object.
  RETURNS: None"""
    forum_id = dblib.get_forum_id(con, cur, home)
    logger.info("got forum id %d", forum_id)
    restart.restore_state(forum_id)
    try:
        browser.get(home)
    except TimeoutException:
        logger.info("Timeout: %s", home)
        sys.stderr.write("TIMEOUT")
        keypress(browser)

    if type_flag:
        scraper = mybb
        logger.info("using mybb backend")
    else:
        scraper = vbulletin
        logger.info("using vbulletin backend")

    main_src = browser.page_source
    main_soup = bs(main_src)
    subforums = scraper.get_subforums(main_soup)

    logger.debug("got subforums: %s", str(subforums))

    subs = 0

    restart.get_cookies(forum_id, browser)

    sys.stderr.write("REFRESH")
    restart.dump_cookies(forum_id, browser)

    ##SUBFORUMS##
    for sub in subforums[::-1]:

        print "subforums %f%% DONE" % (float(subs) / len(subforums))
        print "subforum %d of %d DONE" % (subs, len(subforums))
        logger.info("subforums %f%% DONE", float(subs) / len(subforums))
        logger.info("subforum %d of %d DONE", subs, len(subforums))
        subs += 1
        subforum_id = dblib.get_sub_id(con, cur, sub["name"], forum_id)
        logger.debug("scraping subforum %s, id #%d", sub["name"], subforum_id)

        t_done = 0
        sub_page = 0
        sub_page_count = 1
        while sub_page < sub_page_count:  # iterate through subforum pages
            sys.stderr.write("REFRESH")
            sub_page += 1
            sub_link = scraper.get_page(home + sub["link"], sub_page)
            print "sub link %s DONE" % sub_link
            logger.info("sub link %s DONE", sub_link)
            try:
                browser.get(sub_link)
            except TimeoutException:
                logger.info("Timeout: %s", home)
                sys.stderr.write("TIMEOUT")
                keypress(browser)

            sub_src = browser.page_source
            sub_soup = bs(sub_src)
            threads, (sub_page, sub_page_count) = scraper.get_threads(sub_soup)
            print "got threads"
            logger.debug(
                "got threads\nsubforum page: %d\nsubforum poge count: %d\nThreads: %s",
                sub_page,
                sub_page_count,
                str(threads),
            )

            ##THREADS##
            for thread in threads:  # iterate through threads on page
                print "threads %f%% DONE" % (float(t_done) / (len(threads) * sub_page_count))
                print "thread %d of %d DONE" % (t_done, len(threads) * sub_page_count)
                logger.info("threads %f%% DONE", float(t_done) / (len(threads) * sub_page_count))
                logger.info("thread %d of %d DONE", t_done, len(threads) * sub_page_count)
                t_done += 1
                sys.stderr.write("REFRESH")
                thread_id = dblib.get_thread_id(con, cur, thread["name"], subforum_id)

                logger.debug("scraping thread %s, id %d", thread["name"], thread_id)

                tc = dblib.get_thread_count(thread["name"], cur)
                logger.debug("posts in thread: %d\ndownloaded posts from thread: %d", tc, thread["count"])
                if (thread["count"] == tc) and (tc != 0):
                    continue  # if we have all of the posts, skip this thread

                if thread_id in restart.threads.keys():
                    logger.debug(
                        "in thread_keys: starting thread %d  scrape at %d", thread_id, restart.threads[thread_id]
                    )
                #    restart.threads[thread_id][1] += 1
                # else: restart.threads[thread_id] = (0, 1)
                else:
                    restart.threads[thread_id] = 1
                    logger.debug("not in thread keys: starting thread %d scrape at page 1", thread_id)

                thread_page = restart.threads[thread_id] - 1
                thread_page_count = thread_page + 1
                print "thread %d: page %d of %d" % (thread_id, thread_page, thread_page_count)
                print "%d\% done"
                while thread_page < thread_page_count:  # iterate through thread pages
                    sys.stderr.write("REFRESH")
                    thread_page += 1
                    thread_link = scraper.get_page(home + thread["link"], thread_page)
                    logger.info(
                        "thread %d: page %d of %d\nLink: %s", thread_id, thread_page, thread_page_count, thread_link
                    )
                    try:
                        browser.get(thread_link)
                    except TimeoutException:
                        logger.info("Timeout: %s", home)
                        sys.stderr.write("TIMEOUT")
                        keypress(browser)

                    page_src = browser.page_source
                    posts, (thread_page, thread_page_count) = scraper.get_posts(page_src)
                    print "got posts"
                    for post in posts:
                        print "iterate post"
                        user = post["user"]
                        P = dblib.post(
                            home,
                            sub["name"],
                            sub["link"],
                            sub_page,
                            thread["name"],
                            post["date"],
                            post["link"],
                            post["message"],
                            user["name"],
                            user["title"],
                            user["join"],
                            user["link"],
                            user["sig"],
                            post["edit"],
                            post["message images"],
                        )
                        (post_id, user_id) = dblib.insert_data(con, cur, P)
                        print post_id
                        print user["image"]
                        print type(user["image"])
                        if user["image"]:
                            if user["image"].find("http") == -1:
                                user["image"] = P.home + user["image"]
                        imaget.get_user_image(user_id, user["image"])
                        imaget.get_post_images(P, post["message images"], cur)

                    # restart.threads[thread_id][0] = thread_page
                    restart.threads[thread_id] = thread_page
                    restart.save_state(forum_id)

    """
def scrape_thread(browser, home, con, cur):
    """This function handles the main loop and parses the data obtained

  INPUTS: string (path to image directory), Selenium Browser object, string (subforum page), string (home url),
  string (thread name), int (thread page), string (subforum name), string (subforum link), 
  MySQLdb Connection Object, MySQLdb Cursor object.
  RETURNS: None"""
    forum_id = dblib.get_forum_id(con, cur, home)
    logger.info("got forum id %d", forum_id)
    restart.restore_state(forum_id)
    try:
        browser.get(home)
    except TimeoutException:
        logger.info("Timeout: %s", home)
        sys.stderr.write("TIMEOUT")
        keypress(browser)

    if type_flag:
        scraper = mybb
        logger.info("using mybb backend")
    else:
        scraper = vbulletin
        logger.info("using vbulletin backend")

    main_src = browser.page_source
    main_soup = bs(main_src)
    subforums = scraper.get_subforums(main_soup)

    logger.debug("got subforums: %s", str(subforums))

    subs = 0


    restart.get_cookies(forum_id, browser)

    sys.stderr.write("REFRESH")
    restart.dump_cookies(forum_id, browser)

    ##SUBFORUMS##
    for sub in subforums[::-1]:
        
        print "subforums %f%% DONE" % (float(subs)/len(subforums))
        print "subforum %d of %d DONE" % (subs, len(subforums))
        logger.info("subforums %f%% DONE", float(subs)/len(subforums))
        logger.info("subforum %d of %d DONE", subs, len(subforums))
        subs += 1
        subforum_id = dblib.get_sub_id(con, cur, sub['name'], forum_id)
        logger.debug("scraping subforum %s, id #%d", sub['name'], subforum_id)
        
        t_done = 0
        sub_page = 0
        sub_page_count = 1
        while sub_page < sub_page_count: #iterate through subforum pages
            sys.stderr.write("REFRESH")
            sub_page += 1
            sub_link = scraper.get_page(home + sub['link'], sub_page)
            print "sub link %s DONE" % sub_link
            logger.info("sub link %s DONE", sub_link)
            try:
                browser.get(sub_link)
            except TimeoutException:
                logger.info("Timeout: %s", home)
                sys.stderr.write("TIMEOUT")
                keypress(browser)
    
            sub_src = browser.page_source
            sub_soup = bs(sub_src)
            threads, (sub_page, sub_page_count) = scraper.get_threads(sub_soup)
            print "got threads"
            logger.debug("got threads\nsubforum page: %d\nsubforum poge count: %d\nThreads: %s", sub_page, sub_page_count, str(threads))

            ##THREADS##
            for thread in threads: #iterate through threads on page
                print "threads %f%% DONE" % (float(t_done)/(len(threads) * sub_page_count))
                print "thread %d of %d DONE" % (t_done, len(threads) * sub_page_count)
                logger.info("threads %f%% DONE", float(t_done)/(len(threads) * sub_page_count))
                logger.info("thread %d of %d DONE", t_done, len(threads) * sub_page_count)
                t_done += 1
                sys.stderr.write("REFRESH")
                thread_id = dblib.get_thread_id(con, cur, thread['name'], subforum_id)

                logger.debug("scraping thread %s, id %d", thread['name'], thread_id)
    
                tc = dblib.get_thread_count(thread['name'], cur)
                logger.debug("posts in thread: %d\ndownloaded posts from thread: %d", tc, thread['count'])
                if (thread['count'] == tc) and (tc != 0):  continue #if we have all of the posts, skip this thread

                if thread_id in restart.threads.keys(): logger.debug("in thread_keys: starting thread %d  scrape at %d", thread_id, restart.threads[thread_id])
                #    restart.threads[thread_id][1] += 1
                #else: restart.threads[thread_id] = (0, 1)
                else: 
                    restart.threads[thread_id] = 1
                    logger.debug("not in thread keys: starting thread %d scrape at page 1", thread_id)
                
                thread_page = restart.threads[thread_id] -1
                thread_page_count = thread_page + 1
                print "thread %d: page %d of %d" % (thread_id, thread_page, thread_page_count)
                print "%d\% done"
                while thread_page < thread_page_count: #iterate through thread pages
                    sys.stderr.write("REFRESH")
                    thread_page += 1
                    thread_link = scraper.get_page(home + thread['link'], thread_page)
                    logger.info("thread %d: page %d of %d\nLink: %s", thread_id, thread_page, thread_page_count, thread_link)
                    try:
                        browser.get(thread_link)
                    except TimeoutException:
                        logger.info("Timeout: %s", home)
                        sys.stderr.write("TIMEOUT")
                        keypress(browser)

                    page_src = browser.page_source
                    posts, (thread_page, thread_page_count) = scraper.get_posts(page_src)
                    print "got posts"
                    for post in posts:
                        print "iterate post"
                        user = post['user']
                        P = dblib.post(home, sub['name'], sub['link'], sub_page, thread['name'], post['date'], post['link'], post['message'],
                                      user['name'], user['title'], user['join'], user['link'], user['sig'], post['edit'], post['message images'])
                        (post_id, user_id) = dblib.insert_data(con, cur, P)
                        print post_id
                        print user['image']
                        print type(user['image'])
                        if user['image']:
                            if (user['image'].find('http') == -1): user['image'] = P.home + user['image']
                        imaget.get_user_image(user_id, user['image'])
                        imaget.get_post_images(P, post['message images'], cur)

                    #restart.threads[thread_id][0] = thread_page
                    restart.threads[thread_id] = thread_page
                    restart.save_state(forum_id)
                  
            
    """