Example #1
0
    def __init__(self):
        starttime = time.clock()
        super(crawl, self).__init__()
        print "    Welcome to the crawler version " + VERSION + "\n"
        for i in USER_AGENT.split():
            print i + "\n ",
        print "\nInitializing"

        self.quit = True
        self.quit_analyze = True
        self.request_time = 0

        #alias should be the last one
        #if not, recrawl will no longer work for alias
        self.item_names = ("level",
                           "badge",
                           "game",
                           "screenshot",
                           "video",
                           "workshop",
                           "recommendation",
                           "guide",
                           "image",
                           "greenlight",
                           "item",
                           "group",
                           "friend",
                           "alias",
                           )
        self.item_search = (r"badges/",
                            r"games/\?tab=all",
                            r"screenshots/",
                            r"videos/",
                            r"myworkshopfiles/",
                            r"recommended/",
                            r"myworkshopfiles/\?section=guides",
                            r"images/",
                            r"myworkshopfiles/\?section=greenlight",
                            r"inventory/",
                            r"groups/",
                            r"friends/",
                            )
        self.item_important = (True,  #level
                               False, #badge
                               True,  #game
                               False, #screenshot
                               False, #video
                               False, #workshop
                               False, #recommendation
                               False, #guide
                               False, #image
                               False, #greenlight
                               False, #item
                               True,  #group
                               True,  #friend
                               False, #alias
                               )
        self.item_upload = list(self.item_important)
        self.item_upload[-1] = True #alias
        self.item_upload = tuple(self.item_upload)

        if not file_exists("mem/stats"): print "\nRUNNING THE CRAWLER FOR THE FIRST TIME\n"
        # [start time, crawls, bytes, crawl age, uptime, hi alias]
        self.alltimestats = load_queue("mem/stats", [time.time(), 0.0, 0.0, time.time(), 0.0, 0.0])
        self.queue = load_queue("mem/queue", [FIRST_USER], file_to_queue)
        self.hiscores = load_queue("mem/high", [1] * len(self.item_names), int)
        self.save_times = load_queue("mem/times", [])
        self.save_amounts = load_queue("mem/bytes", [])
        self.bg_images = load_queue("mem/backgrounds", [], file_to_bgurl)
        self.uptime = self.alltimestats[4]
        if file_exists("mem/exists"):
            with open("mem/exists", "rb") as f: self.existlist = f.read()
        else: self.existlist = ""

        self.games = load_dict("mem/games", {})
        self.games_queue = []
        for i in self.bg_images:
            game = bgurl_to_game(i)
            if game not in self.games and game not in self.games_queue: self.games_queue.append(game)

        #regexes and search strings
            #public
        self.re_name = re.compile(r'"personaname":"([^"]*)') # 1
        self.re_steamid = re.compile(r'"steamid":"([^"]*)') # 1
        self.re_customurl = re.compile(r'"url":"([^"]*)') # 1
        self.se_private = "private_profile"
        self.se_noavatar = "fef49e7fa7e1997310d705b2a6158ff8dc1cdfeb_full.jpg"
        self.se_bans = "profile_ban"
            #private
        self.se_background = "has_profile_background"
        self.re_bgimage = re.compile(r"background-image: url\(( ')?([^')]*)") # 2
        self.re_friends      = re.compile(r'steamcommunity\.com/((id|profiles)/[\w-]+)') # 1
        self.re_friend_level = re.compile(r'steamcommunity\.com/((id|profiles)/[\w-]+)[\D]*([\d]*)') # 1 + 3
        self.re_level = re.compile(r'"friendPlayerLevelNum">(\d*)') # 1
            #positions
        self.se_comments = "profile_comment_area"
        self.se_leftcol = "profile_leftcol"
        self.se_rightcol = "profile_rightcol"
        self.se_topfriends = "profile_topfriends"
            #game
        self.re_game = re.compile(r'apphub_AppName[^>]*>([^<]*)') # 1

        #performance stats
        self.crawl_times_sum = 0
        self.crawl_times_amount = 0

        self.request_handler = Request_handler(self.queue)

        self.database = Database(self.item_names, self.item_important, self.item_upload)

        self.next_backup = get_next_backup_time()

        print "  " + str(len(self.queue)) + " users in queue"
        print "  " + str(len(self.bg_images)) + " backgrounds found"
        print "  " + str(len(self.games)) + " games crawled"
        print "  " + str(round(self.uptime / 86400.0, 1)) + " days of crawling time"
        print "Next backup in " + str(round((self.next_backup - time.time()) / 3600.0, 1)) + " hours"
        print "Done initializing",
        print get_time_string(starttime)
Example #2
0
    def __init__(self):
        starttime = time.clock()
        super(crawl, self).__init__()
        print "    Welcome to the crawler version " + VERSION + "\n"
        for i in USER_AGENT.split():
            print i + "\n ",
        print "\nInitializing"

        self.quit = True
        self.quit_analyze = True
        self.request_time = 0

        #alias should be the last one
        #if not, recrawl will no longer work for alias
        self.item_names = ("level",
                           "badge",
                           "game",
                           "screenshot",
                           "video",
                           "workshop",
                           "recommendation",
                           "guide",
                           "image",
                           "greenlight",
                           "item",
                           "group",
                           "friend",
                           "alias",
                           )
        self.item_search = (r"badges/",
                            r"games/\?tab=all",
                            r"screenshots/",
                            r"videos/",
                            r"myworkshopfiles/",
                            r"recommended/",
                            r"myworkshopfiles/\?section=guides",
                            r"images/",
                            r"myworkshopfiles/\?section=greenlight",
                            r"inventory/",
                            r"groups/",
                            r"friends/",
                            )
        self.item_important = (True,  #level
                               False, #badge
                               True,  #game
                               False, #screenshot
                               False, #video
                               False, #workshop
                               False, #recommendation
                               False, #guide
                               False, #image
                               False, #greenlight
                               False, #item
                               True,  #group
                               True,  #friend
                               False, #alias
                               )
        self.item_upload = list(self.item_important)
        self.item_upload[-1] = True #alias
        self.item_upload = tuple(self.item_upload)

        if not file_exists("mem/stats"): print "\nRUNNING THE CRAWLER FOR THE FIRST TIME\n"
        # [start time, crawls, bytes, crawl age, uptime, hi alias]
        self.alltimestats = load_queue("mem/stats", [time.time(), 0.0, 0.0, time.time(), 0.0, 0.0])
        self.queue = load_queue("mem/queue", [FIRST_USER], file_to_queue)
        self.hiscores = load_queue("mem/high", [1] * len(self.item_names), int)
        self.save_times = load_queue("mem/times", [])
        self.save_amounts = load_queue("mem/bytes", [])
        self.bg_images = load_queue("mem/backgrounds", [], file_to_bgurl)
        self.uptime = self.alltimestats[4]
        if file_exists("mem/exists"):
            with open("mem/exists", "rb") as f: self.existlist = f.read()
        else: self.existlist = ""

        self.games = load_dict("mem/games", {})
        self.games_queue = []
        for i in self.bg_images:
            game = bgurl_to_game(i)
            if game not in self.games and game not in self.games_queue: self.games_queue.append(game)

        #regexes and search strings
            #public
        self.re_name = re.compile(r'"personaname":"([^"]*)') # 1
        self.re_steamid = re.compile(r'"steamid":"([^"]*)') # 1
        self.re_customurl = re.compile(r'"url":"([^"]*)') # 1
        self.se_private = "private_profile"
        self.se_noavatar = "fef49e7fa7e1997310d705b2a6158ff8dc1cdfeb_full.jpg"
        self.se_bans = "profile_ban"
            #private
        self.se_background = "has_profile_background"
        self.re_bgimage = re.compile(r"background-image: url\(( ')?([^')]*)") # 2
        self.re_friends      = re.compile(r'steamcommunity\.com/((id|profiles)/[\w-]*)') # 1
        self.re_friend_level = re.compile(r'steamcommunity\.com/((id|profiles)/[\w-]*)[\D]*([\d]*)') # 1 + 3
        self.re_level = re.compile(r'"friendPlayerLevelNum">(\d*)') # 1
            #positions
        self.se_comments = "profile_comment_area"
        self.se_leftcol = "profile_leftcol"
        self.se_rightcol = "profile_rightcol"
        self.se_topfriends = "profile_topfriends"
            #game
        self.re_game = re.compile(r'apphub_AppName[^>]*>([^<]*)') # 1

        #performance stats
        self.crawl_times_sum = 0
        self.crawl_times_amount = 0

        self.request_handler = Request_handler(self.queue)

        self.database = Database(self.item_names, self.item_important, self.item_upload)

        self.next_backup = get_next_backup_time()

        print "  " + str(len(self.queue)) + " users in queue"
        print "  " + str(len(self.bg_images)) + " backgrounds found"
        print "  " + str(len(self.games)) + " games crawled"
        print "  " + str(round(self.uptime / 86400.0, 1)) + " days of crawling time"
        print "Next backup in " + str(round((self.next_backup - time.time()) / 3600.0, 1)) + " hours"
        print "Done initializing",
        print get_time_string(starttime)
Example #3
0
class crawl(threading.Thread):

    def __init__(self):
        starttime = time.clock()
        super(crawl, self).__init__()
        print "    Welcome to the crawler version " + VERSION + "\n"
        for i in USER_AGENT.split():
            print i + "\n ",
        print "\nInitializing"

        self.quit = True
        self.quit_analyze = True
        self.request_time = 0

        #alias should be the last one
        #if not, recrawl will no longer work for alias
        self.item_names = ("level",
                           "badge",
                           "game",
                           "screenshot",
                           "video",
                           "workshop",
                           "recommendation",
                           "guide",
                           "image",
                           "greenlight",
                           "item",
                           "group",
                           "friend",
                           "alias",
                           )
        self.item_search = (r"badges/",
                            r"games/\?tab=all",
                            r"screenshots/",
                            r"videos/",
                            r"myworkshopfiles/",
                            r"recommended/",
                            r"myworkshopfiles/\?section=guides",
                            r"images/",
                            r"myworkshopfiles/\?section=greenlight",
                            r"inventory/",
                            r"groups/",
                            r"friends/",
                            )
        self.item_important = (True,  #level
                               False, #badge
                               True,  #game
                               False, #screenshot
                               False, #video
                               False, #workshop
                               False, #recommendation
                               False, #guide
                               False, #image
                               False, #greenlight
                               False, #item
                               True,  #group
                               True,  #friend
                               False, #alias
                               )
        self.item_upload = list(self.item_important)
        self.item_upload[-1] = True #alias
        self.item_upload = tuple(self.item_upload)

        if not file_exists("mem/stats"): print "\nRUNNING THE CRAWLER FOR THE FIRST TIME\n"
        # [start time, crawls, bytes, crawl age, uptime, hi alias]
        self.alltimestats = load_queue("mem/stats", [time.time(), 0.0, 0.0, time.time(), 0.0, 0.0])
        self.queue = load_queue("mem/queue", [FIRST_USER], file_to_queue)
        self.hiscores = load_queue("mem/high", [1] * len(self.item_names), int)
        self.save_times = load_queue("mem/times", [])
        self.save_amounts = load_queue("mem/bytes", [])
        self.bg_images = load_queue("mem/backgrounds", [], file_to_bgurl)
        self.uptime = self.alltimestats[4]
        if file_exists("mem/exists"):
            with open("mem/exists", "rb") as f: self.existlist = f.read()
        else: self.existlist = ""

        self.games = load_dict("mem/games", {})
        self.games_queue = []
        for i in self.bg_images:
            game = bgurl_to_game(i)
            if game not in self.games and game not in self.games_queue: self.games_queue.append(game)

        #regexes and search strings
            #public
        self.re_name = re.compile(r'"personaname":"([^"]*)') # 1
        self.re_steamid = re.compile(r'"steamid":"([^"]*)') # 1
        self.re_customurl = re.compile(r'"url":"([^"]*)') # 1
        self.se_private = "private_profile"
        self.se_noavatar = "fef49e7fa7e1997310d705b2a6158ff8dc1cdfeb_full.jpg"
        self.se_bans = "profile_ban"
            #private
        self.se_background = "has_profile_background"
        self.re_bgimage = re.compile(r"background-image: url\(( ')?([^')]*)") # 2
        self.re_friends      = re.compile(r'steamcommunity\.com/((id|profiles)/[\w-]+)') # 1
        self.re_friend_level = re.compile(r'steamcommunity\.com/((id|profiles)/[\w-]+)[\D]*([\d]*)') # 1 + 3
        self.re_level = re.compile(r'"friendPlayerLevelNum">(\d*)') # 1
            #positions
        self.se_comments = "profile_comment_area"
        self.se_leftcol = "profile_leftcol"
        self.se_rightcol = "profile_rightcol"
        self.se_topfriends = "profile_topfriends"
            #game
        self.re_game = re.compile(r'apphub_AppName[^>]*>([^<]*)') # 1

        #performance stats
        self.crawl_times_sum = 0
        self.crawl_times_amount = 0

        self.request_handler = Request_handler(self.queue)

        self.database = Database(self.item_names, self.item_important, self.item_upload)

        self.next_backup = get_next_backup_time()

        print "  " + str(len(self.queue)) + " users in queue"
        print "  " + str(len(self.bg_images)) + " backgrounds found"
        print "  " + str(len(self.games)) + " games crawled"
        print "  " + str(round(self.uptime / 86400.0, 1)) + " days of crawling time"
        print "Next backup in " + str(round((self.next_backup - time.time()) / 3600.0, 1)) + " hours"
        print "Done initializing",
        print get_time_string(starttime)


    def update_uptime(self, current_time):
        self.alltimestats[4] = self.uptime + current_time - self.session_starttime


    #function for finding the amount of certain item
    def find_value(self, index, html, name):
        match = re.search(r'steamcommunity\.com/' + self.current_user + r'/' + self.item_search[index] + r'"([\D]*)([\d,]+)', html, re.I)
        if not match or "steamcommunity.com/" in match.group(1): return 0
        else: return match.group(2).replace(',', '')


    def parse(self, html1, html2, check_existence):
        if html1.find("<title>Steam Community") == -1:
            print "    ALERT: Got wrong language website!"
            return 0

        #user name (must be found)
        name = find_item(self.re_name, html1, "user name", self.current_user)
        if check_existence:
            if len(self.existlist) >= EXIST_LIST_SIZE: self.existlist = self.existlist[1:]
            self.existlist+= "0" if name == None else "1"
        if name == None: return 0

        #steam id (must be found)
        steamid = find_item(self.re_steamid, html1, "Steam id", self.current_user)
        c_user_id = user_url_to_user(self.current_user) if self.current_user[0] == 'p' else None #profiles/...
        if steamid:
            if c_user_id and steamid != c_user_id:
                print "Mismatch: " + self.current_user + " != " + steamid
        else: steamid = c_user_id if c_user_id else None
        if not steamid: return 0

        #custom url (NOTE: current_user should be updated only if really necessary!)
        #basically this is updated always when the current user uses steam id and custom url is available
        if self.current_user[0] == 'p': #profiles/...
            customurl = find_item(self.re_customurl, html1, "Custom URL", self.current_user)
            if customurl and r"\/id\/" in customurl:
                self.current_user = "******" + customurl.split("\/")[-2]

        private_profile = html1.find(self.se_private) != -1
        has_avatar = html1.find(self.se_noavatar) == -1
        has_bans = html1.find(self.se_bans) != -1

        if not private_profile:
            #parts of the page
                #left and right cols
            leftcol_index = html1.find(self.se_leftcol)
            rightcol_index = html1.find(self.se_rightcol)
                #right col
            html1_right = ""
            if rightcol_index == -1:
                print "Couldn't find right collumn for " + name
            elif leftcol_index < rightcol_index:
                html1_right = html1[rightcol_index:]
            else: html1_right = html1[rightcol_index:leftcol_index]
                #left col
            html1_left = ""
            if leftcol_index == -1:
                print "Couldn't find left collumn for " + name
            elif rightcol_index < leftcol_index:
                html1_left = html1[leftcol_index:]
            else: html1_left = html1[leftcol_index:rightcol_index]
                #comments
            html1_comments = ""
            comments_index = html1_left.find(self.se_comments)
            if comments_index == -1:
                print "Couldn't find comments for " + name
            else: html1_comments = html1_left[comments_index:]
                #top friends
            html1_topfriends = ""
            topfriend_index = html1_right.find(self.se_topfriends)
            if topfriend_index == -1:
                print "Couldn't find top friends for " + name
            else: html1_topfriends = html1_right[topfriend_index:]

            #background
            has_background = html1.find(self.se_background) != -1
            if has_background:
                bg_image = find_item(self.re_bgimage, html1, "background", name, 2)
                if bg_image:
                    bg_image = trim_bgurl(bg_image)
                    if bg_image in self.bg_images:
                        bg_image = self.bg_images.index(bg_image) + 1
                    else:
                        self.bg_images.append(bg_image)
                        game = bgurl_to_game(bg_image)
                        if game not in self.games: self.games_queue.append(game)
                        bg_image = len(self.bg_images)
                else: bg_image = 0
            else: bg_image = 0

            #level
            level = int(find_item(self.re_level, html1, "level", name))
            if level: items = [level]
            else: items = [0]

            #items (these are searched only from the right collumn in the webpage)
            for i in range(len(self.item_search)):
                items.append(int(self.find_value(i, html1_right, name)))

            #aliases
            items.append(0)
            if html2: items[-1] = parse_aliases(html2, name)

            #check hiscores
            for i in range(len(items)):
                if items[i] > self.hiscores[i]:
                    print name + " broke hi-score '" + self.item_names[i] + "' (" + str(self.hiscores[i]) + " -> " + str(items[i]) + ")"
                    self.hiscores[i] = items[i]

            #get friends
            with self.request_handler.queue_lock:
                get_friends = len(self.queue) < MAX_QUEUE_SIZE
                    #top friends with levels
                friends = re.findall(self.re_friend_level, html1_topfriends)
                if friends:
                    for i in friends:
                        friend = i[0]
                        if friend != self.current_user:
                            try: is_high_leveled = int(i[2]) >= QUICK_CRAWL_LEVEL
                            except ValueError: is_high_leveled = False
                            if is_high_leveled: self.database.add_high_leveled(friend)
                            if not self.database.exists(friend):
                                if is_high_leveled:
                                    try: queue_index = self.queue.index(friend)
                                    except ValueError: queue_index = None
                                    if queue_index == None or queue_index > 10:
                                        if queue_index: del self.queue[queue_index]
                                        self.queue.insert(0, friend)
                                        print "Quick crawling " + friend + " (" + i[2] + ") "
                                elif get_friends and not friend in self.queue:
                                    self.queue.append(friend)
                    #comments (get these only if there is space in the queue)
                if get_friends:
                    friends = re.findall(self.re_friends, html1_comments)
                    if friends:
                        for i in friends:
                            friend = i[0]
                            if friend != self.current_user and not friend in self.queue and not self.database.exists(friend):
                                self.queue.append(friend)

        bools = [private_profile, has_avatar, has_bans]
        numbers = None
        if not private_profile:
            bools.append(has_background)
            numbers = [bg_image] + items

        self.database.save_user(self.current_user, steamid, name, bools, numbers)


    def parse_game(self, html1, game):
        # Special cases that can't be crawled
        if game in UNKNOWN_GAMES: self.games[game] = UNKNOWN_GAMES[game]
        else:
            name = find_item(self.re_game, html1, "name", "game " + game)
            if name: self.games[game] = name
            else:
                try:
                    int(game)
                    print "Recrawling later"
                except ValueError:
                    self.games[game] = "???"
                    print "Setting as unknown"


    def run(self):
        print "Really, no need for another Steam crawler"
        self.request_handler.start()
        health_check = True
        start_time = time.clock()
        queue_time = start_time
        dump_time = start_time - DATA_DUMP_TIME / 2 #first dump faster
        dump_time2 = start_time
        self.session_starttime = start_time
        self.dump_status(1)
        while not self.quit or not self.request_handler.done():
            if self.quit: self.request_handler.stop()

            if len(self.games_queue):
                game = self.games_queue.pop()
                print "Crawling game: " + game
                html = 1
                html1 = request_html("game " + game, "http://steamcommunity.com/app/" + game)
                if html1[0]: self.parse_game(html1[2], game)
                html2 = False,
            else:
                html = self.request_handler.get_html()
                if html != -1:
                    html1 = html[2]
                    html2 = html[3]
                    self.current_user = html[0]
                    self.current_url = "http://steamcommunity.com/" + self.current_user
                    if html1[0]: self.parse(html1[2], html2[2] if html2[0] else None, html[1])

                #stats
            current_time = time.time()
            if html != -1 and len(html1) > 1:
                self.save_times.append(current_time)
                html2_size = html2[1] if len(html2) > 1 else 0
                self.save_amounts.append(html1[1] + html2_size)
                self.alltimestats[1]+= 1
                self.alltimestats[2]+= html1[1] + html2_size

                #sleep
            end_time = time.clock()
            elapsed_time = end_time - start_time
            sleep_time = SLEEP_TIME - elapsed_time
            # print time until analysis
            time_until_analysis = DATA_DUMP_TIME - end_time + dump_time
            print "Analyzing in %i:%02i \r" % (time_until_analysis // 60, time_until_analysis % 60),
            # sleep now
            if sleep_time > 0:
                time.sleep(sleep_time)
                start_time = end_time + sleep_time
            else: start_time = end_time

                #performance stats
            if elapsed_time < ERROR_TIME and html != -1 and len(html1) > 1:
                self.crawl_times_sum+= elapsed_time
                self.crawl_times_amount+= 1

            if health_check:
                print "Crawling succesfully"
                health_check = False

                #data save/dump and backup
            if end_time - dump_time > DATA_DUMP_TIME: #sync
                self.request_handler.stop()
                if self.request_handler.done():
                    self.dump_data()
                    queue_time = time.clock()
                    dump_time = queue_time
                    dump_time2 = queue_time
                    self.request_handler.start()
            elif end_time - dump_time2 > STATUS_DUMP_TIME: #status
                self.dump_status()
                dump_time2 = end_time
            elif current_time > self.next_backup: #backup
                self.save_queue()
                self.next_backup = backup_files()
                queue_time = time.clock()
            elif end_time - queue_time > QUEUE_SAVE_TIME: #save
                self.save_queue()
                queue_time = time.clock()

        if self.quit_analyze: self.dump_data()
        else: self.dump_status()
        self.save_queue() #do this last!


    def save_queue(self):
        starttime = time.clock()
        self.update_uptime(starttime)
        save_queue(self.alltimestats, "mem/stats")
        with self.request_handler.queue_lock:
            save_queue(self.queue, "mem/queue", queue_to_file)
        save_queue(self.hiscores, "mem/high")
        save_queue(self.save_times, "mem/times")
        save_queue(self.save_amounts, "mem/bytes")
        save_queue(self.bg_images, "mem/backgrounds", bgurl_to_file)
        save_dict(self.games, "mem/games")
        with open("mem/exists", "wb") as f: f.write(self.existlist)
        self.database.save_data()
            #queue size and bg size
        print "  saved (" + str(len(self.bg_images)) + " bg)",
            #http time
        print "(" + str(round(get_req_time(), 2)) + " http)",
            #database speed
        print "(" + str(round(self.crawl_times_sum / max(self.crawl_times_amount, 1), 2)) + " db)",
            #total speed
        print "(" + str(round((starttime - self.session_starttime) / max(self.crawl_times_amount, 1), 2)) + " tot)",
            #speed of this func
        print get_time_string(starttime)


    def dump_status(self, init = 0):
        starttime = time.clock()
        self.update_uptime(starttime)
            #remove older than hour data
        if len(self.save_times):
            hour_ago = time.time() - 3600
            if self.save_times[-1] < hour_ago:
                self.save_times = []
                self.save_amounts = []
            else:
                i = 0
                while self.save_times[i] < hour_ago: i+= 1
                self.save_times = self.save_times[i:]
                self.save_amounts = self.save_amounts[i:]
            #json data
        keys = ("files", "bytes", "inittime", "total_crawls", "total_bytes", "crawl_age", "uptime")
        values = (len(self.save_times),
                  sum(self.save_amounts),
                  self.alltimestats[0],
                  self.alltimestats[1],
                  self.alltimestats[2],
                  time.time() - self.alltimestats[3],
                  self.alltimestats[4])
        data = get_json(keys, values)
            #rest
        self.request_time = max(int(time.time()), self.request_time + 1)
        _hash = calc_hash(self.request_time)
        print "  " + str(request_html("status dump",
                                      DATA_SAVER,
                                      get_post_values(_hash,
                                                      self.request_time,
                                                      "s",
                                                      data,
                                                      init))[-1]),
        print get_time_string(starttime)


    def dump_data(self):
        self.save_queue()
        #do this first because data dump may take very long time
        #  and the server may start to think that the crawler is not crawling anymore
        self.dump_status()
        #do the analyze and update some values
        self.request_time, recrawl_queue, self.alltimestats[3], new_hiscores, self.alltimestats[5] =\
            self.database.synchronize(self.request_time, self.bg_images, self.games, self.alltimestats[5], self.existlist)
        #hi-score change
        hiscore_changed = False
        for i in range(len(self.hiscores)):
            if self.hiscores[i] != new_hiscores[i]:
                hiscore_changed = True
                print "hi-score '" + self.item_names[i] + "' changed: " +\
                      str(self.hiscores[i]) + " -> " + str(new_hiscores[i])
                self.hiscores[i] = new_hiscores[i]
        if hiscore_changed: print
        #recrawl
        recrawl_len = len(recrawl_queue)
        if recrawl_len:
            if recrawl_len >= len(self.queue): self.queue = recrawl_queue
            else: self.queue = recrawl_queue + self.queue[recrawl_len:]
            #also update the new queue for the request handler
            self.request_handler.queue = self.queue
        print "Length of queue: " + str(len(self.queue)) + "\n"
Example #4
0
class crawl(threading.Thread):

    def __init__(self):
        starttime = time.clock()
        super(crawl, self).__init__()
        print "    Welcome to the crawler version " + VERSION + "\n"
        for i in USER_AGENT.split():
            print i + "\n ",
        print "\nInitializing"

        self.quit = True
        self.quit_analyze = True
        self.request_time = 0

        #alias should be the last one
        #if not, recrawl will no longer work for alias
        self.item_names = ("level",
                           "badge",
                           "game",
                           "screenshot",
                           "video",
                           "workshop",
                           "recommendation",
                           "guide",
                           "image",
                           "greenlight",
                           "item",
                           "group",
                           "friend",
                           "alias",
                           )
        self.item_search = (r"badges/",
                            r"games/\?tab=all",
                            r"screenshots/",
                            r"videos/",
                            r"myworkshopfiles/",
                            r"recommended/",
                            r"myworkshopfiles/\?section=guides",
                            r"images/",
                            r"myworkshopfiles/\?section=greenlight",
                            r"inventory/",
                            r"groups/",
                            r"friends/",
                            )
        self.item_important = (True,  #level
                               False, #badge
                               True,  #game
                               False, #screenshot
                               False, #video
                               False, #workshop
                               False, #recommendation
                               False, #guide
                               False, #image
                               False, #greenlight
                               False, #item
                               True,  #group
                               True,  #friend
                               False, #alias
                               )
        self.item_upload = list(self.item_important)
        self.item_upload[-1] = True #alias
        self.item_upload = tuple(self.item_upload)

        if not file_exists("mem/stats"): print "\nRUNNING THE CRAWLER FOR THE FIRST TIME\n"
        # [start time, crawls, bytes, crawl age, uptime, hi alias]
        self.alltimestats = load_queue("mem/stats", [time.time(), 0.0, 0.0, time.time(), 0.0, 0.0])
        self.queue = load_queue("mem/queue", [FIRST_USER], file_to_queue)
        self.hiscores = load_queue("mem/high", [1] * len(self.item_names), int)
        self.save_times = load_queue("mem/times", [])
        self.save_amounts = load_queue("mem/bytes", [])
        self.bg_images = load_queue("mem/backgrounds", [], file_to_bgurl)
        self.uptime = self.alltimestats[4]
        if file_exists("mem/exists"):
            with open("mem/exists", "rb") as f: self.existlist = f.read()
        else: self.existlist = ""

        self.games = load_dict("mem/games", {})
        self.games_queue = []
        for i in self.bg_images:
            game = bgurl_to_game(i)
            if game not in self.games and game not in self.games_queue: self.games_queue.append(game)

        #regexes and search strings
            #public
        self.re_name = re.compile(r'"personaname":"([^"]*)') # 1
        self.re_steamid = re.compile(r'"steamid":"([^"]*)') # 1
        self.re_customurl = re.compile(r'"url":"([^"]*)') # 1
        self.se_private = "private_profile"
        self.se_noavatar = "fef49e7fa7e1997310d705b2a6158ff8dc1cdfeb_full.jpg"
        self.se_bans = "profile_ban"
            #private
        self.se_background = "has_profile_background"
        self.re_bgimage = re.compile(r"background-image: url\(( ')?([^')]*)") # 2
        self.re_friends      = re.compile(r'steamcommunity\.com/((id|profiles)/[\w-]*)') # 1
        self.re_friend_level = re.compile(r'steamcommunity\.com/((id|profiles)/[\w-]*)[\D]*([\d]*)') # 1 + 3
        self.re_level = re.compile(r'"friendPlayerLevelNum">(\d*)') # 1
            #positions
        self.se_comments = "profile_comment_area"
        self.se_leftcol = "profile_leftcol"
        self.se_rightcol = "profile_rightcol"
        self.se_topfriends = "profile_topfriends"
            #game
        self.re_game = re.compile(r'apphub_AppName[^>]*>([^<]*)') # 1

        #performance stats
        self.crawl_times_sum = 0
        self.crawl_times_amount = 0

        self.request_handler = Request_handler(self.queue)

        self.database = Database(self.item_names, self.item_important, self.item_upload)

        self.next_backup = get_next_backup_time()

        print "  " + str(len(self.queue)) + " users in queue"
        print "  " + str(len(self.bg_images)) + " backgrounds found"
        print "  " + str(len(self.games)) + " games crawled"
        print "  " + str(round(self.uptime / 86400.0, 1)) + " days of crawling time"
        print "Next backup in " + str(round((self.next_backup - time.time()) / 3600.0, 1)) + " hours"
        print "Done initializing",
        print get_time_string(starttime)


    def update_uptime(self, current_time):
        self.alltimestats[4] = self.uptime + current_time - self.session_starttime


    #function for finding the amount of certain item
    def find_value(self, index, html, name):
        match = re.search(r'steamcommunity\.com/' + self.current_user + r'/' + self.item_search[index] + r'"([\D]*)([\d,]+)', html, re.I)
        if not match or "steamcommunity.com/" in match.group(1): return 0
        else: return match.group(2).replace(',', '')


    def parse(self, html1, html2, check_existence):
        if html1.find("<title>Steam Community") == -1:
            print "    ALERT: Got wrong language website!"
            return 0

        #user name (must be found)
        name = find_item(self.re_name, html1, "user name", self.current_user)
        if check_existence:
            if len(self.existlist) >= EXIST_LIST_SIZE: self.existlist = self.existlist[1:]
            self.existlist+= "0" if name == None else "1"
        if name == None: return 0

        #steam id (must be found)
        steamid = find_item(self.re_steamid, html1, "Steam id", self.current_user)
        c_user_id = user_url_to_user(self.current_user) if self.current_user[0] == 'p' else None #profiles/...
        if steamid:
            if c_user_id and steamid != c_user_id:
                print "Mismatch: " + self.current_user + " != " + steamid
        else: steamid = c_user_id if c_user_id else None
        if not steamid: return 0

        #custom url (NOTE: current_user should be updated only if really necessary!)
        #basically this is updated always when the current user uses steam id and custom url is available
        if self.current_user[0] == 'p': #profiles/...
            customurl = find_item(self.re_customurl, html1, "Custom URL", self.current_user)
            if customurl and r"\/id\/" in customurl:
                self.current_user = "******" + customurl.split("\/")[-2]

        private_profile = html1.find(self.se_private) != -1
        has_avatar = html1.find(self.se_noavatar) == -1
        has_bans = html1.find(self.se_bans) != -1

        if not private_profile:
            #parts of the page
                #left and right cols
            leftcol_index = html1.find(self.se_leftcol)
            rightcol_index = html1.find(self.se_rightcol)
                #right col
            html1_right = ""
            if rightcol_index == -1:
                print "Couldn't find right collumn for " + name
            elif leftcol_index < rightcol_index:
                html1_right = html1[rightcol_index:]
            else: html1_right = html1[rightcol_index:leftcol_index]
                #left col
            html1_left = ""
            if leftcol_index == -1:
                print "Couldn't find left collumn for " + name
            elif rightcol_index < leftcol_index:
                html1_left = html1[leftcol_index:]
            else: html1_left = html1[leftcol_index:rightcol_index]
                #comments
            html1_comments = ""
            comments_index = html1_left.find(self.se_comments)
            if comments_index == -1:
                print "Couldn't find comments for " + name
            else: html1_comments = html1_left[comments_index:]
                #top friends
            html1_topfriends = ""
            topfriend_index = html1_right.find(self.se_topfriends)
            if topfriend_index == -1:
                print "Couldn't find top friends for " + name
            else: html1_topfriends = html1_right[topfriend_index:]

            #background
            has_background = html1.find(self.se_background) != -1
            if has_background:
                bg_image = find_item(self.re_bgimage, html1, "background", name, 2)
                if bg_image:
                    bg_image = trim_bgurl(bg_image)
                    if bg_image in self.bg_images:
                        bg_image = self.bg_images.index(bg_image) + 1
                    else:
                        self.bg_images.append(bg_image)
                        game = bgurl_to_game(bg_image)
                        if game not in self.games: self.games_queue.append(game)
                        bg_image = len(self.bg_images)
                else: bg_image = 0
            else: bg_image = 0

            #level
            level = int(find_item(self.re_level, html1, "level", name))
            if level: items = [level]
            else: items = [0]

            #items (these are searched only from the right collumn in the webpage)
            for i in range(len(self.item_search)):
                items.append(int(self.find_value(i, html1_right, name)))

            #aliases
            items.append(0)
            if html2: items[-1] = parse_aliases(html2, name)

            #check hiscores
            for i in range(len(items)):
                if items[i] > self.hiscores[i]:
                    print name + " broke hi-score '" + self.item_names[i] + "' (" + str(self.hiscores[i]) + " -> " + str(items[i]) + ")"
                    self.hiscores[i] = items[i]

            #get friends
            with self.request_handler.queue_lock:
                get_friends = len(self.queue) < MAX_QUEUE_SIZE
                    #top friends with levels
                friends = re.findall(self.re_friend_level, html1_topfriends)
                if friends:
                    for i in friends:
                        friend = i[0]
                        if friend != self.current_user:
                            is_high_leveled = int(i[2]) >= QUICK_CRAWL_LEVEL
                            if is_high_leveled: self.database.add_high_leveled(friend)
                            if not self.database.exists(friend):
                                if is_high_leveled:
                                    try: queue_index = self.queue.index(friend)
                                    except ValueError: queue_index = None
                                    if queue_index == None or queue_index > 10:
                                        if queue_index: del self.queue[queue_index]
                                        self.queue.insert(0, friend)
                                        print "Quick crawling " + friend + " (" + i[2] + ") "
                                elif get_friends and not friend in self.queue:
                                    self.queue.append(friend)
                    #comments (get these only if there is space in the queue)
                if get_friends:
                    friends = re.findall(self.re_friends, html1_comments)
                    if friends:
                        for i in friends:
                            friend = i[0]
                            if friend != self.current_user and not friend in self.queue and not self.database.exists(friend):
                                self.queue.append(friend)

        bools = [private_profile, has_avatar, has_bans]
        numbers = None
        if not private_profile:
            bools.append(has_background)
            numbers = [bg_image] + items

        self.database.save_user(self.current_user, steamid, name, bools, numbers)


    def parse_game(self, html1, game):
        # Special cases that can't be crawled
        if game == "267420": self.games[game] = "Holiday Sale 2013"
        elif game in UNKNOWN_GAMES: self.games[game] = "Unknown"
        else:
            name = find_item(self.re_game, html1, "name", "game " + game)
            if name: self.games[game] = name
            else:
                try:
                    int(game)
                    print "Recrawling later"
                except ValueError:
                    self.games[game] = "Unknown"
                    print "Setting as unknown"


    def run(self):
        print "Really, no need for another Steam crawler"
        self.request_handler.start()
        health_check = True
        start_time = time.clock()
        queue_time = start_time
        dump_time = start_time - DATA_DUMP_TIME / 2 #first dump faster
        dump_time2 = start_time
        self.session_starttime = start_time
        self.dump_status(1)
        while not self.quit or not self.request_handler.done():
            if self.quit: self.request_handler.stop()

            if len(self.games_queue):
                game = self.games_queue.pop()
                print "Crawling game: " + game
                html = 1
                html1 = request_html("game " + game, "http://steamcommunity.com/app/" + game)
                if html1[0]: self.parse_game(html1[2], game)
                html2 = False,
            else:
                html = self.request_handler.get_html()
                if html != -1:
                    html1 = html[2]
                    html2 = html[3]
                    self.current_user = html[0]
                    self.current_url = "http://steamcommunity.com/" + self.current_user
                    if html1[0]: self.parse(html1[2], html2[2] if html2[0] else None, html[1])

                #stats
            current_time = time.time()
            if html != -1 and len(html1) > 1:
                self.save_times.append(current_time)
                html2_size = html2[1] if len(html2) > 1 else 0
                self.save_amounts.append(html1[1] + html2_size)
                self.alltimestats[1]+= 1
                self.alltimestats[2]+= html1[1] + html2_size

                #sleep
            end_time = time.clock()
            elapsed_time = end_time - start_time
            sleep_time = SLEEP_TIME - elapsed_time
            # print time until analysis
            time_until_analysis = DATA_DUMP_TIME - end_time + dump_time
            print "Analyzing in %i:%02i \r" % (time_until_analysis // 60, time_until_analysis % 60),
            # sleep now
            if sleep_time > 0:
                time.sleep(sleep_time)
                start_time = end_time + sleep_time
            else: start_time = end_time

                #performance stats
            if elapsed_time < ERROR_TIME and html != -1 and len(html1) > 1:
                self.crawl_times_sum+= elapsed_time
                self.crawl_times_amount+= 1

            if health_check:
                print "Crawling succesfully"
                health_check = False

                #data save/dump and backup
            if end_time - dump_time > DATA_DUMP_TIME: #sync
                self.request_handler.stop()
                if self.request_handler.done():
                    self.dump_data()
                    queue_time = time.clock()
                    dump_time = queue_time
                    dump_time2 = queue_time
                    self.request_handler.start()
            elif end_time - dump_time2 > STATUS_DUMP_TIME: #status
                self.dump_status()
                dump_time2 = end_time
            elif current_time > self.next_backup: #backup
                self.save_queue()
                self.next_backup = backup_files()
                queue_time = time.clock()
            elif end_time - queue_time > QUEUE_SAVE_TIME: #save
                self.save_queue()
                queue_time = time.clock()

        if self.quit_analyze: self.dump_data()
        else: self.dump_status()
        self.save_queue() #do this last!


    def save_queue(self):
        starttime = time.clock()
        self.update_uptime(starttime)
        save_queue(self.alltimestats, "mem/stats")
        with self.request_handler.queue_lock:
            save_queue(self.queue, "mem/queue", queue_to_file)
        save_queue(self.hiscores, "mem/high")
        save_queue(self.save_times, "mem/times")
        save_queue(self.save_amounts, "mem/bytes")
        save_queue(self.bg_images, "mem/backgrounds", bgurl_to_file)
        save_dict(self.games, "mem/games")
        with open("mem/exists", "wb") as f: f.write(self.existlist)
        self.database.save_data()
            #queue size and bg size
        print "  saved (" + str(len(self.bg_images)) + " bg)",
            #http time
        print "(" + str(round(get_req_time(), 2)) + " http)",
            #database speed
        print "(" + str(round(self.crawl_times_sum / max(self.crawl_times_amount, 1), 2)) + " db)",
            #total speed
        print "(" + str(round((starttime - self.session_starttime) / max(self.crawl_times_amount, 1), 2)) + " tot)",
            #speed of this func
        print get_time_string(starttime)


    def dump_status(self, init = 0):
        starttime = time.clock()
        self.update_uptime(starttime)
            #remove older than hour data
        if len(self.save_times):
            hour_ago = time.time() - 3600
            if self.save_times[-1] < hour_ago:
                self.save_times = []
                self.save_amounts = []
            else:
                i = 0
                while self.save_times[i] < hour_ago: i+= 1
                self.save_times = self.save_times[i:]
                self.save_amounts = self.save_amounts[i:]
            #json data
        keys = ("files", "bytes", "inittime", "total_crawls", "total_bytes", "crawl_age", "uptime")
        values = (len(self.save_times),
                  sum(self.save_amounts),
                  self.alltimestats[0],
                  self.alltimestats[1],
                  self.alltimestats[2],
                  time.time() - self.alltimestats[3],
                  self.alltimestats[4])
        data = get_json(keys, values)
            #rest
        self.request_time = max(int(time.time()), self.request_time + 1)
        _hash = calc_hash(self.request_time)
        print "  " + str(request_html("status dump",
                                      DATA_SAVER,
                                      get_post_values(_hash,
                                                      self.request_time,
                                                      "s",
                                                      data,
                                                      init))[-1]),
        print get_time_string(starttime)


    def dump_data(self):
        self.save_queue()
        #do this first because data dump may take very long time
        #  and the server may start to think that the crawler is not crawling anymore
        self.dump_status()
        #do the analyze and update some values
        self.request_time, recrawl_queue, self.alltimestats[3], new_hiscores, self.alltimestats[5] =\
            self.database.synchronize(self.request_time, self.bg_images, self.games, self.alltimestats[5], self.existlist)
        #hi-score change
        hiscore_changed = False
        for i in range(len(self.hiscores)):
            if self.hiscores[i] != new_hiscores[i]:
                hiscore_changed = True
                print "hi-score '" + self.item_names[i] + "' changed: " +\
                      str(self.hiscores[i]) + " -> " + str(new_hiscores[i])
                self.hiscores[i] = new_hiscores[i]
        if hiscore_changed: print
        #recrawl
        recrawl_len = len(recrawl_queue)
        if recrawl_len:
            if recrawl_len >= len(self.queue): self.queue = recrawl_queue
            else: self.queue = recrawl_queue + self.queue[recrawl_len:]
            #also update the new queue for the request handler
            self.request_handler.queue = self.queue
        print "Length of queue: " + str(len(self.queue)) + "\n"