Esempio n. 1
0
    def get(self):
        search_keyword = self.request.get('q')
        recent_user_url =TWITTER_SEARCH_FORMAT % search_keyword
        feed = feedparser.parse(recent_user_url)
        last_user = recentTuser()
        users_feed_dict = {}
        last_user_created = last_user.created if last_user else  None
        for entry in feed.entries:
            icon_url = entry.links[1]['href']
            # Skip if the result is already crawled.
            entry_datetime = datetime(*(entry.updated_parsed[:6]))
            if last_user_created and entry_datetime <= last_user_created:
#                debug("%s is older than %s" % (entry_datetime, last_user_created))
                continue
            if 'href' in entry and entry.href.find('http://twitter.com/') == 0:
                author = entry.href[len('http://twitter.com/'):]
            else:
                continue
            dummy_last_crawl = entry_datetime + timedelta(weeks=-4)
            user_info_dict = {
                'uid' : '0',
                'name' : author,
                'profile_image' : icon_url,
                'last_crawl' : dummy_last_crawl
                }
            users_feed_dict[author] = user_info_dict

        db_users_dict = users_dict_by_names(users_feed_dict.keys())
        users_to_put = []
        for user_name in users_feed_dict.keys():
            if user_name in db_users_dict:
                debug("%s is already in db" % user_name)
                continue
            tuser = Tuser(
                uid="0",
                name=user_name,
                profile_image=users_feed_dict[user_name]['profile_image'],
                last_crawl=users_feed_dict[user_name]['last_crawl'],
                priority=1
                )
            users_to_put.append(tuser)
        db.put(users_to_put)
        self.response.out.write('added new %d users' % len(users_feed_dict.keys()))
Esempio n. 2
0
    def get(self):
        """update user info using favotter.
        """
        fav_by_regex = re.compile('twitter.com/(?P<username>[^"]+)')
        icon_regex = re.compile('src="(?P<icon_image>[^"]+)')
        feed = feedparser.parse(FAVOTTER_FEED)
        users_feed_dict = {}
        today = datetime.now()
        dummy_last_crawl = today + timedelta(weeks=-4)
        users_feed_dict = {}
        found_users_count = 0
        for entry in feed.entries:
            start_index = entry.summary.find('fav by')
            if (start_index < 0):
                continue
            fav_by_string = entry.summary[start_index:]
            debug(fav_by_string)
            username_list = fav_by_regex.findall(fav_by_string)
            icon_url_list = icon_regex.findall(fav_by_string)

            if len(username_list) != len(icon_url_list):
                debug("length mismatch")
                debug(username_list)
                debug(icon_url_list)
                debug(fav_by_string)
                continue

            debug(len(username_list))
            for i in range(len(username_list)):
                author = username_list[i]
                icon_url = icon_url_list[i]
                if author in users_feed_dict:
                    continue
                if found_users_count >= GQL_QUERY_IN_MAX:
                    break
                user_info_dict = {
                    'uid' : '0',
                    'name' : author,
                    'profile_image' : icon_url,
                    'last_crawl' : dummy_last_crawl
                    }
                users_feed_dict[author] = user_info_dict
                found_users_count = found_users_count + 1

        db_users_dict = users_dict_by_names(users_feed_dict.keys())
        users_to_put = []
        added_user_count = 0
        for user_name in users_feed_dict.keys():
            debug(user_name)
            # Do not save existing user
            if user_name in db_users_dict:
                debug("%s is already in db" % user_name)
                continue
            tuser = Tuser(
                uid="0",
                name=user_name,
                profile_image=users_feed_dict[user_name]['profile_image'],
                last_crawl=users_feed_dict[user_name]['last_crawl'],
                priority=1
                )
            debug(users_feed_dict[user_name])
            users_to_put.append(tuser)
            added_user_count = added_user_count + 1
        db.put(users_to_put)
        add_tuser_counter(added_user_count)
        self.response.out.write('added new %d users' % added_user_count)