Example #1
0
def _require_session_owned(session_id):
    """
    Require that the session is owned by the logged in user
    """
    session_r = _session.find_one({'_id': bson.ObjectId(session_id)})
    if not session_r:
        raise Exception('Session not found')    

    search_r = _search.find_one({'_id': bson.ObjectId(session_r['search_id'])})
    if not search_r:
        raise Exception('Search not found')
 
    if search_r['username'] != session.get('username', ''):
        raise Exception('You do not have permission to access this snapshot')
        
    return (search_r, session_r)
Example #2
0
def _require_session_owned(session_id):
    """
    Require that the session is owned by the logged in user
    """
    session_r = _session.find_one({'_id': bson.ObjectId(session_id)})
    if not session_r:
        raise Exception('Session not found')

    search_r = _search.find_one({'_id': bson.ObjectId(session_r['search_id'])})
    if not search_r:
        raise Exception('Search not found')

    if search_r['username'] != session.get('username', ''):
        raise Exception('You do not have permission to access this snapshot')

    return (search_r, session_r)
Example #3
0
def filter(session_id):
    """
    Get histogram and tweets
    
    @filter: comma-delimited list of elements to filter by
        if element starts with '#', then it is a hashtag
        else, it is a stem
    """
    try:
        _require_session_access(session_id)

        session_r = _session.find_one(
            {'_id': bson.ObjectId(session_id)})
        if not session_r:
            raise Exception('Session not found')
            
        search_r = _search.find_one(
            {'_id': bson.ObjectId(session_r['search_id'])})
        if not search_r:
            raise Exception('Search not found')
                        
        # Find tweets
        params = {'session_id': session_id}
        
        filter = request.args.getlist('filter[]')
        filter_stems = []
        filter_hashtags = []       
        filter_urls = []
        
        for element in filter:
            if element.startswith('#'):
                filter_hashtags.append(element)
            elif element.startswith('http'):
                filter_urls.append(element)
            else:
                filter_stems.append(element)
                
        if filter_urls:
            params['urls'] = {'$all': filter_urls}
        if filter_stems:
            params['stems'] = {'$all': filter_stems}
        if filter_hashtags:
            params['hashtags'] = {'$all': filter_hashtags}
        
        cursor = _tweets.find(params, {
                'embed': 1,
                'id_str': 1,
                'created_at': 1,
                'user.name': 1,
                'user.screen_name': 1,
                'retweeted_status.id_str': 1,
                'stems': 1,
                'hashtags': 1,
                'urls': 1
            }, sort=[('dt', pymongo.DESCENDING)])
        
        # Process tweets
        stem_counter = Counter()
        hashtag_counter = Counter()
        url_counter = Counter()
        
        tweets = []   
        retweets = 0        
        id_set = set()
        
        for tweet in cursor:  
            stem_counter.update(tweet['stems'])
            hashtag_counter.update(tweet['hashtags'])
            url_counter.update(tweet['urls'])
            
            if tweet['id_str'] in id_set:
                retweets += 1
                continue
            id_set.add(tweet['id_str'])
          
            if 'retweeted_status' in tweet:
                retweeted_id = tweet['retweeted_status']['id_str']
                if retweeted_id in id_set:
                    retweets += 1
                    continue              
                id_set.add(retweeted_id)
                    
            tweets.append({
                'text': tweet['embed'],
                'user_name': tweet['user']['name'],
                'user_screen_name': tweet['user']['screen_name'],
                'id_str': tweet['id_str'],
                'created_at': tweet['created_at']           
            })
                
        stem_counts = [x for x in stem_counter.most_common() \
            if x[0] not in filter_stems]
        hashtag_counts = [x for x in hashtag_counter.most_common() \
            if x[0] not in filter_hashtags]
        url_counts = [x for x in url_counter.most_common() \
            if x[0] not in filter_urls]
                           
        return _jsonify(
            search=search_r,
            session=session_r,
            stem_counts=stem_counts, 
            hashtag_counts=hashtag_counts,
            url_counts=url_counts,
            tweets=tweets,
            retweets=retweets
        )
    except Exception, e:
        traceback.print_exc()
        return _jsonify(error=str(e))
Example #4
0
def analyze():
    """
    Get tweets from twitter and analyze them
    
    @language = language code, e.g. 'en'
    
    @query = query string
        OR
    @list_id = list id
     """
    try:
        language = request.args.get('language') or 'en'

        query = request.args.get('query')
        list_id = request.args.get('list_id')
         
        if query:
            query_lower = query.lower()
        elif list_id:
            list_map = _get_list_map()
        else:    
            raise Exception('No query or list specified')                  
               
        # Get api object
        api = tweepy.API(get_oauth())
                            
        # Get/create search record
        param = {
            'username': session['username'], 
            'language': language
        }
        if query:
            param['query_lower'] = query_lower
        else:
            param['list_id'] = list_id
        search_r = _search.find_one(param)
        if not search_r:
            search_r = param
            if query:
                search_r['query'] = query   
            else:
                search_r['list_name'] = list_map[list_id]    
            search_r['_id'] = _search.save(search_r, manipulate=True)
        search_id = str(search_r['_id'])
        
        # Create new search session
        session_r = {
            'search_id': search_id,
            'dt': datetime.datetime.now().isoformat(),
            'stem_counts': [],      # [[stem, post count]]
            'stem_map': {},         # {stem: [term, count]}  
        }
        session_r['_id'] = _session.save(session_r, manipulate=True)
        session_id = str(session_r['_id'])
 
        # Process tweets
        stopwords = extract.get_stopwords(language).copy()  
        stoptags = set()
        stemmer = extract.get_stemmer(language)
        stem_map = defaultdict(Counter)       
        tweet_list = []      
        
        if query:
            stoptags.update([x.lower().lstrip('#') for x in query_lower.split()])
            stopwords.update(stoptags)
            cursor = tweepy.Cursor(api.search, q=query, lang=language, \
                count=100, result_type='recent', include_entities=True) 
        else:
            cursor = tweepy.Cursor(api.list_timeline, list_id=list_id, \
                count=100, include_entities=True)

        for tweet in cursor.items(limit=settings.TWITTER_SEARCH_LIMIT):  
            tweet_dict = twutil.tweepy_model_to_dict(tweet)
                                        
            tweet_dict['session_id'] = session_id
            tweet_dict['embed'] = twutil.format_text(tweet_dict)              
            tweet_dict['tokens'] = extract.tokenize(tweet_dict['text'])
            
            # Filter hashtags from query
            # tweet_dict['hashtags'] = list(set(['#'+x['text'].lower() \
            #    for x in tweet_dict['entities']['hashtags']]))
            tweet_dict['hashtags'] = list(set([
                    '#'+x['text'].lower() \
                    for x in tweet_dict['entities']['hashtags'] \
                    if x['text'].lower() not in stoptags
                ]))    
                
            tweet_dict['urls'] = list(set([x['expanded_url'] \
                for x in tweet_dict['entities']['urls']]))
            
            tweet_list.append(tweet_dict)

        # ------------------------------------------------------------
        # Process trigrams
        
        trigram_counter = Counter()
        
        for tweet in tweet_list:
            grams = [] 
                     
            for tokens in tweet['tokens']:
                for g in nltk.ngrams(tokens, 3):
                    if extract.stoplist_iter(g, stopwords):
                        continue
                    if g[0].startswith('@') or g[1].startswith('@') or g[2].startswith('@'):
                        continue
                    grams.append(g)

            stems = extract.stems_from_grams(grams, stemmer)                                                            
            for s, g in zip(stems, grams):
                stem_map[s].update([g]) 
                               
            tweet['stems_3'] = list(set(stems))
            trigram_counter.update(tweet['stems_3'])            

        # Ignore trigrams that only appear once
        for g, n in trigram_counter.items():
            if n < 2:
                del trigram_counter[g]
                del stem_map[g]
               
        # ------------------------------------------------------------
        # Process bigrams
        
        bigram_counter = Counter()
        
        for tweet in tweet_list:
            grams = []    
            stems = []
                    
            for tokens in tweet['tokens']:
                gram_list = nltk.ngrams(tokens, 2)
                stem_list = extract.stems_from_grams(gram_list, stemmer)

                last_i = len(gram_list) - 1
                
                for i, g in enumerate(gram_list):     
                    if extract.stoplist_iter(g, stopwords):
                        continue     
                    if g[0].startswith('@') or g[1].startswith('@'):
                        continue    
                    
                    # Filter by trigrams                              
                    if i > 0 and \
                    (stem_list[i-1][0], stem_list[i][0], stem_list[i][1]) in trigram_counter:
                        continue
                    if i < last_i and \
                    (stem_list[i][0], stem_list[i][1], stem_list[i+1][1]) in trigram_counter:
                        continue
                    
                    grams.append(g)
                    stems.append(stem_list[i])
                                                      
            for s, g in zip(stems, grams):
                stem_map[s].update([g]) 
                               
            tweet['stems_2'] = list(set(stems))
            bigram_counter.update(tweet['stems_2'])            
             
        # Ignore bigrams that only appear once
        for g, n in bigram_counter.items():
            if n < 2:
                del bigram_counter[g]
                del stem_map[g]
                 
        # ------------------------------------------------------------
        # Process unigrams              
        
        for tweet in tweet_list:
            grams = []
            stems = []
            
            for tokens in tweet['tokens']:                               
                gram_list = nltk.ngrams(tokens, 1)
                stem_list = extract.stems_from_grams(gram_list, stemmer)
                
                last_i = len(gram_list) - 1
                               
                for i, g in enumerate(gram_list):
                    if extract.stoplist_iter(g, stopwords):
                        continue
                        
                    # Filter bigram terms
                    if i > 0 and \
                    (stem_list[i-1][0], stem_list[i][0]) in bigram_counter:
                        continue
                    if i < last_i and \
                    (stem_list[i][0], stem_list[i+1][0]) in bigram_counter:
                        continue                        
                        
                    # Filter trigram terms  
                    if i > 1 and \
                    (stem_list[i-2][0], stem_list[i-1][0], stem_list[i][0]) in trigram_counter:
                        continue
                    if i > 0 and i < last_i and \
                    (stem_list[i-1][0], stem_list[i][0], stem_list[i+1][0]) in trigram_counter:
                        continue                    
                    if i < (last_i - 1) and \
                    (stem_list[i][0], stem_list[i+1][0], stem_list[i+2][0]) in trigram_counter:
                        continue
                                
                    grams.append(g)
                    stems.append(stem_list[i])
                        
            for s, g in zip(stems, grams):
                stem_map[s].update([g]) 
          
            # Process stems
            tweet['stems'] = [' '.join(x) for x in set(stems)] 
            tweet['stems'].extend([' '.join(x) for x in tweet['stems_2'] if x in bigram_counter])            
            tweet['stems'].extend([' '.join(x) for x in tweet['stems_3'] if x in trigram_counter])
            
            del tweet['stems_2']
            del tweet['stems_3']
                        
        # Update session
        for stem, c in stem_map.iteritems():
            session_r['stem_map'][' '.join(stem)] = \
                [' '.join(k) for k, v in c.most_common()]        
        
        # Save tweets
        if tweet_list:
            _tweets.insert(tweet_list)

        session_r['tweet_count'] = len(tweet_list)
        _session.save(session_r)
                
        return _jsonify(session=session_r)
    except tweepy.TweepError, e:
        traceback.print_exc()
        return _jsonify(error=e.message[0]['message'])        
Example #5
0
def filter(session_id):
    """
    Get histogram and tweets

    @filter: comma-delimited list of elements to filter by
        if element starts with '#', then it is a hashtag
        else, it is a stem
    """
    try:
        _require_session_access(session_id)

        session_r = _session.find_one(
            {'_id': bson.ObjectId(session_id)})
        if not session_r:
            raise Exception('Session not found')

        search_r = _search.find_one(
            {'_id': bson.ObjectId(session_r['search_id'])})
        if not search_r:
            raise Exception('Search not found')

        # Find tweets
        params = {'session_id': session_id}

        filter = request.args.getlist('filter[]')
        filter_stems = []
        filter_hashtags = []
        filter_urls = []

        for element in filter:
            if element.startswith('#'):
                filter_hashtags.append(element)
            elif element.startswith('http'):
                filter_urls.append(element)
            else:
                filter_stems.append(element)

        if filter_urls:
            params['urls'] = {'$all': filter_urls}
        if filter_stems:
            params['stems'] = {'$all': filter_stems}
        if filter_hashtags:
            params['hashtags'] = {'$all': filter_hashtags}

        cursor = _tweets.find(params, {
                'embed': 1,
                'id_str': 1,
                'created_at': 1,
                'user.name': 1,
                'user.screen_name': 1,
                'retweeted_status.id_str': 1,
                'stems': 1,
                'hashtags': 1,
                'urls': 1
            }, sort=[('dt', pymongo.DESCENDING)])

        # Process tweets
        stem_counter = Counter()
        hashtag_counter = Counter()
        url_counter = Counter()

        tweets = []
        retweets = 0
        id_set = set()

        for tweet in cursor:
            stem_counter.update(tweet['stems'])
            hashtag_counter.update(tweet['hashtags'])
            url_counter.update(tweet['urls'])

            if tweet['id_str'] in id_set:
                retweets += 1
                continue
            id_set.add(tweet['id_str'])

            if 'retweeted_status' in tweet:
                retweeted_id = tweet['retweeted_status']['id_str']
                if retweeted_id in id_set:
                    retweets += 1
                    continue
                id_set.add(retweeted_id)

            tweets.append({
                'text': tweet['embed'],
                'user_name': tweet['user']['name'],
                'user_screen_name': tweet['user']['screen_name'],
                'id_str': tweet['id_str'],
                'created_at': tweet['created_at']
            })

        stem_counts = [x for x in stem_counter.most_common() \
            if x[0] not in filter_stems]
        hashtag_counts = [x for x in hashtag_counter.most_common() \
            if x[0] not in filter_hashtags]
        url_counts = [x for x in url_counter.most_common() \
            if x[0] not in filter_urls]

        return _jsonify(
            search=search_r,
            session=session_r,
            stem_counts=stem_counts,
            hashtag_counts=hashtag_counts,
            url_counts=url_counts,
            tweets=tweets,
            retweets=retweets
        )
    except Exception, e:
        traceback.print_exc()
        return _jsonify(error=str(e))
Example #6
0
def analyze():
    """
    Get tweets from twitter and analyze them

    @language = language code, e.g. 'en'

    @query = query string
        OR
    @list_id = list id
     """
    try:
        language = request.args.get('language') or 'en'

        query = request.args.get('query')
        list_id = request.args.get('list_id')

        if query:
            query_lower = query.lower()
        elif list_id:
            list_map = _get_list_map()
        else:
            raise Exception('No query or list specified')

        # Get api object
        api = tweepy.API(get_oauth())

        # Get/create search record
        param = {
            'username': session['username'],
            'language': language
        }
        if query:
            param['query_lower'] = query_lower
        else:
            param['list_id'] = list_id
        search_r = _search.find_one(param)
        if not search_r:
            search_r = param
            if query:
                search_r['query'] = query
            else:
                search_r['list_name'] = list_map[list_id]
            search_r['_id'] = _search.save(search_r, manipulate=True)
        search_id = str(search_r['_id'])

        # Create search session
        session_r = {
            'search_id': search_id,
            'dt': datetime.datetime.now().isoformat(),
            'stem_counts': [],      # [[stem, post count]]
            'stem_map': {},         # {stem: [term, count]}
        }
        session_r['_id'] = _session.save(session_r, manipulate=True)
        session_id = str(session_r['_id'])

        # Process tweets
        stopwords = extract.get_stopwords(language).copy()
        stoptags = set()
        stemmer = extract.get_stemmer(language)
        stem_map = defaultdict(Counter)
        tweet_list = []

        if query:
            stoptags.update([x.lower().lstrip('#') for x in query_lower.split()])
            stopwords.update(stoptags)
            cursor = tweepy.Cursor(api.search, q=query, lang=language, \
                count=100, result_type='recent', include_entities=True)
        else:
            cursor = tweepy.Cursor(api.list_timeline, list_id=list_id, \
                count=100, include_entities=True)

        for tweet in cursor.items(limit=settings.TWITTER_SEARCH_LIMIT):
            tweet_dict = twutil.tweepy_model_to_dict(tweet)

            tweet_dict['session_id'] = session_id
            tweet_dict['embed'] = twutil.format_text(tweet_dict)
            tweet_dict['tokens'] = extract.tokenize(tweet_dict['text'])

            # Filter hashtags from query
            # tweet_dict['hashtags'] = list(set(['#'+x['text'].lower() \
            #    for x in tweet_dict['entities']['hashtags']]))
            tweet_dict['hashtags'] = list(set([
                    '#'+x['text'].lower() \
                    for x in tweet_dict['entities']['hashtags'] \
                    if x['text'].lower() not in stoptags
                ]))

            tweet_dict['urls'] = list(set([x['expanded_url'] \
                for x in tweet_dict['entities']['urls']]))

            tweet_list.append(tweet_dict)

        # ------------------------------------------------------------
        # Process trigrams

        trigram_counter = Counter()

        for tweet in tweet_list:
            grams = []

            for tokens in tweet['tokens']:
                for g in nltk.ngrams(tokens, 3):
                    if extract.stoplist_iter(g, stopwords):
                        continue
                    if g[0].startswith('@') or g[1].startswith('@') or g[2].startswith('@'):
                        continue
                    grams.append(g)

            stems = extract.stems_from_grams(grams, stemmer)
            for s, g in zip(stems, grams):
                stem_map[s].update([g])

            tweet['stems_3'] = list(set(stems))
            trigram_counter.update(tweet['stems_3'])

        # Ignore trigrams that only appear once
        for g, n in trigram_counter.items():
            if n < 2:
                del trigram_counter[g]
                del stem_map[g]

        # ------------------------------------------------------------
        # Process bigrams

        bigram_counter = Counter()

        for tweet in tweet_list:
            grams = []
            stems = []

            for tokens in tweet['tokens']:
                gram_list = nltk.ngrams(tokens, 2)
                stem_list = extract.stems_from_grams(gram_list, stemmer)

                last_i = len(gram_list) - 1

                for i, g in enumerate(gram_list):
                    if extract.stoplist_iter(g, stopwords):
                        continue
                    if g[0].startswith('@') or g[1].startswith('@'):
                        continue

                    # Filter by trigrams
                    if i > 0 and \
                    (stem_list[i-1][0], stem_list[i][0], stem_list[i][1]) in trigram_counter:
                        continue
                    if i < last_i and \
                    (stem_list[i][0], stem_list[i][1], stem_list[i+1][1]) in trigram_counter:
                        continue

                    grams.append(g)
                    stems.append(stem_list[i])

            for s, g in zip(stems, grams):
                stem_map[s].update([g])

            tweet['stems_2'] = list(set(stems))
            bigram_counter.update(tweet['stems_2'])

        # Ignore bigrams that only appear once
        for g, n in bigram_counter.items():
            if n < 2:
                del bigram_counter[g]
                del stem_map[g]

        # ------------------------------------------------------------
        # Process unigrams

        for tweet in tweet_list:
            grams = []
            stems = []

            for tokens in tweet['tokens']:
                gram_list = nltk.ngrams(tokens, 1)
                stem_list = extract.stems_from_grams(gram_list, stemmer)

                last_i = len(gram_list) - 1

                for i, g in enumerate(gram_list):
                    if extract.stoplist_iter(g, stopwords):
                        continue

                    # Filter bigram terms
                    if i > 0 and \
                    (stem_list[i-1][0], stem_list[i][0]) in bigram_counter:
                        continue
                    if i < last_i and \
                    (stem_list[i][0], stem_list[i+1][0]) in bigram_counter:
                        continue

                    # Filter trigram terms
                    if i > 1 and \
                    (stem_list[i-2][0], stem_list[i-1][0], stem_list[i][0]) in trigram_counter:
                        continue
                    if i > 0 and i < last_i and \
                    (stem_list[i-1][0], stem_list[i][0], stem_list[i+1][0]) in trigram_counter:
                        continue
                    if i < (last_i - 1) and \
                    (stem_list[i][0], stem_list[i+1][0], stem_list[i+2][0]) in trigram_counter:
                        continue

                    grams.append(g)
                    stems.append(stem_list[i])

            for s, g in zip(stems, grams):
                stem_map[s].update([g])

            # Process stems
            tweet['stems'] = [' '.join(x) for x in set(stems)]
            tweet['stems'].extend([' '.join(x) for x in tweet['stems_2'] if x in bigram_counter])
            tweet['stems'].extend([' '.join(x) for x in tweet['stems_3'] if x in trigram_counter])

            del tweet['stems_2']
            del tweet['stems_3']

        # Update session
        for stem, c in stem_map.iteritems():
            session_r['stem_map'][' '.join(stem)] = \
                [' '.join(k) for k, v in c.most_common()]

        # Save tweets
        if tweet_list:
            _tweets.insert(tweet_list)

        session_r['tweet_count'] = len(tweet_list)
        _session.save(session_r)

        return _jsonify(session=session_r)
    except tweepy.TweepError, e:
        traceback.print_exc()
        return _jsonify(error=e.message[0]['message'])