コード例 #1
0
ファイル: datagrabber.py プロジェクト: sbilstein/twitterjelly
 def __init__(self):
     self.sql = SQLQuery()
     self.tf = TweetFetcher(sql_obj=self.sql)
     # TODO More sophisticated stopword filtering.
     self.stopwords = ["the", "lol", "yea", "haha"]
コード例 #2
0
ファイル: datagrabber.py プロジェクト: sbilstein/twitterjelly
 def __init__(self):
     self.sql = SQLQuery()
     self.tf = TweetFetcher(sql_obj=self.sql)
     # TODO More sophisticated stopword filtering.
     self.stopwords = ["the", "lol", "yea", "haha"]
コード例 #3
0
ファイル: datagrabber.py プロジェクト: sbilstein/twitterjelly
class DataGrabber:
    ##@perftest
    def __init__(self):
        self.sql = SQLQuery()
        self.tf = TweetFetcher(sql_obj=self.sql)
        # TODO More sophisticated stopword filtering.
        self.stopwords = ["the", "lol", "yea", "haha"]

    ##@perftest
    def GetUserTweets(self, user, can_retry=True):
        user_data = None
        if self.tf.canFetchTimeline():
            user_data = self.tf.fetchUserTimeline(user,
                                                  format="searchapi",
                                                  use_filesystem_cache=True)
        else:
            user_data = self.tf.getUserTweetsData(user)

        if 'results' not in user_data:
            if 'error' in user_data:
                ret = {'status': 'error'}
                if user_data['error'] == 'Not authorized':
                    ret['error'] = 'protected'
                return ret
            if can_retry:
                return self.GetUserTweets(user, can_retry=False)
            else:
                user_data = self.tf.getUserTweetsData(user)
        elif not len(user_data['results']):
            ret = {'status': 'error', 'error': 'no_tweets'}
            return ret

        return user_data

    ##@perftest
    def GetUserTFIDFs(self, user_data):
        tfidf_obj = TfIdf()

        # GET TERM COUNTS AND BUILD DICTS
        terms = {}
        token_mapping = {}
        user_tweets = {}
        tokens_count = 0

        for tweet in user_data['results']:
            user_tweets[tweet['id']] = tweet

            tokens = [
                t for t in tfidf_obj.get_tokens(tweet['text'],
                                                tagtypes=False,
                                                wordsonly=True,
                                                excludeUrls=True,
                                                minLength=3)
            ]

            tokens_count += len(tokens)

            for token in tokens:
                if token in terms:
                    terms[token] += 1
                else:
                    terms[token] = 1

                if token in token_mapping:
                    token_mapping[token].append(tweet['id'])
                else:
                    token_mapping[token] = [tweet['id']]

        #CALCULATE TFIDF
        idfs = self.GetTermIDFs(terms.keys())
        scores = {}

        for term in terms.keys():
            if term in idfs['idfs']:
                tf = Decimal(terms[term]) / Decimal(tokens_count)
                this_tfidf = Decimal(tf) * Decimal(idfs['idfs'][term])
                scores[term] = float(this_tfidf)

        sorted_scores = [(term, scores[term]) for term in scores.keys()]
        sorted_scores.sort(key=lambda x: -1 * x[1])

        user_scores = {}
        for score in sorted_scores:
            user_scores[score[0]] = score[1]

        return {
            'scores_dic': user_scores,
            'scores_list': sorted_scores,
            'tweets': user_tweets,
            'token_mapping': token_mapping
        }

    ##@perftest
    def GetTermIDFs(self, terms):
        if not terms or not len(terms):
            return json.loads({"idfs": []})

        url = 'http://50.56.221.228/cgi-bin/idf.php?'
        # TODO: HTML entity encoding (?)
        # TODO: Enhanced encoding detection - first term's encoding may not be always appropriate.
        data = ('terms=' + ','.join(terms).replace("#", "%23")).encode("utf-8")
        debuglog.msg(data)

        txt_unicode = UnicodeDammit(urllib.request.urlopen(url, data).read())
        txt = txt_unicode.unicode_markup
        txt = txt.replace(",null:", ',"null":')  #workaround
        data = json.loads(txt, encoding=txt_unicode.original_encoding)
        return data

    ##@perftest
    def GetCelebTFIDFsForTerms(self, terms):
        q = "SELECT * FROM celeb_tfidf WHERE token IN("
        count = 0
        vals = {}
        for term in terms:
            vals['token' + str(count)] = unidecode.unidecode(term)
            q += "%(token" + str(count) + ")s,"

            count += 1

        q = q[:len(q) - 1]  #remove last comma
        q += ") ORDER BY score DESC"
        results = self.sql.q(q, vals)

        return results

    ##@perftest
    def GetCelebTweetStats(self):
        q = "SELECT * FROM celeb_stats WHERE tr_day > -1"
        results = self.sql.q(q)

        return results

    def GetCelebMatchesForUser(self, user):
        """
        Generate object with information about user and matches with celeb (including matching tweets) to pass to the
        UI.

        TODO: Break this into smaller functions, it's way too big.
        """

        results = {
            'user': {
                'screen_name': user,
                'name': '',
                'pic_url': '',
                'personality': ''
            },
            'celeb_matches': [],
            'celeb_matches_pers': []
        }

        # GET USER TWEETS
        user_data = self.GetUserTweets(user)

        # return an error if user doesn't exist/has no tweets.
        if user_data is None or not len(user_data['results']):
            results['status'] = 'error'
            return results

        results['user']['name'] = user_data['results'][0]['user']['name']
        results['user']['pic_url'] = user_data['results'][0]['user'][
            'profile_image_url']

        #Pass user_data and celeb_stats to get celeb matches
        celeb_stats = self.GetCelebTweetStats()
        celeb_matches = celebmatcher.getCelebMatches(user_data, celeb_stats)

        celebs = celeb_matches[1]
        results['user']['personality'] = celeb_matches[0]

        #get pic urls for celeb pers matches
        if len(celebs) > 0:
            q = "SELECT from_user,profile_image_url FROM tweets WHERE from_user="******"%(token" + str(count) + ")s OR from_user="******" GROUP BY from_user"

            q_results = self.sql.q(q, vals)
            celeb_match_pers_array = []
            for res in q_results:
                celeb_match_pers_array.append([res[0], res[1]])
            results['celeb_matches_pers'] = celeb_match_pers_array

        # GET USER TFIDF
        user_tfidf = self.GetUserTFIDFs(user_data)
        user_scores = user_tfidf['scores_dic']

        debuglog.msg("top user terms are", user_tfidf['scores_list'][:15])

        # GET CELEBS TFIDF
        celeb_scores = self.GetCelebTFIDFsForTerms(
            [term[0] for term in user_tfidf['scores_list']][:15])

        # CALCULATE MATCH SCORES
        cumulative_celeb_scores = {}
        celeb_words = {}
        for entry in celeb_scores:
            celeb = entry[0]

            if celeb.lower() == user.lower():
                continue

            token = unidecode.unidecode(entry[1])
            score = float(entry[2])

            if celeb in cumulative_celeb_scores:
                celeb_words[celeb][token] = score
                cumulative_celeb_scores[celeb] += user_scores[token] * score
            else:
                celeb_words[celeb] = {token: score}
                cumulative_celeb_scores[celeb] = user_scores[token] * score

        matches = [(celeb, cumulative_celeb_scores[celeb], celeb_words[celeb])
                   for celeb in cumulative_celeb_scores]
        matches.sort(key=lambda x: -cumulative_celeb_scores[x[0]])

        # FIND MATCHING TWEETS FOR TOP 10 CELEBS
        for top_10_celeb_index in range(min(10, len(matches))):
            celeb_match = {
                'screen_name':
                matches[top_10_celeb_index][0],
                'name':
                '',
                'pic_url':
                '',
                'match_score':
                cumulative_celeb_scores[matches[top_10_celeb_index][0]],
                #'top_words' : matches[top_10_celeb_index][2],
                'top_words': {},
                'tweets': []
            }

            #vals = {'celeb':matches[top_10_celeb_index][0], 'tokens': ' '.join(matches[top_10_celeb_index][2])}
            #q = "SELECT text, id, from_user_name, profile_image_url FROM tweets WHERE from_user=%(celeb)s AND MATCH(text) AGAINST(%(tokens)s)"

            q = "SELECT text, id, from_user_name, profile_image_url FROM tweets, (SELECT tweet_id FROM token_user_mapping WHERE user=%(celeb)s AND token IN ("
            vals = {'celeb': matches[top_10_celeb_index][0]}

            count = 0
            for token in list(matches[top_10_celeb_index][2].keys()):
                # TODO Clean up stopword filtering
                if token.lower() not in self.stopwords:
                    vals['token' + str(count)] = token
                    q += '%(token' + str(count) + ')s, '
                    count += 1

            # trim last comma and space.
            if count:
                q = q[:len(q) - 2]

            q += ")) as t WHERE tweets.id=t.tweet_id;"
            q_results = self.sql.q(q, vals)

            # skip if we don't have any matching celeb tweets.
            if not q_results or not len(q_results):
                continue

            celeb_match['name'] = q_results[0][2]
            celeb_match['pic_url'] = q_results[0][3]
            matching_celeb_tweets = [{
                'text': result[0],
                'id': result[1]
            } for result in q_results]

            matches[top_10_celeb_index] = list(matches[top_10_celeb_index])

            # ADD TWEETS THAT MATCH ON TOKENS
            sorted_tokens = [
                token for token in sorted(
                    matches[top_10_celeb_index][2].keys(),
                    key=lambda x: -matches[top_10_celeb_index][2][x])
            ]
            # TODO Clean up stopword filtering
            for token in list(
                    filter(lambda x: x not in self.stopwords, sorted_tokens)):
                celeb_tweets_for_token = list(
                    filter(
                        lambda x: x['text'].lower().count(token.lower()) > 0,
                        matching_celeb_tweets))
                user_tweets_for_token = [
                    user_tfidf['tweets'][user_tfidf['token_mapping'][token][k]]
                    for k in range(len(user_tfidf['token_mapping'][token]))
                ]

                if len(celeb_tweets_for_token) or len(user_tweets_for_token):
                    celeb_match['top_words'][token] = matches[
                        top_10_celeb_index][2][token]

                for matching_tweets_for_token_index in range(
                        min(len(celeb_tweets_for_token),
                            len(user_tweets_for_token))):
                    celeb_match['tweets'].append({
                        'word': token,
                        'user_tweet': {
                            'url':
                            'http://twitter.com/' + user_tweets_for_token[
                                matching_tweets_for_token_index]['user']
                            ['screen_name'] + '/status/' +
                            str(user_tweets_for_token[
                                matching_tweets_for_token_index]['id']),
                            'text':
                            user_tweets_for_token[
                                matching_tweets_for_token_index]['text']
                        },
                        'celeb_tweet': {
                            'url':
                            'http://twitter.com/' +
                            celeb_match['screen_name'] + '/status/' +
                            str(celeb_tweets_for_token[
                                matching_tweets_for_token_index]['id']),
                            'text':
                            celeb_tweets_for_token[
                                matching_tweets_for_token_index]['text']
                        }
                    })

            #matches[top_10_celeb_index].append({matches[top_10_celeb_index][0]:matching_celeb_tweets,user:matching_user_tweets})

            if len(celeb_match['tweets']):
                results['celeb_matches'].append(celeb_match)

        results['status'] = 'ok'
        results['permalink_id'] = self.StorePermalink(results)
        return results

    def StorePermalink(self, results_obj):
        json_txt = json.dumps(results_obj)
        hash = hashlib.md5(json_txt.encode('utf-8')).hexdigest()
        q = "INSERT INTO stored_matches_json (hashed, user, json) VALUES(%(hash)s, %(user)s, %(json)s);"
        vals = {
            'user': results_obj['user']['screen_name'],
            'json': json_txt,
            'hash': hash
        }

        self.sql.q(q, vals)
        return hash
コード例 #4
0
ファイル: datagrabber.py プロジェクト: sbilstein/twitterjelly
class DataGrabber:
    ##@perftest
    def __init__(self):
        self.sql = SQLQuery()
        self.tf = TweetFetcher(sql_obj=self.sql)
        # TODO More sophisticated stopword filtering.
        self.stopwords = ["the", "lol", "yea", "haha"]

    ##@perftest
    def GetUserTweets(self, user, can_retry=True):
        user_data = None
        if self.tf.canFetchTimeline():
            user_data = self.tf.fetchUserTimeline(user, format="searchapi", use_filesystem_cache=True)
        else:
            user_data = self.tf.getUserTweetsData(user)

        if "results" not in user_data:
            if "error" in user_data:
                ret = {"status": "error"}
                if user_data["error"] == "Not authorized":
                    ret["error"] = "protected"
                return ret
            if can_retry:
                return self.GetUserTweets(user, can_retry=False)
            else:
                user_data = self.tf.getUserTweetsData(user)
        elif not len(user_data["results"]):
            ret = {"status": "error", "error": "no_tweets"}
            return ret

        return user_data

    ##@perftest
    def GetUserTFIDFs(self, user_data):
        tfidf_obj = TfIdf()

        # GET TERM COUNTS AND BUILD DICTS
        terms = {}
        token_mapping = {}
        user_tweets = {}
        tokens_count = 0

        for tweet in user_data["results"]:
            user_tweets[tweet["id"]] = tweet

            tokens = [
                t
                for t in tfidf_obj.get_tokens(
                    tweet["text"], tagtypes=False, wordsonly=True, excludeUrls=True, minLength=3
                )
            ]

            tokens_count += len(tokens)

            for token in tokens:
                if token in terms:
                    terms[token] += 1
                else:
                    terms[token] = 1

                if token in token_mapping:
                    token_mapping[token].append(tweet["id"])
                else:
                    token_mapping[token] = [tweet["id"]]

        # CALCULATE TFIDF
        idfs = self.GetTermIDFs(terms.keys())
        scores = {}

        for term in terms.keys():
            if term in idfs["idfs"]:
                tf = Decimal(terms[term]) / Decimal(tokens_count)
                this_tfidf = Decimal(tf) * Decimal(idfs["idfs"][term])
                scores[term] = float(this_tfidf)

        sorted_scores = [(term, scores[term]) for term in scores.keys()]
        sorted_scores.sort(key=lambda x: -1 * x[1])

        user_scores = {}
        for score in sorted_scores:
            user_scores[score[0]] = score[1]

        return {
            "scores_dic": user_scores,
            "scores_list": sorted_scores,
            "tweets": user_tweets,
            "token_mapping": token_mapping,
        }

    ##@perftest
    def GetTermIDFs(self, terms):
        if not terms or not len(terms):
            return json.loads({"idfs": []})

        url = "http://50.56.221.228/cgi-bin/idf.php?"
        # TODO: HTML entity encoding (?)
        # TODO: Enhanced encoding detection - first term's encoding may not be always appropriate.
        data = ("terms=" + ",".join(terms).replace("#", "%23")).encode("utf-8")
        debuglog.msg(data)

        txt_unicode = UnicodeDammit(urllib.request.urlopen(url, data).read())
        txt = txt_unicode.unicode_markup
        txt = txt.replace(",null:", ',"null":')  # workaround
        data = json.loads(txt, encoding=txt_unicode.original_encoding)
        return data

    ##@perftest
    def GetCelebTFIDFsForTerms(self, terms):
        q = "SELECT * FROM celeb_tfidf WHERE token IN("
        count = 0
        vals = {}
        for term in terms:
            vals["token" + str(count)] = unidecode.unidecode(term)
            q += "%(token" + str(count) + ")s,"

            count += 1

        q = q[: len(q) - 1]  # remove last comma
        q += ") ORDER BY score DESC"
        results = self.sql.q(q, vals)

        return results

    ##@perftest
    def GetCelebTweetStats(self):
        q = "SELECT * FROM celeb_stats WHERE tr_day > -1"
        results = self.sql.q(q)

        return results

    def GetCelebMatchesForUser(self, user):
        """
        Generate object with information about user and matches with celeb (including matching tweets) to pass to the
        UI.

        TODO: Break this into smaller functions, it's way too big.
        """

        results = {
            "user": {"screen_name": user, "name": "", "pic_url": "", "personality": ""},
            "celeb_matches": [],
            "celeb_matches_pers": [],
        }

        # GET USER TWEETS
        user_data = self.GetUserTweets(user)

        # return an error if user doesn't exist/has no tweets.
        if user_data is None or not len(user_data["results"]):
            results["status"] = "error"
            return results

        results["user"]["name"] = user_data["results"][0]["user"]["name"]
        results["user"]["pic_url"] = user_data["results"][0]["user"]["profile_image_url"]

        # Pass user_data and celeb_stats to get celeb matches
        celeb_stats = self.GetCelebTweetStats()
        celeb_matches = celebmatcher.getCelebMatches(user_data, celeb_stats)

        celebs = celeb_matches[1]
        results["user"]["personality"] = celeb_matches[0]

        # get pic urls for celeb pers matches
        if len(celebs) > 0:
            q = "SELECT from_user,profile_image_url FROM tweets WHERE from_user="******"token" + str(count)] = celeb
                q = q + "%(token" + str(count) + ")s OR from_user="******" GROUP BY from_user"

            q_results = self.sql.q(q, vals)
            celeb_match_pers_array = []
            for res in q_results:
                celeb_match_pers_array.append([res[0], res[1]])
            results["celeb_matches_pers"] = celeb_match_pers_array

        # GET USER TFIDF
        user_tfidf = self.GetUserTFIDFs(user_data)
        user_scores = user_tfidf["scores_dic"]

        debuglog.msg("top user terms are", user_tfidf["scores_list"][:15])

        # GET CELEBS TFIDF
        celeb_scores = self.GetCelebTFIDFsForTerms([term[0] for term in user_tfidf["scores_list"]][:15])

        # CALCULATE MATCH SCORES
        cumulative_celeb_scores = {}
        celeb_words = {}
        for entry in celeb_scores:
            celeb = entry[0]

            if celeb.lower() == user.lower():
                continue

            token = unidecode.unidecode(entry[1])
            score = float(entry[2])

            if celeb in cumulative_celeb_scores:
                celeb_words[celeb][token] = score
                cumulative_celeb_scores[celeb] += user_scores[token] * score
            else:
                celeb_words[celeb] = {token: score}
                cumulative_celeb_scores[celeb] = user_scores[token] * score

        matches = [(celeb, cumulative_celeb_scores[celeb], celeb_words[celeb]) for celeb in cumulative_celeb_scores]
        matches.sort(key=lambda x: -cumulative_celeb_scores[x[0]])

        # FIND MATCHING TWEETS FOR TOP 10 CELEBS
        for top_10_celeb_index in range(min(10, len(matches))):
            celeb_match = {
                "screen_name": matches[top_10_celeb_index][0],
                "name": "",
                "pic_url": "",
                "match_score": cumulative_celeb_scores[matches[top_10_celeb_index][0]],
                #'top_words' : matches[top_10_celeb_index][2],
                "top_words": {},
                "tweets": [],
            }

            # vals = {'celeb':matches[top_10_celeb_index][0], 'tokens': ' '.join(matches[top_10_celeb_index][2])}
            # q = "SELECT text, id, from_user_name, profile_image_url FROM tweets WHERE from_user=%(celeb)s AND MATCH(text) AGAINST(%(tokens)s)"

            q = "SELECT text, id, from_user_name, profile_image_url FROM tweets, (SELECT tweet_id FROM token_user_mapping WHERE user=%(celeb)s AND token IN ("
            vals = {"celeb": matches[top_10_celeb_index][0]}

            count = 0
            for token in list(matches[top_10_celeb_index][2].keys()):
                # TODO Clean up stopword filtering
                if token.lower() not in self.stopwords:
                    vals["token" + str(count)] = token
                    q += "%(token" + str(count) + ")s, "
                    count += 1

            # trim last comma and space.
            if count:
                q = q[: len(q) - 2]

            q += ")) as t WHERE tweets.id=t.tweet_id;"
            q_results = self.sql.q(q, vals)

            # skip if we don't have any matching celeb tweets.
            if not q_results or not len(q_results):
                continue

            celeb_match["name"] = q_results[0][2]
            celeb_match["pic_url"] = q_results[0][3]
            matching_celeb_tweets = [{"text": result[0], "id": result[1]} for result in q_results]

            matches[top_10_celeb_index] = list(matches[top_10_celeb_index])

            # ADD TWEETS THAT MATCH ON TOKENS
            sorted_tokens = [
                token
                for token in sorted(
                    matches[top_10_celeb_index][2].keys(), key=lambda x: -matches[top_10_celeb_index][2][x]
                )
            ]
            # TODO Clean up stopword filtering
            for token in list(filter(lambda x: x not in self.stopwords, sorted_tokens)):
                celeb_tweets_for_token = list(
                    filter(lambda x: x["text"].lower().count(token.lower()) > 0, matching_celeb_tweets)
                )
                user_tweets_for_token = [
                    user_tfidf["tweets"][user_tfidf["token_mapping"][token][k]]
                    for k in range(len(user_tfidf["token_mapping"][token]))
                ]

                if len(celeb_tweets_for_token) or len(user_tweets_for_token):
                    celeb_match["top_words"][token] = matches[top_10_celeb_index][2][token]

                for matching_tweets_for_token_index in range(
                    min(len(celeb_tweets_for_token), len(user_tweets_for_token))
                ):
                    celeb_match["tweets"].append(
                        {
                            "word": token,
                            "user_tweet": {
                                "url": "http://twitter.com/"
                                + user_tweets_for_token[matching_tweets_for_token_index]["user"]["screen_name"]
                                + "/status/"
                                + str(user_tweets_for_token[matching_tweets_for_token_index]["id"]),
                                "text": user_tweets_for_token[matching_tweets_for_token_index]["text"],
                            },
                            "celeb_tweet": {
                                "url": "http://twitter.com/"
                                + celeb_match["screen_name"]
                                + "/status/"
                                + str(celeb_tweets_for_token[matching_tweets_for_token_index]["id"]),
                                "text": celeb_tweets_for_token[matching_tweets_for_token_index]["text"],
                            },
                        }
                    )

            # matches[top_10_celeb_index].append({matches[top_10_celeb_index][0]:matching_celeb_tweets,user:matching_user_tweets})

            if len(celeb_match["tweets"]):
                results["celeb_matches"].append(celeb_match)

        results["status"] = "ok"
        results["permalink_id"] = self.StorePermalink(results)
        return results

    def StorePermalink(self, results_obj):
        json_txt = json.dumps(results_obj)
        hash = hashlib.md5(json_txt.encode("utf-8")).hexdigest()
        q = "INSERT INTO stored_matches_json (hashed, user, json) VALUES(%(hash)s, %(user)s, %(json)s);"
        vals = {"user": results_obj["user"]["screen_name"], "json": json_txt, "hash": hash}

        self.sql.q(q, vals)
        return hash