def index(): token = request.args.get('token', '') twitter = request.args.get('twitter', '') if twitter: t = tweets.Tweets() color = t.user_to_color(twitter) else: color = None if token: sp = spotipy.Spotify(auth=token) results = sp.current_user_saved_tracks() profile = sp.current_user() base = songBase.Songbase() country = profile['country'] age = calculate_age(profile['birthdate']) song = base.get_song(color, country, age, []) else: results = None profile = None song = None return render_template('index.html', token=token, results=results, profile=profile, twitter=twitter, color=color, colorcode=color_to_code(color), song=song)
def init_api(): api = tweets.Tweets() container.add('api', api) try: api.authentication() except URLError, e: print 'error:%s' % e sys.exit(1)
def authenticate(self): tyrs.init_conf() self.api = tweets.Tweets() self.api.authentication()
def main(): E = float(sys.argv[2]) news_api = news.News() word_processor = processor.Processor() tweets_api = tweets.Tweets(int(sys.argv[4])) articles = news_api.process_news(news_api.retrieve_everything()) data = [] for line in open(sys.argv[1]): data.append(json.loads(line)) all_tweets = tweets_api.process_tweets(data) all_tokens = [] copied_tweets = list(all_tweets) for tweet in copied_tweets: tokens = word_processor.tweet_tokenize(tweet[0]) if tokens["text"] == []: all_tweets.remove(tweet) continue all_tokens.append(tokens) all_clusters = [] # we are computing the similarity of one tweet with all clusters # exists, not the similarity with other tweets cluster_id = 0 for i in range(len(all_tweets)): tweet = all_tweets[i] token = all_tokens[i] # first cluster if all_clusters == []: new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id) cluster_id += 1 all_clusters.append(new_cluster) continue clustered = False # max_cluster_similarity = 0 # max_cluster_index = -1 # for single_cluster in all_clusters: for j in range(len(all_clusters)): single_cluster = all_clusters[j] vector = single_cluster.get_vector(False) # no common words between the tweet and the cluster, skip common_text_vector = intersection(vector["text"], token["text"]) common_hashtag_vector = intersection(vector["hashtag"], token["hashtag"]) common_url_vector = intersection(vector["url"], token["url"]) if common_text_vector == [] and \ common_hashtag_vector == [] and \ common_url_vector == []: continue vector = single_cluster.get_vector(True) new_token = {} new_token["text"] = " ".join(token["text"]) new_token["hashtag"] = token["hashtag"] new_token["url"] = token["url"] similarity = word_processor.new_triple_similarity( new_token, vector) # print("Tweet %d, Cluster %d" % (i, j)) # print("Similarity before: %f" % (similarity)) # # similarity = word_processor.docs_similarity(tweet[0], vector) # similarity = word_processor.modified_similarity( # similarity, tweet[1], single_cluster) # print("Similarity after: %f" % (similarity)) # if similarity >= E and similarity > max_cluster_similarity: if similarity >= E: # max_cluster_similarity = similarity # max_cluster_index = j single_cluster.push(tweet[0], tweet[1], token) clustered = True # TODO: we need to consider when one tweet is similar to multiple clusters, # which cluster should we push to break # if max_cluster_index != -1: # all_clusters[max_cluster_index].push(tweet[0], tweet[1], token) # clustered = True if not clustered: new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id) cluster_id += 1 all_clusters.append(new_cluster) # print(i) print("Total number of clusters generated: %d" % (len(all_clusters))) cluster_sizes = [x.get_size() for x in all_clusters] print("The sizes of all clusters generated:") print(cluster_sizes) max_cluster_size = max(cluster_sizes) # for j in range(len(cluster_sizes)): # if cluster_sizes[j] == max_cluster_size: # break print("The max cluster size is: %d" % (max_cluster_size)) # print("Number of tweets clustered using hashtag/url: %d" % (word_processor.hashtag_index)) # print("Number of tweets clustered using text: %d" % (word_processor.text_index)) # for item in all_clusters[j].get_all_tweets(): # print(item) # similarity = word_processor.docs_similarity(all_tweets[0][0], all_tweets[0][0]) # the similarity we get is greater the better, closer to 1 means they are very # similar, otherwise very different # TODO: after finish clustering, we need to compute the similarity between # each cluster and each news we retrieved F = float(sys.argv[3]) related_news_clusters = [] # for article in articles: for i in range(len(articles)): news_cluster_group = {} # max_similarity = 0 # max_similarity_index = -1 article = articles[i] text = article["title"] + article["description"] time = article["publish_time"] # for single_cluster in all_clusters: for j in range(len(all_clusters)): single_cluster = all_clusters[j] # Remove outlier clusters # if single_cluster.get_size() <= 10 or single_cluster.is_clustered: if single_cluster.get_size() <= 10: continue cluster_vector = single_cluster.get_vector(True)["text"] similarity = word_processor.docs_similarity(text, cluster_vector) similarity = word_processor.modified_similarity( similarity, time, single_cluster, True) # if similarity >= F and similarity > max_similarity: if similarity >= F: # max_similarity = similarity # max_similarity_index = j # The news is related to this cluster news_cluster_group["article"] = i news_cluster_group["cluster"] = single_cluster.get_id() related_news_clusters.append(news_cluster_group) # stop comparing with other clusters break # if max_similarity_index == -1: # continue # news_cluster_group["article"] = i # news_cluster_group["cluster"] = max_similarity_index # related_news_clusters.append(news_cluster_group) # all_clusters[max_similarity_index].change_clustered() counter = {} for item in related_news_clusters: if item["cluster"] not in counter: counter[item["cluster"]] = 1 else: counter[item["cluster"]] += 1 # most_related_cluster = max(counter.items(), key=operator.itemgetter(1))[0] print("Number of pairs generated in total: %d" % (len(related_news_clusters))) print("All generated pairs:") print(related_news_clusters)
from sys import argv # for passing arguments from command line import time # for tracking time import tweets # contains Tweets class # Program that calculates the median number of unique words per tweet # Author: Jason Keung # Created: July 4, 2015 if __name__ == "__main__": script, infile, outfile = argv start_time = time.time() print("Starting median_unique.py...") mytweet = tweets.Tweets(infile, outfile) while mytweet.read_tweet(): mytweet.get_num_unique_words() mytweet.write_median() # write median after EACH line is read mytweet.close() print("Output is saved to %s ") % (outfile) print("median_unique.py run successfully!") print("--- %s seconds ---\n") % (time.time() - start_time)
import tweets import songBase import webbrowser import fb t = tweets.Tweets() base = songBase.Songbase() def init(): global my_name global my_color my_name = raw_input('Enter your twitter name: ') my_color = t.user_to_color(my_name) print "Song base size: "+str(base.get_size()) print "==========================" print "Your color is: "+my_color def next_song(): global my_color global my_url my_url = base.get_song(my_color,"SG",22,[]) print "Current song: "+my_url print "1. Next song" print "2. Like this song" webbrowser.open(my_url) def like_song(): global my_color global my_url
def main(): if int(sys.argv[5]) == 1: enable_time_relevancy = True else: enable_time_relevancy = False if int(sys.argv[6]) == 1: enable_hashtag_similarity = True else: enable_hashtag_similarity = False E = float(sys.argv[2]) word_processor = processor.Processor(enable_hashtag_similarity) tweets_api = tweets.Tweets(int(sys.argv[4])) all_tweets = tweets_api.process_tweets(sys.argv[1]) all_tokens = [] copied_tweets = list(all_tweets) for tweet in copied_tweets: tokens = word_processor.tweet_tokenize(tweet[0]) if tokens["text"] == []: all_tweets.remove(tweet) continue all_tokens.append(tokens) all_clusters = [] # we are computing the similarity of one tweet with all clusters # exists, not the similarity with other tweets cluster_id = 0 for i in range(len(all_tweets)): # first cluster if all_clusters == []: tweet = all_tweets[i] token = all_tokens[i] new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id) cluster_id += 1 all_clusters.append(new_cluster) continue clustered = False # max_cluster_similarity = 0 # max_cluster_index = -1 token = all_tokens[i] # print("Tweet after processed: %s" % (token["text"])) # for single_cluster in all_clusters: for j in range(len(all_clusters)): single_cluster = all_clusters[j] vector = single_cluster.get_vector(False) # no common words between the tweet and the cluster, skip if not intersection(vector["text"], token["text"]) and \ not intersection(vector["hashtag"], token["hashtag"]): continue # start_pre_similarity = time.time() new_token = {} new_token["text"] = " ".join(token["text"]) new_token["hashtag"] = token["hashtag"] # new_token["url"] = token["url"] # print("Pre similarity duration: %s" % (time.time() - start_pre_similarity)) # if all_text_in_cluster(new_token["text"], vector["text"]): # similarity = 1 # else: # vector = single_cluster.get_vector(True) # similarity = word_processor.new_triple_similarity(new_token, vector) try: # print("Cluster: %s" % (vector["text"])) vector = single_cluster.get_vector(True) similarity = word_processor.new_triple_similarity(new_token, vector) except: continue # print(new_token) # print(vector) if enable_time_relevancy: similarity = word_processor.modified_similarity( similarity, all_tweets[i][1], single_cluster) # print("Similarity: %f" % (similarity)) if similarity >= E: tweet = all_tweets[i] single_cluster.push(tweet[0], tweet[1], token) clustered = True break # if max_cluster_index != -1: # all_clusters[max_cluster_index].push(tweet[0], tweet[1], token) # clustered = True if not clustered: tweet = all_tweets[i] token = all_tokens[i] new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id) cluster_id += 1 all_clusters.append(new_cluster) # print("-----------------------------------------------------------") print("Total number of clusters generated: %d" % (len(all_clusters))) cluster_sizes = [x.get_size() for x in all_clusters] print("The sizes of all clusters generated:") print(cluster_sizes) max_cluster_size = max(cluster_sizes) # for j in range(len(cluster_sizes)): # if cluster_sizes[j] == max_cluster_size: # break print("The max cluster size is: %d" % (max_cluster_size)) # print("Number of tweets clustered using hashtag/url: %d" % (word_processor.hashtag_index)) # print("Number of tweets clustered using text: %d" % (word_processor.text_index)) # for item in all_clusters[j].get_all_tweets(): # print(item) # similarity = word_processor.docs_similarity(all_tweets[0][0], all_tweets[0][0]) # the similarity we get is greater the better, closer to 1 means they are very # similar, otherwise very different # TODO: after finish clustering, we need to compute the similarity between # each cluster and each news we retrieved news_api = news.News() articles = news_api.process_news(news_api.retrieve_everything()) F = float(sys.argv[3]) related_news_clusters = [] # for article in articles: for i in range(len(articles)): news_cluster_group = {} # max_similarity = 0 # max_similarity_index = -1 article = articles[i] text = article["title"] + article["description"] publish_time = article["publish_time"] # for single_cluster in all_clusters: for j in range(len(all_clusters)): single_cluster = all_clusters[j] # Remove outlier clusters if single_cluster.get_size() < 10: continue cluster_vector = single_cluster.get_vector(True)["text"] if not intersection(cluster_vector, text): continue similarity = word_processor.docs_similarity(text, cluster_vector) # similarity = word_processor.modified_similarity(similarity, publish_time, single_cluster, True) # if enable_time_relevancy: # similarity = word_processor.modified_similarity( # similarity, publish_time, single_cluster, True) # if similarity >= F and similarity > max_similarity: if similarity >= F: # max_similarity = similarity # max_similarity_index = j # The news is related to this cluster news_cluster_group["article"] = i news_cluster_group["cluster"] = single_cluster.get_id() news_cluster_group["similarity"] = similarity related_news_clusters.append(news_cluster_group) # stop comparing with other clusters break print("Number of pairs generated in total: %d" % (len(related_news_clusters))) print("All generated pairs:") print(related_news_clusters) for related_pair in related_news_clusters: print("News below") article_id = related_pair["article"] print(articles[article_id]) cluster_id = related_pair["cluster"] print("Tweets below:") for k in range(len(all_clusters[cluster_id].get_all_tweets())): print("[%d]: %s: " % (k, all_clusters[cluster_id].get_all_tweets()[k])) print("----------------------------------------------------")
def main(): E = float(sys.argv[2]) # glove_file = datapath('glove.twitter.27B/glove.twitter.27B.200d.txt') # tmp_file = get_tmpfile("tweets_word2vec.txt") # _ = glove2word2vec(glove_file, tmp_file) # model = KeyedVectors.load_word2vec_format(tmp_file) # model.save("tweets_word2vec.model") # print("model completed") model = KeyedVectors.load("glove.twitter.27B/tweets_word2vec.model") news_api = news.News() word_processor = processor.Processor() tweets_api = tweets.Tweets() articles = news_api.process_news(news_api.retrieve_everything()) data = [] for line in open(sys.argv[1]): data.append(json.loads(line)) all_tweets = tweets_api.process_tweets(data) all_tokens = [] copied_tweets = list(all_tweets) for tweet in copied_tweets: tokens = word_processor.tweet_tokenize(tweet[0]) if tokens == []: all_tweets.remove(tweet) continue all_tokens.append(tokens) all_clusters = [] cluster_id = 0 for i in range(len(all_tweets)): tweet = all_tweets[i] token = all_tokens[i] # first cluster if all_clusters == []: new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id) cluster_id += 1 all_clusters.append(new_cluster) continue clustered = False for j in range(len(all_clusters)): single_cluster = all_clusters[j] vector = single_cluster.get_vector(False) # no common words between the tweet and the cluster, skip common_text_vector = intersection(vector["text"], token["text"]) common_hashtag_vector = intersection(vector["hashtag"], token["hashtag"]) common_url_vector = intersection(vector["url"], token["url"]) if common_text_vector == [] and \ common_hashtag_vector == [] and \ common_url_vector == []: continue # vector = single_cluster.get_vector(True) new_token = {} new_token["text"] = token["text"] new_token["hashtag"] = token["hashtag"] # TODO: we can check if a word is in the pre-trained model by doing the following # for word not in new_token["text"]: # if word in model.wv.vocab: # if word in model.vocab: # print(word) similarity = model.wv.n_similarity(new_token["text"], vector["text"]) print(similarity) if similarity >= E: # max_cluster_similarity = similarity # max_cluster_index = j single_cluster.push(tweet[0], tweet[1], token) clustered = True break if not clustered: new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id) cluster_id += 1 all_clusters.append(new_cluster) print("Total number of clusters generated: %d" % (len(all_clusters))) cluster_sizes = [x.get_size() for x in all_clusters] print("The sizes of all clusters generated:") print(cluster_sizes) max_cluster_size = max(cluster_sizes) print("The max cluster size is: %d" % (max_cluster_size)) F = float(sys.argv[3]) related_news_clusters = [] # for article in articles: for i in range(len(articles)): news_cluster_group = {} # max_similarity = 0 # max_similarity_index = -1 article = articles[i] text = article["title"] + article["description"] time = article["publish_time"] # for single_cluster in all_clusters: for j in range(len(all_clusters)): single_cluster = all_clusters[j] # Remove outlier clusters # if single_cluster.get_size() <= 10 or single_cluster.is_clustered: if single_cluster.get_size() <= 10: continue cluster_vector = single_cluster.get_vector(True)["text"] similarity = word_processor.docs_similarity(text, cluster_vector) similarity = word_processor.modified_similarity( similarity, time, single_cluster, True) # if similarity >= F and similarity > max_similarity: if similarity >= F: # max_similarity = similarity # max_similarity_index = j # The news is related to this cluster news_cluster_group["article"] = i news_cluster_group["cluster"] = single_cluster.get_id() related_news_clusters.append(news_cluster_group) # stop comparing with other clusters break # if max_similarity_index == -1: # continue # news_cluster_group["article"] = i # news_cluster_group["cluster"] = max_similarity_index # related_news_clusters.append(news_cluster_group) # all_clusters[max_similarity_index].change_clustered() counter = {} for item in related_news_clusters: if item["cluster"] not in counter: counter[item["cluster"]] = 1 else: counter[item["cluster"]] += 1 # most_related_cluster = max(counter.items(), key=operator.itemgetter(1))[0] print("Number of pairs generated in total: %d" % (len(related_news_clusters))) print("All generated pairs:") print(related_news_clusters)
def main(): # Settings if int(sys.argv[5]) == 1: enable_time_relevancy = True else: enable_time_relevancy = False if int(sys.argv[6]) == 1: enable_hashtag_similarity = True else: enable_hashtag_similarity = False E = float(sys.argv[2]) # model = Doc2Vec.load("./enwiki_dbow/doc2vec.bin") # print("Starts loading the model.") doc2vec_model = Doc2VecModel() model = doc2vec_model.get_model() # print("Model loaded.") word_processor = processor.Processor(enable_hashtag_similarity) tweets_api = tweets.Tweets(int(sys.argv[4])) # print("Starts loading the dataset") all_tweets = tweets_api.process_tweets(sys.argv[1]) # print("Dataset loaded") all_tokens = [] copied_tweets = list(all_tweets) for tweet in copied_tweets: tokens = word_processor.tweet_tokenize(tweet[0]) if tokens == []: all_tweets.remove(tweet) continue all_tokens.append(tokens) # print("pre-processing completed") all_clusters = [] cluster_id = 0 for i in range(len(all_tweets)): # start_total = time.time() # first cluster if all_clusters == []: tweet = all_tweets[i] token = all_tokens[i] new_cluster = cluster.Cluster( tweet[0], tweet[1], token, cluster_id, True, model) cluster_id += 1 all_clusters.append(new_cluster) continue clustered = False # print("Starts clustering %d" % (i)) token = all_tokens[i] for j in range(len(all_clusters)): vector = all_clusters[j].get_vector(False) single_cluster = all_clusters[j] # no common words between the tweet and the cluster, skip if not intersection(vector["text"], token["text"]) and \ not intersection(vector["hashtag"], token["hashtag"]): continue # vector = single_cluster.get_vector(True) new_token = {} new_token["text"] = token["text"] new_token["hashtag"] = token["hashtag"] # cluster_dbow_vector = model.infer_vector(vector["text"]) # similarity = spatial.distance.cosine(tweet_dbow_vector, cluster_dbow_vector) # similarity = 1 - similarity # if all_text_in_cluster(new_token["text"], vector["text"]): # similarity = 1 # else: # tweet_dbow_vector = model.infer_vector(new_token["text"]) # similarity = word_processor.doc2vec_double_similarity(new_token, vector, tweet_dbow_vector, all_clusters[j]) tweet_dbow_vector = model.infer_vector(new_token["text"]) similarity = word_processor.doc2vec_double_similarity( new_token, vector, tweet_dbow_vector, all_clusters[j]) if enable_time_relevancy: similarity = word_processor.modified_similarity( similarity, all_tweets[i][1], single_cluster) if similarity >= E: tweet = all_tweets[i] all_clusters[j].push(tweet[0], tweet[1], token) clustered = True break if not clustered: # start_new_cluster = time.time() tweet = all_tweets[i] token = all_tokens[i] new_cluster = cluster.Cluster(tweet[0], tweet[1], token, cluster_id, True, model) cluster_id += 1 all_clusters.append(new_cluster) # print("New cluster duration: %s" % # (time.time() - start_new_cluster)) # print("Total time: %s" % (time.time() - start_total)) # print("Clustering completed %d" % (i)) print("Total number of clusters generated: %d" % (len(all_clusters))) cluster_sizes = [x.get_size() for x in all_clusters] print("The sizes of all clusters generated:") print(cluster_sizes) max_cluster_size = max(cluster_sizes) print("The max cluster size is: %d" % (max_cluster_size)) news_api = news.News() articles = news_api.process_news(news_api.retrieve_everything()) F = float(sys.argv[3]) related_news_clusters = [] # for article in articles: for i in range(len(articles)): news_cluster_group = {} # max_similarity = 0 # max_similarity_index = -1 article = articles[i] text = article["title"] + article["description"] publish_time = article["publish_time"] # for single_cluster in all_clusters: for j in range(len(all_clusters)): single_cluster = all_clusters[j] # Remove outlier clusters if single_cluster.get_size() < 10: continue # print("Article %d, Cluster %d." % (i, j)) cluster_vector = single_cluster.get_vector(True)["text"] if not intersection(cluster_vector, text): continue similarity = word_processor.docs_similarity(text, cluster_vector) # if enable_time_relevancy: # similarity = word_processor.modified_similarity( # similarity, publish_time, single_cluster, True) if similarity >= F: news_cluster_group = {} # find all clusters related to the news # if i not in news_cluster_group.keys(): # news_cluster_group[i] = [] # cluster_id = single_cluster.get_id() # news_cluster_group[i].append((cluster_id, similarity)) news_cluster_group["article"] = i news_cluster_group["cluster"] = single_cluster.get_id() news_cluster_group["similarity"] = similarity related_news_clusters.append(news_cluster_group) # stop comparing with other clusters break # if news_cluster_group != {}: # related_news_clusters.append(news_cluster_group) print("Number of pairs generated in total: %d" % (len(related_news_clusters))) print("All generated pairs:") print(related_news_clusters) # for related_pair in related_news_clusters: # article_id = list(related_pair.keys())[0] # print("News is below") # print(articles[article_id]) # print("Tweets are below") # cluster_list = list(related_pair.values())[0] # for cluster_id, similarity in cluster_list: # for k in range(len(all_clusters[cluster_id].get_all_tweets())): # print("[%d]: %s: " % # (k, all_clusters[cluster_id].get_all_tweets()[k])) # print("---------------------------------------------------") for related_pair in related_news_clusters: print("News below") article_id = related_pair["article"] print(articles[article_id]) cluster_id = related_pair["cluster"] print("Tweets below:") for k in range(len(all_clusters[cluster_id].get_all_tweets())): print("[%d]: %s: " % (k, all_clusters[cluster_id].get_all_tweets()[k])) print("----------------------------------------------------")