def make_url_hist(cursor, party=""): max_days = 365 words_delete_all() users = db_get_cols_from_table(cursor, "user_names", ["user_id", "job_type1"]) tweets = [] done = 0 v = [] for i in range(0, 10): u = users[i][0] p = users[i][1] print(u, p) if p.startswith(party) == True: cur_time = time.time() tweets = db_get_cols_from_table(cursor, u, ["date", "url"]) http = 0 urls = [] for ii in range(0, len(tweets)): date = int(tweets[ii][0]) url = tweets[ii][1] if url != "None" and url != "error": if ((cur_time - date) / 60 / 60 / 24) < max_days: address = url.split("/")[0] if address.startswith("www."): address = address[4:] word_add(address) word_clean() return words_ret_hist()
def sen_run_over_db(cursor): users=db_get_all_users(cursor) tweets=[] cur_time=time.time() done=0 for i in range(0,len(users)): u=users[i] print(u,i,len(users)) if db_is_col(cursor,u,"sen")==False: db_add_col(cursor,u,"sen") print("adding") tweets=db_get_cols_from_table(cursor,u,["tweet_id","tweet","sen","date"]) for t in tweets: delta=int((cur_time-int(t[3]))/60/60/24) #print(delta) if delta<365*2: if t[2]=="None": st=t[1].replace("\\\\","") s=sentiment(st) print(".", end='', flush=True) #print(t[1],s,"new") db_update_record(cursor,u,"tweet_id",t[0],"sen",str(s)) #db_commit() #else: # print(t[2]) print("") db_commit()
def short_to_long_over_db(cursor): max_days = 365 users = db_get_all_users(cursor) tweets = [] done = 0 v = [] for i in range(0, len(users)): u = users[i] print(u) if db_is_col(cursor, u, "url") == False: db_add_col(cursor, u, "url", length="1000") print("adding url") cur_time = time.time() tweets = db_get_cols_from_table(cursor, u, ["date", "tweet", "url", "tweet_id"]) http = 0 urls = [] for ii in range(0, len(tweets)): date = int(tweets[ii][0]) t = tweets[ii][1] url = tweets[ii][2] id = tweets[ii][3] if url == "None": if ((cur_time - date) / 60 / 60 / 24) < max_days: ret = re.search("(?P<url>https?://[^\s]+)", t) if ret != None: ret = ret.group("url") if ret.count("\\u2026") == 0: urls.append([ret, id]) #print(urls) if len(urls) > 40 or (len(urls) > 0 and ii == len(tweets) - 1): #print("fetch",urls) urls_out = fetch_parallel(urls) #print("fetch..") for iii in range(0, len(urls_out)): if urls_out[iii][0] != "error": if urls_out[iii][0].startswith('https://'): urls_out[iii][0] = urls_out[iii][0][8:] if urls_out[iii][0].startswith('http://'): urls_out[iii][0] = urls_out[iii][0][7:] if len(urls_out[iii][0]) > 1000: urls_out[iii][0] = urls_out[iii][0][:1000] #print(urls[iii][0]) #print(urls_out[iii][0]) #print() #print(urls_out[iii]) db_update_record(cursor, u, "tweet_id", urls_out[iii][1], "url", urls_out[iii][0]) db_commit() urls = [] ids = []
def time_on_twitter(cursor): users=db_get_all_users(cursor) tweets=[] done=0 v=[] rt=0 origonal=0 for i in range(0,len(users)): u=users[i] print(u) cur_time=time.time() tweets=db_get_cols_from_table(cursor,u,["date","tweet"]) for ii in range(0,len(tweets)): t=tweets[ii][1] delta=(cur_time-int(tweets[ii][0]))/60/60/24 if delta<365: if t.startswith("RT "): rt=rt+1 else: origonal=origonal+1 print(origonal,rt,len(users),(origonal*30.0+rt*10.0)/60/60)
def retweet_anal_to_flat_files(cursor): users = db_get_all_users(cursor) tweets = [] done = 0 v = [] if os.path.isdir("retweet_anal") == False: os.mkdir("retweet_anal") for i in range(0, len(users)): f = open(os.path.join("retweet_anal", users[i]), 'w') f.close() for i in range(0, len(users)): u = users[i] print(u) cur_time = time.time() tweets = db_get_cols_from_table(cursor, u, ["date", "tweet"]) rt = 0 origonal = 0 for ii in range(0, len(tweets)): t = tweets[ii][1] d = int(tweets[ii][0]) if t.startswith("RT @"): user = t.split(":")[0][4:] if user in users: short = t if len(short) > 100: short = short[:100] f = open(os.path.join("retweet_anal", user), 'a') f.write(str(d) + ":" + short + "\n") f.close()
def get_tweet_time(cursor, u, st): tweets = db_get_cols_from_table(cursor, u, ["date", "tweet"]) for ii in range(0, len(tweets)): t = tweets[ii][1] d = int(tweets[ii][0]) if t.startswith(st) == True: return d return -1
def clas_stats(cursor): users = db_get_all_users(cursor) for u in users: print(u) words_delete_all() c = db_get_cols_from_table(cursor, u, ["clas"]) for w in c: #print(w) word_add(w[0]) words = words_ret_hist() w = "" for i in range(0, len(words[0])): w = w + words[0][i] + "=" + str(words[1][i]) + ";" w = w[:-1] db_update_record(cursor, "user_names", "user_id", u, "clas", w) db_commit() print(w) adas
def noun_anal(cursor): words_delete_all() users=db_get_all_users(cursor) tweets=[] v=[] update=False if update==True: print(len(users)) for i in range(0,len(users)): u=users[i] print(i,u) cur_time=time.time() tweets=db_get_cols_from_table(cursor,u,["tweet","date"]) for ii in range(0,len(tweets)): t=tweets[ii][0] date=int(tweets[ii][1]) if ((cur_time-date)/60/60/24)<100.0: #print(t) word_add_array(t) names,values=words_ret_hist() f = open('noun_hist.dat', 'w') for i in range(0,len(names)): f.write(names[i]+" "+str(values[i])+"\n") f.close() lines = open('noun_hist.dat').read().splitlines() http=[] times=[] for i in range(0,len(lines)): a=lines[i].split() http.append(a[0]) times.append(int(a[1])) if i>15: break http.reverse() times.reverse() y_pos = np.arange(len(http)) for_web=False if for_web==True: plt.figure(figsize=(25.0, 16.0),dpi=300) bars=plt.bar(y_pos, times,color="blue") plt.xticks(y_pos, http, fontsize=35) plt.legend(loc='best', fontsize=30) plt.ylabel('Usage (Tweets)', fontsize=30) #plt.yscale('log', fontsize=30) plt.yticks(fontsize=30) plt.xticks(rotation=45, rotation_mode="anchor", ha="right") plt.tight_layout() plt.savefig('/var/www/html/graphs/nouns.png') else: plt.figure(figsize=(10.0, 10.0),dpi=300) ax = plt.subplot(111) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) bars=plt.barh(y_pos, times,color="#36845b") for tick in ax.xaxis.get_major_ticks(): tick.label.set_fontsize(25) plt.yticks(y_pos, http, fontsize=25) plt.xticks(rotation=45, rotation_mode="anchor", ha="right") plt.legend(loc='best', fontsize=25) plt.xlabel('Usage (Tweets)', fontsize=25) #plt.yscale('log', fontsize=30) plt.tight_layout() plt.savefig('nouns.png')
def sen(cursor): users_party=db_get_cols_from_table(cursor,"user_names",["user_id","job_type1"]) users=[] party=[] for i in range(0,len(users_party)): users.append(users_party[i][0]) party.append(users_party[i][1]) tweets=[] done=0 v=[] update=True days_in_past=365*2 if update==True: all=[0] * days_in_past all_tot=[0] * days_in_past con=[0] * days_in_past con_tot=[0] * days_in_past lab=[0] * days_in_past lab_tot=[0] * days_in_past all_out=[0] * days_in_past con_out=[0] * days_in_past lab_out=[0] * days_in_past for i in range(0,int(len(users))): u=users[i] p=party[i] print(u,p,i,len(users)) cur_time=time.time() tweets=db_get_cols_from_table(cursor,u,["date","tweet","sen"]) for ii in range(0,len(tweets)): t=tweets[ii][1] delta=int((cur_time-int(tweets[ii][0]))/60/60/24) if delta<days_in_past: string_s=tweets[ii][2] if string_s!="None": s=float(string_s) #s=sentiment(t) #print(ii,len(tweets),t,s) all[delta]+=s all_tot[delta]+=1 if p.startswith("con")==True: con[delta]+=s con_tot[delta]+=1 if p.startswith("lab")==True: lab[delta]+=s lab_tot[delta]+=1 for i in range(0,len(all)): div=all_tot[i] if div!=0: all_out[i]=all[i]/div div=con_tot[i] if div!=0: con_out[i]=con[i]/div div=lab_tot[i] if div!=0: lab_out[i]=lab[i]/div f=open("sen_"+str(days_in_past)+"_days.txt", 'w') for i in range(0,len(all)): f.write(str(all_out[i])+" "+str(con_out[i])+" "+str(lab_out[i])+"\n") f.close()
def cal_retweets(cursor): users = db_get_all_users(cursor) tweets = [] done = 0 v = [] update = False if update == True: for i in range(0, len(users)): u = users[i] print(u) cur_time = time.time() tweets = db_get_cols_from_table(cursor, u, ["date", "tweet"]) rt = 0 origonal = 0 for ii in range(0, len(tweets)): t = tweets[ii][1] delta = (cur_time - int(tweets[ii][0])) / 60 / 60 / 24 if delta < 100.0: if t.startswith("RT "): rt = rt + 1 else: origonal = origonal + 1 if rt + origonal != 0: frac = 100.0 * rt / (rt + origonal) else: frac = 0.0 db_update_record(cursor, "user_names", "user_id", u, "retweets", str(frac)) db_commit() tweets_per_day = db_get_cols_from_table(cursor, "user_names", ["retweets", "job_type1"]) con = [] lab = [] lib = [] snp = [] for i in range(0, len(tweets_per_day)): party = tweets_per_day[i][1] if party.startswith("con") == True: con.append(int(tweets_per_day[i][0])) if party.startswith("lab") == True: lab.append(int(tweets_per_day[i][0])) if party.startswith("lib") == True: lib.append(int(tweets_per_day[i][0])) if party.startswith("snp") == True: snp.append(int(tweets_per_day[i][0])) if int(tweets_per_day[i][0]) != 0: v.append(int(tweets_per_day[i][0])) m = 100 dx = 1.0 x = 0.0 xbins = [] while (x < m): xbins.append(x) x = x + dx con[0] = 0.0 lab[0] = 0.0 snp[0] = 0.0 #lib[0]=0.0 for_web = False if for_web == True: plt.figure(figsize=(25.0, 6.0), dpi=300) plt.title("Re-tweets from MPs as % over last 100 days", fontsize=30) plt.gcf().subplots_adjust(bottom=0.15) plt.hist(v, bins=xbins, alpha=0.5, color='green') plt.hist(con, bins=xbins, alpha=0.8, color='blue') plt.hist(lab, bins=xbins, alpha=0.8, color='red') plt.hist(snp, bins=xbins, alpha=0.8, color='purple') plt.hist(lib, bins=xbins, alpha=0.8, color='yellow') plt.legend(('All', 'Con', 'Lab', 'SNP', "Lib"), fontsize=25) plt.ylabel('Number of MPs', fontsize=25) plt.xlabel('Percentage of tweets that are retweets', fontsize=25) plt.savefig("/var/www/html/graphs/retweets.png", bbox_inches='tight') else: matplotlib.rcParams['font.family'] = 'Open Sans' plt.figure(figsize=(6.0, 6.0), dpi=300) ax = plt.subplot(111) #plt.title("Re-tweets from MPs as % over last 100 days", fontsize=30) plt.gcf().subplots_adjust(bottom=0.15) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) plt.hist(v, bins=xbins, alpha=1.0, color='#36845b') #plt.hist(con, bins=xbins, alpha=0.8,color="#00539f") #plt.hist(lab, bins=xbins, alpha=0.8,color="#d50000") #plt.hist(snp, bins=xbins, alpha=0.8,color="#fff685") #plt.hist(lib, bins=xbins, alpha=0.8,color='yellow') #plt.legend( ( 'Con', 'Lab','SNP'), fontsize=25) plt.ylabel('Number of MPs', fontsize=25) plt.xlabel('Percentage retweets', fontsize=25) plt.savefig("retweet_dist.png", bbox_inches='tight')
def clas(cursor, delta=172800 / 2): users = db_get_all_users(cursor) tweets = [] #users=["JoJohnsonUK"] done = 0 for i in range(271, 562): u = users[i] print(u, i, len(users)) #ada if db_is_col(cursor, u, "clas") == False: db_add_col(cursor, u, "clas") print("adding") tweets = db_get_cols_from_table(cursor, u, ["tweet_id", "tweet"]) for t in tweets: clas, tot, res = clas_clas_text(t[1]) db_update_record(cursor, u, "tweet_id", t[0], "clas", clas) #print(clas) #print(t) db_commit() #adas types = None if tot != 0: m = len(tweets) if m > 500: m = 500 for i in range(0, m): #print(tweets[i]) out = clas_clas_text(tweets[i]) if types == None: types = res else: #print(types) for ii in range(0, len(res)): types[ii][1] = types[ii][1] + res[ii][1] if out != "unknown": clas = clas + 1 path = "/var/www/html/interests/" + u + ".txt" #print(types) #asadasds if clas != 0: types = sorted(types, key=itemgetter(1), reverse=True) sum_ids = 0 for i in range(0, len(types)): sum_ids = sum_ids + types[i][1] clas_perent = 100.0 * (clas / tot) print(u, types, clas_perent) f = open(path, 'w') for i in range(0, len(types)): f.write(types[i][0] + " " + str(int((types[i][1] / sum_ids) * 100.0)) + "\n") #f.write("clas "+str(int(clas_perent))+"\n") f.close()
def cal_tweets_per_day(cursor): users = db_get_all_users(cursor) tweets = [] done = 0 v = [] update = False if update == True: for i in range(0, len(users)): u = users[i] print(u) cur_time = time.time() tweets = db_get_cols_from_table(cursor, u, ["date"]) count = 0 for ii in range(0, len(tweets)): delta = (cur_time - int(tweets[ii][0])) / 60 / 60 / 24 if delta < 100.0: count = count + 1 db_update_record(cursor, "user_names", "user_id", u, "tweets_per_day", str(count / 100.0)) #v.append(count) db_commit() tweets_per_day = db_get_cols_from_table(cursor, "user_names", ["tweets_per_day", "job_type1"]) con = [] lab = [] lib = [] snp = [] for i in range(0, len(tweets_per_day)): party = tweets_per_day[i][1] if party.startswith("con") == True: con.append(int(tweets_per_day[i][0])) if party.startswith("lab") == True: lab.append(int(tweets_per_day[i][0])) if party.startswith("lib") == True: lib.append(int(tweets_per_day[i][0])) if party.startswith("snp") == True: snp.append(int(tweets_per_day[i][0])) v.append(int(tweets_per_day[i][0])) m = 60 dx = 1.0 x = 0.0 xbins = [] while (x < m): xbins.append(x) x = x + dx for_web = False if for_web == True: plt.figure(figsize=(25.0, 6.0), dpi=300) plt.title("Tweets from MPs per day", fontsize=30) plt.gcf().subplots_adjust(bottom=0.15) plt.hist(v, bins=xbins, alpha=0.5, color='green') plt.hist(con, bins=xbins, alpha=0.8, color='blue') plt.hist(lab, bins=xbins, alpha=0.8, color='red') plt.hist(snp, bins=xbins, alpha=0.8, color='purple') plt.hist(lib, bins=xbins, alpha=0.8, color='yellow') plt.legend(('All', 'Con', 'Lab', 'SNP', "Lib"), fontsize=25) plt.ylabel('Number of MPs', fontsize=25) plt.xlabel('Number of tweets/day', fontsize=25) plt.savefig("/var/www/html/graphs/tweets_per_day.png", bbox_inches='tight') else: matplotlib.rcParams['font.family'] = 'Open Sans' ###############All############# plt.clf() plt.figure(figsize=(6.0, 6.0), dpi=300) ax = plt.subplot(111) #plt.title("Tweets from MPs per day", fontsize=30) plt.gcf().subplots_adjust(bottom=0.15) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) ax.hist(v, bins=xbins, alpha=1.0, color='#36845b') plt.ylabel('Number of MPs', fontsize=20) plt.xlabel('Number of tweets/day', fontsize=20) plt.savefig("./tweets_per_day/tweets_per_day_all.png", bbox_inches='tight') ###############Con############# ax = setup_graph() axes = plt.gca() axes.set_ylim([0, 40]) plt.hist(con, bins=xbins, alpha=0.8, color="#00539f") #plt.legend( ( 'Con', 'Lab','SNP',"Lib"), fontsize=20) #plt.ylabel('Number of MPs', fontsize=20) #plt.xlabel('Number of tweets/day', fontsize=20) plt.savefig("./tweets_per_day/tweets_per_day_all_con.png", bbox_inches='tight') ###############Lab############# ax = setup_graph() axes = plt.gca() axes.set_ylim([0, 40]) plt.hist(lab, bins=xbins, alpha=0.8, color="#d50000") #plt.legend( ( 'Con', 'Lab','SNP',"Lib"), fontsize=20) #plt.ylabel('Number of MPs', fontsize=20) #plt.xlabel('Number of tweets/day', fontsize=20) plt.savefig("./tweets_per_day/tweets_per_day_all_lab.png", bbox_inches='tight') ###############SNP############# ax = setup_graph() axes = plt.gca() axes.set_ylim([0, 40]) plt.hist(snp, bins=xbins, alpha=0.8, color="#fff685") #plt.legend( ( 'Con', 'Lab','SNP',"Lib"), fontsize=20) #plt.ylabel('Number of MPs', fontsize=20) #plt.xlabel('Number of tweets/day', fontsize=20) plt.savefig("./tweets_per_day/tweets_per_day_all_snp.png", bbox_inches='tight') ###############Lib############# ax = setup_graph() plt.hist(lib, bins=xbins, alpha=0.8, color='#faa01a') #plt.legend( ( 'Con', 'Lab','SNP',"Lib"), fontsize=20) #plt.ylabel('Number of MPs', fontsize=20) #plt.xlabel('Number of tweets/day', fontsize=20) plt.savefig("./tweets_per_day/tweets_per_day_all_lib.png", bbox_inches='tight')
def topics(cursor): users = db_get_all_users(cursor) tweets = [] done = 0 v = [] update = True days_in_past = 200 cur_time = time.time() - 7 * 24 * 60 * 60 if update == True: brexit = [0] * days_in_past defence = [0] * days_in_past econ = [0] * days_in_past education = [0] * days_in_past health = [0] * days_in_past home_aff = [0] * days_in_past international = [0] * days_in_past misc = [0] * days_in_past party = [0] * days_in_past sci_env = [0] * days_in_past sport = [0] * days_in_past terror = [0] * days_in_past transport = [0] * days_in_past for i in range(0, int(len(users) / 1)): # u = users[i] print(u) tweets = db_get_cols_from_table(cursor, u, ["date", "clas"]) #internal=0 #external=0 for ii in range(0, len(tweets)): c = tweets[ii][1] delta = int((cur_time - int(tweets[ii][0])) / 60 / 60 / 24) if delta < days_in_past: if c == "health": health[delta] += 1 elif c == "transport": transport[delta] += 1 elif c == "education": education[delta] += 1 elif c == "sci_env" or c == "science": sci_env[delta] += 1 elif c == "party": party[delta] += 1 elif c == "brexit": brexit[delta] += 1 m = days_in_past dx = 1.0 x = 0.0 xbins = [] while (x < m): xbins.append(x) x = x + dx matplotlib.rcParams['font.family'] = 'Open Sans' plt.figure(figsize=(25.0, 6.0), dpi=300) ax = plt.subplot(111) ax.spines['right'].set_visible(False) ax.spines['top'].set_visible(False) plt.gcf().subplots_adjust(bottom=0.15) plt.semilogy(xbins, health, alpha=1.0, color="#36845b") plt.semilogy(xbins, transport, alpha=1.0, color="#a3d9bc") plt.semilogy(xbins, education, alpha=1.0, color="#808080") plt.semilogy(xbins, sci_env, alpha=1.0, color="#305496") plt.semilogy(xbins, party, alpha=1.0, color="#ffc000") plt.tick_params(axis='y', labelsize=25) plt.tick_params(axis='x', labelsize=25) plt.legend( ('Health', "Transport", "Education", "Science/Env", "Political"), fontsize=25) plt.xlim((0, 225)) plt.ylabel('Number of tweets', fontsize=25) plt.xlabel('Days in past', fontsize=25) plt.savefig("topics.png", bbox_inches='tight')
def http_retweets(cursor): users = db_get_all_users(cursor) tweets = [] done = 0 v = [] update = False if update == True: for i in range(0, len(users)): u = users[i] print(u) cur_time = time.time() tweets = db_get_cols_from_table(cursor, u, ["date", "tweet"]) http = 0 for ii in range(0, len(tweets)): t = tweets[ii][1] delta = (cur_time - int(tweets[ii][0])) / 60 / 60 / 24 if delta < 100.0: if t.count("http"): http = http + 1 if http != 0: frac = 100.0 * http / (len(tweets)) else: frac = 0.0 db_update_record(cursor, "user_names", "user_id", u, "http_tweets", str(frac)) db_commit() tweets_per_day = db_get_cols_from_table(cursor, "user_names", ["http_tweets", "job_type1"]) print(tweets_per_day) con = [] lab = [] lib = [] snp = [] for i in range(0, len(tweets_per_day)): party = tweets_per_day[i][1] if party.startswith("con") == True: con.append(int(tweets_per_day[i][0])) if party.startswith("lab") == True: lab.append(int(tweets_per_day[i][0])) if party.startswith("lib") == True: lib.append(int(tweets_per_day[i][0])) if party.startswith("snp") == True: snp.append(int(tweets_per_day[i][0])) v.append(int(tweets_per_day[i][0])) m = 100 dx = 1.0 x = 0.0 xbins = [] while (x < m): xbins.append(x) x = x + dx plt.figure(figsize=(25.0, 6.0), dpi=300) plt.title("Percentage of tweets which contain web links", fontsize=30) plt.gcf().subplots_adjust(bottom=0.15) plt.hist(v, bins=xbins, alpha=0.5, color='green') plt.hist(con, bins=xbins, alpha=0.8, color='blue') plt.hist(lab, bins=xbins, alpha=0.8, color='red') plt.hist(snp, bins=xbins, alpha=0.8, color='purple') plt.hist(lib, bins=xbins, alpha=0.8, color='yellow') plt.legend(('All', 'Con', 'Lab', 'SNP', "Lib"), fontsize=25) plt.ylabel('Number of MPs', fontsize=25) plt.xlabel('Percent', fontsize=25) plt.savefig("/var/www/html/graphs/http_tweets.png", bbox_inches='tight')