def __init__(self): json_data = {} get = gd.get_data() mod = md.modify_data() print('Triangulating tweets...') senators = get.senators() concerns = get.concerns() with open('raw/gathered_tweets.json', 'r') as json_file: data = json.load(json_file) for sen in senators: json_data[sen] = {} for con in concerns: json_data[sen][con] = [] for i in range(len(data[sen][con])): tweet = data[sen][con][i]['tweet_text2'] tweet = mod.remove_stopwords(tweet) if self.triangulate(tweet, data[sen][con][i]['tweet_loc']): json_data[sen][con].append(data[sen][con][i]) with open('clean/final_tweets.json', 'w') as json_file: json.dump(json_data, json_file, indent=4, sort_keys=True)
def delete_local_files(self): get = gd.get_data() f_name, f_path = get.file_data() for fpath in f_path: os.remove(fpath) os.remove('raw/raw_rss.txt') os.remove('clean/clean_rss.txt')
def create_tables(self): get = gd.get_data() f_name, f_path = get.file_data(True) conn = sqlite3.connect('policalc.db') db_con = conn.cursor() for name in f_name: query = """CREATE TABLE {} (id INTEGER PRIMARY KEY AUTOINCREMENT, date datetime, file blob)""".format(name) db_con.execute(query) conn.close()
def get_all_file(self): conn = sqlite3.connect('policalc.db') db_con = conn.cursor() get = gd.get_data() f_name, f_path = get.file_data() for i in range(len(f_path)): db_con.execute("SELECT * FROM {} ORDER BY id DESC LIMIT 1;".format(f_name[i])) db_data = db_con.fetchone() with open(f_path[i], 'wb') as file: file.write(db_data[2]) conn.close()
def main(): n, m, clients, graph = get_data('in_files/first_in') min_max_latency = None for vertex_id in graph.connections: if vertex_id not in clients: current_latencies = dejkstra_algorithm(graph, vertex_id) current_max_latency = max( [current_latencies[client] for client in clients]) if min_max_latency is None: min_max_latency = current_max_latency elif current_max_latency < min_max_latency: min_max_latency = current_max_latency write_data('out_files/first_out', min_max_latency) return min_max_latency
def insert_all_file(self): get = gd.get_data() f_name, f_path = get.file_data(True) conn = sqlite3.connect('policalc.db') db_con = conn.cursor() for i in range(len(f_path)): with open(f_path[i], 'rb') as file: blob_file = file.read() db_con.execute("INSERT INTO {} VALUES (:id, :date, :file)".format(f_name[i]), {'id': None, 'date': dt.now(), 'file': blob_file}) conn.commit() conn.close()
def __init__(self): get = gd.get_data() concerns = get.concerns() final_concerns = [] limit = 0 for con in concerns: if limit < 3: final_concerns.append(con) limit += 1 dbs = dbase.access_db() dbs.get_file('twitter_concerns_inf', 'DB/twitter_concerns_inf.json') with open('DB/twitter_concerns_inf.json', 'r') as db_file: db_data = json.load(db_file) with open('raw/twitter_concerns.json', 'r') as tc_file: tc_data = json.load(tc_file) with open('raw/twitter_concerns_inf.json', 'w') as js_file: js_data = {} for i in db_data: js_data[i] = db_data[i] + tc_data[i] top_list = sorted(js_data.items(), key=lambda kv: kv[1], reverse=True) limit = 0 for i in range(len(top_list)): if limit < 3: print(top_list[i][0], final_concerns) if top_list[i][0] not in final_concerns: final_concerns.append(top_list[i][0]) limit += 1 json.dump(js_data, js_file, indent=4, sort_keys=True) with open('clean/final_concerns_inf.txt', 'a') as final: for final_con in final_concerns: final.write(final_con + '\n') os.remove('DB/twitter_concerns_inf.json')
def count_response(self, con_list): get = gd.get_data() mod = md.modify_data() tso = ts.TwitterSearchOrder() tso.arguments.update({'tweet_mode': 'extended'}) api = get.api() coordinates = get.coordinates() con_count = 0 respo_list = [] respo_loc = [] for con in con_list: print('\tCounting ' + con + '...') tso.set_keywords([con]) for coordinate in coordinates: tso.set_geocode(coordinate['lat'], coordinate['long'], 5, False) for tweet in api.search_tweets_iterable(tso, callback=self.avoid_rate_limit): try: tweet_text = tweet['retweeted_status']['full_text'] except KeyError: tweet_text = tweet['full_text'] cleaned_tweet = mod.clean_tweet(tweet_text) temp_res = cleaned_tweet + ' --- ' + tweet['id_str'] if temp_res not in respo_list: respo_list.append(temp_res) respo_loc.append(coordinate['city']) con_count += 1 with open('raw/response.txt', 'a') as res: print('Total: ' + str(con_count)) res.write(con_list[0] + ': ' + str(con_count) + '\n') for i in range(con_count): response = respo_list[i] + ' (' + respo_loc[i] + ')' res.write(response + '\n') res.write('\n') return con_count
def __init__(self): get = gd.get_data() mod = md.modify_data() dbs = dbase.access_db() json_data = {} dbs.get_file('tweet_scores_inf', 'DB/clean/tweet_scores_inf.json') with open('DB/clean/tweet_scores_inf.json', 'r') as json_file: dbs_data = json.load(json_file) with open('clean/final_tweets.json', 'r') as json_file: data = json.load(json_file) senators = get.senators() concerns = get.concerns() for sen in senators: for con in concerns: json_data[sen + ' - ' + con] = [] total_tweets = len(data[sen][con]) pos = 0 neg = 0 neu = 0 pos_tweets = [] neg_tweets = [] neu_tweets = [] for i in range(total_tweets): tweet = data[sen][con][i]['tweet_text2'] text = TextBlob(tweet) result = text.sentiment.polarity score = self.check_score(data[sen][con][i]['user_verified'], data[sen][con][i]['user_created'], data[sen][con][i]['user_follower'], data[sen][con][i]['is_retweet']) if text.sentiment.polarity >= 0.1: pos += score pos_tweets.append(tweet) print('POSITIVE', text.sentiment.polarity, tweet) elif text.sentiment.polarity <= -0.1: neg += score neg_tweets.append(tweet) print('NEGATIVE', text.sentiment.polarity, tweet) else: neu += score neu_tweets.append(tweet) print('NEUTRAL', text.sentiment.polarity, tweet) with open('common_words.txt', 'a') as common_words: tweet = mod.translate(tweet) tweet = mod.remove_stopwords(tweet) text = nltk.word_tokenize(tweet) posTagged = pos_tag(text) result = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged] for res in result: if res[1] == 'NOUN' or res[1] == 'VERB' or res[1] == 'ADJ': if res[0] != sen and res[0] not in con: text = res[0] + ' ' common_words.write(text) total = pos + neg + neu json_data[sen + ' - ' + con].append({ 'pos': pos, 'neg': neg, 'neu': neu, 'total': total, 'num_tweets': total_tweets, 'pos_tweets': pos_tweets, 'neg_tweets': neg_tweets, 'neu_tweets': neu_tweets }) try: for pt in pos_tweets: dbs_data[sen + ' - ' + con][0]['pos_tweets'].append(pt) for nt in neg_tweets: dbs_data[sen + ' - ' + con][0]['neg_tweets'].append(nt) for nt in neu_tweets: dbs_data[sen + ' - ' + con][0]['neu_tweets'].append(nt) dbs_data[sen + ' - ' + con][0]['pos'] += pos dbs_data[sen + ' - ' + con][0]['neg'] += neg dbs_data[sen + ' - ' + con][0]['neu'] += neu except KeyError: json_data[sen + ' - ' + con] = [] json_data[sen + ' - ' + con].append({ 'pos': pos, 'neg': neg, 'neu': neu, 'total': total, 'num_tweets': total_tweets, 'pos_tweets': pos_tweets, 'neg_tweets': neg_tweets, 'neu_tweets': neu_tweets }) if total != 0: print(sen + ' - ' + con) print('Positive: ' + str(round(pos/total*100, 2)) + '%\nNegative: ' + str(round(neg/total*100, 2)) + '%\nNeutral: ' + str(round(neu/total*100, 2)) + '%') words = re.findall(r'\w+', open('common_words.txt').read().lower()) count = Counter(words).most_common(3) common = '' for cnt in count: common = common + cnt[0] + ' ' print('General Keywords: ' + common) os.remove("common_words.txt") print('From ' + str(total_tweets) + ' tweets.\n') with open('clean/tweet_scores.json', 'w') as json_file: json.dump(json_data, json_file, indent=4, sort_keys=True) with open('clean/tweet_scores_inf.json', 'w') as json_file: json.dump(dbs_data, json_file, indent=4, sort_keys=True) os.remove("DB/clean/tweet_scores_inf.json")
def __init__(self): print('Gathering tweets with political context...') get = gd.get_data() mod = md.modify_data() api = get.api() tso = ts.TwitterSearchOrder() tso.arguments.update({'tweet_mode': 'extended'}) res_list = [] res_dict = {} json_data = {} senators = get.senators() concerns = get.concerns() coordinates = get.coordinates() for senator in senators: json_data[senator] = {} print('Gathering tweets mentioning ' + senator + '...') for concern in concerns: json_data[senator][concern] = [] con_en = concern.split(',')[0] try: con_tl = concern.split(', ')[1] con_cb = concern.split(', ')[2] con_list = [con_en, con_tl, con_cb] except IndexError: con_tl = concern.split(', ')[1] con_cb = None con_list = [con_en, con_tl] print('\t' + concern + '...') for con_item in con_list: tso.set_keywords([senator, con_item]) for coordinate in coordinates: tso.set_geocode(coordinate['lat'], coordinate['long'], 5, False) for tweet in api.search_tweets_iterable(tso, callback=self.avoid_rate_limit): try: tweet_text = tweet['retweeted_status']['full_text'] is_retweet = True except KeyError: tweet_text = tweet['full_text'] is_retweet = False res_text = tweet['id_str'] + ': ' + tweet_text if res_text not in res_list: res_list.append(res_text) if tweet['is_quote_status']: if is_retweet: quote_text = tweet['retweeted_status']['quoted_status']['full_text'] else: quote_text = tweet['quoted_status']['full_text'] else: quote_text = None tweet_text2 = mod.clean_tweet(tweet_text) tweet_text2 = mod.translate(tweet_text2) if tweet_text2 is None: continue if quote_text is not None: quote_text2 = mod.clean_tweet(quote_text) quote_text2 = mod.translate(quote_text2) else: quote_text2 = None json_data[senator][concern].append({ 'tweet_text': tweet_text, 'tweet_text2': tweet_text2, 'is_retweet': is_retweet, 'quote_text': quote_text, 'quote_text2': quote_text2, 'tweet_id': tweet['id'], 'rt_count': tweet['retweet_count'], 'tweet_created': tweet['created_at'], 'tweet_loc': coordinate['city'], 'user_id': tweet['user']['id'], 'user_created': tweet['user']['created_at'], 'user_verified': tweet['user']['verified'], 'user_follower': tweet['user']['followers_count'], 'user_total_tweet': tweet['user']['statuses_count'], 'user_loc': tweet['user']['location'] }) res_tweet = mod.remove_stopwords(tweet_text2) if quote_text2 is not None: res_dict = self.initialize_triangulation( res_dict, res_tweet + ' ' + quote_text2 + ' ' + coordinate['city']) else: res_dict = self.initialize_triangulation( res_dict, res_tweet + ' ' + coordinate['city']) print('Saving collected tweets into \"gathered_tweets.json\" file...') self.save_tweet(json_data) self.save_cleaned_tweet(res_dict) print('Finished gathering tweets with political context...')
from modules.get_data import get_data import os from glob import glob import pandas as pd from unidecode import unidecode # Download HTML data url = "https://www.ligue1.fr/listejoueurs?seasonId=" urls = get_urls(url, 100) download_urls(urls) # Extract and format data (1 file per season) path = "html/LIGUE1/seasonId=" paths = get_paths(path, 100) get_data(paths) # Append all csv files in one file extension = "csv" fichiers = glob(os.path.join(fr"csv/LIGUE1/sources", "*")) combined_csv = pd.concat([pd.read_csv(f) for f in fichiers]) if os.path.exists(os.path.join("csv/LIGUE1")): combined_csv.to_csv(fr"csv/LIGUE1/liste_joueurs.csv", index=False, encoding='utf-8-sig') else: os.makedirs(os.path.join("csv/LIGUE1")) combined_csv.to_csv(fr"csv/LIGUE1/liste_joueurs.csv", index=False, encoding='utf-8-sig')