コード例 #1
0
ファイル: triangulation.py プロジェクト: rjaragon53/Policalc
    def __init__(self):

        json_data = {}
        get = gd.get_data()
        mod = md.modify_data()
        print('Triangulating tweets...')
        senators = get.senators()
        concerns = get.concerns()

        with open('raw/gathered_tweets.json', 'r') as json_file:
            data = json.load(json_file)

            for sen in senators:
                json_data[sen] = {}

                for con in concerns:
                    json_data[sen][con] = []

                    for i in range(len(data[sen][con])):
                        tweet = data[sen][con][i]['tweet_text2']
                        tweet = mod.remove_stopwords(tweet)

                        if self.triangulate(tweet,
                                            data[sen][con][i]['tweet_loc']):
                            json_data[sen][con].append(data[sen][con][i])

        with open('clean/final_tweets.json', 'w') as json_file:
            json.dump(json_data, json_file, indent=4, sort_keys=True)
コード例 #2
0
    def delete_local_files(self):
        get = gd.get_data()
        f_name, f_path = get.file_data()
        for fpath in f_path:
            os.remove(fpath)

        os.remove('raw/raw_rss.txt')
        os.remove('clean/clean_rss.txt')
コード例 #3
0
    def create_tables(self):

        get = gd.get_data()
        f_name, f_path = get.file_data(True)
        conn = sqlite3.connect('policalc.db')
        db_con = conn.cursor()

        for name in f_name:
            query = """CREATE TABLE {} (id INTEGER PRIMARY KEY AUTOINCREMENT, date datetime, file blob)""".format(name)
            db_con.execute(query)

        conn.close()
コード例 #4
0
    def get_all_file(self):

        conn = sqlite3.connect('policalc.db')
        db_con = conn.cursor()
        get = gd.get_data()
        f_name, f_path = get.file_data()

        for i in range(len(f_path)):
            db_con.execute("SELECT * FROM {} ORDER BY id DESC LIMIT 1;".format(f_name[i]))
            db_data = db_con.fetchone()

            with open(f_path[i], 'wb') as file:
                file.write(db_data[2])

        conn.close()
コード例 #5
0
ファイル: app.py プロジェクト: Apollo1207/algo-lab-3
def main():
    n, m, clients, graph = get_data('in_files/first_in')
    min_max_latency = None
    for vertex_id in graph.connections:
        if vertex_id not in clients:
            current_latencies = dejkstra_algorithm(graph, vertex_id)
            current_max_latency = max(
                [current_latencies[client] for client in clients])
            if min_max_latency is None:
                min_max_latency = current_max_latency
            elif current_max_latency < min_max_latency:
                min_max_latency = current_max_latency

    write_data('out_files/first_out', min_max_latency)

    return min_max_latency
コード例 #6
0
    def insert_all_file(self):

        get = gd.get_data()
        f_name, f_path = get.file_data(True)
        conn = sqlite3.connect('policalc.db')
        db_con = conn.cursor()

        for i in range(len(f_path)):

            with open(f_path[i], 'rb') as file:
                blob_file = file.read()

            db_con.execute("INSERT INTO {} VALUES (:id, :date, :file)".format(f_name[i]), {'id': None, 'date': dt.now(), 'file': blob_file})
            conn.commit()

        conn.close()
コード例 #7
0
ファイル: get_fc_inf.py プロジェクト: rjaragon53/Policalc
    def __init__(self):
        get = gd.get_data()
        concerns = get.concerns()
        final_concerns = []

        limit = 0
        for con in concerns:
            if limit < 3:
                final_concerns.append(con)
                limit += 1

        dbs = dbase.access_db()
        dbs.get_file('twitter_concerns_inf', 'DB/twitter_concerns_inf.json')
        with open('DB/twitter_concerns_inf.json', 'r') as db_file:
            db_data = json.load(db_file)

            with open('raw/twitter_concerns.json', 'r') as tc_file:
                tc_data = json.load(tc_file)

                with open('raw/twitter_concerns_inf.json', 'w') as js_file:
                    js_data = {}

                    for i in db_data:
                        js_data[i] = db_data[i] + tc_data[i]

                    top_list = sorted(js_data.items(),
                                      key=lambda kv: kv[1],
                                      reverse=True)

                    limit = 0
                    for i in range(len(top_list)):
                        if limit < 3:
                            print(top_list[i][0], final_concerns)
                            if top_list[i][0] not in final_concerns:
                                final_concerns.append(top_list[i][0])
                            limit += 1

                    json.dump(js_data, js_file, indent=4, sort_keys=True)

        with open('clean/final_concerns_inf.txt', 'a') as final:
            for final_con in final_concerns:
                final.write(final_con + '\n')

        os.remove('DB/twitter_concerns_inf.json')
コード例 #8
0
    def count_response(self, con_list):

        get = gd.get_data()
        mod = md.modify_data()
        tso = ts.TwitterSearchOrder()
        tso.arguments.update({'tweet_mode': 'extended'})
        api = get.api()
        coordinates = get.coordinates()
        con_count = 0
        respo_list = []
        respo_loc = []

        for con in con_list:
            print('\tCounting ' + con + '...')
            tso.set_keywords([con])

            for coordinate in coordinates:
                tso.set_geocode(coordinate['lat'], coordinate['long'], 5, False)

                for tweet in api.search_tweets_iterable(tso, callback=self.avoid_rate_limit):
                    try:
                        tweet_text = tweet['retweeted_status']['full_text']
                    except KeyError:
                        tweet_text = tweet['full_text']

                    cleaned_tweet = mod.clean_tweet(tweet_text)
                    temp_res = cleaned_tweet + ' --- ' + tweet['id_str']
                    if temp_res not in respo_list:
                        respo_list.append(temp_res)
                        respo_loc.append(coordinate['city'])
                        con_count += 1

        with open('raw/response.txt', 'a') as res:
            print('Total: ' + str(con_count))
            res.write(con_list[0] + ': ' + str(con_count) + '\n')
            for i in range(con_count):
                response = respo_list[i] + ' (' + respo_loc[i] + ')'
                res.write(response + '\n')
            res.write('\n')

        return con_count
コード例 #9
0
    def __init__(self):

        get = gd.get_data()
        mod = md.modify_data()
        dbs = dbase.access_db()
        json_data = {}

        dbs.get_file('tweet_scores_inf', 'DB/clean/tweet_scores_inf.json')
        with open('DB/clean/tweet_scores_inf.json', 'r') as json_file:
            dbs_data = json.load(json_file)

        with open('clean/final_tweets.json', 'r') as json_file:
            data = json.load(json_file)

            senators = get.senators()
            concerns = get.concerns()

            for sen in senators:
                for con in concerns:
                    json_data[sen + ' - ' + con] = []
                    total_tweets = len(data[sen][con])
                    pos = 0
                    neg = 0
                    neu = 0
                    pos_tweets = []
                    neg_tweets = []
                    neu_tweets = []

                    for i in range(total_tweets):
                        tweet = data[sen][con][i]['tweet_text2']
                        text = TextBlob(tweet)
                        result = text.sentiment.polarity
                        score = self.check_score(data[sen][con][i]['user_verified'],
                                                 data[sen][con][i]['user_created'],
                                                 data[sen][con][i]['user_follower'],
                                                 data[sen][con][i]['is_retweet'])

                        if text.sentiment.polarity >= 0.1:
                            pos += score
                            pos_tweets.append(tweet)
                            print('POSITIVE', text.sentiment.polarity, tweet)
                        elif text.sentiment.polarity <= -0.1:
                            neg += score
                            neg_tweets.append(tweet)
                            print('NEGATIVE', text.sentiment.polarity, tweet)
                        else:
                            neu += score
                            neu_tweets.append(tweet)
                            print('NEUTRAL', text.sentiment.polarity, tweet)

                        with open('common_words.txt', 'a') as common_words:
                            tweet = mod.translate(tweet)
                            tweet = mod.remove_stopwords(tweet)
                            text = nltk.word_tokenize(tweet)
                            posTagged = pos_tag(text)
                            result = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in posTagged]

                            for res in result:
                                if res[1] == 'NOUN' or res[1] == 'VERB' or res[1] == 'ADJ':
                                    if res[0] != sen and res[0] not in con:
                                        text = res[0] + ' '
                                        common_words.write(text)

                    total = pos + neg + neu

                    json_data[sen + ' - ' + con].append({
                        'pos': pos, 'neg': neg, 'neu': neu, 'total': total, 'num_tweets': total_tweets,
                        'pos_tweets': pos_tweets, 'neg_tweets': neg_tweets, 'neu_tweets': neu_tweets
                    })
                    try:
                        for pt in pos_tweets:
                            dbs_data[sen + ' - ' + con][0]['pos_tweets'].append(pt)
                        for nt in neg_tweets:
                            dbs_data[sen + ' - ' + con][0]['neg_tweets'].append(nt)
                        for nt in neu_tweets:
                            dbs_data[sen + ' - ' + con][0]['neu_tweets'].append(nt)

                        dbs_data[sen + ' - ' + con][0]['pos'] += pos
                        dbs_data[sen + ' - ' + con][0]['neg'] += neg
                        dbs_data[sen + ' - ' + con][0]['neu'] += neu

                    except KeyError:
                        json_data[sen + ' - ' + con] = []
                        json_data[sen + ' - ' + con].append({
                            'pos': pos, 'neg': neg, 'neu': neu, 'total': total, 'num_tweets': total_tweets,
                            'pos_tweets': pos_tweets, 'neg_tweets': neg_tweets, 'neu_tweets': neu_tweets
                        })

                    if total != 0:
                        print(sen + ' - ' + con)
                        print('Positive: ' + str(round(pos/total*100, 2)) +
                              '%\nNegative: ' + str(round(neg/total*100, 2)) +
                              '%\nNeutral: ' + str(round(neu/total*100, 2)) + '%')

                        words = re.findall(r'\w+', open('common_words.txt').read().lower())
                        count = Counter(words).most_common(3)
                        common = ''
                        for cnt in count:
                            common = common + cnt[0] + ' '
                        print('General Keywords: ' + common)
                        os.remove("common_words.txt")

                        print('From ' + str(total_tweets) + ' tweets.\n')

        with open('clean/tweet_scores.json', 'w') as json_file:
            json.dump(json_data, json_file, indent=4, sort_keys=True)

        with open('clean/tweet_scores_inf.json', 'w') as json_file:
            json.dump(dbs_data, json_file, indent=4, sort_keys=True)

        os.remove("DB/clean/tweet_scores_inf.json")
コード例 #10
0
    def __init__(self):

        print('Gathering tweets with political context...')
        get = gd.get_data()
        mod = md.modify_data()
        api = get.api()
        tso = ts.TwitterSearchOrder()
        tso.arguments.update({'tweet_mode': 'extended'})
        res_list = []
        res_dict = {}
        json_data = {}
        senators = get.senators()
        concerns = get.concerns()
        coordinates = get.coordinates()

        for senator in senators:
            json_data[senator] = {}
            print('Gathering tweets mentioning ' + senator + '...')

            for concern in concerns:
                json_data[senator][concern] = []
                con_en = concern.split(',')[0]
                try:
                    con_tl = concern.split(', ')[1]
                    con_cb = concern.split(', ')[2]
                    con_list = [con_en, con_tl, con_cb]
                except IndexError:
                    con_tl = concern.split(', ')[1]
                    con_cb = None
                    con_list = [con_en, con_tl]
                print('\t' + concern + '...')

                for con_item in con_list:
                    tso.set_keywords([senator, con_item])

                    for coordinate in coordinates:
                        tso.set_geocode(coordinate['lat'], coordinate['long'], 5, False)

                        for tweet in api.search_tweets_iterable(tso, callback=self.avoid_rate_limit):
                            try:
                                tweet_text = tweet['retweeted_status']['full_text']
                                is_retweet = True
                            except KeyError:
                                tweet_text = tweet['full_text']
                                is_retweet = False

                            res_text = tweet['id_str'] + ': ' + tweet_text
                            if res_text not in res_list:
                                res_list.append(res_text)

                                if tweet['is_quote_status']:
                                    if is_retweet:
                                        quote_text = tweet['retweeted_status']['quoted_status']['full_text']
                                    else:
                                        quote_text = tweet['quoted_status']['full_text']
                                else:
                                    quote_text = None

                                tweet_text2 = mod.clean_tweet(tweet_text)
                                tweet_text2 = mod.translate(tweet_text2)

                                if tweet_text2 is None:
                                    continue

                                if quote_text is not None:
                                    quote_text2 = mod.clean_tweet(quote_text)
                                    quote_text2 = mod.translate(quote_text2)
                                else:
                                    quote_text2 = None

                                json_data[senator][concern].append({
                                    'tweet_text': tweet_text,
                                    'tweet_text2': tweet_text2,
                                    'is_retweet': is_retweet,
                                    'quote_text': quote_text,
                                    'quote_text2': quote_text2,
                                    'tweet_id': tweet['id'],
                                    'rt_count': tweet['retweet_count'],
                                    'tweet_created': tweet['created_at'],
                                    'tweet_loc': coordinate['city'],
                                    'user_id': tweet['user']['id'],
                                    'user_created': tweet['user']['created_at'],
                                    'user_verified': tweet['user']['verified'],
                                    'user_follower': tweet['user']['followers_count'],
                                    'user_total_tweet': tweet['user']['statuses_count'],
                                    'user_loc': tweet['user']['location']
                                })

                                res_tweet = mod.remove_stopwords(tweet_text2)
                                if quote_text2 is not None:
                                    res_dict = self.initialize_triangulation(
                                        res_dict, res_tweet + ' ' + quote_text2 + ' ' + coordinate['city'])
                                else:
                                    res_dict = self.initialize_triangulation(
                                        res_dict, res_tweet + ' ' + coordinate['city'])

        print('Saving collected tweets into \"gathered_tweets.json\" file...')
        self.save_tweet(json_data)
        self.save_cleaned_tweet(res_dict)
        print('Finished gathering tweets with political context...')
コード例 #11
0
ファイル: app.py プロジェクト: hapoh/football_scraping
from modules.get_data import get_data

import os
from glob import glob
import pandas as pd
from unidecode import unidecode

# Download HTML data
url = "https://www.ligue1.fr/listejoueurs?seasonId="
urls = get_urls(url, 100)
download_urls(urls)

# Extract and format data (1 file per season)
path = "html/LIGUE1/seasonId="
paths = get_paths(path, 100)
get_data(paths)

# Append all csv files in one file
extension = "csv"
fichiers = glob(os.path.join(fr"csv/LIGUE1/sources", "*"))
combined_csv = pd.concat([pd.read_csv(f) for f in fichiers])

if os.path.exists(os.path.join("csv/LIGUE1")):
    combined_csv.to_csv(fr"csv/LIGUE1/liste_joueurs.csv",
                        index=False,
                        encoding='utf-8-sig')
else:
    os.makedirs(os.path.join("csv/LIGUE1"))
    combined_csv.to_csv(fr"csv/LIGUE1/liste_joueurs.csv",
                        index=False,
                        encoding='utf-8-sig')