def insert_get_countries(dataset):

    parsed_countries = []
    countries = []

    for row in dataset[1:]:
        country = row[1]

        if country not in parsed_countries:
            parsed_countries.append(country)

            iso3 = covid19_data_downloader.get_iso3_country(country)
            state = ''
            population = None

            countries.append((country, iso3, state, population))

    inserter = DataInserter()

    return inserter.insert_country(countries)
Example #2
0
 def __init__(self,
              select_fun,
              insert_fun,
              worker_class,
              lock=threading.Lock(),
              db_m=DBManager()):
     threading.Thread.__init__(self)
     self.new_data = queue.Queue()
     self.done_data = queue.Queue()
     self.db_m = db_m
     self._db_access = lock
     self.db_insert = DataInserter(self.db_m)
     self.db_select = DataSelector(self.db_m)
     self.num_loc_threads = 2
     self.max_threads = 5  # 5 worked..
     self.batch_size = 10
     self.quota_exceeded = False
     self.select_fun = select_fun.__name__
     self.insert_fun = insert_fun.__name__
     self.worker_class = worker_class
def insert_get_countries(global_conf, us_deaths):

    parsed_countries = []
    parsed_states = []
    countries = []

    for row in global_conf[1:]:
        country = row[1]

        if country not in parsed_countries:
            parsed_countries.append(country)

            iso3 = get_iso3_country(country)
            state = ''
            population = None

            countries.append((country, iso3, state, population))

    for row in us_deaths[1:]:
        state = row[6]

        if state not in parsed_states:
            parsed_states.append(state)

            state_code = us_states.state_to_code(state)

            country = row[7]
            iso3 = row[2]
            population = row[11]

            if state_code is None:
                state_code = state
            countries.append((country, iso3, state_code, population))

    data_inserter = DataInserter()

    return data_inserter.insert_country(countries)
                    input_data = (country_id, date, 0, sum_deaths, 0)
                    parsed_data.append(input_data)

    for d in parsed_data:
        date = d[1]
        parsed_date = parse_date(date)
        index = parsed_data.index(d)
        parsed_data[index] = (d[0], parsed_date, d[2], d[3], d[4])

    return parsed_data


if __name__ == "__main__":

    data_inserter = DataInserter()

    directory = '../data/Covid-19-data/csse_covid_19_data/csse_covid_19_time_series'
    path_global_conf_file = directory + '/' + 'time_series_covid19_confirmed_global.csv'
    path_global_deaths_file = directory + '/' + 'time_series_covid19_deaths_global.csv'
    path_us_conf_file = directory + '/' + 'time_series_covid19_confirmed_US.csv'
    path_us_deaths_file = directory + '/' + 'time_series_covid19_deaths_US.csv'
    path_global_recov_file = directory + '/' + 'time_series_covid19_recovered_global.csv'

    data = parse_csv_files(
        path_global_conf_file, path_global_deaths_file, path_us_conf_file,
        path_us_deaths_file, path_global_recov_file
    )  # no starting date = all, data='no-zero-month/day/last-two-numbers-year'
    # print(data)
    data_inserter.insert_covid19_data(data)
            sum_deaths = sum(deaths)
            sum_recovs = sum(recovs)

            index = parsed_data.index(old_data[0])
            parsed_data[index] = (old_data[0][0], old_data[0][1], sum_confs,
                                  sum_deaths, sum_recovs)

        else:

            parsed_data.append((country_id, date, conf, death, recov))

    for row in parsed_data:
        date = row[1]
        parsed_date = parse_date(date)

        index = parsed_data.index(row)
        parsed_data[index] = (row[0], parsed_date, row[2], row[3], row[4])

    return parsed_data


if __name__ == "__main__":

    data_inserter = DataInserter()

    directory = '../data/SARS-03-data'
    file_path = directory + '/' + 'sars_2003_complete_dataset_clean.csv'

    data = parse_csv_files(file_path)
    # print(data)
    # data_inserter.insert_sars_data(data)
Example #6
0
class Pipe(threading.Thread):
    def __init__(self,
                 select_fun,
                 insert_fun,
                 worker_class,
                 lock=threading.Lock(),
                 db_m=DBManager()):
        threading.Thread.__init__(self)
        self.new_data = queue.Queue()
        self.done_data = queue.Queue()
        self.db_m = db_m
        self._db_access = lock
        self.db_insert = DataInserter(self.db_m)
        self.db_select = DataSelector(self.db_m)
        self.num_loc_threads = 2
        self.max_threads = 5  # 5 worked..
        self.batch_size = 10
        self.quota_exceeded = False
        self.select_fun = select_fun.__name__
        self.insert_fun = insert_fun.__name__
        self.worker_class = worker_class

    def get_new_data(self):
        batch = []
        it = 0
        while it < self.batch_size and not self.new_data.empty():
            batch.append(self.new_data.get())
            it += 1
        return batch

    def get_done_data(self):
        batch = []
        it = 0
        while it < self.batch_size and not self.done_data.empty():
            batch.append(self.done_data.get())
            it += 1
        return batch

    def put_new_data(self, data):
        for d in data:
            self.new_data.put(d)

    def put_done_data(self, data):
        for d in data:
            self.done_data.put(d)

    def stop(self):
        self.quota_exceeded = True

    def run(self):
        worker_threads = []
        epoch_count = 0
        select_scale = 5

        with self._db_access:
            new_data = self.db_select.__getattribute__(self.select_fun)(
                self.batch_size * select_scale)
        print('Data selected')
        self.put_new_data(new_data)
        thread_id = 0

        while not self.new_data.empty() and not self.quota_exceeded:
            print("----- Beginning " + str(epoch_count) + " epoch -----")
            worker_threads = [t for t in worker_threads if t.is_alive()]
            print("Active threads: " + str(len(worker_threads)))
            print("Data to process: " + str(self.new_data.qsize()))
            for i in range(self.num_loc_threads):
                thread = self.worker_class(thread_id, self.get_new_data(),
                                           self)
                thread.start()
                worker_threads.append(thread)
                thread_id += 1

            print("Processing started")

            if len(worker_threads) > self.max_threads:
                print('Too many to process, waiting..')
                for t in worker_threads[:-self.max_threads // 2]:
                    t.join()
                print('Resuming...')

            print("Inserting started")
            print("Data to insert: " + str(self.done_data.qsize()))
            if not self.done_data.empty():
                with self._db_access:
                    while not self.done_data.empty():
                        self.db_insert.__getattribute__(self.insert_fun)(
                            self.get_done_data())

            with self._db_access:
                new_data = self.db_select.__getattribute__(self.select_fun)(
                    self.batch_size * select_scale)
            print('New data selected')
            self.put_new_data(new_data)
            epoch_count += 1

        print('--- No more data ---')
        print('Joining threads')
        for t in worker_threads:
            t.join()
        print("All thread finished, inserting last..")

        if not self.done_data.empty():
            with self._db_access:
                while not self.done_data.empty():
                    self.db_insert.__getattribute__(self.insert_fun)(
                        self.get_done_data())
        print('--- Everything added ---')

    def run_one(self):
        epoch_count = 0

        with self._db_access:
            new_data = self.db_select.__getattribute__(self.select_fun)(
                self.batch_size)
        print('Data selected')
        self.put_new_data(new_data)

        while not self.new_data.empty() and not self.quota_exceeded:
            print("----- Beginning " + str(epoch_count) + " epoch -----")
            print(self.quota_exceeded)

            worker = self.worker_class(1, self.get_new_data(), self)
            worker.start()
            print("Processing started")
            worker.join()
            print("Data to insert: " + str(self.done_data.qsize()))
            with self._db_access:
                self.db_insert.__getattribute__(self.insert_fun)(
                    self.get_done_data())
                new_data = self.db_select.__getattribute__(self.select_fun)(
                    self.batch_size)
            print('New data selected')
            self.put_new_data(new_data)
            epoch_count += 1

        if not self.done_data.empty():
            with self._db_access:
                while not self.done_data.empty():
                    self.db_insert.__getattribute__(self.insert_fun)(
                        self.get_done_data())
        print('--- Everything added ---')
Example #7
0
import json
import os
from db_utils.data_inserter import DataInserter

tw_data_dir = '../../data/tweets/'
rtw_data_dir = '../../data/retweets/'
usr_data_dir = '../../data/users/'

data_dir = usr_data_dir

if __name__ == "__main__":
    db = DataInserter()
    filenames = []
    for filename in os.listdir(data_dir):
        if '.DS' not in filename and '_in' not in filename:
            filenames.append(filename)

    for filename in filenames:
        print(filename)
        with open(data_dir + filename) as f:
            data = json.load(f)
            if 'retweets' in data_dir:
                db.insert_retweets(data)
            elif 'users' in data_dir:
                db.fast_insert_users(data)
            else:
                db.fast_insert_tweets(data)