def process_single_db(db_file): db = Database(db_file) users_fips_tb = db.create_table(USER_FIPS_TBNAME, USER_FIPS_COLUMNS) user_chunks = chunkify([user[0] for user in get_users_in_db(db_file)], n=10000) process_user_chunks(user_chunks, db)
if TEST_SIZE: unique_users = [ user[0] for user in chronology_db.select( 'SELECT DISTINCT user_id FROM user_fips LIMIT {}'.format( TEST_SIZE)) ] else: unique_users = [ user[0] for user in chronology_db.select( 'SELECT DISTINCT user_id FROM user_fips') ] print "Number of Unique Users: {}".format(len(unique_users)) chronology_tb = chronology_db.create_table('user_chronology', USER_CHRONOLOGY_COLUMNS) user_chunks = list(chunkify(unique_users, n=100)) for i, user_chunk in enumerate(user_chunks): if int(ceil((float(i) / len(user_chunks)) * 100)) % 25 == 0 or i == len(user_chunks) - 1: print "\nProcessing chunk {} out of {}".format( i + 1, len(user_chunks)) process_user_chunk_chronology(user_chunk, chronology_db, centroids) print '\nElapsed Time: {}s\n'.format(round(time.time() - s, 2)) print 'Size: {}\n'.format(TEST_SIZE if TEST_SIZE else 'All')
if not tweet_stats_db_file: raise Exception('\nNo database selected! Goodbye.\n') except Exception as e: print e sys.exit() s = time.time() tweet_stats_db = Database(tweet_stats_db_file) unique_fips = sorted([ str(fips[0]).zfill(5) for fips in tweet_stats_db.select( 'SELECT DISTINCT fips FROM statistics') ]) fips_chunks = list(chunkify(unique_fips, n=NCHUNKS_FIPS)) pool = mp.Pool(processes=mp.cpu_count()) processes = [ pool.apply_async(process_fips_chunk, args=(chunk, tweet_stats_db_file)) for chunk in fips_chunks ] total_time = 0 for i, p in enumerate(processes): print '{} out of {} chunks'.format(i + 1, len(processes)) loop_t = time.time() p.get() elapsed = round(time.time() - loop_t, 2)
except Exception as e: print e sys.exit() s = time.time() chronology_db = Database(chronology_db_file) if TEST_SIZE: unique_users = [ user[0] for user in chronology_db.select( 'SELECT DISTINCT user_id FROM user_chronology LIMIT {}'.format( TEST_SIZE)) ] else: unique_users = [ user[0] for user in chronology_db.select( 'SELECT DISTINCT user_id FROM user_chronology') ] user_chunks = list(chunkify(unique_users, n=USER_CHUNK_SIZE)) for i, tweet_db_file in enumerate(tweets_db_files): print "\nProcessing tweets db {} out of {}".format( i + 1, len(tweets_db_files)) process_db_file(user_chunks, tweet_db_file) print '\nElapsed Time: {}s\n'.format(round(time.time() - s, 2)) print 'Size: {}\tUser Chunks: {}\n'.format( TEST_SIZE if TEST_SIZE else 'All', USER_CHUNK_SIZE)
if not tweet_stats_db_file: raise Exception('\nNo database selected! Goodbye.\n') except Exception as e: print e sys.exit() s = time.time() tweet_stats_db = Database(tweet_stats_db_file) unique_users = sorted([ uid[0] for uid in tweet_stats_db.select( 'SELECT DISTINCT user_id FROM statistics') ]) user_chunks = list(chunkify(unique_users, n=NCHUNKS_USERS)) pool = mp.Pool(processes=mp.cpu_count()) processes = [ pool.apply_async(process_users_chunk, args=(chunk, tweet_stats_db_file)) for chunk in user_chunks ] total_time = 0 for i, p in enumerate(processes): print '{} out of {} chunks'.format(i + 1, len(processes)) loop_t = time.time() p.get() elapsed = round(time.time() - loop_t, 2)
except Exception as e: print e sys.exit() s = time.time() aggregated_db = Database(aggregated_data_db) potential_movers_tb = aggregated_db.create_table(POTENTIAL_MOVERS_TBNAME, POTENTIAL_MOVERS_COLUMNS) unique_users = [ user[0] for user in aggregated_db.select( 'SELECT DISTINCT user_id FROM {}'.format(USER_FIPS_TBNAME)) ][:TEST_SIZE] user_chunks = [chunk for chunk in chunkify(unique_users, n=10000)] # counter = 0 for i, user_chunk in enumerate(user_chunks): print "User chunk {} out of {}".format(i + 1, len(user_chunks)) aggregated_db.cursor.execute('BEGIN') for user in user_chunk: if is_potential_mover(aggregated_db, user): aggregated_db.insert( 'INSERT INTO {tbn} VALUES(?)'.format( tbn=potential_movers_tb), (user, )) aggregated_db.connection.commit()
if not aggregated_data_db: raise Exception('\nNo database selected! Goodbye.\n') except Exception as e: print e sys.exit() s = time.time() aggregated_db = Database(aggregated_data_db) movers_db = Database(MOVERS_DB_NAME) movers_tb = movers_db.create_table(MOVERS_TBNAME, MOVERS_COLUMNS) potential_movers = [user[0] for user in aggregated_db.select('SELECT DISTINCT user_id FROM {}'.format(POTENTIAL_MOVERS_TBNAME))][:TEST_SIZE] user_chunks = chunkify(potential_movers, n=10000) for user_chunk in user_chunks: movers_db.cursor.execute('BEGIN') for user in user_chunk: actual_moves = user_fips_compare(user, aggregated_data_db, centroids) if actual_moves: movers_db.insert('INSERT INTO {tbn} VALUES(?, ?, ?, ?, ?)'.format(tbn=movers_tb), actual_moves, many=True) movers_db.connection.commit() print '\nElapsed Time: {}s\n'.format(round(time.time() - s, 2)) print 'Size: {}\n'.format(TEST_SIZE if TEST_SIZE else 'All')
if __name__ == '__main__': try: tweet_stats_db_file = fd.askopenfilenames( title='Choose database with tweet statistics') if not tweet_stats_db_file: raise Exception('\nNo database selected! Goodbye.\n') except Exception as e: print e sys.exit() s = time.time() tweet_stats_db = Database(tweet_stats_db_file) unique_fips = sorted([ str(fips[0]).zfill(5) for fips in tweet_stats_db.select( 'SELECT DISTINCT fips FROM statistics') ]) fips_chunks = chunkify(unique_fips, n=NCHUNKS_FIPS) for fips_chunk in fips_chunks: # process fips chunk # This can be parallelized because it's writing to different databases. pass tweet_stats_db.connection.close() print '\nTotal Elapsed Time: {}s\n'.format(round(time.time() - s, 2))