Beispiel #1
0
def process_single_db(db_file):
    db = Database(db_file)
    users_fips_tb = db.create_table(USER_FIPS_TBNAME, USER_FIPS_COLUMNS)

    user_chunks = chunkify([user[0] for user in get_users_in_db(db_file)],
                           n=10000)
    process_user_chunks(user_chunks, db)
Beispiel #2
0
    if TEST_SIZE:
        unique_users = [
            user[0] for user in chronology_db.select(
                'SELECT DISTINCT user_id FROM user_fips LIMIT {}'.format(
                    TEST_SIZE))
        ]
    else:
        unique_users = [
            user[0] for user in chronology_db.select(
                'SELECT DISTINCT user_id FROM user_fips')
        ]

    print "Number of Unique Users: {}".format(len(unique_users))

    chronology_tb = chronology_db.create_table('user_chronology',
                                               USER_CHRONOLOGY_COLUMNS)

    user_chunks = list(chunkify(unique_users, n=100))

    for i, user_chunk in enumerate(user_chunks):
        if int(ceil((float(i) / len(user_chunks)) *
                    100)) % 25 == 0 or i == len(user_chunks) - 1:
            print "\nProcessing chunk {} out of {}".format(
                i + 1, len(user_chunks))

        process_user_chunk_chronology(user_chunk, chronology_db, centroids)

    print '\nElapsed Time: {}s\n'.format(round(time.time() - s, 2))
    print 'Size: {}\n'.format(TEST_SIZE if TEST_SIZE else 'All')
Beispiel #3
0
        if not tweet_stats_db_file:
            raise Exception('\nNo database selected! Goodbye.\n')
    except Exception as e:
        print e
        sys.exit()

    s = time.time()

    tweet_stats_db = Database(tweet_stats_db_file)

    unique_fips = sorted([
        str(fips[0]).zfill(5) for fips in tweet_stats_db.select(
            'SELECT DISTINCT fips FROM statistics')
    ])

    fips_chunks = list(chunkify(unique_fips, n=NCHUNKS_FIPS))

    pool = mp.Pool(processes=mp.cpu_count())
    processes = [
        pool.apply_async(process_fips_chunk, args=(chunk, tweet_stats_db_file))
        for chunk in fips_chunks
    ]

    total_time = 0

    for i, p in enumerate(processes):
        print '{} out of {} chunks'.format(i + 1, len(processes))
        loop_t = time.time()
        p.get()
        elapsed = round(time.time() - loop_t, 2)
Beispiel #4
0
    except Exception as e:
        print e
        sys.exit()

    s = time.time()

    chronology_db = Database(chronology_db_file)

    if TEST_SIZE:
        unique_users = [
            user[0] for user in chronology_db.select(
                'SELECT DISTINCT user_id FROM user_chronology LIMIT {}'.format(
                    TEST_SIZE))
        ]
    else:
        unique_users = [
            user[0] for user in chronology_db.select(
                'SELECT DISTINCT user_id FROM user_chronology')
        ]

    user_chunks = list(chunkify(unique_users, n=USER_CHUNK_SIZE))

    for i, tweet_db_file in enumerate(tweets_db_files):
        print "\nProcessing tweets db {} out of {}".format(
            i + 1, len(tweets_db_files))
        process_db_file(user_chunks, tweet_db_file)

    print '\nElapsed Time: {}s\n'.format(round(time.time() - s, 2))
    print 'Size: {}\tUser Chunks: {}\n'.format(
        TEST_SIZE if TEST_SIZE else 'All', USER_CHUNK_SIZE)
Beispiel #5
0
        if not tweet_stats_db_file:
            raise Exception('\nNo database selected! Goodbye.\n')
    except Exception as e:
        print e
        sys.exit()

    s = time.time()

    tweet_stats_db = Database(tweet_stats_db_file)

    unique_users = sorted([
        uid[0] for uid in tweet_stats_db.select(
            'SELECT DISTINCT user_id FROM statistics')
    ])

    user_chunks = list(chunkify(unique_users, n=NCHUNKS_USERS))

    pool = mp.Pool(processes=mp.cpu_count())
    processes = [
        pool.apply_async(process_users_chunk,
                         args=(chunk, tweet_stats_db_file))
        for chunk in user_chunks
    ]

    total_time = 0

    for i, p in enumerate(processes):
        print '{} out of {} chunks'.format(i + 1, len(processes))
        loop_t = time.time()
        p.get()
        elapsed = round(time.time() - loop_t, 2)
Beispiel #6
0
    except Exception as e:
        print e
        sys.exit()

    s = time.time()

    aggregated_db = Database(aggregated_data_db)
    potential_movers_tb = aggregated_db.create_table(POTENTIAL_MOVERS_TBNAME,
                                                     POTENTIAL_MOVERS_COLUMNS)

    unique_users = [
        user[0] for user in aggregated_db.select(
            'SELECT DISTINCT user_id FROM {}'.format(USER_FIPS_TBNAME))
    ][:TEST_SIZE]

    user_chunks = [chunk for chunk in chunkify(unique_users, n=10000)]

    # counter = 0

    for i, user_chunk in enumerate(user_chunks):
        print "User chunk {} out of {}".format(i + 1, len(user_chunks))

        aggregated_db.cursor.execute('BEGIN')

        for user in user_chunk:
            if is_potential_mover(aggregated_db, user):
                aggregated_db.insert(
                    'INSERT INTO {tbn} VALUES(?)'.format(
                        tbn=potential_movers_tb), (user, ))

        aggregated_db.connection.commit()
Beispiel #7
0
        if not aggregated_data_db: 
            raise Exception('\nNo database selected! Goodbye.\n')
    except Exception as e: 
        print e
        sys.exit()

    s = time.time()
    
    aggregated_db = Database(aggregated_data_db)

    movers_db = Database(MOVERS_DB_NAME)
    movers_tb = movers_db.create_table(MOVERS_TBNAME, MOVERS_COLUMNS)

    potential_movers = [user[0] for user in aggregated_db.select('SELECT DISTINCT user_id FROM {}'.format(POTENTIAL_MOVERS_TBNAME))][:TEST_SIZE]
    user_chunks = chunkify(potential_movers, n=10000)
 
    for user_chunk in user_chunks:
        movers_db.cursor.execute('BEGIN') 
        
        for user in user_chunk: 
            actual_moves = user_fips_compare(user, aggregated_data_db, centroids)

            if actual_moves: 
                movers_db.insert('INSERT INTO {tbn} VALUES(?, ?, ?, ?, ?)'.format(tbn=movers_tb), actual_moves, many=True)

        movers_db.connection.commit()
            
    print '\nElapsed Time: {}s\n'.format(round(time.time() - s, 2))
    print 'Size: {}\n'.format(TEST_SIZE if TEST_SIZE else 'All')
Beispiel #8
0
if __name__ == '__main__':
    try:
        tweet_stats_db_file = fd.askopenfilenames(
            title='Choose database with tweet statistics')

        if not tweet_stats_db_file:
            raise Exception('\nNo database selected! Goodbye.\n')
    except Exception as e:
        print e
        sys.exit()

    s = time.time()

    tweet_stats_db = Database(tweet_stats_db_file)

    unique_fips = sorted([
        str(fips[0]).zfill(5) for fips in tweet_stats_db.select(
            'SELECT DISTINCT fips FROM statistics')
    ])

    fips_chunks = chunkify(unique_fips, n=NCHUNKS_FIPS)

    for fips_chunk in fips_chunks:
        # process fips chunk
        # This can be parallelized because it's writing to different databases.
        pass

    tweet_stats_db.connection.close()

    print '\nTotal Elapsed Time: {}s\n'.format(round(time.time() - s, 2))