Esempio n. 1
0
def parallel_migration(coastal_counties_db, tweet_db_file):
    current_coastal_counties_db_file = os.path.join(
        RESULTS_PATH,
        os.path.split(tweet_db_file)[1])
    current_coastal_counties_db = Database(current_coastal_counties_db_file)

    coastal_counties_tweets_table = current_coastal_counties_db.create_table(
        *COASTAL_COUNTIES_TWEETS_TABLE)

    joined_rows, other_db_name = coastal_counties_db.ijoin(
        (tweet_db_file, 'other_db', 'tweets'),
        FIELDS_TO_SELECT_FOR_JOIN + ',counties.fips', MATCH_CRITERIA_FOR_JOIN)
    current_coastal_counties_db.cursor.execute('BEGIN')

    current_coastal_counties_db.insert("""INSERT INTO {} 
        VALUES(?,?,?,?,?)""".format(coastal_counties_tweets_table),
                                       joined_rows,
                                       many=True)

    current_coastal_counties_db.connection.commit()

    print "\tDetaching database..."
    coastal_counties_db.cursor.execute(
        """DETACH DATABASE '{}'""".format(other_db_name))

    current_coastal_counties_db.connection.close()
Esempio n. 2
0
def process_db_file(user_chunks, tweet_db_file):
    mover_tweets_db = Database(
        os.path.join(MOVERS_TWEETS_DB_PATH,
                     tweet_db_file.split('/')[-1]))
    mover_tweets_tb = mover_tweets_db.create_table(TWEETS_TBNAME,
                                                   TWEETS_COLUMNS)

    mover_tweets_db.cursor.execute('BEGIN')

    current_tweets_db = Database(tweet_db_file)
    s = time.time()

    for i, user_chunk in enumerate(user_chunks):
        if int(ceil((float(i) / len(user_chunks)) *
                    100)) % 25 == 0 or i == len(user_chunks) - 1:
            print "\n\tProcessing chunk {} out of {}".format(
                i + 1, len(user_chunks))
        process_user_chunk(user_chunk, current_tweets_db, mover_tweets_db)

    current_tweets_db.connection.close()
    mover_tweets_db.connection.commit()
    mover_tweets_db.connection.close()

    print '\n\tElapsed Time for db file: {}s\n'.format(
        round(time.time() - s, 2))
Esempio n. 3
0
def process_single_db(db_file):
    db = Database(db_file)
    users_fips_tb = db.create_table(USER_FIPS_TBNAME, USER_FIPS_COLUMNS)

    user_chunks = chunkify([user[0] for user in get_users_in_db(db_file)],
                           n=10000)
    process_user_chunks(user_chunks, db)
Esempio n. 4
0
def process_users_chunk(chunk, stats_db_file):
    db = Database(stats_db_file)

    chunk_dbfile = os.path.join(DISTRIBUTED_TWEET_STATS_USER_PATH,
                                'users_{}_{}.db'.format(chunk[0], chunk[-1]))
    chunk_db = Database(chunk_dbfile)

    stats_tb = chunk_db.create_table('statistics', TWEET_STATS_COLUMNS)

    chunk_db.cursor.execute('BEGIN')

    for uid in chunk:
        if os.path.getsize(chunk_dbfile.replace('.db', '.db-shm')) >= 1e8:
            print "{} is larger than 100MB".format(chunk_dbfile)
            break

        results = db.select(
            'SELECT * FROM statistics WHERE user_id={}'.format(uid))
        chunk_db.insert(
            'INSERT INTO {} VALUES (?, ?, ?, ?, ?)'.format(stats_tb),
            results,
            many=True)

    chunk_db.connection.commit()
    chunk_db.connection.close()
    db.connection.close()
Esempio n. 5
0
def process_fips_chunk(chunk, stats_db_file):
    db = Database(stats_db_file)

    chunk_dbfile = os.path.join(DISTRIBUTED_TWEET_STATS_PATH,
                                'fips_{}_{}.db'.format(chunk[0], chunk[-1]))
    chunk_db = Database(chunk_dbfile)

    stats_tb = chunk_db.create_table('statistics', TWEET_STATS_COLUMNS)

    chunk_db.cursor.execute('BEGIN')

    for fips in chunk:
        results = db.select(
            'SELECT * FROM statistics WHERE fips={}'.format(fips))
        chunk_db.insert(
            'INSERT INTO {} VALUES (?, ?, ?, ?, ?)'.format(stats_tb),
            results,
            many=True)

    chunk_db.connection.commit()
    chunk_db.connection.close()
    db.connection.close()
Esempio n. 6
0
    if TEST_SIZE:
        unique_users = [
            user[0] for user in chronology_db.select(
                'SELECT DISTINCT user_id FROM user_fips LIMIT {}'.format(
                    TEST_SIZE))
        ]
    else:
        unique_users = [
            user[0] for user in chronology_db.select(
                'SELECT DISTINCT user_id FROM user_fips')
        ]

    print "Number of Unique Users: {}".format(len(unique_users))

    chronology_tb = chronology_db.create_table('user_chronology',
                                               USER_CHRONOLOGY_COLUMNS)

    user_chunks = list(chunkify(unique_users, n=100))

    for i, user_chunk in enumerate(user_chunks):
        if int(ceil((float(i) / len(user_chunks)) *
                    100)) % 25 == 0 or i == len(user_chunks) - 1:
            print "\nProcessing chunk {} out of {}".format(
                i + 1, len(user_chunks))

        process_user_chunk_chronology(user_chunk, chronology_db, centroids)

    print '\nElapsed Time: {}s\n'.format(round(time.time() - s, 2))
    print 'Size: {}\n'.format(TEST_SIZE if TEST_SIZE else 'All')
Esempio n. 7
0
            title='Choose databases with tweet statistics')
        county_weather_db_file = fd.askopenfilename(
            title='Choose database with county weather')

        if not tweet_stats_db_files and not county_weather_db_file:
            raise Exception('\nNo databases selected! Goodbye.\n')
    except Exception as e:
        print e
        sys.exit()

    s = time.time()

    for i, stats_dbfile in enumerate(tweet_stats_db_files):
        print "{} out of {} dbs".format(i + 1, len(tweet_stats_db_files))
        stats_db = Database(stats_dbfile)
        stats_db.create_table('aggregated_tweet_data',
                              AGGREGATED_TWEET_DATA_COLUMNS)

        stats_db.cursor.execute(
            'ATTACH "{}" as wdb'.format(county_weather_db_file))

        stats_db.cursor.execute('BEGIN')

        #  'user_id INT, date_text TEXT, fips INT, tmax REAL, prcp REAL, humidity REAL, total_tweets INT, weather_tweets INT,'
        stats_db.cursor.execute("""
            INSERT INTO aggregated_tweet_data
            SELECT 
            statistics.user_id, statistics.date_text, statistics.fips, 
            weather.tmax, weather.prcp, weather.hmd,
            statistics.total_tweets, statistics.weather_tweets
            FROM statistics
            INNER JOIN
Esempio n. 8
0
    s = time.time()

    user_chronology_db = Database(user_chronology_db_file)

    print

    for i, mover_tweets_db_file in enumerate(mover_tweets_db_files):
        print "Processing db file {} out of {}".format(
            i + 1, len(mover_tweets_db_files))

        mover_tweets_db = Database(mover_tweets_db_file)

        weather_info_db = Database(
            os.path.join(WEATHER_INFO_DB_PATH,
                         mover_tweets_db_file.split('/')[-1]))
        weather_info_tb = weather_info_db.create_table('tweets',
                                                       WEATHER_INFO_COLUMNS)

        ntweets = mover_tweets_db.select(
            'SELECT COUNT(user_id) FROM tweets').fetchone()[0]

        mover_tweets_db.select('SELECT * FROM tweets')

        weather_info_db.cursor.execute('BEGIN')

        processed_tweets = 0

        s = time.time()

        while True:
            if int(ceil((float(processed_tweets) / ntweets) *
                        100)) % 25 == 0 or processed_tweets == ntweets - 1:
Esempio n. 9
0
if __name__ == '__main__':
    try:
        weather_info_db_files = fd.askopenfilenames(
            title='Choose databases with weather info tweets')

        if not weather_info_db_files:
            raise Exception('\nNo database selected! Goodbye.\n')
    except Exception as e:
        print e
        sys.exit()

    s = time.time()

    stats_db = Database(TWEET_STATS_DB)
    stats_tb = stats_db.create_table(TWEET_STATS_TB, TWEET_STATS_COLUMNS)

    total_time = 0
    # stash, stash_path = load_processed_stash()

    # get db_files that haven't been processed yet and that are complete
    # u_weather_info_db_files = filter(lambda f: not is_processed_file(f, stash), weather_info_db_files)

    # for i, weather_info_db_file in enumerate(weather_info_db_files):
    for i, weather_info_db_file in enumerate(weather_info_db_files):
        print "Processing db file {} out of {}".format(
            i + 1, len(weather_info_db_files))

        weather_info_db = Database(weather_info_db_file)

        unique_users = [
Esempio n. 10
0

if __name__ == "__main__":
    try:
        dbs = fd.askopenfilenames(title='Get databases')

        if not dbs:
            raise Exception('\nNo databases selected! Goodbye.\n')
    except Exception as e:
        print e
        sys.exit()

    aggregated_data_db_path = os.path.join(RESULTS_DIR_PATH, USER_FIPS_DB)
    aggregated_data_db = Database(aggregated_data_db_path)

    user_fips_tb = aggregated_data_db.create_table(USER_FIPS_TBNAME,
                                                   USER_FIPS_COLUMNS_UNIQUE)

    s = time.time()

    for db_file in dbs:
        db = Database(db_file)

        db.select('SELECT * FROM {tbn}'.format(tbn=USER_FIPS_TBNAME))

        current_user_fips = [(user_id, fips, get_date_from_path(db_file))
                             for user_id, fips in db.cursor]

        aggregated_data_db.cursor.execute('BEGIN')
        aggregated_data_db.insert(
            'INSERT OR IGNORE INTO {tbn} VALUES (?, ?, ?)'.format(
                tbn=user_fips_tb),
Esempio n. 11
0
        if not coastal_counties_tweets_db_files: 
            raise Exception('\nNo databases selected! Goodbye.\n')
    except Exception as e: 
        print e
        sys.exit()

    total_time = 0 

    for i, tweet_db_file in enumerate(coastal_counties_tweets_db_files): 
        print '{} out of {} databases'.format(i + 1, len(coastal_counties_tweets_db_files))

        start = time.time()

        current_tweet_db = Database(tweet_db_file)

        stats_tb = current_tweet_db.create_table('statistics', STATS_TB_COLUMNS)

        current_tweet_db.cursor.execute('BEGIN')
        current_tweet_db.cursor.executescript(INSERT_GROUP_BY_COMMAND.format(tbn=stats_tb)) # create statistics table with groub by selection
        current_tweet_db.connection.commit()
    
        current_tweet_db.connection.close()

        end = time.time() - start
        total_time += end

        print "\tElapsed: {}s\tAverage: {}s".format(round(total_time, 2), round(total_time / (i+1), 2))

    print "\nTotal time: {}s".format(round(total_time, 2))

Esempio n. 12
0
if __name__ == '__main__':
    try:
        aggregated_data_db = fd.askopenfilename(
            title='Choose database with aggregated data')

        if not aggregated_data_db:
            raise Exception('\nNo database selected! Goodbye.\n')
    except Exception as e:
        print e
        sys.exit()

    s = time.time()

    aggregated_db = Database(aggregated_data_db)
    potential_movers_tb = aggregated_db.create_table(POTENTIAL_MOVERS_TBNAME,
                                                     POTENTIAL_MOVERS_COLUMNS)

    unique_users = [
        user[0] for user in aggregated_db.select(
            'SELECT DISTINCT user_id FROM {}'.format(USER_FIPS_TBNAME))
    ][:TEST_SIZE]

    user_chunks = [chunk for chunk in chunkify(unique_users, n=10000)]

    # counter = 0

    for i, user_chunk in enumerate(user_chunks):
        print "User chunk {} out of {}".format(i + 1, len(user_chunks))

        aggregated_db.cursor.execute('BEGIN')
Esempio n. 13
0
    try: 
        aggregated_data_db = fd.askopenfilename(title='Choose database with aggregated data')
        centroids = get_centroids()

        if not aggregated_data_db: 
            raise Exception('\nNo database selected! Goodbye.\n')
    except Exception as e: 
        print e
        sys.exit()

    s = time.time()
    
    aggregated_db = Database(aggregated_data_db)

    movers_db = Database(MOVERS_DB_NAME)
    movers_tb = movers_db.create_table(MOVERS_TBNAME, MOVERS_COLUMNS)

    potential_movers = [user[0] for user in aggregated_db.select('SELECT DISTINCT user_id FROM {}'.format(POTENTIAL_MOVERS_TBNAME))][:TEST_SIZE]
    user_chunks = chunkify(potential_movers, n=10000)
 
    for user_chunk in user_chunks:
        movers_db.cursor.execute('BEGIN') 
        
        for user in user_chunk: 
            actual_moves = user_fips_compare(user, aggregated_data_db, centroids)

            if actual_moves: 
                movers_db.insert('INSERT INTO {tbn} VALUES(?, ?, ?, ?, ?)'.format(tbn=movers_tb), actual_moves, many=True)

        movers_db.connection.commit()
            
Esempio n. 14
0
if not rwls_path:
    print "\nNo directory selected. Goodbye!\n"
    sys.exit()

if not os.path.exists('./results'):
    os.makedirs('results')

# Connect / Create results database

db = Database('results/tree_ring_data.db')

# Create all tables

species_tb = db.create_table(
    "species",
    "species_id NVARCHAR(4) PRIMARY KEY NOT NULL, species_name NVARCHAR(8) NULL"
)

sites_tb = db.create_table(
    "sites", """
    site_id NVARCHAR(7) PRIMARY KEY NOT NULL, 
    site_name NVARCHAR(52) NULL,     
    elevation NVARCHAR(5) NULL,
    latitude REAL NULL,
    longitude REAL NULL
    """)

# from sites_tb: location NVARCHAR(13) NULL,

trees_tb = db.create_table(
    "trees", """
Esempio n. 15
0
    except Exception as e:
        print e
        sys.exit()

    coastal_counties_db = Database(coastal_counties_db_file)

    for i, tweet_db_file in enumerate(tweet_db_files):
        print '{} out of {} databases'.format(i + 1, len(tweet_db_files))

        current_coastal_counties_db_file = os.path.join(
            RESULTS_PATH,
            os.path.split(tweet_db_file)[1])
        current_coastal_counties_db = Database(
            current_coastal_counties_db_file)

        coastal_counties_tweets_table = current_coastal_counties_db.create_table(
            *COASTAL_COUNTIES_TWEETS_TABLE)

        joined_rows, other_db_name = coastal_counties_db.ijoin(
            (tweet_db_file, 'other_db', 'tweets'), FIELDS_TO_SELECT_FOR_JOIN +
            ',{}.fips'.format(CURRENT_COUNTIES_TABLE), MATCH_CRITERIA_FOR_JOIN)
        current_coastal_counties_db.cursor.execute('BEGIN')

        current_coastal_counties_db.insert("""INSERT INTO {} 
            VALUES(?,?,?,?,?)""".format(coastal_counties_tweets_table),
                                           joined_rows,
                                           many=True)

        current_coastal_counties_db.connection.commit()
        coastal_counties_db.cursor.execute(
            """DETACH DATABASE '{}'""".format(other_db_name))
        current_coastal_counties_db.connection.close()