def test_check_db(self): from xml.etree import ElementTree from db_utils import connect_to_db from weather_data import TABLE_NAME from collect_data import get_weather_data weather_data = get_weather_data() main_element = ElementTree.fromstring(weather_data.content) connection = connect_to_db() if connection: try: query = ( "SELECT " " datetime, cloudiness, wind_speed, wind_direction, temperature, humidity, pressure " "FROM " + TABLE_NAME + " " "WHERE " " datetime = '{0}'") cursor = connection.cursor() check_results(self, cursor, main_element.findall("metData"), query) print() finally: connection.close()
def worker_insert_tweets_2017(record_queue, m_q, num_workers): table = connect_to_db('drug_viz_data', 'tweets_2017_filter') none_count = 0 while True: joined_gdf = record_queue.get() if joined_gdf is None: none_count += 1 if none_count >= num_workers: m_q.put('Records finished, time {}'.format(datetime.now())) m_q.put(None) break continue # process fields joined_gdf = joined_gdf.rename( { 'POP100': 'Pop_2010', 'GEOID': 'GeoID_2010', 'TRACT': 'TractID_2010', 'STATE': 'StateID', 'COUNTY': 'CountyID_2010' }, axis='columns') joined_gdf = joined_gdf[[ 'Followers', 'Friends', 'Statuses', 'category', 'drugCategory', 'fakeLocalTime', 'geo', 'textNorm', 'textRaw', 'textTokenCounts', 'textTokens', 'timezone', 'tweetID', 'utcTime', 'Pop_2010', 'GeoID_2010', 'TractID_2010', 'StateID', 'CountyID_2010' ]] # turn gdf to list of dicts tweet_dicts = joined_gdf.to_dict('records') # insert m_q.put('Inserting, time {}, size {}'.format(datetime.now(), len(tweet_dicts))) table.insert_many(tweet_dicts)
def process_records_mp(): # load shpfiles all_gdf, main_crs = load_shpfiles() # load census data all_census_df = load_census_concat() # join shpfiles and census data pop_gdf = join_census_shpfile(all_gdf, all_census_df) # setup process and queues manager = Manager() in_q = manager.Queue(20) record_q = manager.Queue(3) m_q = manager.Queue(1000) num_workers = 2 # start processes process_pool = [] for i in range(num_workers): process_p = Process(target=worker_prepare_records_positive_tweets, args=(main_crs, pop_gdf, in_q, record_q, m_q)) process_p.start() process_pool.append(process_p) out_p = Process(target=worker_insert_tweets_2017, args=(record_q, m_q, num_workers)) out_p.start() # put file path into queue # change this path path_temp = 'D:/TEMPFILE/2017_filter/2017_positive_filtered_tweets_{}.json' for i in range(2): p = path_temp.format(i) in_q.put(p) for i in range(num_workers): in_q.put(None) print('all in') # wait till finish while True: m = m_q.get() if m is None: print('all done') break print(m) # make index table = connect_to_db('drug_viz_data', 'tweets_2017_filter') table.create_index([('textNorm', pymongo.TEXT)]) table.create_index([('geo', pymongo.GEOSPHERE)]) table.create_index("utcTime") table.create_index("fakeLocalTime") table.create_index("textTokens") table.create_index("category") table.create_index("drugCategory") table.create_index("StateID") table.create_index("GeoID_2010") table.create_index("Pop_2010") # done for p in process_pool: p.join() out_p.join() print('all process ends') return
def prepare(): from db_utils import connect_to_db, create_tables print("Preparing environment ...") connection = connect_to_db() if connection: try: create_tables(connection) finally: connection.close() print("Environment successfully prepared.")
def collect(): from xml.etree import ElementTree from db_utils import connect_to_db connection = connect_to_db() if connection: try: weather_data = get_weather_data() main_element = ElementTree.fromstring(weather_data.content) data_elements = main_element.findall("metData") for data_element in data_elements: weather_data = convert_xml_to_weather_record(data_element) save_to_db(connection, weather_data) print("Weather data collected for {0}.".format( weather_data.datetime)) connection.commit() finally: connection.close()
def user_info_extraction_cycle_db(driver, users_list, is_from_scratch): """ Process of extracting user data of user_list and saving them to a DB :param driver: The instantiated web driver :param users_list: List of user names :param is_from_scratch: boolean, specify if taking info from scratch :return: None """ connection = db_utils.connect_to_db(config.DB_HOST, config.DB_USER, config.DB_PWD, config.DB_NAME) if not is_from_scratch: existing_users = db_utils.select_users(connection, users_list) users_list = get_new_users_db(users_list, existing_users) if users_list: users_dict = get_users_info(driver, users_list) if users_dict: db_utils.save_user_info(connection, users_dict) logger.info(config.MSG_DICT["SAVING_DB_USERS_COUNT"].format( len(users_dict))) # print(config.MSG_DICT["SAVING_DB_USERS_COUNT"].format(len(users_dict))) connection.close()
for table in tables: if table not in tables_existing: tdef = tables[table] sql = """ CREATE TABLE policing.""" + table + """ ( """ + ', '.join([ col[0] + ' ' + col[1] for col in tdef if col[0] not in ['primary_key', 'index'] ]) + """ """ + ''.join([ ', PRIMARY KEY (' + col[1] + ')' for col in tdef if col[0] == 'primary_key' ]) + """ """ + ''.join([ ', INDEX (' + col[1] + ')' for col in tdef if col[0] == 'index' ]) + """ ) """ cur.execute(sql) con.commit() print('created table: ' + table) ############ ## MAIN ## ############ con, cur = db_utils.connect_to_db() tables_existing = check_tables(con, cur) create_tables(con, cur, tables_existing)
# filemode='w', # format='%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s', # datefmt='%H:%M:%S', # level=logging.DEBUG) root_logger = logging.getLogger() root_logger.setLevel(logging.DEBUG) handler = logging.FileHandler('./log/query_log.log', 'a', 'utf-8') formatter = logging.Formatter( '%(asctime)s,%(msecs)d %(name)s %(levelname)s %(message)s') handler.setFormatter(formatter) # Pass handler as a parameter, not assign root_logger.addHandler(handler) census_years = ['2010'] utc = pytz.utc state_dict = get_state_fips_dict() db_map = connect_to_db('drug_viz') db_data = connect_to_db('drug_viz_data') MAX_KEYWORDS = 100 MAX_SAMPLES = 10 # by default, each query field is joined with "AND" logic # to use text search and geo query, only "geowithin" can be used # text search is always performed first, then filter conditions are considered # the query should return the following fields: # the mapping data: geojson string of census tracts # the tweet data: each matched tweet with selected fields, include: # id (not the tweet id for privacy) # local time stamp (for time display) # token count (for keyword display)