def _create_table_if_not_exist(self, table_name: str) -> None: """ Checks if the a given table exists, Creates fire_history table if not exist If it exists, the function does nothing :param table_name: name of the table to look for or create. Only 3 possible values: "fire_history", "fire", "fire_merged" """ # check if the fire_history table exists # if the pipeline is run for the first time # then the fire_history table will not exist is_table_exist = table_name in self.existing_tables # table is the result of the statement: sql_check_if_history_table_exists if is_table_exist: logger.info(f"Found the {table_name} table, continue ...") else: # if table does not exist, create the table logger.info(f"No {table_name} exists. Creating a new one.") # choose the corresponding query to execute table_name_to_create_query = { "fire_history": FireDumper.SQL_CREATE_HISTORY_TABLE, "fire": FireDumper.SQL_CREATE_FIRE_TABLE, "fire_merged": FireDumper.SQL_CREATE_FIRE_MERGED_TABLE, "new_table_testing": "CREATE TABLE new_table_testing(tid int)" }[table_name] # execute the query to create the table in database Connection.sql_execute_commit(table_name_to_create_query) # add the table name into the global variable to keep consistency with the database self.existing_tables.add(table_name) logger.info(f"Table {table_name} created.")
def insert(self, model_type: str, image_url: str, data: Tuple[float, float]) -> None: """ Inserts image prediction result into images table. :param model_type: vgg or resnet for image classification model. :param image_url: image's url that identify each distinct image. :param data: image prediction result -- probability of being wildfire and not wildfire. """ # extract data to be dumped prob_not_wildfire, prob_wildfire = data # for vgg model if model_type == ImageClassifier.VGG_MODEL: try: Connection().sql_execute_commit( f"UPDATE images SET not_wildfire_prob = {prob_not_wildfire}, wildfire_prob = {prob_wildfire} " f"WHERE image_url = {repr(image_url)}") except Exception: logger.error("error: " + traceback.format_exc()) # for resnet model elif model_type == ImageClassifier.RESNET_MODEL: try: Connection().sql_execute_commit( f"UPDATE images SET resnet_not_wildfire = {prob_not_wildfire}, resnet_wildfire = {prob_wildfire} " f"WHERE image_url = {repr(image_url)}") except Exception: logger.error("error: " + traceback.format_exc()) else: logger.error("Insertion fail. Please specify the model type to be vgg or resnet.")
def send_fire_tweet_data2(): """ This func gives all historical tweets objects with id :returns: a list of tweet objects, each with time, lat, long, id """ request_json = flask_request.get_json(force=True) old_north = request_json['oldBound']['_northEast']['lat'] old_east = request_json['oldBound']['_northEast']['lng'] old_south = request_json['oldBound']['_southWest']['lat'] old_west = request_json['oldBound']['_southWest']['lng'] north = request_json['newBound']['_northEast']['lat'] east = request_json['newBound']['_northEast']['lng'] south = request_json['newBound']['_southWest']['lat'] west = request_json['newBound']['_southWest']['lng'] start_date_float = request_json['startDate'] end_date_float = request_json['endDate'] old_poly = 'polygon(({0} {1}, {0} {2}, {3} {2}, {3} {1}, {0} {1}))'.format( old_east, old_south, old_north, old_west) poly = 'polygon(({0} {1}, {0} {2}, {3} {2}, {3} {1}, {0} {1}))'.format( east, south, north, west) if old_poly == poly: return make_response( jsonify([{ "createAt": t.timestamp(), "lng": long, "lat": lat, "id": str(id) } for t, id, long, lat in Connection.sql_execute(f""" SELECT r.create_at, r.id , st_x(location), st_y(location) FROM records r where r.create_at BETWEEN to_timestamp({start_date_float / 1000}) AND to_timestamp({end_date_float / 1000}) and ST_CONTAINS(st_geomfromtext({repr(poly)}) , location)""" )])) else: return make_response( jsonify([{ "createAt": t.timestamp(), "lng": long, "lat": lat, "id": str(id) } for t, id, long, lat in Connection.sql_execute(f""" SELECT r.create_at, r.id , st_x(location), st_y(location) FROM records r where r.create_at BETWEEN to_timestamp({start_date_float / 1000}) AND to_timestamp({end_date_float / 1000}) and ST_CONTAINS(st_difference(st_geomfromtext({repr(poly)}), st_geomfromtext({repr(old_poly)})) , location)""" )]))
def search_administrative_boundaries(): """ search administrative boundaries support region_id as integer, full-name as string and abbreviated state name as 2-letter string :return: """ keyword = flask_request.args.get('keyword') # if kw is an id, get geometry directly if keyword.isdigit(): region_id = int(keyword) # is a region_id query = f''' SELECT st_asgeojson(t.geom) as geojson from us_states t where state_id = {region_id} union SELECT st_asgeojson(t.geom) as geojson from us_counties t where county_id = {region_id} union SELECT st_asgeojson(t.geom) as geojson from us_cities t where city_id = {region_id} ''' with Connection() as conn: cur = conn.cursor() cur.execute(query) resp = make_response( jsonify([json.loads(geom) for geom, in cur.fetchall()])) cur.close() else: # load abbreviation # default value is keyword itself if not found in dict keyword = us_states_abbr.get(keyword, keyword) search_state = "SELECT st_asgeojson(t.geom) from us_states t where lower(state_name)=lower(%s)" search_county = "SELECT st_asgeojson(t.geom) from us_counties t where lower(county_name)=lower(%s) limit 1" search_city = "SELECT st_asgeojson(t.geom) from us_cities t where lower(city_name)=lower(%s) limit 1" with Connection() as conn: cur = conn.cursor() results = None if not results: cur.execute(search_state, (keyword, )) results = [json.loads(geom) for geom, in cur.fetchall()] if not results: cur.execute(search_county, (keyword, )) results = [json.loads(geom) for geom, in cur.fetchall()] if not results: cur.execute(search_city, (keyword, )) results = [json.loads(geom) for geom, in cur.fetchall()] resp = make_response(jsonify(results)) cur.close() return resp
def send_temperature_data(): """ This func gives the second lastest data for temperature within ractangle around US, since the most lastest data is always updating (not completed) :returns: a list of temp objects, with lat, long, and temp value """ temperature_fetch = Connection.sql_execute("select t.lat, t.long, t.temperature from recent_temperature t " "where t.endtime = (select max(t.endtime) from recent_temperature t" " where t.endtime <(select max(t.endtime) from recent_temperature t))") temperature_data_celsius = [] # format temp data into a dictionary structure for row in temperature_fetch: temperature_object = { "lat": row[0], "long": row[1] % (-360), # convert longtitude range "temp": row[2] - 273.15, # change temp into celsius } temperature_data_celsius.append(temperature_object) temperature_data_us = points_in_us(temperature_data_celsius) # restrict data within US boundary. resp = make_response(jsonify(temperature_data_us)) resp.headers['Access-Control-Allow-Origin'] = '*' # sends temperature data with coordinate within us boundary return resp
def run(self, model_type: str = ImageClassifier.RESNET_MODEL): """ Gets image_id and image_url from database and dump prediction results into database. :param model_type: different model will be used according to model type. :return: none """ # set up image classifier image_classifier = ImageClassifier(model_type) image_classifier.set_model() # set up image classification dumper img_classification_dumper = ImgClassificationDumper() # loop every image in database try: for id, image_url in Connection().sql_execute( "select id, image_url from images"): # get prediction result of image prediction_tuple = image_classifier.predict(image_url) # dump prediction result into database img_classification_dumper.insert(model_type, image_url, prediction_tuple) logger.info("id " + str(id) + " is done!") except: logger.error('error: ' + traceback.format_exc())
def run(self, keywords: list = None, batch_num: int = 100, using_filter_api: bool = False, fetch_from_db: bool = False, time_interval: int = 2): if keywords is None: keywords = ['wildfire'] logger.info('Crawler Starting') if fetch_from_db: return self._run_fetch_from_db_mode(time_interval) # crawling ids: if using_filter_api and not isinstance(self.crawler, TweetFilterAPICrawler): self.crawler = TweetFilterAPICrawler() while True: # add fires that happened in the past month and put these fires into the keywords for search fire_names = { fire_name.lower() for fire_name, in Connection.sql_execute( f"select name from fire_merged where start_time > now() - interval '1 month'" ) } keywords_with_fire_names = fire_names | (set(keywords)) logger.info(f'Full keyword set: {keywords_with_fire_names}') logger.info('Running ID mode') ids = self.crawler.crawl(list(keywords_with_fire_names), batch_num) self.dumper.insert(ids, id_mode=True) # prevent API from being banned time.sleep(time_interval)
def fire(): # return a json of all fire name, fire time, and fire geometry inside the bounding box request_json = flask_request.get_json(force=True) north = request_json['northEast']['lat'] east = request_json['northEast']['lon'] south = request_json['southWest']['lat'] west = request_json['southWest']['lon'] size = request_json['size'] start_date = request_json['startDate'][:10] end_date = request_json['endDate'][:10] size_getters = {0: "geom_full", 1: "geom_1e4", 2: "geom_1e3", 3: "geom_1e2", 4: "geom_center"} poly = 'polygon(({0} {1}, {0} {2}, {3} {2}, {3} {1}, {0} {1}))'.format(east, south, north, west) query = f"SELECT id, name, agency,start_time, end_time, st_asgeojson({size_getters[size]}) as geom, max_area FROM " \ f"fire_merged f WHERE ((('{start_date}'::date <= f.end_time::date) AND " \ f"('{start_date}'::date >= f.start_time::date)) OR (('{end_date}'::date >= f.start_time::date) " \ f"AND ('{end_date}'::date <= f.end_time::date)) OR (('{start_date}'::date <= f.start_time::date) " \ f"AND ('{end_date}'::date >= f.end_time::date) )) " \ f"AND (st_contains(ST_GeomFromText('{poly}'),f.{size_getters[size]}) " \ f"OR st_overlaps(ST_GeomFromText('{poly}'),f.{size_getters[size]}))" return make_response(jsonify([{"type": "Feature", "id": fid, "properties": {"name": name, "agency": agency, "starttime": start_time, "endtime": end_time, "density": 520, "area": max_area}, "geometry": json.loads(geom)} for fid, name, agency, start_time, end_time, geom, max_area in Connection.sql_execute(query)]))
def send_recent_tweet_data(): """ This func gives recent tweets objects which must has a image here the interval is 10 month :returns: a list of tweet objects, each with time, lat, long, text, id """ with Connection() as conn: cur = conn.cursor() livetweet_query = "select it.create_at, it.top_left_long, it.top_left_lat, it.bottom_right_long, it.bottom_right_lat, it.id, it.text, i.image_url, it.profile_pic, it.user_name " \ "from (select r.create_at, l.top_left_long, l.top_left_lat, l.bottom_right_long, l.bottom_right_lat, l.id, r.text, r.profile_pic, r.user_name " \ "from records r, locations l where r.id=l.id and r.profile_pic is not null and r.create_at between (SELECT current_timestamp - interval '10 month') and current_timestamp) AS it LEFT JOIN images i on i.id = it.id where i.image_url is not null " cur.execute(livetweet_query) resp = make_response( jsonify([{ "create_at": time.isoformat(), "long": long, "lat": lat, "id": id, "text": text, "image": image, "profilePic": profilePic, "user": user } for time, long, lat, _, _, id, text, image, profilePic, user in cur.fetchall()])) cur.close() return resp
def send_boundaries_data(): """ get administrative boundaries within specific bounding box will DB stored procedure (boundaries.sql) :return: """ request_json = flask_request.get_json(force=True) states = request_json['states'] cities = request_json['cities'] counties = request_json['counties'] north = request_json['northEast']['lat'] east = request_json['northEast']['lon'] south = request_json['southWest']['lat'] west = request_json['southWest']['lon'] select_states = "SELECT * from boundaries_states(%s)" select_counties = "SELECT * from boundaries_counties(%s)" select_cities = "SELECT * from boundaries_cities(%s)" poly = 'polygon(({1} {0}, {2} {0}, {2} {3}, {1} {3}, {1} {0}))'.format( north, west, east, south) # lon lat +-180 with Connection() as conn: cur = conn.cursor() result_list = list() if states: result_list.extend(_get_geometry(cur, select_states, poly)) if counties: result_list.extend(_get_geometry(cur, select_counties, poly)) if cities: result_list.extend(_get_geometry(cur, select_cities, poly)) cur.close() return make_response(jsonify(result_list))
def send_wildfire(): """ provide predicted wildfires :return: """ # TODO: update the where clause request_json = flask_request.get_json(force=True) north = request_json['northEast']['lat'] east = request_json['northEast']['lon'] south = request_json['southWest']['lat'] west = request_json['southWest']['lon'] start = request_json['startDate'] end = request_json['endDate'] start = time.mktime(time.strptime(start, "%Y-%m-%dT%H:%M:%S.%fZ")) end = time.mktime(time.strptime(end, "%Y-%m-%dT%H:%M:%S.%fZ")) return make_response( jsonify([{ "long": lon, "lat": lat, "nlp": True, "text": text } for lon, lat, nlp_text, text in Connection.sql_execute( f"select l.top_left_long, l.top_left_lat, r.text, r.text from locations l, images i, records r " f"where l.id = i.id and r.id = l.id and i.wildfire_prob>0.9 and " f"l.top_left_long>{west} and l.top_left_lat<{north} " f"and l.bottom_right_long<{east} and l.bottom_right_lat>{south} " f"and extract(epoch from r.create_at) >{start} and extract(epoch from r.create_at) <{end}" )]))
def run(self): try: """get tweets from database and dump prediction results into database""" # set up text classifier text_classifier = TextClassifier() # use the pre-trained model text_classifier.set_model(TEXT_CNN_MODEL_PATH) # set up text dumper text_dumper = TextClassificationDumper() # loop required text in database for tweet_id, text in Connection().sql_execute( "select id, text from records where text_cnn_wildfire_prob is null" ): # preprocess the text processed_text = text_classifier.preprocess(text) # get prediction result of text, tuple example: tensor([[0.8321, 0.1679]]) prediction_tuple = text_classifier.predict(processed_text) # dump prediction result into database text_dumper.insert(tweet_id, prediction_tuple[0][0].item(), prediction_tuple[0][1].item()) logger.info("id " + str(tweet_id) + " is done!") logger.info("Total affected records: " + str(text_dumper.inserted_count)) except Exception: logger.error('error: ' + traceback.format_exc())
def insert(self, date_str: str, weekly_soil_mois: np.array) -> None: """ :param date_str: current data's datetime :param weekly_soil_mois: data :return: None """ flattened_data = weekly_soil_mois.flatten() with Connection() as conn: cur = conn.cursor() for gid, val in enumerate(flattened_data.tolist()): val = float('NaN') if val in [-999, -9999] else val try: cur.execute( self.INSERT_SOIL_MOISTURE, (gid, datetime.datetime.strptime(date_str, "%Y%m%d"), val)) self.inserted_count += cur.rowcount conn.commit() except Exception: logger.error("error: " + traceback.format_exc()) logger.info( f'{date_str} finished, total inserted {self.inserted_count}') cur.close()
def tweet_by_date(): """ tweet count within specific date range @:param start-date: ISO string @:param end-date: ISO string :return: [ {create_at, id, lat, lon}, ... ] """ start_date_str = flask_request.args.get('start-date').split('.')[0][:-3] end_date_str = flask_request.args.get('end-date').split('.')[0][:-3] query = f''' select r.create_at, r.id, top_left_long, top_left_lat, bottom_right_long, bottom_right_lat from records r, locations l where r.id = l.id and r.create_at < to_timestamp({end_date_str}) and r.create_at > to_timestamp({start_date_str}) ''' resp = make_response( jsonify([{ 'create_at': create_at, 'id': id, 'lat': (top_left_lat + bottom_right_lat) / 2, 'long': (top_left_long + bottom_right_long) / 2 } for create_at, id, top_left_long, top_left_lat, bottom_right_long, bottom_right_lat in Connection.sql_execute(query)])) return resp
def batch_traverse_tokens(self, token_type: str, probability_type: str, table_name: str, data_list: list, record_id_list: list, page_size: int): """ Traverses each token in data dictionary for the whole batch, and calls respective inserting functions to dump them into database. :param token_type: which one of X's reaction, Y's reaction and X's intent. :param probability_type: type of respective probability. :param table_name: the target table to dump data into. :param data_list: list of the prediction dictionary. :param record_id_list: list of tweet record id corresponding to prediction result. :param page_size: an interger for batch insertion into database. """ # list of all the record id rids = [] # list of all the probabilies in data_list all_probabilities = [] # list of all the tokens in data_list all_tokens = [] for j in range(len(data_list)): # loop each dictionary in data_list data = data_list[j] tokens = data[token_type] probabilities = data[probability_type] for i in range(len(tokens)): all_tokens.append(' '.join(tokens[i])) all_probabilities.append(probabilities[i]) rids.append(record_id_list[j]) with Connection() as conn: # eids is a list containing ids of all the tokens eids = self.batch_insert_into_tokens(all_tokens, token_type, page_size, conn) self.batch_insert_into_pairs(rids, eids, all_probabilities, table_name, page_size, conn)
def get_aggregated_fire_with_id(self, year: int, name: str, state: str, id: int) -> List[Tuple]: """ Merges fire events with this id from fire table into several big fire events. Some pages might have fires from multiple fire events, so the return value is a list of tuples. If there are more than one fire events, then there will be multiple tuples in returned list. :param year: int e.g. 1999 :param name: str e.g. "FireA" :param state: str e.g. "California" :param id: int e.g. 9999 :return: list of tuples representing fire events e.g. [(1999, "FireA",...), (1999, "FireB",....)] """ aggregated_fire_records_with_id = list( Connection.sql_execute(self.SQL_GET_LATEST_AGGREGATION.format(id))) if not aggregated_fire_records_with_id: # if this id is an empty record, then there is no aggregated fire records # in this situation, we only mark the url as crawled, by inserting it into fire_history self.insert_history(FireEvent(year, state, name, id)) # return the latest fire id raise InvalidFireRecordException( f"Record {id} is an empty record. Skipping...") logger.info( f"Successfully fetch Record #{id}, there are {len(aggregated_fire_records_with_id)} " f"aggregated records in this id") return aggregated_fire_records_with_id
def _insert_ids(ids=List[Tuple[int]]): """insert given id list into the database""" logger.info("Inserting ids") with Connection() as connection: cur = connection.cursor() extras.execute_values(cur, TweetDumper.INSERT_LOCATION_QUERY, ids) connection.commit() cur.close()
def _get_length_of_select_query_result(query: str) -> int: """ Takes a SELECT query and returns the number of rows the query returns. :param query: str. e.g. 'SELECT year,state,name FROM fire_history' :return: int """ return sum(1 for _ in Connection.sql_execute(query))
def get_exists(self): """get how far we went last time""" with Connection() as conn: cur = conn.cursor() cur.execute(self.select_exists) exists_list = cur.fetchall() cur.close() return exists_list
def f1_score(self): total_positive = next(Connection().sql_execute( "select count(*) from records where (label1 = label2 and label1= true) or judge=True" )) print(total_positive) l = [] self.train() for id, text in Connection().sql_execute( "select id, text from records where label1 is not null"): print(text) if self.predict(text): l.append(id) print(l) print(len(l)) precision = len(l) / total_positive recall = total_positive pass
def run(self, batch_num: int = 100): try: self.dumper.insert({ id: self.extractor.extract(text) for id, text in Connection().sql_execute( f"select id, text from records r WHERE NOT EXISTS (select distinct id from images i where i.id = r.id) limit {batch_num}" ) }) except Exception: logger.error('error: ' + traceback.format_exc())
def send_tweet_count_data(): """ This func gives all historical tweets objects with id :returns: a list of tweet objects, each with time, lat, long, id """ return make_response( jsonify({ date.isoformat(): count for date, count in Connection().sql_execute(""" select r.create_at::timestamp::date, count(*) from records r where r.location is not null and r.create_at is not null group by r.create_at::timestamp::date""") }))
def _generate_sql_statement_and_execute(sql_statement: str, data) -> None: """ Generates a SQL statement with the data given as a dictionary and execute, commit the changes. :param sql_statement: string e.g. "SELECT * FROM fire_history WHERE %{id}=199" :param data: data as a dict. e.g. {'year': 2019, 'firename': 'TRESTLE', 'agency': 'USFS', 'datetime': .....} """ with Connection() as conn: cur = conn.cursor() cur.execute(sql_statement, data) conn.commit() cur.close()
def _fetch_id_from_db(self): """a generator which generates 100 id list at a time""" result = list() for id, in Connection.sql_execute( f"SELECT id FROM records WHERE user_id IS NULL or text is null ORDER BY create_at DESC" ): if id not in self.cache: self.cache.add(id) result.append(id) if len(result) == 100: yield result result.clear() pickle.dump(self.cache, open(TWITTER_TEXT_CACHE, 'wb+')) yield result
def run(self, end_clause: int = 210): """ crawling routine :param end_clause: number of days we want to crawl, default=7 :return: None """ current_date = datetime.now(timezone.utc).date() end_date = current_date - timedelta(days=end_clause) # TODO: stop and continue with Connection() as conn: cur = conn.cursor() cur.execute('select date from usgs_info') exist_list = cur.fetchall() cur.close() date = current_date - timedelta(days=7) # website update weekly diff_date = date - date_parser.parse('20190730').date() date = date - timedelta(days=diff_date.days % 7) while date >= end_date: logger.info(f'[fetch]{date}') # skip if exist if (date,) in exist_list: logger.info(f'skip: {date}') saved_zip_path = self.crawler.crawl(date) if saved_zip_path is None: logger.info(f'{date} not found, skipped') else: zf = zipfile.ZipFile(saved_zip_path) for file in zf.namelist(): if file.split('.')[-4] == 'VI_NDVI' and file.split('.')[-1] == 'tif': zf.extract(file, os.path.split(saved_zip_path)[0]) tif_file_name = file zf.close() tif_path = os.path.join(os.path.split(saved_zip_path)[0], tif_file_name) if tif_path is not None: unflattened = self.extractor.extract(tif_path) if unflattened is not None: self.dumper.insert(date, unflattened, 'usgs') # clean up os.remove(saved_zip_path) os.remove(tif_path) # finish crawling a day date = date - timedelta(days=7)
def get_latest_fire_id() -> int: """ Gets the latest fire id to pass to the counter in data_from_fire :return: int e.g. 299 """ # execute select statement # the latest fire id is the first and only entry of the result with Connection() as conn: cur = conn.cursor() cur.execute(FireDumper.SQL_GET_LATEST_ID) result = cur.fetchone()[0] cur.close() return result
def drop_box(): """ auto-completion relies on this API. frontend send user types through userInput, this API perform DB query through stored procedure (autocomplete.sql) return a list/array: [ (city, county, state, id), ... ] :return: """ user_input = flask_request.args.get('userInput') # request_json = flask_request.get_json(force=True) # user_input = request_json['userInput'] name_list_query = f"select * from fuzzy_search('{user_input + '%'}')" return make_response(jsonify(list(Connection.sql_execute(name_list_query))))
def fetch_status_id_from_db(): """a generator which generates 100 id list at a time""" count = 0 result = list() for id, in Connection.sql_execute( f"SELECT id FROM records WHERE user_id IS NULL order by create_at desc" ): count += 1 result.append(id) if count >= 100: yield result time.sleep(20) # set sleep time to prevent the twitter api from being banned result.clear() count = 0
class Dumper: RECORD = 0 LOCATION = 1 def __init__(self, json_filename): with open(json_filename, 'rb') as file: self.tweets = json.load(file) self.target_fields = ["id", "bounding_box"] self.conn = Connection()() def __iter__(self, target): if target == Dumper.LOCATION: for tweet in self.tweets: (a, b), (c, d) = tweet['place']["bounding_box"] yield tuple([tweet['id'], a, b, c, d]) elif target == Dumper.RECORD: for tweet in self.tweets: yield tuple( [tweet[field] for field in ["id", "create_at", "text"]]) def dump_all(self, table, value_count): try: sql = f'INSERT INTO {table} VALUES({"%s " * value_count});' for record in self: print(record) cur = self.conn.cursor() cur.execute(sql, record) cur.close() self.conn.commit() except psycopg2.DatabaseError as error: print(error) def get_location(self): pass
def get_labeled_data(self): self.labeled_data = list() for id, text, label1, label2, judge in Connection().sql_execute( "select id, text, label1, label2, judge from records where label1 is not null and label2 is not null" ): self.labeled_data.append((self.pre_process(text), label1 if label1 == label2 else judge)) for text_dict, label in self.labeled_data: for word in text_dict.copy(): if self.low_fre_words[word] < 35 and word not in [ "THISISALINK", "THISISAUSER" ]: del text_dict[word] random.shuffle(self.labeled_data)