def run(self, ): try: url_list = [ 'https://trmm.gsfc.nasa.gov/trmm_rain/Events/latest_1_day_landslide.html', 'https://trmm.gsfc.nasa.gov/trmm_rain/Events/latest_3_day_landslide.html', 'https://trmm.gsfc.nasa.gov/trmm_rain/Events/latest_7_day_landslide.html' ] for trmm_url in url_list: self.messageQueue.put("Obtained TRMM url: %s" % trmm_url) try: response = requests.get(trmm_url) except Exception as e: self.messageQueue.put("TRMM URL %s failed with error: %s" % (trmm_url, repr(e))) continue html = response.text trmm_items, trmm_locations = self.getTRMMItems(html) self.insertTRMM(trmm_items) self.updateRedisLocations(trmm_locations) self.DB_CONN.close() self.messageQueue.put("Completed TRMM successfully at %s." % readable_time()) except Exception as e: traceback.print_exc() self.errorQueue.put((self.root_name, str(e)))
def run(self): """Run - Launches the sreamer itself. """ try: self.stream.filter(track=self.keywords) self.messageQueue.put(" ".join([ "Running unstructured streamer", "with PID", str(os.getpid()), "at", readable_time() ])) except Exception as e: self.messageQueue.put(" ".join([ "Crashed unstructured stream", "at", readable_time(), "with error", str(e) ])) self.errorQueue.put(('unstructured', ("twitter", ), str(e)))
def run(self, ): try: for event_topic in self.config["topic_names"]: if not self.config["topic_names"][event_topic][ "high_confidence"]["valid"]: continue self.messageQueue.put("News downloader - working on %s" % event_topic) event_topic_key = str( self.config["topic_names"][event_topic]["index"]) self.cached_list = self.getCachedList(event_topic_key) stopwords = self.config["topic_names"][event_topic][ "stopwords"] keyword_set = self.config["topic_names"][event_topic][ "high_confidence"]["keywords"] articles = [] for keyword in keyword_set: try: response = self.client.get_everything(q=keyword, page_size=100) articles += response["articles"] except Exception as e: self.messageQueue.put( "NewsAPI for %s-%s failed with error: %s" % (event_topic, keyword, repr(e))) article_content, article_location = self.getArticleDetails( articles, stopwords) self.insertNews(article_content, event_topic_key) self.updateRedisLocations(article_location) self.DB_CONN.close() self.messageQueue.put( "Completed News download successfully at %s." % readable_time()) except Exception as e: traceback.print_exc() self.errorQueue.put((self.root_name, str(e)))
def run(self,): try: url_list = ['http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_month.geojson'] for usgs_url in url_list: self.messageQueue.put("Obtaining USGS url: %s"%usgs_url) try: response = requests.get(usgs_url) except Exception as e: self.messageQueue.put("USGS URL %s failed with error: %s" % (usgs_url, repr(e))) continue jsonData = response.json() usgs_items, usgs_locations = self.getUSGSItems(jsonData) self.insertUSGS(usgs_items) self.updateRedisLocations(usgs_locations) self.DB_CONN.close() self.messageQueue.put("Completed USGS successfully at %s."%readable_time()) except Exception as e: traceback.print_exc() self.errorQueue.put((self.root_name, str(e)))
def main(logdir, importkey, exportkey, dataprocessor, dataprocessorscriptdir, pidname): TOP_OF_FILE_START = True pid_name = pidname helper_utils.setup_pid(pid_name, logdir=logdir) # Import processscript helper_utils.std_flush("[%s] -- Initializing ASSED-input-buffer %s" % (helper_utils.readable_time(), pidname)) moduleImport = __import__("pipelines.%s.%s" % (dataprocessorscriptdir, dataprocessor), fromlist=[dataprocessor]) DataProcessor = getattr(moduleImport, dataprocessor) DataProcessor = DataProcessor() helper_utils.std_flush("[%s] -- Imported Data processor %s" % (helper_utils.readable_time(), dataprocessor)) # Set up connections pool = redis.ConnectionPool(host='localhost', port=6379, db=0) r = redis.Redis(connection_pool=pool) kafka_key = exportkey.replace(":", "_") kafka_producer = kafka.KafkaProducer() message_refresh = 7200 skip_count = 0 process_count = 0 time_slept = 0 message_timer = time.time() # Get earliest file to parse... helper_utils.std_flush("[%s] -- Searching for files" % helper_utils.readable_time()) finishedUpToTime = r.get(importkey) granularTime = 0 if finishedUpToTime is None: finishedUpToTime = 0 else: finishedUpToTime = int(finishedUpToTime.decode()) if finishedUpToTime == 0: # TODO CHANGE TO 7 days after setup is complete... helper_utils.std_flush( "[%s] -- No value for previous stop. Starting from 7 days prior", helper_utils.readable_time()) currentTime = datetime.now() - timedelta(days=7) foundFlag = 0 while foundFlag == 0: filePath = DataProcessor.getInputPath(currentTime) if os.path.exists(filePath): # We found the most recent file, and increment our counter finishedUpToTime = currentTime foundFlag = 1 else: # If our search is too broad - i.e. we are a month behind, ignore currentTime += TIME_DELTA_MINIMAL timeDeltaOutputStream = (datetime.now() - currentTime) if timeDeltaOutputStream.days == 0 and timeDeltaOutputStream.seconds <= 1: foundFlag = -1 else: # I.E. if we already have a timestmap from pervious execution, we will read files that are a minute behind, and catch up to the granular time helper_utils.std_flush( "[%s] -- Starting File tracking at %s" % (helper_utils.readable_time(), str(datetime.fromtimestamp(finishedUpToTime / 1000.0)))) granularTime = finishedUpToTime finishedUpToTime = datetime.fromtimestamp( granularTime / 1000.0) - timedelta(seconds=60) TOP_OF_FILE_START = False if TOP_OF_FILE_START: # Otherwise, we start from the beginning of the 'first' file... finishedUpToTime -= timedelta(seconds=finishedUpToTime.second) granularTime = 0 prevGranular = granularTime helper_utils.std_flush("[%s] -- Starting Stream Tracking for %s" % (helper_utils.readable_time(), importkey)) while True: if time.time() - message_timer > message_refresh: message_timer = time.time() helper_utils.std_flush( "[%s] -- Processed %i items, with %i items skipped and %i seconds slept in the last %i seconds" % (helper_utils.readable_time(), process_count, skip_count, time_slept, message_refresh)) process_count, skip_count, time_slept = 0, 0, 0 if (datetime.now() - finishedUpToTime).total_seconds() < 60: waitTime = 120 - (datetime.now() - finishedUpToTime).seconds time.sleep(waitTime) time_slept += waitTime else: filePath = DataProcessor.getInputPath(finishedUpToTime) if not os.path.exists(filePath): waitTime = (datetime.now() - finishedUpToTime).total_seconds() #Difference is less than Two minutes if waitTime < 120: waitTime = 120 - waitTime time.sleep(waitTime) time_slept += waitTime else: # Difference is more than two minutes - we can increment the the by one minute for the next ones finishedUpToTime += TIME_DELTA_MINIMAL # Not we have file else: with open(filePath, 'r') as fileRead: for line in fileRead: try: jsonVersion = json.loads(line) except ValueError as e: helper_utils.std_flush( "[%s] -- WARNING -- Possible warning for %s file for %s with error %s" % (helper_utils.readable_time(), filePath, importkey, str(e))) continue if "timestamp_ms" not in jsonVersion: jsonVersion["timestamp_ms"] = int( jsonVersion["timestamp"]) if granularTime > int(jsonVersion["timestamp_ms"]): # skip already finished this... skip_count += 1 continue else: # Have not done this item yet... # process processed_data = DataProcessor.process(jsonVersion) byted = bytes(json.dumps(processed_data), encoding="utf-8") kafka_producer.send(kafka_key, byted) kafka_producer.flush() granularTime = int(jsonVersion["timestamp_ms"]) r.set(importkey, granularTime) process_count += 1 if granularTime - prevGranular > 86400000: helper_utils.std_flush( "[%s] -- Finished with %s" % (helper_utils.readable_time(), str( datetime.fromtimestamp( granularTime / 1000.0)))) prevGranular = granularTime finishedUpToTime += TIME_DELTA_MINIMAL
# Now we have a StreamerManager with empty instances for each streamer we are going to launch. # We will launch all of them, and go on from there... for _streamer_ in StreamerManager: if StreamerManager[_streamer_]["type"] == "unstructured": #launch single unstructured streamer... StreamerManager[_streamer_]["apikey"] = StreamerManager[ _streamer_]["keyserver"].get_key() StreamerManager[_streamer_]["instance"] = StreamerManager[ _streamer_]["executor"]( StreamerManager[_streamer_]["keywords"], StreamerManager[_streamer_]["apikey"][1], errorQueue, messageQueue) StreamerManager[_streamer_]["instance"].start() std_flush( "Deployed unstructured streamer : %s\tat %s\twith key %s" % (StreamerManager[_streamer_]["name"], readable_time(), StreamerManager[_streamer_]["apikey"][0])) elif StreamerManager[_streamer_]["type"] == "structured": # Launch each instance (eventlangtuple)... for _instance_ in StreamerManager[_streamer_]["instances"]: StreamerManager[_streamer_]["instances"][_instance_][ "apikey"] = StreamerManager[_streamer_][ "keyserver"].get_key() StreamerManager[_streamer_]["instances"][_instance_][ "instance"] = StreamerManager[_streamer_]["executor"]( _instance_[0], _instance_[1], StreamerManager[_streamer_]["instances"][_instance_] ["keywords"], StreamerManager[_streamer_]["instances"] [_instance_]["apikey"][1], errorQueue, messageQueue) StreamerManager[_streamer_]["instances"][_instance_][ "instance"].start()
def main(logdir, importkey, exportkey, processscript, processscriptdir, pidname, debug, seekval): if debug is None: debug = 0 if debug: helper_utils.std_flush("[%s] -- DEBUG_MODE -- Active" % helper_utils.readable_time()) pid_name = pidname if not debug: helper_utils.setup_pid(pid_name, logdir=logdir) # Import processscript helper_utils.std_flush("[%s] -- Initializing ASSED-Process %s" % (helper_utils.readable_time(), pidname)) moduleImport = __import__("pipelines.%s.%s" % (processscriptdir, processscript), fromlist=[processscript]) MessageProcessor = getattr(moduleImport, processscript) if debug: MessageProcessor = MessageProcessor(debug=True) else: MessageProcessor = MessageProcessor() helper_utils.std_flush("[%s] -- Imported Module %s" % (helper_utils.readable_time(), processscript)) kafka_import = importkey.replace(":", "_") helper_utils.std_flush("[%s] -- Generated kafka import key %s" % (helper_utils.readable_time(), kafka_import)) kafka_export = exportkey.replace(":", "_") helper_utils.std_flush("[%s] -- Generated kafka export key %s" % (helper_utils.readable_time(), kafka_export)) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) r = redis.Redis(connection_pool=pool) helper_utils.std_flush( "[%s] -- Connected to redis with ConnectionPool on port 6379" % helper_utils.readable_time()) seek_partition = r.get(exportkey + ":partition") seek_offset = r.get(exportkey + ":offset") seek_partition = 0 if seek_partition is None else int(seek_partition) seek_offset = 0 if seek_offset is None else int(seek_offset) + 1 helper_utils.std_flush( "[%s] -- Obtained seek partition for kafka at Partition %i -- Offset %i" % (helper_utils.readable_time(), seek_partition, seek_offset)) # replace seek value if debug: if seekval is not None: seek_offset = seekval helper_utils.std_flush( "[%s] -- DEBUG -- Replaced seek offset for kafka at Partition %i -- Offset %i" % (helper_utils.readable_time(), seek_partition, seek_offset)) kafka_producer = kafka.KafkaProducer() helper_utils.std_flush("[%s] -- Generated kafka producer" % helper_utils.readable_time()) kafka_consumer = kafka.KafkaConsumer() helper_utils.std_flush("[%s] -- Generated kafka consumer" % helper_utils.readable_time()) TopicPartition = kafka.TopicPartition(kafka_import, seek_partition) kafka_consumer.assign([TopicPartition]) kafka_consumer.seek(TopicPartition, seek_offset) helper_utils.std_flush("[%s] -- Set kafka consumer seek" % helper_utils.readable_time()) message_correct_counter = 0 message_fail_counter = 0 message_counter = 0 for message in kafka_consumer: item = json.loads(message.value.decode()) processedMessage = MessageProcessor.process(item) # Push the message to kafka...if true if type(processedMessage) != type(tuple()): raise ValueError( "[%s] -- ERROR -- Invalid type %s for processedMessage. MessageProcessor.process() must return tuple of (bool,message)." % (helper_utils.readable_time(), str(type(processedMessage)))) if not processedMessage[0]: message_fail_counter += 1 else: if not debug: byted = bytes(json.dumps(processedMessage[1]), encoding="utf-8") kafka_producer.send(kafka_export, byted) kafka_producer.flush() message_correct_counter += 1 message_counter += 1 if not debug: r.set(exportkey + ":partition", message.partition) r.set(exportkey + ":offset", message.offset) r.set(exportkey + ":timestamp", message.timestamp) if message_counter % 1000 == 0: helper_utils.std_flush( "[%s] -- Processed %i messages with %i failures and %i successes" % (helper_utils.readable_time(), message_counter, message_fail_counter, message_correct_counter))
#Load the keywords keywordConfig = load_config(CONSTANTS.TOPIC_CONFIG_PATH) errorQueue = multiprocessing.Queue() messageQueue = multiprocessing.Queue() keyStreamConfig = {} # for each keyword-lang pair type, launch a StreamFilesProcessor for physicalEvent in keywordConfig['topic_names'].keys(): for language in keywordConfig['topic_names'][physicalEvent]["languages"]: eventLangTuple = (physicalEvent,language) keyStreamConfig[eventLangTuple] = {} keyStreamConfig[eventLangTuple]['name'] = physicalEvent keyStreamConfig[eventLangTuple]['lang'] = language keyStreamConfig[eventLangTuple]['keywords'] = keywordConfig['topic_names'][physicalEvent]["languages"][language] keyStreamConfig[eventLangTuple]['postpone'] = False std_flush(" ".join(["Deploying",str(eventLangTuple), "at", readable_time()])) try: keyStreamConfig[eventLangTuple]['processor'] = StreamFilesProcessor( None, keyStreamConfig[eventLangTuple]['keywords'], "_".join([eventLangTuple[0],eventLangTuple[1]]), errorQueue, messageQueue) except RuntimeError: std_flush(" ".join([str(eventLangTuple), " does not have files to start. Posponing launch 2 hr at", readable_time()])) keyStreamConfig[eventLangTuple]['postpone'] = True keyStreamConfig[eventLangTuple]['launchTime'] = datetime.now() if not keyStreamConfig[eventLangTuple]['postpone']: keyStreamConfig[eventLangTuple]['processor'].start() configCheckTimer = time.time()
def main(): local_timer = 0 refresh_timer = 7200 sleep_timer = 300 while True: if time.time() - local_timer > refresh_timer: local_timer = time.time() helper_utils.std_flush("[%s] -- Initializing EventDetection" % helper_utils.readable_time()) cell_cache = {} assed_config = file_utils.load_config("./config/assed_config.json") helper_utils.std_flush("[%s] -- Obtained DB Connection" % helper_utils.readable_time()) DB_CONN = db_utils.get_db_connection(assed_config) cursor = DB_CONN.cursor() available_streamers = [ item for item in assed_config["SocialStreamers"] ] streamer_results = {} helper_utils.std_flush( "[%s] -- Available streamers: %s" % (helper_utils.readable_time(), str(available_streamers))) for _streamer_ in available_streamers: helper_utils.std_flush( "[%s] -- Generating query for: %s" % (helper_utils.readable_time(), _streamer_)) _query_ = generate_social_query(_streamer_=_streamer_, _topic_="landslide") cursor.execute(_query_) streamer_results[_streamer_] = cursor.fetchall() helper_utils.std_flush( "[%s] -- Obtained results for : %s" % (helper_utils.readable_time(), _streamer_)) helper_utils.std_flush("[%s] -- Generating query for: %s" % (helper_utils.readable_time(), "TRMM")) _query_ = generate_trmm_query() cursor.execute(_query_) trmm_results = cursor.fetchall() helper_utils.std_flush("[%s] -- Obtained resuts for: %s" % (helper_utils.readable_time(), "TRMM")) helper_utils.std_flush("[%s] -- Generating query for: %s" % (helper_utils.readable_time(), "USGS")) _query_ = generate_usgs_query() cursor.execute(_query_) usgs_results = cursor.fetchall() helper_utils.std_flush("[%s] -- Obtained resuts for: %s" % (helper_utils.readable_time(), "USGS")) helper_utils.std_flush("[%s] -- Generating query for: %s" % (helper_utils.readable_time(), "News")) _query_ = generate_news_query() cursor.execute(_query_) news_results = cursor.fetchall() helper_utils.std_flush("[%s] -- Obtained resuts for: %s" % (helper_utils.readable_time(), "News")) cursor.close() helper_utils.std_flush( "[%s] -- Generating local cache with scoring:\tSocial-ML - 0.3\tSocial-HDI - 1\tNews - 3\tUSGS - 5\tTRMM - 1" % helper_utils.readable_time()) # Scoring -- Twitter-Social: 0.3 Twitter-HDI - 1 News: 3 USGS: 5 TRMM: 1 for _streamer_ in streamer_results: helper_utils.std_flush( "[%s] -- Local caching for %s" % (helper_utils.readable_time(), _streamer_)) for tuple_cell_ in streamer_results[_streamer_]: _cell_ = tuple_cell_[0] if _cell_ not in cell_cache: cell_cache[_cell_] = {} if int(float(tuple_cell_[1])) > 0: cell_cache[_cell_][_streamer_ + "-hdi"] = (int( float(tuple_cell_[1])), float(tuple_cell_[1])) if int(float(tuple_cell_[2]) / 0.34) > 0: cell_cache[_cell_][_streamer_ + "-ml"] = (int( float(tuple_cell_[2]) / 0.34), float( tuple_cell_[2])) helper_utils.std_flush("[%s] -- Local caching for %s" % (helper_utils.readable_time(), "TRMM")) for tuple_cell_ in trmm_results: _cell_ = tuple_cell_[0] if _cell_ not in cell_cache: cell_cache[_cell_] = {} cell_cache[_cell_]["TRMM"] = (float(tuple_cell_[1]), float(tuple_cell_[1] * 1) ) # 1 <-- TRMM score helper_utils.std_flush("[%s] -- Local caching for %s" % (helper_utils.readable_time(), "USGS")) for tuple_cell_ in usgs_results: _cell_ = tuple_cell_[0] if _cell_ not in cell_cache: cell_cache[_cell_] = {} cell_cache[_cell_]["USGS"] = (float(tuple_cell_[1]), float(tuple_cell_[1] * 5)) helper_utils.std_flush("[%s] -- Local caching for %s" % (helper_utils.readable_time(), "News")) for tuple_cell_ in news_results: _cell_ = tuple_cell_[0] if _cell_ not in cell_cache: cell_cache[_cell_] = {} cell_cache[_cell_]["News"] = (float(tuple_cell_[1]), float(tuple_cell_[1] * 3)) helper_utils.std_flush( "[%s] -- Local cache score total generation" % helper_utils.readable_time()) for _cell_ in cell_cache: cell_cache[_cell_]["total"] = sum([ cell_cache[_cell_][item][1] for item in cell_cache[_cell_] ]) pool = redis.ConnectionPool(host='localhost', port=6379, db=0) r = redis.Redis(connection_pool=pool) helper_utils.std_flush("[%s] -- Connected to Redis" % helper_utils.readable_time()) # Correct-key -- v1 or v2 # Key Push # Actual keys... # list_tracker_key tracks where the data is (either v1 or v2) # list_push_key contains the list of cells list_tracker_key = "assed:event:detection:multisource:listkey" list_push_key = "assed:event:detection:multisource:list" list_info_key = "assed:event:detection:multisource:info" key_version = r.get(list_tracker_key) if key_version is None: key_version = "v2" else: key_version = key_version.decode() push_key = 'v1' if key_version == 'v1': helper_utils.std_flush( "[%s] -- v1 key already in effect. Pushing to v2" % helper_utils.readable_time()) push_key = 'v2' else: helper_utils.std_flush( "[%s] -- v2 key already in effect. Pushing to v1" % helper_utils.readable_time()) cell_list = [item for item in cell_cache] true_list_push_key = list_push_key + ":" + push_key helper_utils.std_flush( "[%s] -- Deleting existing %s, if any" % (helper_utils.readable_time(), true_list_push_key)) r.delete(true_list_push_key) r.lpush(true_list_push_key, *cell_list) helper_utils.std_flush( "[%s] -- Pushed cell list to %s" % (helper_utils.readable_time(), true_list_push_key)) helper_utils.std_flush("[%s] -- Pushing individual cell results" % helper_utils.readable_time()) cell_counter = 0 for _cell_ in cell_cache: cell_push_contents = json.dumps(cell_cache[_cell_]) cell_specific_suffix = ":".join(_cell_.split("_")) cell_push_key = ":".join( [list_info_key, cell_specific_suffix, push_key]) r.set(cell_push_key, cell_push_contents) if cell_counter == 0: helper_utils.std_flush("[%s] -- First push: %s --- %s" % (helper_utils.readable_time(), cell_push_key, cell_push_contents)) cell_counter += 1 helper_utils.std_flush( "[%s] -- Completed individual cell pushes with %s cells" % (helper_utils.readable_time(), str(cell_counter))) r.set(list_tracker_key, push_key) helper_utils.std_flush( "[%s] -- Setting versioning in %s to %s" % (helper_utils.readable_time(), list_tracker_key, push_key)) helper_utils.std_flush("-------- COMPLETE AT %s ----------\n" % helper_utils.readable_time()) else: #helper_utils.std_flush("Sleeping for %s"%sleep_timer) time.sleep(sleep_timer)
def process(self, message): if message["streamtype"] not in self.stream_tracker: self.stream_tracker[message["streamtype"]] = {} self.stream_tracker[message["streamtype"]]["hdi"] = 0 self.stream_tracker[message["streamtype"]]["non_hdi"] = 0 self.stream_tracker[message["streamtype"]]["totalcounter"] = 0 self.stream_tracker[message["streamtype"]]["totalcounter"] += 1 if time.time() - self.cursor_timer > self.cursor_refresh: self.cursor.close() self.cursor = self.DB_CONN.cursor() self.cursor_timer = time.time() for _streamtype in self.stream_tracker: utils.helper_utils.std_flush( "[%s] -- Processed %i elements from %s with %i HDI and %i NONHDI" % (helper_utils.readable_time(), self.stream_tracker[_streamtype]["totalcounter"], _streamtype, self.stream_tracker[_streamtype]["hdi"], self.stream_tracker[_streamtype]["non_hdi"])) self.stream_tracker[_streamtype]["totalcounter"] = 0 self.stream_tracker[_streamtype]["non_hdi"] = 0 self.stream_tracker[_streamtype]["hdi"] = 0 if self.debug: utils.helper_utils.std_flush( "Processed %i elements from %s with %i HDI and %i NONHDI" % (self.stream_tracker[message["streamtype"]]["totalcounter"], message["streamtype"], self.stream_tracker[message["streamtype"]]["hdi"], self.stream_tracker[message["streamtype"]]["non_hdi"])) # Check # Check item self.verify_message(message) message["cell"] = utils.helper_utils.generate_cell( float(message["latitude"]), float(message["longitude"])) _time_ = int(int(message["timestamp"]) / 1000) _time_minus = self.time_convert(_time_ - 6 * self.MS_IN_DAYS) _time_plus = self.time_convert(_time_ + 3 * self.MS_IN_DAYS) select_s = 'SELECT location from HCS_News where cell = %s and timestamp > %s and timestamp < %s' params = (message["cell"], _time_minus, _time_plus) self.cursor.execute(select_s, params) results = self.cursor.fetchall() if len(results) > 0: #helper_utils.std_flush("True Event found for %s"%str(message["text"].encode("utf-8"))[2:-2]) self.true_counter += 1 # Push into landslide events... insert = 'INSERT INTO ASSED_Social_Events ( \ social_id, cell, \ latitude, longitude, timestamp, link, text, location, topic_name, source, valid, streamtype) \ VALUES (%s,%s,%s,%s,%s,%s, %s, %s,%s, %s, %s, %s)' params = (str(message["id_str"]), message["cell"], str(message['latitude']), \ str(message['longitude']), self.ms_time_convert(message['timestamp']), message["link"], str(message["text"].encode("utf-8"))[2:-2], message["location"], "landslide", "hdi", "1", message["streamtype"]) #helper_utils.std_flush(insert%params) try: if not self.debug: self.cursor.execute(insert, params) self.DB_CONN.commit() else: #helper_utils.std_flush(insert%params) pass helper_utils.std_flush( "[%s] -- Possible landslide event at %s detected at time %s using HDI (current time: %s)" % (helper_utils.readable_time(), message["location"], self.ms_time_convert(message["timestamp"]), self.time_convert(time.time()))) self.stream_tracker[message["streamtype"]]["hdi"] += 1 return (False, message) except mdb._exceptions.Error as mdb_error: traceback.print_exc() true_mdb_error = eval(str(mdb_error)) if true_mdb_error[0] == 2013 or true_mdb_error[ 0] == 2006: # This is database connection error raise RuntimeError( "[%s] -- ERROR -- Cannot connect to MySQL Database. Shutting down." % helper_utils.readable_time()) helper_utils.std_flush( '[%s] -- ERROR -- Failed to insert %s with error %s' % (helper_utils.readable_time(), message["id_str"], repr(mdb_error))) else: # No matching HDI pass """ tODO also perform event detection on other data (just news data (already exists), combination of earthquake AND TRMM (???)) """ if self.debug: #helper_utils.std_flush("No HDI detected for %s - %s - %s"%(str(message["id_str"]),str(message["text"].encode("utf-8"))[2:-2], message["cell"] )) pass self.stream_tracker[message["streamtype"]]["non_hdi"] += 1 return (True, message)
def run(self): """Run - Launches the sreamer itself. """ while True: # Perform work ONLY if it hasn't been performed today... time_reset = False previousTimestamp = self.r.get("social:streamer:facebook:%s:%s:timestamp"%(self.event, self.lang)) previousApiAccesses = self.r.get("social:streamer:facebook:%s:%s:count"%(self.event, self.lang)) if previousApiAccesses is None: previousApiAccesses = 0 else: previousApiAccesses = int(previousApiAccesses) if previousTimestamp is None: previousTimestamp = time.time() time_reset = True else: previousTimestamp = int(float(previousTimestamp)) if time_reset or datetime.fromtimestamp(previousTimestamp).day != datetime.fromtimestamp(time.time()).day: self.messageQueue.put("Initiating facebook download of %s-%s at %s"%(self.event, self.lang, readable_time())) max_results = 10 for page_get in range(5): start_param = page_get*10 + 1 if start_param > max_results: continue results = self.service.cse().list(q=self.keywords,cx=self.cx,dateRestrict='d1',siteSearch='www.facebook.com',siteSearchFilter='i', start=start_param).execute() #results = {'searchInformation': {'searchTime': 0.438406, 'formattedSearchTime': '0.44', 'totalResults': '359', 'formattedTotalResults': '359'}, 'items': [{'kind': 'customsearch#result', 'title': 'Dost_pagasa - 24-HOUR PUBLIC WEATHER FORECAST Issued at ...', 'htmlTitle': 'Dost_pagasa - 24-HOUR PUBLIC WEATHER FORECAST Issued at ...', 'link': 'https://www.facebook.com/PAGASA.DOST.GOV.PH/photos/a.302759263167323/2109827015793863/?type=3', 'displayLink': 'www.facebook.com', 'snippet': 'Caused by: Southwesterlies Impacts: Possible flash floods or landslides due to \nscattered light to moderate rains. Place: Metro Manila and the rest of the country', 'htmlSnippet': 'Caused by: Southwesterlies Impacts: Possible flash floods or <b>landslides</b> due to <br>\nscattered light to moderate rains. Place: Metro Manila and the rest of the country', 'cacheId': 'U61GxJQyR4QJ', 'formattedUrl': 'https://www.facebook.com/PAGASA.../a.../2109827015793863/?...', 'htmlFormattedUrl': 'https://www.facebook.com/PAGASA.../a.../2109827015793863/?...', 'pagemap': {'cse_thumbnail': [{'width': '255', 'height': '197', 'src': 'https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSmvRenMdLrIaVlcJZQhA9lPWht0i9M50eSKoYjUCgjlN5yEKdHrPENUM2m'}], 'metatags': [{'referrer': 'default', 'og:title': 'Dost_pagasa', 'og:description': '24-HOUR PUBLIC WEATHER FORECAST\nIssued at 4:00 PM Monday, 13 May 2019 \n\nSynopsis: Ridge of High Pressure Area affecting the eastern sections of Northern and Central Luzon, and Southern Luzon....', 'og:image': 'https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=2109827015793863', 'og:url': 'https://www.facebook.com/PAGASA.DOST.GOV.PH/posts/2109827262460505'}], 'cse_image': [{'src': 'https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=2109827015793863'}]}}, {'kind': 'customsearch#result', 'title': 'Persistent rainfall + floods +... - Severe Weather Europe | Facebook', 'htmlTitle': 'Persistent rainfall + floods +... - Severe Weather Europe | Facebook', 'link': 'https://www.facebook.com/severeweatherEU/posts/persistent-rainfall-floods-landslides-for-parts-of-the-western-balkans-next-week/2515360518687034/', 'displayLink': 'www.facebook.com', 'snippet': 'Persistent rainfall + floods + landslides for parts of the western Balkans next week\n! #Croatia and #Bosnia and Herzegovina will see the most problems....', 'htmlSnippet': 'Persistent rainfall + floods + <b>landslides</b> for parts of the western Balkans next week<br>\n! #Croatia and #Bosnia and Herzegovina will see the most problems....', 'cacheId': 'axcWNRhPvZ8J', 'formattedUrl': 'https://www.facebook.com/...landslides.../2515360518687034/', 'htmlFormattedUrl': 'https://www.facebook.com/...<b>landslides</b>.../2515360518687034/', 'pagemap': {'cse_thumbnail': [{'width': '225', 'height': '225', 'src': 'https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcQMzGKR0k2IELVUTyc9UD443A5nzGE2PDqO1rSLG6xCajL8xpYI26zXvm4'}], 'metatags': [{'referrer': 'default', 'og:title': 'Severe Weather Europe', 'og:description': 'Persistent rainfall + floods + landslides for parts of the western Balkans next week! #Croatia and #Bosnia and Herzegovina will see the most problems....', 'og:image': 'https://external-atl3-1.xx.fbcdn.net/safe_image.php?d=AQAQHIR0RhEZxrkq&w=400&h=400&url=http%3A%2F%2Fwww.severe-weather.eu%2Fwp-content%2Fuploads%2F2019%2F05%2F500h_anom.eu_Tuesday.png&cfs=1&_nc_hash=AQD2C7pYBAE_Wgq_', 'og:url': 'https://www.facebook.com/severeweatherEU/posts/2515360518687034'}], 'cse_image': [{'src': 'https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=157048324896449'}]}}]} currentTimeStamp = time.time() self.r.set("social:streamer:facebook:%s:%s:timestamp"%(self.event, self.lang), currentTimeStamp) previousApiAccesses+=1 self.r.set("social:streamer:facebook:%s:%s:count"%(self.event, self.lang), previousApiAccesses) max_results = int(results["searchInformation"]["totalResults"]) try: if time.time() - self.local_time > self.TIMER: self.path_setup() for _item_ in results["items"]: _data_ = {} #-20 for the varchar limit... _data_["id_str"] = _item_["link"][-20:] _data_["text"] = _item_["snippet"] _data_["location"] = "" _data_["latitude"] = None _data_["longitude"] = None _data_["streamtype"] = "facebook" _data_["timestamp_ms"] = int(time.time()*1000) _data_["link"] = _item_["link"] self.output.write(json.dumps(_data_)+"\n") except Exception as e: self.errorQueue.put(("structured",("facebook", self.event, self.lang), str(e))) self.messageQueue.put("Completed facebook download of %s-%s part %i of 10 at %s"%(self.event, self.lang, page_get+1, readable_time())) self.messageQueue.put("Completed facebook download of %s-%s at time %s"%(self.event, self.lang, readable_time())) else: # we already done... self.messageQueue.put("Facebook download of %s-%s at %s is already complete for day %s"%(self.event, self.lang, readable_time(), str(datetime.fromtimestamp(time.time()).day))) # Perform sleep for four hours until next check. time.sleep(14400)
fromlist=[_cfg["source_file"]]) Executor = getattr(moduleImport, _cfg["source_file"]) try: HCS_configuration[hcs_type]['processor'] = Executor( assed_config, root_name=hcs_type, errorQueue=errorQueue, messageQueue=messageQueue, **kwargs) except Exception as e: traceback.print_exc() std_flush("Failed to launch %s with error %s" % (hcs_type, repr(e))) std_flush("Launch complete for ", hcs_type, "HighConfigurationStreamer at ", readable_time()) HCS_configuration[hcs_type]['processor'].start() HCS_configuration[hcs_type]['timestamp'] = time.time() configCheckTimer = time.time() while True: if time.time() - configCheckTimer > CONSTANTS.HCS_CONFIG_TIME_CHECK: configCheckTimer = time.time() std_flush(" ".join(["Checking configuration at", readable_time()])) configReload = load_config(CONSTANTS.HIGH_CONFIDENCE_CONFIG_PATH) configCheckTimer = time.time() # TODO handle config changes... pass # Rerun scheduled ones
def process(self, message): if message["streamtype"] not in self.stream_tracker: self.stream_tracker[message["streamtype"]] = {} self.stream_tracker[message["streamtype"]]["positive"] = 0 self.stream_tracker[message["streamtype"]]["negative"] = 0 self.stream_tracker[message["streamtype"]]["totalcounter"] = 0 self.stream_tracker[message["streamtype"]]["totalcounter"] += 1 if time.time() - self.cursor_timer > self.cursor_refresh: self.cursor.close() self.cursor = self.DB_CONN.cursor() self.cursor_timer = time.time() #helper_utils.std_flush("TRUE: %i\t\tFALSE: %i out of total of %i"%(self.true_counter, self.false_counter, self.total_counter)) self.total_counter, self.true_counter, self.false_counter = 0, 0, 0 for _streamtype in self.stream_tracker: utils.helper_utils.std_flush( "[%s] -- Processed %i elements from %s with %i positive and %i negative" % (helper_utils.readable_time(), self.stream_tracker[_streamtype]["totalcounter"], _streamtype, self.stream_tracker[_streamtype]["positive"], self.stream_tracker[_streamtype]["negative"])) self.stream_tracker[_streamtype]["totalcounter"] = 0 self.stream_tracker[_streamtype]["positive"] = 0 self.stream_tracker[_streamtype]["negative"] = 0 if self.debug: utils.helper_utils.std_flush( "Processed %i elements from %s with %i positive and %i negative" % (self.stream_tracker[message["streamtype"]]["totalcounter"], message["streamtype"], self.stream_tracker[message["streamtype"]]["positive"], self.stream_tracker[message["streamtype"]]["negative"])) # Get message text cleaned_message = str(message["text"].encode("utf-8"))[2:-2] encoded_message = self.encode(cleaned_message) prediction = np.argmax( self.model.predict(np.array([encoded_message]))[0]) params = None if prediction == 1: # push to db self.true_counter += 1 params = (message["id_str"], message["cell"], str(message['latitude']), \ str(message['longitude']), self.ms_time_convert(message['timestamp']), message["link"], str(message["text"].encode("utf-8"))[2:-2], message["location"], "landslide", "ml", "1", message["streamtype"]) self.stream_tracker[message["streamtype"]]["positive"] += 1 elif prediction == 0: # push to db, with false? push to different db? self.false_counter += 1 params = (message["id_str"], message["cell"], str(message['latitude']), \ str(message['longitude']), self.ms_time_convert(message['timestamp']), message["link"], str(message["text"].encode("utf-8"))[2:-2], message["location"], "landslide", "ml", "0", message["streamtype"]) self.stream_tracker[message["streamtype"]]["negative"] += 1 else: warnings.warn( "[%s] -- WARNING -- Prediction value of %i is not one of valid predictions [0, 1]" % (helper_utils.readable_time(), prediction)) try: if not self.debug: self.cursor.execute(self.db_insert, params) self.DB_CONN.commit() else: #helper_utils.std_flush(self.db_insert%params) pass except mdb._exceptions.Error as mdb_error: traceback.print_exc() true_mdb_error = eval(str(mdb_error)) if true_mdb_error[0] == 2013 or true_mdb_error[ 0] == 2006: # This is database connection error raise RuntimeError( "[%s] -- ERROR -- Cannot connect to MySQL Database. Shutting down" % helper_utils.readable_time()) helper_utils.std_flush( '[%s] -- ERROR -- Failed to insert %s with error %s' % (helper_utils.readable_time(), message["id_str"], repr(mdb_error))) return (False, message) self.total_counter += 1 return (False, message)