Example #1
0
    def run(self, ):
        try:
            url_list = [
                'https://trmm.gsfc.nasa.gov/trmm_rain/Events/latest_1_day_landslide.html',
                'https://trmm.gsfc.nasa.gov/trmm_rain/Events/latest_3_day_landslide.html',
                'https://trmm.gsfc.nasa.gov/trmm_rain/Events/latest_7_day_landslide.html'
            ]

            for trmm_url in url_list:
                self.messageQueue.put("Obtained TRMM url: %s" % trmm_url)
                try:
                    response = requests.get(trmm_url)
                except Exception as e:
                    self.messageQueue.put("TRMM URL %s failed with error: %s" %
                                          (trmm_url, repr(e)))
                    continue
                html = response.text
                trmm_items, trmm_locations = self.getTRMMItems(html)

                self.insertTRMM(trmm_items)
                self.updateRedisLocations(trmm_locations)

            self.DB_CONN.close()
            self.messageQueue.put("Completed TRMM successfully at %s." %
                                  readable_time())
        except Exception as e:
            traceback.print_exc()
            self.errorQueue.put((self.root_name, str(e)))
Example #2
0
    def run(self):
        """Run - Launches the sreamer itself.

        """
        try:
            self.stream.filter(track=self.keywords)
            self.messageQueue.put(" ".join([
                "Running unstructured streamer", "with PID",
                str(os.getpid()), "at",
                readable_time()
            ]))
        except Exception as e:
            self.messageQueue.put(" ".join([
                "Crashed unstructured stream", "at",
                readable_time(), "with error",
                str(e)
            ]))
            self.errorQueue.put(('unstructured', ("twitter", ), str(e)))
Example #3
0
    def run(self, ):

        try:
            for event_topic in self.config["topic_names"]:
                if not self.config["topic_names"][event_topic][
                        "high_confidence"]["valid"]:
                    continue
                self.messageQueue.put("News downloader - working on %s" %
                                      event_topic)
                event_topic_key = str(
                    self.config["topic_names"][event_topic]["index"])
                self.cached_list = self.getCachedList(event_topic_key)
                stopwords = self.config["topic_names"][event_topic][
                    "stopwords"]
                keyword_set = self.config["topic_names"][event_topic][
                    "high_confidence"]["keywords"]
                articles = []
                for keyword in keyword_set:
                    try:
                        response = self.client.get_everything(q=keyword,
                                                              page_size=100)
                        articles += response["articles"]
                    except Exception as e:
                        self.messageQueue.put(
                            "NewsAPI for %s-%s failed with error: %s" %
                            (event_topic, keyword, repr(e)))

                article_content, article_location = self.getArticleDetails(
                    articles, stopwords)

                self.insertNews(article_content, event_topic_key)
                self.updateRedisLocations(article_location)

            self.DB_CONN.close()
            self.messageQueue.put(
                "Completed News download successfully at %s." %
                readable_time())

        except Exception as e:
            traceback.print_exc()
            self.errorQueue.put((self.root_name, str(e)))
Example #4
0
 def run(self,):
     try:
         url_list = ['http://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/2.5_month.geojson']
         
         for usgs_url in url_list:
             self.messageQueue.put("Obtaining USGS url: %s"%usgs_url)
             try:
                 response = requests.get(usgs_url)
             except Exception as e:
                 self.messageQueue.put("USGS URL %s failed with error: %s" % (usgs_url, repr(e)))
                 continue
             jsonData = response.json()
             usgs_items, usgs_locations = self.getUSGSItems(jsonData)
             self.insertUSGS(usgs_items)
             self.updateRedisLocations(usgs_locations)
         
         self.DB_CONN.close()
         self.messageQueue.put("Completed USGS successfully at %s."%readable_time())
     except Exception as e:
         traceback.print_exc()
         self.errorQueue.put((self.root_name, str(e)))
Example #5
0
def main(logdir, importkey, exportkey, dataprocessor, dataprocessorscriptdir,
         pidname):
    TOP_OF_FILE_START = True
    pid_name = pidname
    helper_utils.setup_pid(pid_name, logdir=logdir)

    # Import processscript
    helper_utils.std_flush("[%s] -- Initializing ASSED-input-buffer %s" %
                           (helper_utils.readable_time(), pidname))
    moduleImport = __import__("pipelines.%s.%s" %
                              (dataprocessorscriptdir, dataprocessor),
                              fromlist=[dataprocessor])
    DataProcessor = getattr(moduleImport, dataprocessor)
    DataProcessor = DataProcessor()
    helper_utils.std_flush("[%s] -- Imported Data processor %s" %
                           (helper_utils.readable_time(), dataprocessor))

    # Set up connections
    pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
    r = redis.Redis(connection_pool=pool)
    kafka_key = exportkey.replace(":", "_")
    kafka_producer = kafka.KafkaProducer()

    message_refresh = 7200
    skip_count = 0
    process_count = 0
    time_slept = 0
    message_timer = time.time()

    # Get earliest file to parse...
    helper_utils.std_flush("[%s] -- Searching for files" %
                           helper_utils.readable_time())
    finishedUpToTime = r.get(importkey)
    granularTime = 0
    if finishedUpToTime is None:
        finishedUpToTime = 0
    else:
        finishedUpToTime = int(finishedUpToTime.decode())

    if finishedUpToTime == 0:
        # TODO CHANGE TO 7 days after setup is complete...
        helper_utils.std_flush(
            "[%s] -- No value for previous stop. Starting from 7 days prior",
            helper_utils.readable_time())
        currentTime = datetime.now() - timedelta(days=7)
        foundFlag = 0
        while foundFlag == 0:
            filePath = DataProcessor.getInputPath(currentTime)
            if os.path.exists(filePath):
                # We found the most recent file, and increment our counter
                finishedUpToTime = currentTime
                foundFlag = 1
            else:
                # If our search is too broad - i.e. we are a month behind, ignore
                currentTime += TIME_DELTA_MINIMAL
                timeDeltaOutputStream = (datetime.now() - currentTime)
                if timeDeltaOutputStream.days == 0 and timeDeltaOutputStream.seconds <= 1:
                    foundFlag = -1
    else:
        # I.E. if we already have a timestmap from pervious execution, we will read files that are a minute behind, and catch up to the granular time
        helper_utils.std_flush(
            "[%s] -- Starting File tracking at %s" %
            (helper_utils.readable_time(),
             str(datetime.fromtimestamp(finishedUpToTime / 1000.0))))
        granularTime = finishedUpToTime
        finishedUpToTime = datetime.fromtimestamp(
            granularTime / 1000.0) - timedelta(seconds=60)
        TOP_OF_FILE_START = False
    if TOP_OF_FILE_START:
        # Otherwise, we start from the beginning of the 'first' file...
        finishedUpToTime -= timedelta(seconds=finishedUpToTime.second)
        granularTime = 0

    prevGranular = granularTime

    helper_utils.std_flush("[%s] -- Starting Stream Tracking for %s" %
                           (helper_utils.readable_time(), importkey))
    while True:
        if time.time() - message_timer > message_refresh:
            message_timer = time.time()
            helper_utils.std_flush(
                "[%s] -- Processed %i items, with %i items skipped and %i seconds slept in the last %i seconds"
                % (helper_utils.readable_time(), process_count, skip_count,
                   time_slept, message_refresh))
            process_count, skip_count, time_slept = 0, 0, 0

        if (datetime.now() - finishedUpToTime).total_seconds() < 60:
            waitTime = 120 - (datetime.now() - finishedUpToTime).seconds
            time.sleep(waitTime)
            time_slept += waitTime
        else:
            filePath = DataProcessor.getInputPath(finishedUpToTime)
            if not os.path.exists(filePath):
                waitTime = (datetime.now() - finishedUpToTime).total_seconds()
                #Difference is less than Two minutes
                if waitTime < 120:
                    waitTime = 120 - waitTime
                    time.sleep(waitTime)
                    time_slept += waitTime
                else:
                    # Difference is more than two minutes - we can increment the the by one minute for the next ones
                    finishedUpToTime += TIME_DELTA_MINIMAL
            # Not we have file
            else:
                with open(filePath, 'r') as fileRead:
                    for line in fileRead:
                        try:
                            jsonVersion = json.loads(line)
                        except ValueError as e:
                            helper_utils.std_flush(
                                "[%s] -- WARNING -- Possible warning for %s file for %s with error %s"
                                % (helper_utils.readable_time(), filePath,
                                   importkey, str(e)))
                            continue

                        if "timestamp_ms" not in jsonVersion:
                            jsonVersion["timestamp_ms"] = int(
                                jsonVersion["timestamp"])

                        if granularTime > int(jsonVersion["timestamp_ms"]):
                            # skip already finished this...
                            skip_count += 1
                            continue

                        else:
                            # Have not done this item yet...
                            # process
                            processed_data = DataProcessor.process(jsonVersion)
                            byted = bytes(json.dumps(processed_data),
                                          encoding="utf-8")
                            kafka_producer.send(kafka_key, byted)
                            kafka_producer.flush()

                            granularTime = int(jsonVersion["timestamp_ms"])
                            r.set(importkey, granularTime)
                            process_count += 1
                            if granularTime - prevGranular > 86400000:
                                helper_utils.std_flush(
                                    "[%s] -- Finished with %s" %
                                    (helper_utils.readable_time(),
                                     str(
                                         datetime.fromtimestamp(
                                             granularTime / 1000.0))))
                                prevGranular = granularTime
                finishedUpToTime += TIME_DELTA_MINIMAL
Example #6
0
 # Now we have a StreamerManager with empty instances for each streamer we are going to launch.
 # We will launch all of them, and go on from there...
 for _streamer_ in StreamerManager:
     if StreamerManager[_streamer_]["type"] == "unstructured":
         #launch single unstructured streamer...
         StreamerManager[_streamer_]["apikey"] = StreamerManager[
             _streamer_]["keyserver"].get_key()
         StreamerManager[_streamer_]["instance"] = StreamerManager[
             _streamer_]["executor"](
                 StreamerManager[_streamer_]["keywords"],
                 StreamerManager[_streamer_]["apikey"][1], errorQueue,
                 messageQueue)
         StreamerManager[_streamer_]["instance"].start()
         std_flush(
             "Deployed unstructured streamer : %s\tat %s\twith key %s" %
             (StreamerManager[_streamer_]["name"], readable_time(),
              StreamerManager[_streamer_]["apikey"][0]))
     elif StreamerManager[_streamer_]["type"] == "structured":
         # Launch each instance (eventlangtuple)...
         for _instance_ in StreamerManager[_streamer_]["instances"]:
             StreamerManager[_streamer_]["instances"][_instance_][
                 "apikey"] = StreamerManager[_streamer_][
                     "keyserver"].get_key()
             StreamerManager[_streamer_]["instances"][_instance_][
                 "instance"] = StreamerManager[_streamer_]["executor"](
                     _instance_[0], _instance_[1],
                     StreamerManager[_streamer_]["instances"][_instance_]
                     ["keywords"], StreamerManager[_streamer_]["instances"]
                     [_instance_]["apikey"][1], errorQueue, messageQueue)
             StreamerManager[_streamer_]["instances"][_instance_][
                 "instance"].start()
Example #7
0
def main(logdir, importkey, exportkey, processscript, processscriptdir,
         pidname, debug, seekval):
    if debug is None:
        debug = 0
    if debug:
        helper_utils.std_flush("[%s] -- DEBUG_MODE -- Active" %
                               helper_utils.readable_time())
    pid_name = pidname
    if not debug:
        helper_utils.setup_pid(pid_name, logdir=logdir)

    # Import processscript
    helper_utils.std_flush("[%s] -- Initializing ASSED-Process %s" %
                           (helper_utils.readable_time(), pidname))
    moduleImport = __import__("pipelines.%s.%s" %
                              (processscriptdir, processscript),
                              fromlist=[processscript])
    MessageProcessor = getattr(moduleImport, processscript)
    if debug:
        MessageProcessor = MessageProcessor(debug=True)
    else:
        MessageProcessor = MessageProcessor()
    helper_utils.std_flush("[%s] -- Imported Module %s" %
                           (helper_utils.readable_time(), processscript))

    kafka_import = importkey.replace(":", "_")
    helper_utils.std_flush("[%s] -- Generated kafka import key %s" %
                           (helper_utils.readable_time(), kafka_import))
    kafka_export = exportkey.replace(":", "_")
    helper_utils.std_flush("[%s] -- Generated kafka export key %s" %
                           (helper_utils.readable_time(), kafka_export))
    pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
    r = redis.Redis(connection_pool=pool)
    helper_utils.std_flush(
        "[%s] -- Connected to redis with ConnectionPool on port 6379" %
        helper_utils.readable_time())

    seek_partition = r.get(exportkey + ":partition")
    seek_offset = r.get(exportkey + ":offset")
    seek_partition = 0 if seek_partition is None else int(seek_partition)
    seek_offset = 0 if seek_offset is None else int(seek_offset) + 1
    helper_utils.std_flush(
        "[%s] -- Obtained seek partition for kafka at Partition %i -- Offset %i"
        % (helper_utils.readable_time(), seek_partition, seek_offset))

    # replace seek value
    if debug:
        if seekval is not None:
            seek_offset = seekval
            helper_utils.std_flush(
                "[%s] -- DEBUG -- Replaced seek offset for kafka at Partition %i -- Offset %i"
                % (helper_utils.readable_time(), seek_partition, seek_offset))

    kafka_producer = kafka.KafkaProducer()
    helper_utils.std_flush("[%s] -- Generated kafka producer" %
                           helper_utils.readable_time())
    kafka_consumer = kafka.KafkaConsumer()
    helper_utils.std_flush("[%s] -- Generated kafka consumer" %
                           helper_utils.readable_time())
    TopicPartition = kafka.TopicPartition(kafka_import, seek_partition)
    kafka_consumer.assign([TopicPartition])
    kafka_consumer.seek(TopicPartition, seek_offset)
    helper_utils.std_flush("[%s] -- Set kafka consumer seek" %
                           helper_utils.readable_time())

    message_correct_counter = 0
    message_fail_counter = 0
    message_counter = 0

    for message in kafka_consumer:
        item = json.loads(message.value.decode())
        processedMessage = MessageProcessor.process(item)
        # Push the message to kafka...if true
        if type(processedMessage) != type(tuple()):
            raise ValueError(
                "[%s] -- ERROR -- Invalid type %s for processedMessage. MessageProcessor.process() must return tuple of (bool,message)."
                % (helper_utils.readable_time(), str(type(processedMessage))))
        if not processedMessage[0]:
            message_fail_counter += 1
        else:
            if not debug:
                byted = bytes(json.dumps(processedMessage[1]),
                              encoding="utf-8")
                kafka_producer.send(kafka_export, byted)
                kafka_producer.flush()
            message_correct_counter += 1
        message_counter += 1

        if not debug:
            r.set(exportkey + ":partition", message.partition)
            r.set(exportkey + ":offset", message.offset)
            r.set(exportkey + ":timestamp", message.timestamp)

        if message_counter % 1000 == 0:
            helper_utils.std_flush(
                "[%s] -- Processed %i messages with %i failures and %i successes"
                % (helper_utils.readable_time(), message_counter,
                   message_fail_counter, message_correct_counter))
    #Load the keywords
    keywordConfig = load_config(CONSTANTS.TOPIC_CONFIG_PATH)
    errorQueue = multiprocessing.Queue()
    messageQueue = multiprocessing.Queue()

    keyStreamConfig = {}
    # for each keyword-lang pair type, launch a StreamFilesProcessor
    for physicalEvent in keywordConfig['topic_names'].keys():
        for language in keywordConfig['topic_names'][physicalEvent]["languages"]:
            eventLangTuple = (physicalEvent,language)
            keyStreamConfig[eventLangTuple] = {}
            keyStreamConfig[eventLangTuple]['name'] = physicalEvent
            keyStreamConfig[eventLangTuple]['lang'] = language
            keyStreamConfig[eventLangTuple]['keywords'] = keywordConfig['topic_names'][physicalEvent]["languages"][language]
            keyStreamConfig[eventLangTuple]['postpone'] = False
            std_flush(" ".join(["Deploying",str(eventLangTuple), "at", readable_time()]))
            try:
                keyStreamConfig[eventLangTuple]['processor'] = StreamFilesProcessor(  None, 
                                                                                keyStreamConfig[eventLangTuple]['keywords'], 
                                                                                "_".join([eventLangTuple[0],eventLangTuple[1]]), 
                                                                                errorQueue,
                                                                                messageQueue)
            except RuntimeError:
                std_flush(" ".join([str(eventLangTuple), " does not have files to start. Posponing launch 2 hr at", readable_time()]))
                keyStreamConfig[eventLangTuple]['postpone'] = True
            keyStreamConfig[eventLangTuple]['launchTime'] = datetime.now()

            if not keyStreamConfig[eventLangTuple]['postpone']:
                keyStreamConfig[eventLangTuple]['processor'].start()

    configCheckTimer = time.time()
Example #9
0
def main():

    local_timer = 0
    refresh_timer = 7200
    sleep_timer = 300
    while True:
        if time.time() - local_timer > refresh_timer:

            local_timer = time.time()

            helper_utils.std_flush("[%s] -- Initializing EventDetection" %
                                   helper_utils.readable_time())
            cell_cache = {}

            assed_config = file_utils.load_config("./config/assed_config.json")

            helper_utils.std_flush("[%s] -- Obtained DB Connection" %
                                   helper_utils.readable_time())
            DB_CONN = db_utils.get_db_connection(assed_config)
            cursor = DB_CONN.cursor()

            available_streamers = [
                item for item in assed_config["SocialStreamers"]
            ]
            streamer_results = {}
            helper_utils.std_flush(
                "[%s] -- Available streamers: %s" %
                (helper_utils.readable_time(), str(available_streamers)))

            for _streamer_ in available_streamers:
                helper_utils.std_flush(
                    "[%s] -- Generating query for: %s" %
                    (helper_utils.readable_time(), _streamer_))
                _query_ = generate_social_query(_streamer_=_streamer_,
                                                _topic_="landslide")
                cursor.execute(_query_)
                streamer_results[_streamer_] = cursor.fetchall()
                helper_utils.std_flush(
                    "[%s] -- Obtained results for : %s" %
                    (helper_utils.readable_time(), _streamer_))

            helper_utils.std_flush("[%s] -- Generating query for: %s" %
                                   (helper_utils.readable_time(), "TRMM"))
            _query_ = generate_trmm_query()
            cursor.execute(_query_)
            trmm_results = cursor.fetchall()
            helper_utils.std_flush("[%s] -- Obtained resuts for: %s" %
                                   (helper_utils.readable_time(), "TRMM"))

            helper_utils.std_flush("[%s] -- Generating query for: %s" %
                                   (helper_utils.readable_time(), "USGS"))
            _query_ = generate_usgs_query()
            cursor.execute(_query_)
            usgs_results = cursor.fetchall()
            helper_utils.std_flush("[%s] -- Obtained resuts for: %s" %
                                   (helper_utils.readable_time(), "USGS"))

            helper_utils.std_flush("[%s] -- Generating query for: %s" %
                                   (helper_utils.readable_time(), "News"))
            _query_ = generate_news_query()
            cursor.execute(_query_)
            news_results = cursor.fetchall()
            helper_utils.std_flush("[%s] -- Obtained resuts for: %s" %
                                   (helper_utils.readable_time(), "News"))
            cursor.close()

            helper_utils.std_flush(
                "[%s] -- Generating local cache with scoring:\tSocial-ML - 0.3\tSocial-HDI - 1\tNews - 3\tUSGS - 5\tTRMM - 1"
                % helper_utils.readable_time())
            # Scoring -- Twitter-Social: 0.3    Twitter-HDI - 1     News:       3       USGS:   5       TRMM:   1
            for _streamer_ in streamer_results:
                helper_utils.std_flush(
                    "[%s] -- Local caching for %s" %
                    (helper_utils.readable_time(), _streamer_))
                for tuple_cell_ in streamer_results[_streamer_]:
                    _cell_ = tuple_cell_[0]
                    if _cell_ not in cell_cache:
                        cell_cache[_cell_] = {}
                    if int(float(tuple_cell_[1])) > 0:
                        cell_cache[_cell_][_streamer_ + "-hdi"] = (int(
                            float(tuple_cell_[1])), float(tuple_cell_[1]))
                    if int(float(tuple_cell_[2]) / 0.34) > 0:
                        cell_cache[_cell_][_streamer_ + "-ml"] = (int(
                            float(tuple_cell_[2]) / 0.34), float(
                                tuple_cell_[2]))

            helper_utils.std_flush("[%s] -- Local caching for %s" %
                                   (helper_utils.readable_time(), "TRMM"))
            for tuple_cell_ in trmm_results:
                _cell_ = tuple_cell_[0]
                if _cell_ not in cell_cache:
                    cell_cache[_cell_] = {}
                cell_cache[_cell_]["TRMM"] = (float(tuple_cell_[1]),
                                              float(tuple_cell_[1] * 1)
                                              )  # 1 <-- TRMM score

            helper_utils.std_flush("[%s] -- Local caching for %s" %
                                   (helper_utils.readable_time(), "USGS"))
            for tuple_cell_ in usgs_results:
                _cell_ = tuple_cell_[0]
                if _cell_ not in cell_cache:
                    cell_cache[_cell_] = {}
                cell_cache[_cell_]["USGS"] = (float(tuple_cell_[1]),
                                              float(tuple_cell_[1] * 5))

            helper_utils.std_flush("[%s] -- Local caching for %s" %
                                   (helper_utils.readable_time(), "News"))
            for tuple_cell_ in news_results:
                _cell_ = tuple_cell_[0]
                if _cell_ not in cell_cache:
                    cell_cache[_cell_] = {}
                cell_cache[_cell_]["News"] = (float(tuple_cell_[1]),
                                              float(tuple_cell_[1] * 3))

            helper_utils.std_flush(
                "[%s] -- Local cache score total generation" %
                helper_utils.readable_time())
            for _cell_ in cell_cache:
                cell_cache[_cell_]["total"] = sum([
                    cell_cache[_cell_][item][1] for item in cell_cache[_cell_]
                ])

            pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
            r = redis.Redis(connection_pool=pool)
            helper_utils.std_flush("[%s] -- Connected to Redis" %
                                   helper_utils.readable_time())

            # Correct-key -- v1 or v2
            # Key Push
            # Actual keys...
            # list_tracker_key tracks where the data is (either v1 or v2)
            # list_push_key contains the list of cells
            list_tracker_key = "assed:event:detection:multisource:listkey"
            list_push_key = "assed:event:detection:multisource:list"
            list_info_key = "assed:event:detection:multisource:info"
            key_version = r.get(list_tracker_key)
            if key_version is None:
                key_version = "v2"
            else:
                key_version = key_version.decode()
            push_key = 'v1'
            if key_version == 'v1':
                helper_utils.std_flush(
                    "[%s] -- v1 key already in effect. Pushing to v2" %
                    helper_utils.readable_time())
                push_key = 'v2'
            else:
                helper_utils.std_flush(
                    "[%s] -- v2 key already in effect. Pushing to v1" %
                    helper_utils.readable_time())

            cell_list = [item for item in cell_cache]
            true_list_push_key = list_push_key + ":" + push_key
            helper_utils.std_flush(
                "[%s] -- Deleting existing %s, if any" %
                (helper_utils.readable_time(), true_list_push_key))
            r.delete(true_list_push_key)

            r.lpush(true_list_push_key, *cell_list)
            helper_utils.std_flush(
                "[%s] -- Pushed cell list to %s" %
                (helper_utils.readable_time(), true_list_push_key))

            helper_utils.std_flush("[%s] -- Pushing individual cell results" %
                                   helper_utils.readable_time())
            cell_counter = 0
            for _cell_ in cell_cache:
                cell_push_contents = json.dumps(cell_cache[_cell_])
                cell_specific_suffix = ":".join(_cell_.split("_"))
                cell_push_key = ":".join(
                    [list_info_key, cell_specific_suffix, push_key])
                r.set(cell_push_key, cell_push_contents)
                if cell_counter == 0:
                    helper_utils.std_flush("[%s] -- First push: %s --- %s" %
                                           (helper_utils.readable_time(),
                                            cell_push_key, cell_push_contents))
                cell_counter += 1

            helper_utils.std_flush(
                "[%s] -- Completed individual cell pushes with %s cells" %
                (helper_utils.readable_time(), str(cell_counter)))

            r.set(list_tracker_key, push_key)
            helper_utils.std_flush(
                "[%s] -- Setting versioning in %s to %s" %
                (helper_utils.readable_time(), list_tracker_key, push_key))

            helper_utils.std_flush("--------   COMPLETE AT  %s ----------\n" %
                                   helper_utils.readable_time())
        else:
            #helper_utils.std_flush("Sleeping for %s"%sleep_timer)
            time.sleep(sleep_timer)
Example #10
0
    def process(self, message):
        if message["streamtype"] not in self.stream_tracker:
            self.stream_tracker[message["streamtype"]] = {}
            self.stream_tracker[message["streamtype"]]["hdi"] = 0
            self.stream_tracker[message["streamtype"]]["non_hdi"] = 0
            self.stream_tracker[message["streamtype"]]["totalcounter"] = 0
        self.stream_tracker[message["streamtype"]]["totalcounter"] += 1

        if time.time() - self.cursor_timer > self.cursor_refresh:
            self.cursor.close()
            self.cursor = self.DB_CONN.cursor()
            self.cursor_timer = time.time()
            for _streamtype in self.stream_tracker:
                utils.helper_utils.std_flush(
                    "[%s] -- Processed %i elements from %s with %i HDI  and %i NONHDI"
                    % (helper_utils.readable_time(),
                       self.stream_tracker[_streamtype]["totalcounter"],
                       _streamtype, self.stream_tracker[_streamtype]["hdi"],
                       self.stream_tracker[_streamtype]["non_hdi"]))
                self.stream_tracker[_streamtype]["totalcounter"] = 0
                self.stream_tracker[_streamtype]["non_hdi"] = 0
                self.stream_tracker[_streamtype]["hdi"] = 0
        if self.debug:
            utils.helper_utils.std_flush(
                "Processed %i elements from %s with %i HDI and %i NONHDI" %
                (self.stream_tracker[message["streamtype"]]["totalcounter"],
                 message["streamtype"],
                 self.stream_tracker[message["streamtype"]]["hdi"],
                 self.stream_tracker[message["streamtype"]]["non_hdi"]))
        # Check

        # Check item
        self.verify_message(message)
        message["cell"] = utils.helper_utils.generate_cell(
            float(message["latitude"]), float(message["longitude"]))
        _time_ = int(int(message["timestamp"]) / 1000)
        _time_minus = self.time_convert(_time_ - 6 * self.MS_IN_DAYS)
        _time_plus = self.time_convert(_time_ + 3 * self.MS_IN_DAYS)
        select_s = 'SELECT location from HCS_News where cell = %s and timestamp > %s and timestamp < %s'
        params = (message["cell"], _time_minus, _time_plus)
        self.cursor.execute(select_s, params)
        results = self.cursor.fetchall()
        if len(results) > 0:
            #helper_utils.std_flush("True Event found for %s"%str(message["text"].encode("utf-8"))[2:-2])
            self.true_counter += 1
            # Push into landslide events...
            insert = 'INSERT INTO ASSED_Social_Events ( \
                        social_id, cell, \
                        latitude, longitude, timestamp, link, text, location, topic_name, source, valid, streamtype) \
                        VALUES (%s,%s,%s,%s,%s,%s, %s, %s,%s, %s, %s, %s)'
            params = (str(message["id_str"]), message["cell"], str(message['latitude']), \
                    str(message['longitude']), self.ms_time_convert(message['timestamp']), message["link"], str(message["text"].encode("utf-8"))[2:-2], message["location"], "landslide", "hdi", "1", message["streamtype"])

            #helper_utils.std_flush(insert%params)

            try:
                if not self.debug:
                    self.cursor.execute(insert, params)
                    self.DB_CONN.commit()
                else:
                    #helper_utils.std_flush(insert%params)
                    pass
                helper_utils.std_flush(
                    "[%s] -- Possible landslide event at %s detected at time %s using HDI (current time: %s)"
                    % (helper_utils.readable_time(), message["location"],
                       self.ms_time_convert(message["timestamp"]),
                       self.time_convert(time.time())))
                self.stream_tracker[message["streamtype"]]["hdi"] += 1
                return (False, message)
            except mdb._exceptions.Error as mdb_error:
                traceback.print_exc()
                true_mdb_error = eval(str(mdb_error))
                if true_mdb_error[0] == 2013 or true_mdb_error[
                        0] == 2006:  # This is database connection error
                    raise RuntimeError(
                        "[%s] -- ERROR -- Cannot connect to MySQL Database. Shutting down."
                        % helper_utils.readable_time())
                helper_utils.std_flush(
                    '[%s] -- ERROR -- Failed to insert %s with error %s' %
                    (helper_utils.readable_time(), message["id_str"],
                     repr(mdb_error)))
        else:
            # No matching HDI
            pass
        """
        tODO
        also perform event detection on other data (just news data (already exists), combination of earthquake AND TRMM (???))

        """

        if self.debug:
            #helper_utils.std_flush("No HDI detected for %s - %s - %s"%(str(message["id_str"]),str(message["text"].encode("utf-8"))[2:-2], message["cell"] ))
            pass
        self.stream_tracker[message["streamtype"]]["non_hdi"] += 1
        return (True, message)
Example #11
0
    def run(self):
        """Run - Launches the sreamer itself.

        """
        while True:
            # Perform work ONLY if it hasn't been performed today...
            time_reset = False
            previousTimestamp = self.r.get("social:streamer:facebook:%s:%s:timestamp"%(self.event, self.lang))
            previousApiAccesses = self.r.get("social:streamer:facebook:%s:%s:count"%(self.event, self.lang))
            if previousApiAccesses is None:
                previousApiAccesses = 0
            else:
                previousApiAccesses = int(previousApiAccesses)
            if previousTimestamp is None:
                previousTimestamp = time.time()
                time_reset = True
            else:
                previousTimestamp = int(float(previousTimestamp))
            if time_reset or datetime.fromtimestamp(previousTimestamp).day != datetime.fromtimestamp(time.time()).day:
                self.messageQueue.put("Initiating facebook download of %s-%s at %s"%(self.event, self.lang, readable_time()))
                max_results = 10
                for page_get in range(5):
                    start_param = page_get*10 + 1
                    if start_param > max_results:
                        continue
                    results = self.service.cse().list(q=self.keywords,cx=self.cx,dateRestrict='d1',siteSearch='www.facebook.com',siteSearchFilter='i', start=start_param).execute()
                    #results = {'searchInformation': {'searchTime': 0.438406, 'formattedSearchTime': '0.44', 'totalResults': '359', 'formattedTotalResults': '359'}, 'items': [{'kind': 'customsearch#result', 'title': 'Dost_pagasa - 24-HOUR PUBLIC WEATHER FORECAST Issued at ...', 'htmlTitle': 'Dost_pagasa - 24-HOUR PUBLIC WEATHER FORECAST Issued at ...', 'link': 'https://www.facebook.com/PAGASA.DOST.GOV.PH/photos/a.302759263167323/2109827015793863/?type=3', 'displayLink': 'www.facebook.com', 'snippet': 'Caused by: Southwesterlies Impacts: Possible flash floods or landslides due to \nscattered light to moderate rains. Place: Metro Manila and the rest of the country', 'htmlSnippet': 'Caused by: Southwesterlies Impacts: Possible flash floods or <b>landslides</b> due to <br>\nscattered light to moderate rains. Place: Metro Manila and the rest of the country', 'cacheId': 'U61GxJQyR4QJ', 'formattedUrl': 'https://www.facebook.com/PAGASA.../a.../2109827015793863/?...', 'htmlFormattedUrl': 'https://www.facebook.com/PAGASA.../a.../2109827015793863/?...', 'pagemap': {'cse_thumbnail': [{'width': '255', 'height': '197', 'src': 'https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcSmvRenMdLrIaVlcJZQhA9lPWht0i9M50eSKoYjUCgjlN5yEKdHrPENUM2m'}], 'metatags': [{'referrer': 'default', 'og:title': 'Dost_pagasa', 'og:description': '24-HOUR PUBLIC WEATHER FORECAST\nIssued at 4:00 PM Monday, 13 May 2019 \n\nSynopsis: Ridge of High Pressure Area affecting the eastern sections of Northern and Central Luzon, and Southern Luzon....', 'og:image': 'https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=2109827015793863', 'og:url': 'https://www.facebook.com/PAGASA.DOST.GOV.PH/posts/2109827262460505'}], 'cse_image': [{'src': 'https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=2109827015793863'}]}}, {'kind': 'customsearch#result', 'title': 'Persistent rainfall + floods +... - Severe Weather Europe | Facebook', 'htmlTitle': 'Persistent rainfall + floods +... - Severe Weather Europe | Facebook', 'link': 'https://www.facebook.com/severeweatherEU/posts/persistent-rainfall-floods-landslides-for-parts-of-the-western-balkans-next-week/2515360518687034/', 'displayLink': 'www.facebook.com', 'snippet': 'Persistent rainfall + floods + landslides for parts of the western Balkans next week\n! #Croatia and #Bosnia and Herzegovina will see the most problems....', 'htmlSnippet': 'Persistent rainfall + floods + <b>landslides</b> for parts of the western Balkans next week<br>\n! #Croatia and #Bosnia and Herzegovina will see the most problems....', 'cacheId': 'axcWNRhPvZ8J', 'formattedUrl': 'https://www.facebook.com/...landslides.../2515360518687034/', 'htmlFormattedUrl': 'https://www.facebook.com/...<b>landslides</b>.../2515360518687034/', 'pagemap': {'cse_thumbnail': [{'width': '225', 'height': '225', 'src': 'https://encrypted-tbn1.gstatic.com/images?q=tbn:ANd9GcQMzGKR0k2IELVUTyc9UD443A5nzGE2PDqO1rSLG6xCajL8xpYI26zXvm4'}], 'metatags': [{'referrer': 'default', 'og:title': 'Severe Weather Europe', 'og:description': 'Persistent rainfall + floods + landslides for parts of the western Balkans next week! #Croatia and #Bosnia and Herzegovina will see the most problems....', 'og:image': 'https://external-atl3-1.xx.fbcdn.net/safe_image.php?d=AQAQHIR0RhEZxrkq&w=400&h=400&url=http%3A%2F%2Fwww.severe-weather.eu%2Fwp-content%2Fuploads%2F2019%2F05%2F500h_anom.eu_Tuesday.png&cfs=1&_nc_hash=AQD2C7pYBAE_Wgq_', 'og:url': 'https://www.facebook.com/severeweatherEU/posts/2515360518687034'}], 'cse_image': [{'src': 'https://lookaside.fbsbx.com/lookaside/crawler/media/?media_id=157048324896449'}]}}]}
                    currentTimeStamp = time.time()
                    self.r.set("social:streamer:facebook:%s:%s:timestamp"%(self.event, self.lang), currentTimeStamp)
                    previousApiAccesses+=1
                    self.r.set("social:streamer:facebook:%s:%s:count"%(self.event, self.lang), previousApiAccesses)

                    max_results = int(results["searchInformation"]["totalResults"])

                    

                    try:
                        if time.time() - self.local_time > self.TIMER:
                            self.path_setup()
                        
                        for _item_ in results["items"]:
                            _data_ = {}
                            #-20 for the varchar limit...
                            _data_["id_str"] = _item_["link"][-20:]
                            _data_["text"] = _item_["snippet"]
                            _data_["location"] = ""
                            _data_["latitude"] = None
                            _data_["longitude"] = None
                            _data_["streamtype"] = "facebook"
                            _data_["timestamp_ms"] = int(time.time()*1000)
                            _data_["link"] = _item_["link"]
                            self.output.write(json.dumps(_data_)+"\n")

                    except Exception as e:
                        self.errorQueue.put(("structured",("facebook", self.event, self.lang), str(e)))
                    self.messageQueue.put("Completed facebook download of %s-%s part %i of 10 at %s"%(self.event, self.lang, page_get+1, readable_time()))
                self.messageQueue.put("Completed facebook download of %s-%s at time %s"%(self.event, self.lang, readable_time()))

            else:
                # we already done...
                self.messageQueue.put("Facebook download of %s-%s at %s is already complete for day %s"%(self.event, self.lang, readable_time(), str(datetime.fromtimestamp(time.time()).day)))
                # Perform sleep for four hours until next check.
                time.sleep(14400)
Example #12
0
                                  fromlist=[_cfg["source_file"]])
        Executor = getattr(moduleImport, _cfg["source_file"])

        try:
            HCS_configuration[hcs_type]['processor'] = Executor(
                assed_config,
                root_name=hcs_type,
                errorQueue=errorQueue,
                messageQueue=messageQueue,
                **kwargs)
        except Exception as e:
            traceback.print_exc()
            std_flush("Failed to launch %s with error %s" %
                      (hcs_type, repr(e)))
        std_flush("Launch complete for ", hcs_type,
                  "HighConfigurationStreamer at ", readable_time())
        HCS_configuration[hcs_type]['processor'].start()
        HCS_configuration[hcs_type]['timestamp'] = time.time()

    configCheckTimer = time.time()

    while True:
        if time.time() - configCheckTimer > CONSTANTS.HCS_CONFIG_TIME_CHECK:
            configCheckTimer = time.time()
            std_flush(" ".join(["Checking configuration at", readable_time()]))
            configReload = load_config(CONSTANTS.HIGH_CONFIDENCE_CONFIG_PATH)
            configCheckTimer = time.time()
            # TODO handle config changes...
            pass

        # Rerun scheduled ones
Example #13
0
    def process(self, message):
        if message["streamtype"] not in self.stream_tracker:
            self.stream_tracker[message["streamtype"]] = {}
            self.stream_tracker[message["streamtype"]]["positive"] = 0
            self.stream_tracker[message["streamtype"]]["negative"] = 0
            self.stream_tracker[message["streamtype"]]["totalcounter"] = 0
        self.stream_tracker[message["streamtype"]]["totalcounter"] += 1

        if time.time() - self.cursor_timer > self.cursor_refresh:
            self.cursor.close()
            self.cursor = self.DB_CONN.cursor()
            self.cursor_timer = time.time()
            #helper_utils.std_flush("TRUE: %i\t\tFALSE: %i out of total of %i"%(self.true_counter, self.false_counter, self.total_counter))
            self.total_counter, self.true_counter, self.false_counter = 0, 0, 0
            for _streamtype in self.stream_tracker:
                utils.helper_utils.std_flush(
                    "[%s] -- Processed %i elements from %s with %i positive  and %i negative"
                    %
                    (helper_utils.readable_time(),
                     self.stream_tracker[_streamtype]["totalcounter"],
                     _streamtype, self.stream_tracker[_streamtype]["positive"],
                     self.stream_tracker[_streamtype]["negative"]))
                self.stream_tracker[_streamtype]["totalcounter"] = 0
                self.stream_tracker[_streamtype]["positive"] = 0
                self.stream_tracker[_streamtype]["negative"] = 0
        if self.debug:
            utils.helper_utils.std_flush(
                "Processed %i elements from %s with %i positive and %i negative"
                % (self.stream_tracker[message["streamtype"]]["totalcounter"],
                   message["streamtype"],
                   self.stream_tracker[message["streamtype"]]["positive"],
                   self.stream_tracker[message["streamtype"]]["negative"]))

        # Get message text
        cleaned_message = str(message["text"].encode("utf-8"))[2:-2]
        encoded_message = self.encode(cleaned_message)

        prediction = np.argmax(
            self.model.predict(np.array([encoded_message]))[0])
        params = None
        if prediction == 1:
            # push to db
            self.true_counter += 1
            params = (message["id_str"], message["cell"], str(message['latitude']), \
                    str(message['longitude']), self.ms_time_convert(message['timestamp']), message["link"], str(message["text"].encode("utf-8"))[2:-2], message["location"], "landslide", "ml", "1", message["streamtype"])
            self.stream_tracker[message["streamtype"]]["positive"] += 1
        elif prediction == 0:
            # push to db, with false? push to different db?
            self.false_counter += 1
            params = (message["id_str"], message["cell"], str(message['latitude']), \
                    str(message['longitude']), self.ms_time_convert(message['timestamp']), message["link"], str(message["text"].encode("utf-8"))[2:-2], message["location"], "landslide", "ml", "0", message["streamtype"])
            self.stream_tracker[message["streamtype"]]["negative"] += 1
        else:
            warnings.warn(
                "[%s] -- WARNING -- Prediction value of %i is not one of valid predictions [0, 1]"
                % (helper_utils.readable_time(), prediction))
        try:
            if not self.debug:
                self.cursor.execute(self.db_insert, params)
                self.DB_CONN.commit()
            else:
                #helper_utils.std_flush(self.db_insert%params)
                pass
        except mdb._exceptions.Error as mdb_error:
            traceback.print_exc()
            true_mdb_error = eval(str(mdb_error))
            if true_mdb_error[0] == 2013 or true_mdb_error[
                    0] == 2006:  # This is database connection error
                raise RuntimeError(
                    "[%s] -- ERROR -- Cannot connect to MySQL Database. Shutting down"
                    % helper_utils.readable_time())
            helper_utils.std_flush(
                '[%s] -- ERROR -- Failed to insert %s with error %s' %
                (helper_utils.readable_time(), message["id_str"],
                 repr(mdb_error)))
            return (False, message)

        self.total_counter += 1
        return (False, message)