Beispiel #1
0
    def createInputBufferScript(self):
        # For each input script type (text, or image, wher we get to it...)
        for _bufferscript in self.input_scripts:
            # get the input streams for this...
            for _inputsource in self.inverted_buffer_index[_bufferscript]:
                bufferscriptname = self.config[_bufferscript]["script"]
                bufferlogname = self.config["configuration"]["input-streams"][
                    _inputsource]["name"]
                exportkey = self.config[_bufferscript]["export-key"]
                dataprocessor = self.config["configuration"]["input-streams"][
                    _inputsource]["processor_script"]
                importkey = self.config["configuration"]["input-streams"][
                    _inputsource]["import-key"]


                bufferStr = \
                '''#!/bin/sh
cd hotexamples_com
if ps up `cat {logdir}/{bufferlogname}.pid ` > /dev/null
then
    printf "{bufferscriptname}.py is aleady running\\n" >> {logdir}/{bufferlogname}.out
else
    printf "{bufferlogname} is no longer running. Deleting PID file.\\n" >> {logdir}/{bufferlogname}.out
    rm  {logdir}/{bufferlogname}.pid >> {logdir}/{bufferlogname}.out
    printf "Deleted file\\n" >> {logdir}/{bufferlogname}.out
    printf "Starting {bufferscriptname}.py\\n" >> {logdir}/{bufferlogname}.out
    nohup ./assed_env/bin/python {assedscript}/{bufferscriptname}.py {logdir} {importkey} {exportkey} {dataprocessor} {dataprocessorscriptdir} {pidname} >> {logdir}/{bufferlogname}.log 2>&1 &
fi'''.format(homedir = self.home_dir, logdir = self.log_dir, bufferscriptname = bufferscriptname, bufferlogname = bufferlogname, assedscript = self.assed_sript_dir, importkey = importkey, exportkey = exportkey, dataprocessor = dataprocessor, dataprocessorscriptdir = self.script_dir_importname, pidname=bufferlogname)

                self.inputBufferScriptFile = os.path.join(
                    self.sh_dir, bufferlogname + ".sh")
                self.writeScript(self.inputBufferScriptFile, bufferStr)
                helper_utils.std_flush(
                    "Generated script for Input Buffer at %s" %
                    self.inputBufferScriptFile)
Beispiel #2
0
    def createProcessScripts(self):
        for _processscript in self.process_scripts:
            scriptname = self.config[_processscript]["script"]
            processname = self.config[_processscript]["name"]
            importkey = self.config[_processscript]["import-key"]
            exportkey = self.config[_processscript]["export-key"]
            bufferStr = \
            '''#!/bin/sh
cd hotexamples_com
if ps up `cat {logdir}/{processname}.pid ` > /dev/null
then
    printf "{processscriptname}.py is aleady running\\n" >> {logdir}/{processname}.out
else
    printf "{processname} is no longer running. Deleting PID file.\\n" >> {logdir}/{processname}.out
    rm  {logdir}/{processname}.pid >> {logdir}/{processname}.out
    printf "Deleted file\\n" >> {logdir}/{processname}.out
    printf "Starting {processname}.py\\n" >> {logdir}/{processname}.out
    nohup ./assed_env/bin/python {assedscript}/assed_process.py {logdir} {importkey} {exportkey} {processscriptname} {processscriptdir} {pidname} >> {logdir}/{processname}.log 2>&1 &
fi'''.format(homedir = self.home_dir, logdir = self.log_dir, processscriptname = scriptname, processname = processname, assedscript = self.assed_sript_dir, exportkey = exportkey, importkey = importkey, processscriptdir = self.script_dir_importname, pidname=processname)

            self.inputBufferScriptFile = os.path.join(self.sh_dir,
                                                      scriptname + ".sh")
            self.writeScript(self.inputBufferScriptFile, bufferStr)
            helper_utils.std_flush(
                "Generated script for %s  at %s" %
                (_processscript, self.inputBufferScriptFile))
Beispiel #3
0
 def initializeKafka(self):
     admin = kafka.admin.KafkaAdminClient()
     for _scriptref in self.input_scripts + self.output_scripts + self.process_scripts:
         kafka_key = self.config[_scriptref]["export-key"].replace(":", "_")
         try:
             admin.create_topics(new_topics=[
                 kafka.admin.NewTopic(name=kafka_key,
                                      num_partitions=1,
                                      replication_factor=1)
             ],
                                 validate_only=False)
             helper_utils.std_flush(
                 "Created %s export key in kafka broker" % kafka_key)
         except kafka.errors.TopicAlreadyExistsError:
             helper_utils.std_flush(
                 "%s exportkey already exists in Kafka broker" % kafka_key)
Beispiel #4
0
def main(logdir, importkey, exportkey, dataprocessor, dataprocessorscriptdir,
         pidname):
    TOP_OF_FILE_START = True
    pid_name = pidname
    helper_utils.setup_pid(pid_name, logdir=logdir)

    # Import processscript
    helper_utils.std_flush("[%s] -- Initializing ASSED-input-buffer %s" %
                           (helper_utils.readable_time(), pidname))
    moduleImport = __import__("pipelines.%s.%s" %
                              (dataprocessorscriptdir, dataprocessor),
                              fromlist=[dataprocessor])
    DataProcessor = getattr(moduleImport, dataprocessor)
    DataProcessor = DataProcessor()
    helper_utils.std_flush("[%s] -- Imported Data processor %s" %
                           (helper_utils.readable_time(), dataprocessor))

    # Set up connections
    pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
    r = redis.Redis(connection_pool=pool)
    kafka_key = exportkey.replace(":", "_")
    kafka_producer = kafka.KafkaProducer()

    message_refresh = 7200
    skip_count = 0
    process_count = 0
    time_slept = 0
    message_timer = time.time()

    # Get earliest file to parse...
    helper_utils.std_flush("[%s] -- Searching for files" %
                           helper_utils.readable_time())
    finishedUpToTime = r.get(importkey)
    granularTime = 0
    if finishedUpToTime is None:
        finishedUpToTime = 0
    else:
        finishedUpToTime = int(finishedUpToTime.decode())

    if finishedUpToTime == 0:
        # TODO CHANGE TO 7 days after setup is complete...
        helper_utils.std_flush(
            "[%s] -- No value for previous stop. Starting from 7 days prior",
            helper_utils.readable_time())
        currentTime = datetime.now() - timedelta(days=7)
        foundFlag = 0
        while foundFlag == 0:
            filePath = DataProcessor.getInputPath(currentTime)
            if os.path.exists(filePath):
                # We found the most recent file, and increment our counter
                finishedUpToTime = currentTime
                foundFlag = 1
            else:
                # If our search is too broad - i.e. we are a month behind, ignore
                currentTime += TIME_DELTA_MINIMAL
                timeDeltaOutputStream = (datetime.now() - currentTime)
                if timeDeltaOutputStream.days == 0 and timeDeltaOutputStream.seconds <= 1:
                    foundFlag = -1
    else:
        # I.E. if we already have a timestmap from pervious execution, we will read files that are a minute behind, and catch up to the granular time
        helper_utils.std_flush(
            "[%s] -- Starting File tracking at %s" %
            (helper_utils.readable_time(),
             str(datetime.fromtimestamp(finishedUpToTime / 1000.0))))
        granularTime = finishedUpToTime
        finishedUpToTime = datetime.fromtimestamp(
            granularTime / 1000.0) - timedelta(seconds=60)
        TOP_OF_FILE_START = False
    if TOP_OF_FILE_START:
        # Otherwise, we start from the beginning of the 'first' file...
        finishedUpToTime -= timedelta(seconds=finishedUpToTime.second)
        granularTime = 0

    prevGranular = granularTime

    helper_utils.std_flush("[%s] -- Starting Stream Tracking for %s" %
                           (helper_utils.readable_time(), importkey))
    while True:
        if time.time() - message_timer > message_refresh:
            message_timer = time.time()
            helper_utils.std_flush(
                "[%s] -- Processed %i items, with %i items skipped and %i seconds slept in the last %i seconds"
                % (helper_utils.readable_time(), process_count, skip_count,
                   time_slept, message_refresh))
            process_count, skip_count, time_slept = 0, 0, 0

        if (datetime.now() - finishedUpToTime).total_seconds() < 60:
            waitTime = 120 - (datetime.now() - finishedUpToTime).seconds
            time.sleep(waitTime)
            time_slept += waitTime
        else:
            filePath = DataProcessor.getInputPath(finishedUpToTime)
            if not os.path.exists(filePath):
                waitTime = (datetime.now() - finishedUpToTime).total_seconds()
                #Difference is less than Two minutes
                if waitTime < 120:
                    waitTime = 120 - waitTime
                    time.sleep(waitTime)
                    time_slept += waitTime
                else:
                    # Difference is more than two minutes - we can increment the the by one minute for the next ones
                    finishedUpToTime += TIME_DELTA_MINIMAL
            # Not we have file
            else:
                with open(filePath, 'r') as fileRead:
                    for line in fileRead:
                        try:
                            jsonVersion = json.loads(line)
                        except ValueError as e:
                            helper_utils.std_flush(
                                "[%s] -- WARNING -- Possible warning for %s file for %s with error %s"
                                % (helper_utils.readable_time(), filePath,
                                   importkey, str(e)))
                            continue

                        if "timestamp_ms" not in jsonVersion:
                            jsonVersion["timestamp_ms"] = int(
                                jsonVersion["timestamp"])

                        if granularTime > int(jsonVersion["timestamp_ms"]):
                            # skip already finished this...
                            skip_count += 1
                            continue

                        else:
                            # Have not done this item yet...
                            # process
                            processed_data = DataProcessor.process(jsonVersion)
                            byted = bytes(json.dumps(processed_data),
                                          encoding="utf-8")
                            kafka_producer.send(kafka_key, byted)
                            kafka_producer.flush()

                            granularTime = int(jsonVersion["timestamp_ms"])
                            r.set(importkey, granularTime)
                            process_count += 1
                            if granularTime - prevGranular > 86400000:
                                helper_utils.std_flush(
                                    "[%s] -- Finished with %s" %
                                    (helper_utils.readable_time(),
                                     str(
                                         datetime.fromtimestamp(
                                             granularTime / 1000.0))))
                                prevGranular = granularTime
                finishedUpToTime += TIME_DELTA_MINIMAL
Beispiel #5
0
    # Now we have a StreamerManager with empty instances for each streamer we are going to launch.
    # We will launch all of them, and go on from there...
    for _streamer_ in StreamerManager:
        if StreamerManager[_streamer_]["type"] == "unstructured":
            #launch single unstructured streamer...
            StreamerManager[_streamer_]["apikey"] = StreamerManager[
                _streamer_]["keyserver"].get_key()
            StreamerManager[_streamer_]["instance"] = StreamerManager[
                _streamer_]["executor"](
                    StreamerManager[_streamer_]["keywords"],
                    StreamerManager[_streamer_]["apikey"][1], errorQueue,
                    messageQueue)
            StreamerManager[_streamer_]["instance"].start()
            std_flush(
                "Deployed unstructured streamer : %s\tat %s\twith key %s" %
                (StreamerManager[_streamer_]["name"], readable_time(),
                 StreamerManager[_streamer_]["apikey"][0]))
        elif StreamerManager[_streamer_]["type"] == "structured":
            # Launch each instance (eventlangtuple)...
            for _instance_ in StreamerManager[_streamer_]["instances"]:
                StreamerManager[_streamer_]["instances"][_instance_][
                    "apikey"] = StreamerManager[_streamer_][
                        "keyserver"].get_key()
                StreamerManager[_streamer_]["instances"][_instance_][
                    "instance"] = StreamerManager[_streamer_]["executor"](
                        _instance_[0], _instance_[1],
                        StreamerManager[_streamer_]["instances"][_instance_]
                        ["keywords"], StreamerManager[_streamer_]["instances"]
                        [_instance_]["apikey"][1], errorQueue, messageQueue)
                StreamerManager[_streamer_]["instances"][_instance_][
                    "instance"].start()
Beispiel #6
0
def main(logdir, importkey, exportkey, processscript, processscriptdir,
         pidname, debug, seekval):
    if debug is None:
        debug = 0
    if debug:
        helper_utils.std_flush("[%s] -- DEBUG_MODE -- Active" %
                               helper_utils.readable_time())
    pid_name = pidname
    if not debug:
        helper_utils.setup_pid(pid_name, logdir=logdir)

    # Import processscript
    helper_utils.std_flush("[%s] -- Initializing ASSED-Process %s" %
                           (helper_utils.readable_time(), pidname))
    moduleImport = __import__("pipelines.%s.%s" %
                              (processscriptdir, processscript),
                              fromlist=[processscript])
    MessageProcessor = getattr(moduleImport, processscript)
    if debug:
        MessageProcessor = MessageProcessor(debug=True)
    else:
        MessageProcessor = MessageProcessor()
    helper_utils.std_flush("[%s] -- Imported Module %s" %
                           (helper_utils.readable_time(), processscript))

    kafka_import = importkey.replace(":", "_")
    helper_utils.std_flush("[%s] -- Generated kafka import key %s" %
                           (helper_utils.readable_time(), kafka_import))
    kafka_export = exportkey.replace(":", "_")
    helper_utils.std_flush("[%s] -- Generated kafka export key %s" %
                           (helper_utils.readable_time(), kafka_export))
    pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
    r = redis.Redis(connection_pool=pool)
    helper_utils.std_flush(
        "[%s] -- Connected to redis with ConnectionPool on port 6379" %
        helper_utils.readable_time())

    seek_partition = r.get(exportkey + ":partition")
    seek_offset = r.get(exportkey + ":offset")
    seek_partition = 0 if seek_partition is None else int(seek_partition)
    seek_offset = 0 if seek_offset is None else int(seek_offset) + 1
    helper_utils.std_flush(
        "[%s] -- Obtained seek partition for kafka at Partition %i -- Offset %i"
        % (helper_utils.readable_time(), seek_partition, seek_offset))

    # replace seek value
    if debug:
        if seekval is not None:
            seek_offset = seekval
            helper_utils.std_flush(
                "[%s] -- DEBUG -- Replaced seek offset for kafka at Partition %i -- Offset %i"
                % (helper_utils.readable_time(), seek_partition, seek_offset))

    kafka_producer = kafka.KafkaProducer()
    helper_utils.std_flush("[%s] -- Generated kafka producer" %
                           helper_utils.readable_time())
    kafka_consumer = kafka.KafkaConsumer()
    helper_utils.std_flush("[%s] -- Generated kafka consumer" %
                           helper_utils.readable_time())
    TopicPartition = kafka.TopicPartition(kafka_import, seek_partition)
    kafka_consumer.assign([TopicPartition])
    kafka_consumer.seek(TopicPartition, seek_offset)
    helper_utils.std_flush("[%s] -- Set kafka consumer seek" %
                           helper_utils.readable_time())

    message_correct_counter = 0
    message_fail_counter = 0
    message_counter = 0

    for message in kafka_consumer:
        item = json.loads(message.value.decode())
        processedMessage = MessageProcessor.process(item)
        # Push the message to kafka...if true
        if type(processedMessage) != type(tuple()):
            raise ValueError(
                "[%s] -- ERROR -- Invalid type %s for processedMessage. MessageProcessor.process() must return tuple of (bool,message)."
                % (helper_utils.readable_time(), str(type(processedMessage))))
        if not processedMessage[0]:
            message_fail_counter += 1
        else:
            if not debug:
                byted = bytes(json.dumps(processedMessage[1]),
                              encoding="utf-8")
                kafka_producer.send(kafka_export, byted)
                kafka_producer.flush()
            message_correct_counter += 1
        message_counter += 1

        if not debug:
            r.set(exportkey + ":partition", message.partition)
            r.set(exportkey + ":offset", message.offset)
            r.set(exportkey + ":timestamp", message.timestamp)

        if message_counter % 1000 == 0:
            helper_utils.std_flush(
                "[%s] -- Processed %i messages with %i failures and %i successes"
                % (helper_utils.readable_time(), message_counter,
                   message_fail_counter, message_correct_counter))
Beispiel #7
0
def main(importkey, exportkey, seekval):

    kafka_import = importkey.replace(":", "_")
    helper_utils.std_flush("Generated kafka import key %s" % kafka_import)
    kafka_export = exportkey.replace(":", "_")
    helper_utils.std_flush("Generated kafka export key %s" % kafka_export)
    pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
    r = redis.Redis(connection_pool=pool)
    helper_utils.std_flush("Connected to redis")

    seek_partition = r.get(exportkey + ":partition")
    seek_offset = r.get(exportkey + ":offset")
    seek_partition = 0 if seek_partition is None else int(seek_partition)
    seek_offset = 0 if seek_offset is None else int(seek_offset) + 1
    helper_utils.std_flush(
        "Obtained seek partition for kafka at Partition %i -- Offset %i" %
        (seek_partition, seek_offset))

    if seekval is not None:
        seek_offset = seekval
        helper_utils.std_flush(
            "Replaced seek offset for kafka at Partition %i -- Offset %i" %
            (seek_partition, seek_offset))
    helper_utils.std_flush("\n\n")

    kafka_consumer = kafka.KafkaConsumer()
    helper_utils.std_flush("Generated kafka consumer")
    TopicPartition = kafka.TopicPartition(kafka_import, seek_partition)
    kafka_consumer.assign([TopicPartition])
    kafka_consumer.seek(TopicPartition, seek_offset)
    helper_utils.std_flush("Set kafka consumer seek")

    count = 0
    for message in kafka_consumer:
        #pdb.set_trace()
        count += 1
        jsval = json.loads(message.value.decode())
        helper_utils.std_flush(jsval["streamtype"], str(count))
Beispiel #8
0
    def __init__(self, startTime, keywords, rootName, errorQueue,messageQueue):
        multiprocessing.Process.__init__(self)
        
        ''' Message queue for passing back errors and current times '''
        self.errorQueue = errorQueue
        self.messageQueue = messageQueue

        ''' set up relevant details '''
        self.keywords = keywords
        self.rootName = rootName
        self.DOWNLOAD_PREPEND = './downloads/'
        self.STREAM_FILES_PROCESSOR_MAX_SECOND_DELAY = CONSTANTS.STREAM_FILES_PROCESSOR_MAX_SECOND_DELAY
        self.BACK_CHECK_FILES_DAYS = 10
        self.timeDelta = timedelta(seconds=CONSTANTS.STREAMING_GRANULARITY_SECONDS)

        ''' Set up the time counter 
            Note the finishedUpToTime MUST be a datetime object '''
        
        if startTime is None:
            self.fishedUpToTime = None
            # First attempt to get most recent output file
            currentTime = datetime.now()
            foundFlag = 0
            while foundFlag == 0:
                filePath = self.getOutputPath(currentTime)
                if os.path.exists(filePath):
                    # We found the most recent file, and increment our counter
                    self.finishedUpToTime = currentTime+self.timeDelta
                    std_flush(" ".join([self.rootName, "Found output-stream file at",(str(filePath))]))
                    foundFlag = 1
                else:
                    #if our search is too broad - i.e. we are a month behind, ignore
                    currentTime-=self.timeDelta
                    if (datetime.now() - currentTime).days > self.BACK_CHECK_FILES_DAYS:
                        foundFlag = -1
            
            #If not exists, attempt to get earliest download file
            if foundFlag == -1:
                std_flush(" ".join([self.rootName, "Did not find any output-stream files."]))
                currentTime = datetime.now() - timedelta(days=self.BACK_CHECK_FILES_DAYS)

                foundFlag = 0
                while foundFlag == 0:
                    filePath = self.getInputPath(currentTime)
                    if os.path.exists(filePath):
                        #we found the most recent file, and increment our counter
                        self.finishedUpToTime = currentTime
                        std_flush(" ".join([self.rootName, "Found input-stream file at",(str(filePath))]))
                        foundFlag = 1
                    else:
                        #if our search is too broad - i.e. we are a month behind, ignore
                        currentTime+=self.timeDelta
                        timeDeltaOutputStream = (datetime.now() - currentTime)
                        if timeDeltaOutputStream.days  == 0 and timeDeltaOutputStream.seconds  <= 1:
                            foundFlag = -1

            if foundFlag == -1:
                #So nothing is there
                std_flush(" ".join([self.rootName, "Did not find any input-stream files."]))
                #raise(self.NoStartTimeGivenAndNoFilesExist)
                raise RuntimeError()
            #If not, crash???????

        else:
            self.fishedUpToTime = startTime
        #reset seconds to 0
        self.finishedUpToTime -= timedelta(seconds=self.finishedUpToTime.second)
        self.previousMessageTime = self.finishedUpToTime
    #Load the keywords
    keywordConfig = load_config(CONSTANTS.TOPIC_CONFIG_PATH)
    errorQueue = multiprocessing.Queue()
    messageQueue = multiprocessing.Queue()

    keyStreamConfig = {}
    # for each keyword-lang pair type, launch a StreamFilesProcessor
    for physicalEvent in keywordConfig['topic_names'].keys():
        for language in keywordConfig['topic_names'][physicalEvent]["languages"]:
            eventLangTuple = (physicalEvent,language)
            keyStreamConfig[eventLangTuple] = {}
            keyStreamConfig[eventLangTuple]['name'] = physicalEvent
            keyStreamConfig[eventLangTuple]['lang'] = language
            keyStreamConfig[eventLangTuple]['keywords'] = keywordConfig['topic_names'][physicalEvent]["languages"][language]
            keyStreamConfig[eventLangTuple]['postpone'] = False
            std_flush(" ".join(["Deploying",str(eventLangTuple), "at", readable_time()]))
            try:
                keyStreamConfig[eventLangTuple]['processor'] = StreamFilesProcessor(  None, 
                                                                                keyStreamConfig[eventLangTuple]['keywords'], 
                                                                                "_".join([eventLangTuple[0],eventLangTuple[1]]), 
                                                                                errorQueue,
                                                                                messageQueue)
            except RuntimeError:
                std_flush(" ".join([str(eventLangTuple), " does not have files to start. Posponing launch 2 hr at", readable_time()]))
                keyStreamConfig[eventLangTuple]['postpone'] = True
            keyStreamConfig[eventLangTuple]['launchTime'] = datetime.now()

            if not keyStreamConfig[eventLangTuple]['postpone']:
                keyStreamConfig[eventLangTuple]['processor'].start()

    configCheckTimer = time.time()
Beispiel #10
0
def main():

    local_timer = 0
    refresh_timer = 7200
    sleep_timer = 300
    while True:
        if time.time() - local_timer > refresh_timer:

            local_timer = time.time()

            helper_utils.std_flush("[%s] -- Initializing EventDetection" %
                                   helper_utils.readable_time())
            cell_cache = {}

            assed_config = file_utils.load_config("./config/assed_config.json")

            helper_utils.std_flush("[%s] -- Obtained DB Connection" %
                                   helper_utils.readable_time())
            DB_CONN = db_utils.get_db_connection(assed_config)
            cursor = DB_CONN.cursor()

            available_streamers = [
                item for item in assed_config["SocialStreamers"]
            ]
            streamer_results = {}
            helper_utils.std_flush(
                "[%s] -- Available streamers: %s" %
                (helper_utils.readable_time(), str(available_streamers)))

            for _streamer_ in available_streamers:
                helper_utils.std_flush(
                    "[%s] -- Generating query for: %s" %
                    (helper_utils.readable_time(), _streamer_))
                _query_ = generate_social_query(_streamer_=_streamer_,
                                                _topic_="landslide")
                cursor.execute(_query_)
                streamer_results[_streamer_] = cursor.fetchall()
                helper_utils.std_flush(
                    "[%s] -- Obtained results for : %s" %
                    (helper_utils.readable_time(), _streamer_))

            helper_utils.std_flush("[%s] -- Generating query for: %s" %
                                   (helper_utils.readable_time(), "TRMM"))
            _query_ = generate_trmm_query()
            cursor.execute(_query_)
            trmm_results = cursor.fetchall()
            helper_utils.std_flush("[%s] -- Obtained resuts for: %s" %
                                   (helper_utils.readable_time(), "TRMM"))

            helper_utils.std_flush("[%s] -- Generating query for: %s" %
                                   (helper_utils.readable_time(), "USGS"))
            _query_ = generate_usgs_query()
            cursor.execute(_query_)
            usgs_results = cursor.fetchall()
            helper_utils.std_flush("[%s] -- Obtained resuts for: %s" %
                                   (helper_utils.readable_time(), "USGS"))

            helper_utils.std_flush("[%s] -- Generating query for: %s" %
                                   (helper_utils.readable_time(), "News"))
            _query_ = generate_news_query()
            cursor.execute(_query_)
            news_results = cursor.fetchall()
            helper_utils.std_flush("[%s] -- Obtained resuts for: %s" %
                                   (helper_utils.readable_time(), "News"))
            cursor.close()

            helper_utils.std_flush(
                "[%s] -- Generating local cache with scoring:\tSocial-ML - 0.3\tSocial-HDI - 1\tNews - 3\tUSGS - 5\tTRMM - 1"
                % helper_utils.readable_time())
            # Scoring -- Twitter-Social: 0.3    Twitter-HDI - 1     News:       3       USGS:   5       TRMM:   1
            for _streamer_ in streamer_results:
                helper_utils.std_flush(
                    "[%s] -- Local caching for %s" %
                    (helper_utils.readable_time(), _streamer_))
                for tuple_cell_ in streamer_results[_streamer_]:
                    _cell_ = tuple_cell_[0]
                    if _cell_ not in cell_cache:
                        cell_cache[_cell_] = {}
                    if int(float(tuple_cell_[1])) > 0:
                        cell_cache[_cell_][_streamer_ + "-hdi"] = (int(
                            float(tuple_cell_[1])), float(tuple_cell_[1]))
                    if int(float(tuple_cell_[2]) / 0.34) > 0:
                        cell_cache[_cell_][_streamer_ + "-ml"] = (int(
                            float(tuple_cell_[2]) / 0.34), float(
                                tuple_cell_[2]))

            helper_utils.std_flush("[%s] -- Local caching for %s" %
                                   (helper_utils.readable_time(), "TRMM"))
            for tuple_cell_ in trmm_results:
                _cell_ = tuple_cell_[0]
                if _cell_ not in cell_cache:
                    cell_cache[_cell_] = {}
                cell_cache[_cell_]["TRMM"] = (float(tuple_cell_[1]),
                                              float(tuple_cell_[1] * 1)
                                              )  # 1 <-- TRMM score

            helper_utils.std_flush("[%s] -- Local caching for %s" %
                                   (helper_utils.readable_time(), "USGS"))
            for tuple_cell_ in usgs_results:
                _cell_ = tuple_cell_[0]
                if _cell_ not in cell_cache:
                    cell_cache[_cell_] = {}
                cell_cache[_cell_]["USGS"] = (float(tuple_cell_[1]),
                                              float(tuple_cell_[1] * 5))

            helper_utils.std_flush("[%s] -- Local caching for %s" %
                                   (helper_utils.readable_time(), "News"))
            for tuple_cell_ in news_results:
                _cell_ = tuple_cell_[0]
                if _cell_ not in cell_cache:
                    cell_cache[_cell_] = {}
                cell_cache[_cell_]["News"] = (float(tuple_cell_[1]),
                                              float(tuple_cell_[1] * 3))

            helper_utils.std_flush(
                "[%s] -- Local cache score total generation" %
                helper_utils.readable_time())
            for _cell_ in cell_cache:
                cell_cache[_cell_]["total"] = sum([
                    cell_cache[_cell_][item][1] for item in cell_cache[_cell_]
                ])

            pool = redis.ConnectionPool(host='localhost', port=6379, db=0)
            r = redis.Redis(connection_pool=pool)
            helper_utils.std_flush("[%s] -- Connected to Redis" %
                                   helper_utils.readable_time())

            # Correct-key -- v1 or v2
            # Key Push
            # Actual keys...
            # list_tracker_key tracks where the data is (either v1 or v2)
            # list_push_key contains the list of cells
            list_tracker_key = "assed:event:detection:multisource:listkey"
            list_push_key = "assed:event:detection:multisource:list"
            list_info_key = "assed:event:detection:multisource:info"
            key_version = r.get(list_tracker_key)
            if key_version is None:
                key_version = "v2"
            else:
                key_version = key_version.decode()
            push_key = 'v1'
            if key_version == 'v1':
                helper_utils.std_flush(
                    "[%s] -- v1 key already in effect. Pushing to v2" %
                    helper_utils.readable_time())
                push_key = 'v2'
            else:
                helper_utils.std_flush(
                    "[%s] -- v2 key already in effect. Pushing to v1" %
                    helper_utils.readable_time())

            cell_list = [item for item in cell_cache]
            true_list_push_key = list_push_key + ":" + push_key
            helper_utils.std_flush(
                "[%s] -- Deleting existing %s, if any" %
                (helper_utils.readable_time(), true_list_push_key))
            r.delete(true_list_push_key)

            r.lpush(true_list_push_key, *cell_list)
            helper_utils.std_flush(
                "[%s] -- Pushed cell list to %s" %
                (helper_utils.readable_time(), true_list_push_key))

            helper_utils.std_flush("[%s] -- Pushing individual cell results" %
                                   helper_utils.readable_time())
            cell_counter = 0
            for _cell_ in cell_cache:
                cell_push_contents = json.dumps(cell_cache[_cell_])
                cell_specific_suffix = ":".join(_cell_.split("_"))
                cell_push_key = ":".join(
                    [list_info_key, cell_specific_suffix, push_key])
                r.set(cell_push_key, cell_push_contents)
                if cell_counter == 0:
                    helper_utils.std_flush("[%s] -- First push: %s --- %s" %
                                           (helper_utils.readable_time(),
                                            cell_push_key, cell_push_contents))
                cell_counter += 1

            helper_utils.std_flush(
                "[%s] -- Completed individual cell pushes with %s cells" %
                (helper_utils.readable_time(), str(cell_counter)))

            r.set(list_tracker_key, push_key)
            helper_utils.std_flush(
                "[%s] -- Setting versioning in %s to %s" %
                (helper_utils.readable_time(), list_tracker_key, push_key))

            helper_utils.std_flush("--------   COMPLETE AT  %s ----------\n" %
                                   helper_utils.readable_time())
        else:
            #helper_utils.std_flush("Sleeping for %s"%sleep_timer)
            time.sleep(sleep_timer)
Beispiel #11
0
    def process(self, message):
        if message["streamtype"] not in self.stream_tracker:
            self.stream_tracker[message["streamtype"]] = {}
            self.stream_tracker[message["streamtype"]]["hdi"] = 0
            self.stream_tracker[message["streamtype"]]["non_hdi"] = 0
            self.stream_tracker[message["streamtype"]]["totalcounter"] = 0
        self.stream_tracker[message["streamtype"]]["totalcounter"] += 1

        if time.time() - self.cursor_timer > self.cursor_refresh:
            self.cursor.close()
            self.cursor = self.DB_CONN.cursor()
            self.cursor_timer = time.time()
            for _streamtype in self.stream_tracker:
                utils.helper_utils.std_flush(
                    "[%s] -- Processed %i elements from %s with %i HDI  and %i NONHDI"
                    % (helper_utils.readable_time(),
                       self.stream_tracker[_streamtype]["totalcounter"],
                       _streamtype, self.stream_tracker[_streamtype]["hdi"],
                       self.stream_tracker[_streamtype]["non_hdi"]))
                self.stream_tracker[_streamtype]["totalcounter"] = 0
                self.stream_tracker[_streamtype]["non_hdi"] = 0
                self.stream_tracker[_streamtype]["hdi"] = 0
        if self.debug:
            utils.helper_utils.std_flush(
                "Processed %i elements from %s with %i HDI and %i NONHDI" %
                (self.stream_tracker[message["streamtype"]]["totalcounter"],
                 message["streamtype"],
                 self.stream_tracker[message["streamtype"]]["hdi"],
                 self.stream_tracker[message["streamtype"]]["non_hdi"]))
        # Check

        # Check item
        self.verify_message(message)
        message["cell"] = utils.helper_utils.generate_cell(
            float(message["latitude"]), float(message["longitude"]))
        _time_ = int(int(message["timestamp"]) / 1000)
        _time_minus = self.time_convert(_time_ - 6 * self.MS_IN_DAYS)
        _time_plus = self.time_convert(_time_ + 3 * self.MS_IN_DAYS)
        select_s = 'SELECT location from HCS_News where cell = %s and timestamp > %s and timestamp < %s'
        params = (message["cell"], _time_minus, _time_plus)
        self.cursor.execute(select_s, params)
        results = self.cursor.fetchall()
        if len(results) > 0:
            #helper_utils.std_flush("True Event found for %s"%str(message["text"].encode("utf-8"))[2:-2])
            self.true_counter += 1
            # Push into landslide events...
            insert = 'INSERT INTO ASSED_Social_Events ( \
                        social_id, cell, \
                        latitude, longitude, timestamp, link, text, location, topic_name, source, valid, streamtype) \
                        VALUES (%s,%s,%s,%s,%s,%s, %s, %s,%s, %s, %s, %s)'
            params = (str(message["id_str"]), message["cell"], str(message['latitude']), \
                    str(message['longitude']), self.ms_time_convert(message['timestamp']), message["link"], str(message["text"].encode("utf-8"))[2:-2], message["location"], "landslide", "hdi", "1", message["streamtype"])

            #helper_utils.std_flush(insert%params)

            try:
                if not self.debug:
                    self.cursor.execute(insert, params)
                    self.DB_CONN.commit()
                else:
                    #helper_utils.std_flush(insert%params)
                    pass
                helper_utils.std_flush(
                    "[%s] -- Possible landslide event at %s detected at time %s using HDI (current time: %s)"
                    % (helper_utils.readable_time(), message["location"],
                       self.ms_time_convert(message["timestamp"]),
                       self.time_convert(time.time())))
                self.stream_tracker[message["streamtype"]]["hdi"] += 1
                return (False, message)
            except mdb._exceptions.Error as mdb_error:
                traceback.print_exc()
                true_mdb_error = eval(str(mdb_error))
                if true_mdb_error[0] == 2013 or true_mdb_error[
                        0] == 2006:  # This is database connection error
                    raise RuntimeError(
                        "[%s] -- ERROR -- Cannot connect to MySQL Database. Shutting down."
                        % helper_utils.readable_time())
                helper_utils.std_flush(
                    '[%s] -- ERROR -- Failed to insert %s with error %s' %
                    (helper_utils.readable_time(), message["id_str"],
                     repr(mdb_error)))
        else:
            # No matching HDI
            pass
        """
        tODO
        also perform event detection on other data (just news data (already exists), combination of earthquake AND TRMM (???))

        """

        if self.debug:
            #helper_utils.std_flush("No HDI detected for %s - %s - %s"%(str(message["id_str"]),str(message["text"].encode("utf-8"))[2:-2], message["cell"] ))
            pass
        self.stream_tracker[message["streamtype"]]["non_hdi"] += 1
        return (True, message)
Beispiel #12
0
 def createIfNotExists(self, dir_):
     if not os.path.exists(dir_):
         helper_utils.std_flush("%s directory not created. Creating" % dir_)
         os.makedirs(dir_)
     helper_utils.std_flush("Finished verifying directory %s" % dir_)
Beispiel #13
0
        # Perform the import, then execute
        moduleImport = __import__("HighConfidenceStreamerSrc.%s" %
                                  _cfg["source_file"],
                                  fromlist=[_cfg["source_file"]])
        Executor = getattr(moduleImport, _cfg["source_file"])

        try:
            HCS_configuration[hcs_type]['processor'] = Executor(
                assed_config,
                root_name=hcs_type,
                errorQueue=errorQueue,
                messageQueue=messageQueue,
                **kwargs)
        except Exception as e:
            traceback.print_exc()
            std_flush("Failed to launch %s with error %s" %
                      (hcs_type, repr(e)))
        std_flush("Launch complete for ", hcs_type,
                  "HighConfigurationStreamer at ", readable_time())
        HCS_configuration[hcs_type]['processor'].start()
        HCS_configuration[hcs_type]['timestamp'] = time.time()

    configCheckTimer = time.time()

    while True:
        if time.time() - configCheckTimer > CONSTANTS.HCS_CONFIG_TIME_CHECK:
            configCheckTimer = time.time()
            std_flush(" ".join(["Checking configuration at", readable_time()]))
            configReload = load_config(CONSTANTS.HIGH_CONFIDENCE_CONFIG_PATH)
            configCheckTimer = time.time()
            # TODO handle config changes...
            pass
Beispiel #14
0
    def process(self, message):
        if message["streamtype"] not in self.stream_tracker:
            self.stream_tracker[message["streamtype"]] = {}
            self.stream_tracker[message["streamtype"]]["positive"] = 0
            self.stream_tracker[message["streamtype"]]["negative"] = 0
            self.stream_tracker[message["streamtype"]]["totalcounter"] = 0
        self.stream_tracker[message["streamtype"]]["totalcounter"] += 1

        if time.time() - self.cursor_timer > self.cursor_refresh:
            self.cursor.close()
            self.cursor = self.DB_CONN.cursor()
            self.cursor_timer = time.time()
            #helper_utils.std_flush("TRUE: %i\t\tFALSE: %i out of total of %i"%(self.true_counter, self.false_counter, self.total_counter))
            self.total_counter, self.true_counter, self.false_counter = 0, 0, 0
            for _streamtype in self.stream_tracker:
                utils.helper_utils.std_flush(
                    "[%s] -- Processed %i elements from %s with %i positive  and %i negative"
                    %
                    (helper_utils.readable_time(),
                     self.stream_tracker[_streamtype]["totalcounter"],
                     _streamtype, self.stream_tracker[_streamtype]["positive"],
                     self.stream_tracker[_streamtype]["negative"]))
                self.stream_tracker[_streamtype]["totalcounter"] = 0
                self.stream_tracker[_streamtype]["positive"] = 0
                self.stream_tracker[_streamtype]["negative"] = 0
        if self.debug:
            utils.helper_utils.std_flush(
                "Processed %i elements from %s with %i positive and %i negative"
                % (self.stream_tracker[message["streamtype"]]["totalcounter"],
                   message["streamtype"],
                   self.stream_tracker[message["streamtype"]]["positive"],
                   self.stream_tracker[message["streamtype"]]["negative"]))

        # Get message text
        cleaned_message = str(message["text"].encode("utf-8"))[2:-2]
        encoded_message = self.encode(cleaned_message)

        prediction = np.argmax(
            self.model.predict(np.array([encoded_message]))[0])
        params = None
        if prediction == 1:
            # push to db
            self.true_counter += 1
            params = (message["id_str"], message["cell"], str(message['latitude']), \
                    str(message['longitude']), self.ms_time_convert(message['timestamp']), message["link"], str(message["text"].encode("utf-8"))[2:-2], message["location"], "landslide", "ml", "1", message["streamtype"])
            self.stream_tracker[message["streamtype"]]["positive"] += 1
        elif prediction == 0:
            # push to db, with false? push to different db?
            self.false_counter += 1
            params = (message["id_str"], message["cell"], str(message['latitude']), \
                    str(message['longitude']), self.ms_time_convert(message['timestamp']), message["link"], str(message["text"].encode("utf-8"))[2:-2], message["location"], "landslide", "ml", "0", message["streamtype"])
            self.stream_tracker[message["streamtype"]]["negative"] += 1
        else:
            warnings.warn(
                "[%s] -- WARNING -- Prediction value of %i is not one of valid predictions [0, 1]"
                % (helper_utils.readable_time(), prediction))
        try:
            if not self.debug:
                self.cursor.execute(self.db_insert, params)
                self.DB_CONN.commit()
            else:
                #helper_utils.std_flush(self.db_insert%params)
                pass
        except mdb._exceptions.Error as mdb_error:
            traceback.print_exc()
            true_mdb_error = eval(str(mdb_error))
            if true_mdb_error[0] == 2013 or true_mdb_error[
                    0] == 2006:  # This is database connection error
                raise RuntimeError(
                    "[%s] -- ERROR -- Cannot connect to MySQL Database. Shutting down"
                    % helper_utils.readable_time())
            helper_utils.std_flush(
                '[%s] -- ERROR -- Failed to insert %s with error %s' %
                (helper_utils.readable_time(), message["id_str"],
                 repr(mdb_error)))
            return (False, message)

        self.total_counter += 1
        return (False, message)