Python TweetDatabase Beispiele, geo.tweetdatabase.TweetDatabase Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: database.py Projekt: SiThuc/EventDetector

    def loadInitialTweets(self, tweetFile, numInitTweets):
        self._initialTweets = TweetDatabase()
        self._reader = open(tweetFile)

        for i in range(0, numInitTweets):
            line = self._reader.readline()
            tweet = GeoTweet.geoTweetFromAString(tweetString=line)
            self._initialTweets.add(tweet)
        print("There are %d tweets are loaded as initial tweets."%len(self.getInitialTweets().getTweets()))

Beispiel #2

0

Datei anzeigen

Datei: onlinequery.py Projekt: SiThuc/EventDetector

 def __init__(self, start, end, refWindowSize, minSup, updateWindow):
     super().__init__(start, end, refWindowSize, minSup)
     self._queryTD = TweetDatabase()
     self._deleteTD = TweetDatabase()
     self._insertTD = TweetDatabase()
     self._startDeleteTS = self._startTS
     self._endDeleteTS = self._startTS + updateWindow
     self._startInsertTS = self._endTS
     self._endInsertTS = self._endTS + updateWindow

Beispiel #3

0

Datei anzeigen

Datei: demo.py Projekt: SiThuc/EventDetector

    def update(self, startNewTimes):
        deletedTweets = self._tdb.deleteFromHeadByTime(startNewTimes)
        print("There are %d tweets after removing"%self._tdb.size())
        self._tdb.setNewStartTimestamp(startNewTimes)

        for tweet in deletedTweets:
            self._delete.add(tweet)
        print("There are %d tweets are removed"%self._delete.size())

        for tweet in self._insert.getTweets():
            self._tdb.add(tweet)
        print("There are %d tweets after adding" %self._tdb.size())
        print("----------------------------------------")

        onDetector = self.runOnline(self._tdb, self._delete, self._insert)
        self._insert = TweetDatabase()
        self._delete = TweetDatabase()

Beispiel #4

0

Datei anzeigen

Datei: mongo.py Projekt: SiThuc/EventDetector

 def rangeQueryTweetDB(self, startTS, endTS):
     cursor = self._tweetCol.find(
         {"timestamp": {
             "$gte": startTS,
             "$lt": endTS
         }})
     td = TweetDatabase()
     for data in cursor:
         tweetId = data["id"]
         timestamp = data["timestamp"]
         lng = data["lng"]
         lat = data["lat"]
         loc = Location(lng, lat)
         entities = data["entities"]
         tweet = GeoTweet(tweetId, timestamp, loc, entities)
         td.append(tweet)
     return td

Beispiel #5

0

Datei anzeigen

Datei: demo.py Projekt: SiThuc/EventDetector

    def update_normal(self, startNewTimes):
        deletedTweets = self._tdb.deleteFromHeadByTime(startNewTimes)
        print("There are %d tweets after removing" % self._tdb.size())

        for tweet in deletedTweets:
            self._delete.add(tweet)
        print("There are %d tweets are removed"%self._delete.size())

        for tweet in self._insert.getTweets():
            self._tdb.add(tweet)
        print("There are %d tweets after adding" %self._tdb.size())
        print("----------------------------------------")

        epsilon = float(self._config['hubseek']['epsilon'])
        errorBound = float(self._config['clustream']['errorBound'])
        pRestart = float(self._config['clustream']['pRestart'])
        self._db.generateEntityGraph(self._tdb.getTweets(), epsilon, errorBound, pRestart)
        print("Create graph done!")
        hubseek = self.runHubSeek2(self._tdb)

        self._insert = TweetDatabase()
        self._delete = TweetDatabase()

Beispiel #6

0

Datei anzeigen

Datei: demo.py Projekt: SiThuc/EventDetector

    def __init__(self):
        self._db = None
        self._tdb = TweetDatabase()
        self._insert = TweetDatabase()
        self._delete = TweetDatabase()
        self._clustream = None
        self._config = None

        self._checkInit = False
        self._checkBatchRun = False
        self._tokenizer = None
        self._model = None
        self._detector = None
        print('The program begin')

Beispiel #7

0

Datei anzeigen

Datei: database.py Projekt: SiThuc/EventDetector

 def __init__(self, config):
     self._config = config
     self._reader = None
     self._initialTweets = TweetDatabase()
     self._graph = None

Beispiel #8

0

Datei anzeigen

Datei: database.py Projekt: SiThuc/EventDetector

class Database():

    def __init__(self, config):
        self._config = config
        self._reader = None
        self._initialTweets = TweetDatabase()
        self._graph = None

    def getEntityGraph(self):
        return self._graph

    def getInitialTweets(self):
        return self._initialTweets

    def loadInitialTweets(self, tweetFile, numInitTweets):
        self._initialTweets = TweetDatabase()
        self._reader = open(tweetFile)

        for i in range(0, numInitTweets):
            line = self._reader.readline()
            tweet = GeoTweet.geoTweetFromAString(tweetString=line)
            self._initialTweets.add(tweet)
        print("There are %d tweets are loaded as initial tweets."%len(self.getInitialTweets().getTweets()))

    def nextTweet(self):
        tweet = None
        while tweet is None or tweet.numEntity() == 0:
            line = self._reader.readline()
            if line == '':
                return None
            tweet = GeoTweet.geoTweetFromAString(line)
        return tweet

    def generateEntityGraph(self, tdb, epsilon, errorBound, pRestart):
        start = time.time()
        # 1. init Graph
        self._graph = Graph()
        self._graph.generateNodes(tdb.getTweets())
        self._graph.generateEdges(tdb.getTweets(), False)
        self._graph.calcVicinity(epsilon, errorBound, pRestart)
        end = time.time()
        duration = end - start
        self._graph.setCreateTime(duration)

        bGraphTime = tdb.getStartTimestamp()
        eGraphTime = tdb.getEndTimestamp()

        self.createFolder(bGraphTime, eGraphTime)
        self.writeNode(self._graph._mNodes, bGraphTime, eGraphTime)
        self.writeEdge(self._graph._mEdges, bGraphTime, eGraphTime)
        self.writeVicinity(self._graph._vicinity, bGraphTime, eGraphTime)

    def createFolder(self, bGraphTime, eGraphTime):
        filePath = "../graphsData/Graph"+str(bGraphTime)+"_"+str(eGraphTime)
        # if the directory does not exist, create new one
        if not os.path.exists(filePath):
            print("Creating a new graph folder:"+filePath)
            result = False
            try:
                os.makedirs(filePath)
                result = True
            except OSError as e:
                if e.errno != errno.EEXIST:
                    raise
            if result:
                print("Graph Folder created!")
    def writeNode(self, mNodes, bGraphTime, eGraphTime):
        filePath = "../graphsData/Graph"+str(bGraphTime)+"_"+str(eGraphTime)+"/Nodes.txt"
        with open(filePath, "wb") as output:
            pickle.dump(mNodes, output, pickle.HIGHEST_PROTOCOL)

    def writeEdge(self, mEdges, bGraphTime, eGraphTime):
        filePath = "../graphsData/Graph"+str(bGraphTime)+"_"+str(eGraphTime)+"/Edges.txt"
        with open(filePath, "wb") as output:
            pickle.dump(mEdges, output, pickle.HIGHEST_PROTOCOL)

    def writeVicinity(self, vicinity, bGraphTime, eGraphTime):
        filePath = "../graphsData/Graph"+str(bGraphTime)+"_"+str(eGraphTime)+"/Vicinity.txt"
        with open(filePath, "wb") as output:
            pickle.dump(vicinity, output, pickle.HIGHEST_PROTOCOL)

    def setGraph(self, eGraph):
        self._graph = eGraph

    def loadEntityGraph(self, nodeFile, edgeFile, vicinityFile):
        self._graph = Graph()
        if not Path(nodeFile).is_file() or not Path(edgeFile) or not Path(vicinityFile):
            return False
        self._graph.loadNodes(nodeFile)
        self._graph.loadEdges(edgeFile)
        self._graph.loadVicinity(vicinityFile)
        if self._graph.getNodeCnt() > 0 and self._graph.getEdgeCnt() > 0:
            print("Loading graph completed!")
            return True
        else:
            return False


    def updateGraph(self,currentTd, deleteTd, insertTd):
        #set for making delted of added nodes of graph
        markForDel = set()
        markForAdd = set()

        # A temporary vicinity that store unchanged nodes
        tempVicinity = self._graph._vicinity
        print("There are total %d nodes that remain in old vicinity at beginning"%len(tempVicinity))

        #searching for nodes that would be affected by deleting
        for d in deleteTd.getTweets():
            entities = d.getEntities()
            for k in range(0, len(entities)-1):
                for j in range(k+1, len(entities)):
                    if entities[k] == entities[j]:
                        continue

                    node1 = entities[k]
                    node2 = entities[j]

                    for key, values in tempVicinity.items():
                        vicinity = values.keys()
                        if node1 in vicinity or node2 in vicinity:
                            markForDel.add(key)

        #delete keywords in old vicinity
        for keyword in markForDel:
            del tempVicinity[keyword]
        print("There are total %d nodes that remain in old vicinity right after deleting")

        # # generate the new graph
        # buff = TweetDatabase()
        # buff = self._td
        # buff.deleteFromHead(deleteTd.size())
        # buff.addAll(insertTd)

        #generate graph heare
        self._graph = Graph()
        self._graph.generateNodes(currentTd.getTweets())
        self._graph.generateEdges(currentTd.getTweets(), False)

        # List nodename for new graph
        listNodeNameOfGraph = set()
        for node in self._graph._mNodes:
            listNodeNameOfGraph.add(node.getName())

        self._graph._vicinity = tempVicinity
        print("There are total %d nodes that remin in old vicinity at beging of insertion"%len(self._graph._vicinity))
        for d in insertTd.getTweets():
            entities = d.getEntities()
            for k in range(0, len(entities) -1):
                for j in range(k+1, len(entities)):
                    if entities[k] == entities[j]:
                        continue

                    node1 = entities[k]
                    node2 = entities[j]

                    for key, values in tempVicinity.items():
                        vicinity = values.keys()
                        if node1 in vicinity or node2 in vicinity:
                            markForAdd.add(key)

        #delete keywords in old vicinity
        for keyword in markForAdd:
            if keyword in self._graph._vicinity:
                del self._graph._vicinity[keyword]

        print("There are total %d nodes that remain in old vicinity after inserting")

        epsilon = self._config["hubseek"]["epsilon"]
        errorBound = self._config["clustream"]["errorBound"]
        pRestart = self._config["clustream"]["pRestart"]

        searcher = Propagator(self._graph)
        #recompute
        cnt = 0
        for nodeName in listNodeNameOfGraph:
            if not (nodeName in self._graph._vicinity):
                neighbors = searcher.search(nodeName, epsilon, pRestart, errorBound)
                self._graph._vicinity[nodeName] = neighbors
                cnt += 1
                if cnt % 100 == 0:
                    print("Finished re-computing vicinity for %d nodes."%cnt)

Beispiel #9

0

Datei anzeigen

Datei: demo.py Projekt: SiThuc/EventDetector

class Demo(object):
    def __init__(self):
        self._db = None
        self._tdb = TweetDatabase()
        self._insert = TweetDatabase()
        self._delete = TweetDatabase()
        self._clustream = None
        self._config = None

        self._checkInit = False
        self._checkBatchRun = False
        self._tokenizer = None
        self._model = None
        self._detector = None
        print('The program begin')

    def init(self, paraFile):
        start = time.time()
        # Read YAML file
        self._config = Config.load(paraFile)

        numInitClusTweets = int(self._config['clustream']['numInitTweets'])
        queryFrameDuration = int(self._config['timespan']['init'])
        refWindowSize = int(self._config['query']['refWindowSize'])
        updateWindow = int(self._config['timespan']['update'])
        minSup = int(self._config['hubseek']['minSup'])

        # check if model is already existed
        self._model = load_model("../classifier/lstm_T6_best_weights.02-0.9507.hdf5")
        self._model.summary()

        # load tokenizer file
        file = open("../classifier/tokenizer.pickle", 'rb')
        self._tokenizer = pickle.load(file)
        file.close()

        # # training phase
        # cnnTraining = CNNTraining()
        # cnnTraining.training(self._config)
        # self._tokenizer = cnnTraining._tokenizer
        # self._model = cnnTraining._model

        #Processing raw tweets if the cleaned file is not created yet
        cleaned_tweets_file = self._config['file']['input']['cleaned_tweets']
        if not os.path.isfile(cleaned_tweets_file):
            CleanRawTweets.procesingRawTweet(self._config)
        else:
            print("The cleaned file was created before!")

        # classification process
        classified_tweets = self._config['file']['input']['classified_tweets']  # storing classified tweets
        writer = open(classified_tweets, 'w')

        initClusTweets = list()  # List of tweets for initClustream

        processedTweets = 0     # number of tweets are processed
        checkRunBatch = False
        startRunBatchFrame = 0
        endRunBatchFrame = 0
        startUpdateFrame = 0
        endUpdateFrame = 0

        with open(cleaned_tweets_file, encoding='utf8') as ctf:                 #read the tweet file line by line
            # read header
            line = ctf.readline()
            line = line.replace('\n', '')
            column_names = line.split(',')

            # collect tweets to init clustream
            for i in range(0, numInitClusTweets):
                line = ctf.readline()
                data = line.split(',')
                data_string = self.processLineData(data)
                geo_tweet = GeoTweet.geoTweetFromAString(data_string)
                initClusTweets.append(geo_tweet)
                startRunBatchFrame = geo_tweet.getTimestamp()
            endRunBatchFrame = startRunBatchFrame + queryFrameDuration      # set startTS and endTS for the Batchrun
            print('Clustream is starting to init....')
            self.initClustream(initClusTweets)
            print('Clustream stating is done!')

            dtf = None
            batch = []
            while True :
                line = ctf.readline()
                if line == '':
                    break
                line = line.replace('\n', '')
                data = line.split(',')
                batch.append(data)

                if len(batch) == 100:
                    dtf = pd.DataFrame(batch, columns=column_names)
                    input = dtf.text
                    sequences_test = self._tokenizer.texts_to_sequences(input)
                    x_test_seq = pad_sequences(sequences_test, maxlen=45)
                    y = self._model.predict(x_test_seq)
                    dtf['class'] = y
                    for index, row in dtf.iterrows():
                        list_string = []
                        list_string.append(row['id'])
                        list_string.append(row['timestamp'])
                        list_string.append(row['lng'])
                        list_string.append(row['lat'])
                        cleaned_text = CleanRawTweets.tweet_cleaner(row['text'])
                        tokenized = tokenizing(cleaned_text)
                        list_string.append(tokenized)
                        temp = '\t'.join(list_string)
                        geo_tweet = GeoTweet.geoTweetFromAString(temp)
                        self._clustream.update(geo_tweet)               #update clustream
                        if row['class'] > 0.65:
                            if not checkRunBatch:                       # if the tweet is predicted as disaster tweet
                                self._tdb.add(geo_tweet)                # add tweet into tweetdatabase
                                if (self._tdb.getEndTimestamp() >= endRunBatchFrame):
                                    self.trigger2(self._tdb)
                                    checkRunBatch = True
                                    startUpdateFrame = self._tdb.getStartTimestamp() + updateWindow
                                    endUpdateFrame = self._tdb.getEndTimestamp() + updateWindow
                            else:
                                if geo_tweet.getTimestamp() < endUpdateFrame:
                                    self._insert.add(geo_tweet)
                                else:
                                    print("There are %d tweets in insert set"%self._insert.size())
                                    startUpdateFrame = startUpdateFrame + updateWindow
                                    endUpdateFrame = endUpdateFrame + updateWindow
                                    self.update(startUpdateFrame)
                                    #self.update_normal(startUpdateFrame)
                            writer.write(temp +'\n')
                    batch = []
                    dtf = None

                processedTweets += 1
                # if processedTweets == 50000:
                #     writer.close()
                #     break
                if processedTweets % 5000 == 0:
                    ttime = time.time() - start
                    print("- %d Tweets consumed %f seconds."%(processedTweets, ttime))
                    self._clustream.printStats()


        writer.close()

        end = time.time()

        consumed_time = end-start
        print("Time consumed is %f"%consumed_time)

    def processLineData(self, data):
        list_string = []
        list_string.append(data[0]) #id
        list_string.append(data[4]) #timestamp
        list_string.append(data[2]) #longitude
        list_string.append(data[3]) #latitude
        cleaned_text = CleanRawTweets.tweet_cleaner(data[1])    #text
        tokenized = tokenizing(cleaned_text)
        list_string.append(tokenized)
        result = '\t'.join(list_string)
        return result

    def trigger2(self, tdb):
        print("In the trigger function")
        self._db = Database(self._config)
        epsilon = float(self._config['hubseek']['epsilon'])
        errorBound = float(self._config['clustream']['errorBound'])
        pRestart = float(self._config['clustream']['pRestart'])
        self._db.generateEntityGraph(self._tdb, epsilon, errorBound, pRestart)
        print("Create graph done!")
        self._detector = self.runHubSeek2(self._tdb)
    def runHubSeek2(self, tdb):
        detector = BatchDetector(self._clustream, self._db.getEntityGraph(), self._config)
        if self._config["hubseek"]["run"]:
            bandwidth = self._config["hubseek"]["bandwidth"][0]
            epsilon = self._config["hubseek"]["epsilon"]
            eta = self._config["hubseek"]["eta"][0]
            minSup = int(self._config['hubseek']['minSup'])
            print("Starting Hubseek process.....")
            refSpanTime = self._config["query"]["refWindowSize"]
            detector.detect(tdb, bandwidth, epsilon, minSup, refSpanTime, eta)
            print("THE INTERFACE IS CREATED, PLEASE RUN THE INTEFACE MODULE!!!")
            time.sleep(60)
            detector.printStats()
        return detector

    def update_normal(self, startNewTimes):
        deletedTweets = self._tdb.deleteFromHeadByTime(startNewTimes)
        print("There are %d tweets after removing" % self._tdb.size())

        for tweet in deletedTweets:
            self._delete.add(tweet)
        print("There are %d tweets are removed"%self._delete.size())

        for tweet in self._insert.getTweets():
            self._tdb.add(tweet)
        print("There are %d tweets after adding" %self._tdb.size())
        print("----------------------------------------")

        epsilon = float(self._config['hubseek']['epsilon'])
        errorBound = float(self._config['clustream']['errorBound'])
        pRestart = float(self._config['clustream']['pRestart'])
        self._db.generateEntityGraph(self._tdb.getTweets(), epsilon, errorBound, pRestart)
        print("Create graph done!")
        hubseek = self.runHubSeek2(self._tdb)

        self._insert = TweetDatabase()
        self._delete = TweetDatabase()

    def update(self, startNewTimes):
        deletedTweets = self._tdb.deleteFromHeadByTime(startNewTimes)
        print("There are %d tweets after removing"%self._tdb.size())
        self._tdb.setNewStartTimestamp(startNewTimes)

        for tweet in deletedTweets:
            self._delete.add(tweet)
        print("There are %d tweets are removed"%self._delete.size())

        for tweet in self._insert.getTweets():
            self._tdb.add(tweet)
        print("There are %d tweets after adding" %self._tdb.size())
        print("----------------------------------------")

        onDetector = self.runOnline(self._tdb, self._delete, self._insert)
        self._insert = TweetDatabase()
        self._delete = TweetDatabase()

    def runOnline(self, currentTd, deleteTd, insertTd):
        self._detector._hubSeek.delete(deleteTd.getTweets())
        self._detector._hubSeek.insert(insertTd.getTweets())

        minSup = int(self._config['hubseek']['minSup'])
        clusters = self._detector._hubSeek.genClusters(minSup)

        # This part for update raking process
        eta = self._config["hubseek"]["eta"][0]
        bandwidth = self._config["hubseek"]["bandwidth"][0]
        refSpanTime = self._config["query"]["refWindowSize"]

        self.updateRanker(self._tdb, eta)
        self._detector.setTD(currentTd)
        events = self._detector.rank(clusters, bandwidth, refSpanTime)
        print("There are %d events in online step"%len(events))
        
        output = '../output/live/output.json'
        #output2 = '../output/output_' + str(currentTd.getStartTimestamp())+'_'+str(currentTd.getEndTimestamp())+'.json'
        data = []
        for clus in events:
            sub = clus.toJson()
            data.append(sub)

        with open(output, 'w') as f:
            json.dump(data, f)
        f.close()

        for clus in events:
            print(clus.__str__())
            print("################################")
        print("Online Hubseek generating candidates done!!!")
        return clusters

    def updateRanker(self, tdb, eta):
        weighter = IDFWeighter(tdb.getTweets())
        self._detector._ranker = Ranker(self._clustream, weighter, eta)


    def initClustream(self, initClusTweets):
        numInitClusters = self._config['clustream']['numInitClusters']
        numMaxClusters = self._config['clustream']['numMaxClusters']
        numTweetPeriod = self._config['clustream']['numTweetPeriod']
        outdatedThreshold = self._config['clustream']['outdatedThreshold']
        self._clustream = Clustream(numMaxClusters, numTweetPeriod, outdatedThreshold)
        self._clustream.init(initClusTweets, numInitClusters)
        print("The Clustream is also initiated................Done")

    def updateTweetDatabase(self, delete, insert):
        self._tdb.deleteFromHead(delete.size())
        self._tdb.addAll(insert)

    def main(self):
        paraFile = "../config/config.yml"
        self.init(paraFile)

Beispiel #10

0

Datei anzeigen

Datei: demo_without_model.py Projekt: SiThuc/EventDetector

class Demo(object):
    def __init__(self):
        self._db = None
        self._tdb = TweetDatabase()
        self._insert = TweetDatabase()
        self._delete = TweetDatabase()
        self._clustream = None
        self._config = None
        #self._mongo = None

        self._checkInit = False
        self._checkBatchRun = False
        self._tokenizer = None
        self._model = None
        self._detector = None
        print('The program begin')

    def init(self, paraFile):
        start = time.time()
        # Read YAML file
        self._config = Config.load(paraFile)
        print(self._config)

        init_duration = int(self._config['timespan']['init'])
        update_span = int(self._config['timespan']['update'])

        # # training phase
        # cnnTraining = CNNTraining()
        # cnnTraining.training(self._config)
        # self._tokenizer = cnnTraining._tokenizer
        # self._model = cnnTraining._model

        # Processing raw tweets if the cleaned file is not created yet
        cleaned_tweets_file = self._config['file']['input']['cleaned_tweets']
        if not os.path.isfile(cleaned_tweets_file):
            CleanRawTweets.procesingRawTweet(self._config)
        else:
            print("The cleaned file was created before!")

        countInit = 0
        checkInit = False
        startUpdateFrame = 0
        endUpdateFrame = 0

        with open(cleaned_tweets_file,
                  encoding='utf8') as ctf:  #read the tweet file line by line
            line = ctf.readline()  #read header
            line = line.replace('\n', '')
            column_names = line.split(',')
            dtf = None
            batch = []
            while True:
                line = ctf.readline()
                if line == '':
                    break
                line = line.replace('\n', '')
                data = line.split(',')
                batch.append(data)
                if len(batch) == 100:
                    dtf = pd.DataFrame(batch, columns=column_names)
                    for index, row in dtf.iterrows():
                        list_string = []
                        list_string.append(row['id'])
                        list_string.append(row['timestamp'])
                        list_string.append(row['lng'])
                        list_string.append(row['lat'])
                        cleaned_text = CleanRawTweets.tweet_cleaner(
                            row['text'])
                        tokenized = tokenizing(cleaned_text)
                        list_string.append(tokenized)
                        temp = '\t'.join(list_string)
                        geo_tweet = GeoTweet.geoTweetFromAString(temp)

                        if not checkInit:
                            self._tdb.add(geo_tweet)
                            if (self._tdb.getEndTimestamp() -
                                    self._tdb.getStartTimestamp() >=
                                    init_duration):
                                self.trigger2(self._tdb)
                                checkInit = True
                                startUpdateFrame = self._tdb.getStartTimestamp(
                                ) + update_span
                                endUpdateFrame = self._tdb.getEndTimestamp(
                                ) + update_span
                        else:
                            self._clustream.update(
                                geo_tweet)  # update geotweet
                            if geo_tweet.getTimestamp() < endUpdateFrame:
                                self._insert.add(geo_tweet)

                            else:
                                print("There are %d tweets in insert set" %
                                      self._insert.size())
                                startUpdateFrame = startUpdateFrame + update_span
                                endUpdateFrame = endUpdateFrame + update_span
                                self.update(startUpdateFrame)
                    batch = []
                    dtf = None

                countInit += 1
                if countInit % 1000 == 0:
                    ttime = time.time() - start
                    print("- %d Tweets consumed %f seconds." %
                          (countInit, ttime))
                if countInit == 20000:
                    break
        end = time.time()

        consumed_time = end - start
        print("Time consumed is %f" % consumed_time)

        # # Load mongo config               This part is under considering
        # self._mongo = Mongo(self._config)

        # initialize database
        #self.initDatabase()
        # self.initClustream()

    def trigger2(self, tdb):
        print('Initializating clustream....')
        self.initClustream()
        print("In the trigger function")
        self._db = Database(self._config)
        epsilon = float(self._config['hubseek']['epsilon'])
        errorBound = float(self._config['clustream']['errorBound'])
        pRestart = float(self._config['clustream']['pRestart'])
        self._db.generateEntityGraph(self._tdb.getTweets(), epsilon,
                                     errorBound, pRestart)
        print("Create graph done!")
        self._detector = self.runHubSeek2(self._tdb)

    def runHubSeek2(self, tdb):
        detector = BatchDetector(self._clustream, self._db.getEntityGraph(),
                                 self._config)
        if self._config["hubseek"]["run"]:
            bandwidth = self._config["hubseek"]["bandwidth"][0]
            epsilon = self._config["hubseek"]["epsilon"]
            eta = self._config["hubseek"]["eta"][0]
            minSup = int(self._config['hubseek']['minSup'])
            print("Starting Hubseek process.....")
            refSpanTime = self._config["query"]["refWindowSize"]
            detector.detect(tdb, bandwidth, epsilon, minSup, refSpanTime, eta)
            detector.printStats()
        return detector

    def update(self, startNewTimes):
        deletedTweets = self._tdb.deleteFromHeadByTime(startNewTimes)
        print("There are %d tweets after removing" % self._tdb.size())

        for tweet in deletedTweets:
            self._delete.add(tweet)
        print("There are %d tweets are removed" % self._delete.size())

        for tweet in self._insert.getTweets():
            self._tdb.add(tweet)
        print("There are %d tweets after adding" % self._tdb.size())
        print("----------------------------------------")

        #self._db.updateGraph(self._tdb, self._delete, self._insert)
        onDetector = self.runOnline(self._tdb, self._delete, self._insert)

        # epsilon = float(self._config['hubseek']['epsilon'])
        # errorBound = float(self._config['clustream']['errorBound'])
        # pRestart = float(self._config['clustream']['pRestart'])
        # self._db.generateEntityGraph(self._tdb.getTweets(), epsilon, errorBound, pRestart)
        # hubseek = self.runHubSeek2(self._tdb)

        self._insert = TweetDatabase()
        self._delete = TweetDatabase()

    def runOnline(self, currentTd, deleteTd, insertTd):
        self._detector._hubSeek.delete(deleteTd.getTweets())
        self._detector._hubSeek.insert(insertTd.getTweets())
        minSup = int(self._config['hubseek']['minSup'])
        clusters = self._detector._hubSeek.genClusters(minSup)

        for clus in clusters:
            print(clus.__str__())
            print("################################")
        print("Online Hubseek generating candidates done!!!")
        return clusters

    def initClustream(self):
        numInitClusters = self._config['clustream']['numInitClusters']
        numMaxClusters = self._config['clustream']['numMaxClusters']
        numTweetPeriod = self._config['clustream']['numTweetPeriod']
        outdatedThreshold = self._config['clustream']['outdatedThreshold']
        self._clustream = Clustream(numMaxClusters, numTweetPeriod,
                                    outdatedThreshold)
        self._clustream.init(self._tdb.getTweets(), numInitClusters)
        print("The Clustream is also initiated................Done")


#-------------------------------------------------------------------------------------------

    def initDatabase(self):
        tweetFile = self._config['file']['input']['tweets']
        numInitTweets = self._config['clustream']['numInitTweets']
        self._db = Database(self._config)
        self._db.loadInitialTweets(tweetFile, numInitTweets)
        print("Load Inital tweets.................Done")

    def runBatch(self):
        epsilon = self._config['hubseek']['epsilon'][0]
        #numInitTweets = self._config['clustream']['numInitTweets']
        pRestart = self._config['clustream']['pRestart']
        errorBound = self._config['clustream']['errorBound']

        queryDB = TweetDatabase()
        refDB = TweetDatabase()

        queries = self._mongo.loadBatchQueries(self._config)
        # # print all queries
        # for qr in queries:
        #     print("StartQueryTS:%d, EndQueryTS:%d"%(qr.getStartTS(), qr.getEndTS()))
        queryIndex = 0
        clustream_path = "../clustream/clustream_data/clustream_" + str(
            queryIndex) + ".pickle"
        myClustream_file = Path(clustream_path)

        query = queries[queryIndex]
        if myClustream_file.is_file():
            self.load_clustream(clustream_path)
        else:  # update the cluster
            while True:
                tweet = self._db.nextTweet()
                if tweet is None:
                    break
                self.addTweet(query, queryDB, refDB, tweet)
                self._clustream.update(tweet)
                if tweet.getTimestamp() > query.getEndTS():
                    print("There are: %d Tweets in query: %d" %
                          (len(queryDB.getTweets()), queryIndex))
                    print("There are: %d Tweets in refDB: %d" %
                          (len(refDB.getTweets()), queryIndex))

                    #saving current clustream
                    self._clustream.printStats()
                    with open(clustream_path, "wb") as output:
                        pickle.dump(self._clustream, output,
                                    pickle.HIGHEST_PROTOCOL)
                    print("The Current _clustream is stored!")

                    graphTime = self._config["query"]["querypoints"]
                    entityFile = "../graphsData/Graph" + str(
                        graphTime[0]) + "_" + str(graphTime[1]) + "/Nodes.txt"
                    entityEdgeFile = "../graphsData/Graph" + str(
                        graphTime[0]) + "_" + str(graphTime[1]) + "/Edges.txt"
                    vicinityFile = "../graphsData/Graph" + str(
                        graphTime[0]) + "_" + str(
                            graphTime[1]) + "/Vicinity.txt"
                    #check whether the entitygraph can be loaded or not
                    check = self._db.loadEntityGraph(entityFile,
                                                     entityEdgeFile,
                                                     vicinityFile)
                    if not check:
                        self._db.generateEntityGraph(queryDB.getTweets(),
                                                     epsilon, errorBound,
                                                     pRestart)

                    self.trigger(query, queryDB)
                    queryIndex += 1
                    if queryIndex < len(queries):
                        query = queries[queryIndex]
                        queryDB = TweetDatabase()
                        refDB.deleteFromHead(query.getRefStartTS())
                    else:
                        break
            print("Batch mode ..............Done!")

    def load_clustream(self, filePath):
        with open(filePath, "rb") as ip:
            self._clustream = pickle.load(ip)
        print("The clustream is loaded")

    def addTweet(self, query, queryDB, refDB, tweet):
        ts = tweet.getTimestamp()
        if ts > query.getStartTS() and ts <= query.getEndTS():
            queryDB.add(tweet)
        if ts > query.getRefStartTS() and ts <= query.getRefEndTS():
            refDB.add(tweet)

    def trigger(self, query, queryDB):
        if queryDB.size() == 0:
            return
        hubseek = self.runHubSeek(query, queryDB)

    def runHubSeek(self, query, queryDB):
        detector = BatchDetector(self._clustream, self._db.getEntityGraph(),
                                 self._config)
        if self._config["hubseek"]["run"]:
            bandwidth = self._config["hubseek"]["bandwidth"][0]
            epsilon = self._config["hubseek"]["epsilon"][0]
            eta = self._config["hubseek"]["eta"][0]
            refTimeSpan = query.getRefEndTS() - query.getRefStartTS()
            minSup = query.getMinSup()
            print("Starting Hubseek process.....")
            detector.detect(queryDB, query.getQueryInterval(), bandwidth,
                            epsilon, minSup, refTimeSpan, eta)
            detector.printStats()
        return detector

    def main(self):
        paraFile = "../config/config.yml"
        self.init(paraFile)

Beispiel #11

0

Datei anzeigen

Datei: demo_without_model.py Projekt: SiThuc/EventDetector

    def runBatch(self):
        epsilon = self._config['hubseek']['epsilon'][0]
        #numInitTweets = self._config['clustream']['numInitTweets']
        pRestart = self._config['clustream']['pRestart']
        errorBound = self._config['clustream']['errorBound']

        queryDB = TweetDatabase()
        refDB = TweetDatabase()

        queries = self._mongo.loadBatchQueries(self._config)
        # # print all queries
        # for qr in queries:
        #     print("StartQueryTS:%d, EndQueryTS:%d"%(qr.getStartTS(), qr.getEndTS()))
        queryIndex = 0
        clustream_path = "../clustream/clustream_data/clustream_" + str(
            queryIndex) + ".pickle"
        myClustream_file = Path(clustream_path)

        query = queries[queryIndex]
        if myClustream_file.is_file():
            self.load_clustream(clustream_path)
        else:  # update the cluster
            while True:
                tweet = self._db.nextTweet()
                if tweet is None:
                    break
                self.addTweet(query, queryDB, refDB, tweet)
                self._clustream.update(tweet)
                if tweet.getTimestamp() > query.getEndTS():
                    print("There are: %d Tweets in query: %d" %
                          (len(queryDB.getTweets()), queryIndex))
                    print("There are: %d Tweets in refDB: %d" %
                          (len(refDB.getTweets()), queryIndex))

                    #saving current clustream
                    self._clustream.printStats()
                    with open(clustream_path, "wb") as output:
                        pickle.dump(self._clustream, output,
                                    pickle.HIGHEST_PROTOCOL)
                    print("The Current _clustream is stored!")

                    graphTime = self._config["query"]["querypoints"]
                    entityFile = "../graphsData/Graph" + str(
                        graphTime[0]) + "_" + str(graphTime[1]) + "/Nodes.txt"
                    entityEdgeFile = "../graphsData/Graph" + str(
                        graphTime[0]) + "_" + str(graphTime[1]) + "/Edges.txt"
                    vicinityFile = "../graphsData/Graph" + str(
                        graphTime[0]) + "_" + str(
                            graphTime[1]) + "/Vicinity.txt"
                    #check whether the entitygraph can be loaded or not
                    check = self._db.loadEntityGraph(entityFile,
                                                     entityEdgeFile,
                                                     vicinityFile)
                    if not check:
                        self._db.generateEntityGraph(queryDB.getTweets(),
                                                     epsilon, errorBound,
                                                     pRestart)

                    self.trigger(query, queryDB)
                    queryIndex += 1
                    if queryIndex < len(queries):
                        query = queries[queryIndex]
                        queryDB = TweetDatabase()
                        refDB.deleteFromHead(query.getRefStartTS())
                    else:
                        break
            print("Batch mode ..............Done!")