コード例 #1
0
ファイル: database.py プロジェクト: SiThuc/EventDetector
 def nextTweet(self):
     tweet = None
     while tweet is None or tweet.numEntity() == 0:
         line = self._reader.readline()
         if line == '':
             return None
         tweet = GeoTweet.geoTweetFromAString(line)
     return tweet
コード例 #2
0
ファイル: database.py プロジェクト: SiThuc/EventDetector
    def loadInitialTweets(self, tweetFile, numInitTweets):
        self._initialTweets = TweetDatabase()
        self._reader = open(tweetFile)

        for i in range(0, numInitTweets):
            line = self._reader.readline()
            tweet = GeoTweet.geoTweetFromAString(tweetString=line)
            self._initialTweets.add(tweet)
        print("There are %d tweets are loaded as initial tweets."%len(self.getInitialTweets().getTweets()))
コード例 #3
0
    def load(self, tweetFile):
        with open(tweetFile) as f:
            lines = f.readlines()
        f.close()

        for line in lines:
            if line.isNull():
                break
            gt = GeoTweet(line)
            self._tweets.append(gt)
コード例 #4
0
ファイル: mongo.py プロジェクト: SiThuc/EventDetector
    def loadTweetsIntoQueryDB(self, query, queryDB, refDB):
        # Load for queryDB
        cursor = self._tweetCol.find({
            "timestamp": {
                "$gte": query.getStartTS(),
                "$lt": query.getEndTS()
            }
        })
        for data in cursor:
            tweetId = data["id"]
            timestamp = data["timestamp"]
            lng = data["lng"]
            lat = data["lat"]
            loc = Location(lng, lat)
            entities = data["entities"]
            tweet = GeoTweet(tweetId, timestamp, loc, entities)
            queryDB.append(tweet)

        # Load for refDB
        cursor = self._tweetCol.find({
            "timestamp": {
                "$gte": query.getRefStartTS(),
                "$lt": query.getRefEndTS()
            }
        })
        for data in cursor:
            tweetId = data["id"]
            timestamp = data["timestamp"]
            lng = data["lng"]
            lat = data["lat"]
            loc = Location(lng, lat)
            entities = data["entities"]
            tweet = GeoTweet(tweetId, timestamp, loc, entities)
            refDB.append(tweet)


# if __name__ == "__main__":
#     client = MongoClient("127.0.0.1", 27017)
#     database = client.get_database("EmployeeData")
#     col = database.get_collection("Employees")
#     doc = col.find({"id":{"$gt":2}, "age": {"$lt":29,"$gt":2}})
#     for x in doc:
#         print(x['name'])
コード例 #5
0
ファイル: mongo.py プロジェクト: SiThuc/EventDetector
 def rangeQueryTweetDB(self, startTS, endTS):
     cursor = self._tweetCol.find(
         {"timestamp": {
             "$gte": startTS,
             "$lt": endTS
         }})
     td = TweetDatabase()
     for data in cursor:
         tweetId = data["id"]
         timestamp = data["timestamp"]
         lng = data["lng"]
         lat = data["lat"]
         loc = Location(lng, lat)
         entities = data["entities"]
         tweet = GeoTweet(tweetId, timestamp, loc, entities)
         td.append(tweet)
     return td
コード例 #6
0
ファイル: demo.py プロジェクト: SiThuc/EventDetector
    def init(self, paraFile):
        start = time.time()
        # Read YAML file
        self._config = Config.load(paraFile)

        numInitClusTweets = int(self._config['clustream']['numInitTweets'])
        queryFrameDuration = int(self._config['timespan']['init'])
        refWindowSize = int(self._config['query']['refWindowSize'])
        updateWindow = int(self._config['timespan']['update'])
        minSup = int(self._config['hubseek']['minSup'])

        # check if model is already existed
        self._model = load_model("../classifier/lstm_T6_best_weights.02-0.9507.hdf5")
        self._model.summary()

        # load tokenizer file
        file = open("../classifier/tokenizer.pickle", 'rb')
        self._tokenizer = pickle.load(file)
        file.close()

        # # training phase
        # cnnTraining = CNNTraining()
        # cnnTraining.training(self._config)
        # self._tokenizer = cnnTraining._tokenizer
        # self._model = cnnTraining._model

        #Processing raw tweets if the cleaned file is not created yet
        cleaned_tweets_file = self._config['file']['input']['cleaned_tweets']
        if not os.path.isfile(cleaned_tweets_file):
            CleanRawTweets.procesingRawTweet(self._config)
        else:
            print("The cleaned file was created before!")

        # classification process
        classified_tweets = self._config['file']['input']['classified_tweets']  # storing classified tweets
        writer = open(classified_tweets, 'w')

        initClusTweets = list()  # List of tweets for initClustream

        processedTweets = 0     # number of tweets are processed
        checkRunBatch = False
        startRunBatchFrame = 0
        endRunBatchFrame = 0
        startUpdateFrame = 0
        endUpdateFrame = 0

        with open(cleaned_tweets_file, encoding='utf8') as ctf:                 #read the tweet file line by line
            # read header
            line = ctf.readline()
            line = line.replace('\n', '')
            column_names = line.split(',')

            # collect tweets to init clustream
            for i in range(0, numInitClusTweets):
                line = ctf.readline()
                data = line.split(',')
                data_string = self.processLineData(data)
                geo_tweet = GeoTweet.geoTweetFromAString(data_string)
                initClusTweets.append(geo_tweet)
                startRunBatchFrame = geo_tweet.getTimestamp()
            endRunBatchFrame = startRunBatchFrame + queryFrameDuration      # set startTS and endTS for the Batchrun
            print('Clustream is starting to init....')
            self.initClustream(initClusTweets)
            print('Clustream stating is done!')

            dtf = None
            batch = []
            while True :
                line = ctf.readline()
                if line == '':
                    break
                line = line.replace('\n', '')
                data = line.split(',')
                batch.append(data)

                if len(batch) == 100:
                    dtf = pd.DataFrame(batch, columns=column_names)
                    input = dtf.text
                    sequences_test = self._tokenizer.texts_to_sequences(input)
                    x_test_seq = pad_sequences(sequences_test, maxlen=45)
                    y = self._model.predict(x_test_seq)
                    dtf['class'] = y
                    for index, row in dtf.iterrows():
                        list_string = []
                        list_string.append(row['id'])
                        list_string.append(row['timestamp'])
                        list_string.append(row['lng'])
                        list_string.append(row['lat'])
                        cleaned_text = CleanRawTweets.tweet_cleaner(row['text'])
                        tokenized = tokenizing(cleaned_text)
                        list_string.append(tokenized)
                        temp = '\t'.join(list_string)
                        geo_tweet = GeoTweet.geoTweetFromAString(temp)
                        self._clustream.update(geo_tweet)               #update clustream
                        if row['class'] > 0.65:
                            if not checkRunBatch:                       # if the tweet is predicted as disaster tweet
                                self._tdb.add(geo_tweet)                # add tweet into tweetdatabase
                                if (self._tdb.getEndTimestamp() >= endRunBatchFrame):
                                    self.trigger2(self._tdb)
                                    checkRunBatch = True
                                    startUpdateFrame = self._tdb.getStartTimestamp() + updateWindow
                                    endUpdateFrame = self._tdb.getEndTimestamp() + updateWindow
                            else:
                                if geo_tweet.getTimestamp() < endUpdateFrame:
                                    self._insert.add(geo_tweet)
                                else:
                                    print("There are %d tweets in insert set"%self._insert.size())
                                    startUpdateFrame = startUpdateFrame + updateWindow
                                    endUpdateFrame = endUpdateFrame + updateWindow
                                    self.update(startUpdateFrame)
                                    #self.update_normal(startUpdateFrame)
                            writer.write(temp +'\n')
                    batch = []
                    dtf = None

                processedTweets += 1
                # if processedTweets == 50000:
                #     writer.close()
                #     break
                if processedTweets % 5000 == 0:
                    ttime = time.time() - start
                    print("- %d Tweets consumed %f seconds."%(processedTweets, ttime))
                    self._clustream.printStats()


        writer.close()

        end = time.time()

        consumed_time = end-start
        print("Time consumed is %f"%consumed_time)
コード例 #7
0
    def init(self, paraFile):
        start = time.time()
        # Read YAML file
        self._config = Config.load(paraFile)
        print(self._config)

        init_duration = int(self._config['timespan']['init'])
        update_span = int(self._config['timespan']['update'])

        # # training phase
        # cnnTraining = CNNTraining()
        # cnnTraining.training(self._config)
        # self._tokenizer = cnnTraining._tokenizer
        # self._model = cnnTraining._model

        # Processing raw tweets if the cleaned file is not created yet
        cleaned_tweets_file = self._config['file']['input']['cleaned_tweets']
        if not os.path.isfile(cleaned_tweets_file):
            CleanRawTweets.procesingRawTweet(self._config)
        else:
            print("The cleaned file was created before!")

        countInit = 0
        checkInit = False
        startUpdateFrame = 0
        endUpdateFrame = 0

        with open(cleaned_tweets_file,
                  encoding='utf8') as ctf:  #read the tweet file line by line
            line = ctf.readline()  #read header
            line = line.replace('\n', '')
            column_names = line.split(',')
            dtf = None
            batch = []
            while True:
                line = ctf.readline()
                if line == '':
                    break
                line = line.replace('\n', '')
                data = line.split(',')
                batch.append(data)
                if len(batch) == 100:
                    dtf = pd.DataFrame(batch, columns=column_names)
                    for index, row in dtf.iterrows():
                        list_string = []
                        list_string.append(row['id'])
                        list_string.append(row['timestamp'])
                        list_string.append(row['lng'])
                        list_string.append(row['lat'])
                        cleaned_text = CleanRawTweets.tweet_cleaner(
                            row['text'])
                        tokenized = tokenizing(cleaned_text)
                        list_string.append(tokenized)
                        temp = '\t'.join(list_string)
                        geo_tweet = GeoTweet.geoTweetFromAString(temp)

                        if not checkInit:
                            self._tdb.add(geo_tweet)
                            if (self._tdb.getEndTimestamp() -
                                    self._tdb.getStartTimestamp() >=
                                    init_duration):
                                self.trigger2(self._tdb)
                                checkInit = True
                                startUpdateFrame = self._tdb.getStartTimestamp(
                                ) + update_span
                                endUpdateFrame = self._tdb.getEndTimestamp(
                                ) + update_span
                        else:
                            self._clustream.update(
                                geo_tweet)  # update geotweet
                            if geo_tweet.getTimestamp() < endUpdateFrame:
                                self._insert.add(geo_tweet)

                            else:
                                print("There are %d tweets in insert set" %
                                      self._insert.size())
                                startUpdateFrame = startUpdateFrame + update_span
                                endUpdateFrame = endUpdateFrame + update_span
                                self.update(startUpdateFrame)
                    batch = []
                    dtf = None

                countInit += 1
                if countInit % 1000 == 0:
                    ttime = time.time() - start
                    print("- %d Tweets consumed %f seconds." %
                          (countInit, ttime))
                if countInit == 20000:
                    break
        end = time.time()

        consumed_time = end - start
        print("Time consumed is %f" % consumed_time)