def nextTweet(self): tweet = None while tweet is None or tweet.numEntity() == 0: line = self._reader.readline() if line == '': return None tweet = GeoTweet.geoTweetFromAString(line) return tweet
def loadInitialTweets(self, tweetFile, numInitTweets): self._initialTweets = TweetDatabase() self._reader = open(tweetFile) for i in range(0, numInitTweets): line = self._reader.readline() tweet = GeoTweet.geoTweetFromAString(tweetString=line) self._initialTweets.add(tweet) print("There are %d tweets are loaded as initial tweets."%len(self.getInitialTweets().getTweets()))
def load(self, tweetFile): with open(tweetFile) as f: lines = f.readlines() f.close() for line in lines: if line.isNull(): break gt = GeoTweet(line) self._tweets.append(gt)
def loadTweetsIntoQueryDB(self, query, queryDB, refDB): # Load for queryDB cursor = self._tweetCol.find({ "timestamp": { "$gte": query.getStartTS(), "$lt": query.getEndTS() } }) for data in cursor: tweetId = data["id"] timestamp = data["timestamp"] lng = data["lng"] lat = data["lat"] loc = Location(lng, lat) entities = data["entities"] tweet = GeoTweet(tweetId, timestamp, loc, entities) queryDB.append(tweet) # Load for refDB cursor = self._tweetCol.find({ "timestamp": { "$gte": query.getRefStartTS(), "$lt": query.getRefEndTS() } }) for data in cursor: tweetId = data["id"] timestamp = data["timestamp"] lng = data["lng"] lat = data["lat"] loc = Location(lng, lat) entities = data["entities"] tweet = GeoTweet(tweetId, timestamp, loc, entities) refDB.append(tweet) # if __name__ == "__main__": # client = MongoClient("127.0.0.1", 27017) # database = client.get_database("EmployeeData") # col = database.get_collection("Employees") # doc = col.find({"id":{"$gt":2}, "age": {"$lt":29,"$gt":2}}) # for x in doc: # print(x['name'])
def rangeQueryTweetDB(self, startTS, endTS): cursor = self._tweetCol.find( {"timestamp": { "$gte": startTS, "$lt": endTS }}) td = TweetDatabase() for data in cursor: tweetId = data["id"] timestamp = data["timestamp"] lng = data["lng"] lat = data["lat"] loc = Location(lng, lat) entities = data["entities"] tweet = GeoTweet(tweetId, timestamp, loc, entities) td.append(tweet) return td
def init(self, paraFile): start = time.time() # Read YAML file self._config = Config.load(paraFile) numInitClusTweets = int(self._config['clustream']['numInitTweets']) queryFrameDuration = int(self._config['timespan']['init']) refWindowSize = int(self._config['query']['refWindowSize']) updateWindow = int(self._config['timespan']['update']) minSup = int(self._config['hubseek']['minSup']) # check if model is already existed self._model = load_model("../classifier/lstm_T6_best_weights.02-0.9507.hdf5") self._model.summary() # load tokenizer file file = open("../classifier/tokenizer.pickle", 'rb') self._tokenizer = pickle.load(file) file.close() # # training phase # cnnTraining = CNNTraining() # cnnTraining.training(self._config) # self._tokenizer = cnnTraining._tokenizer # self._model = cnnTraining._model #Processing raw tweets if the cleaned file is not created yet cleaned_tweets_file = self._config['file']['input']['cleaned_tweets'] if not os.path.isfile(cleaned_tweets_file): CleanRawTweets.procesingRawTweet(self._config) else: print("The cleaned file was created before!") # classification process classified_tweets = self._config['file']['input']['classified_tweets'] # storing classified tweets writer = open(classified_tweets, 'w') initClusTweets = list() # List of tweets for initClustream processedTweets = 0 # number of tweets are processed checkRunBatch = False startRunBatchFrame = 0 endRunBatchFrame = 0 startUpdateFrame = 0 endUpdateFrame = 0 with open(cleaned_tweets_file, encoding='utf8') as ctf: #read the tweet file line by line # read header line = ctf.readline() line = line.replace('\n', '') column_names = line.split(',') # collect tweets to init clustream for i in range(0, numInitClusTweets): line = ctf.readline() data = line.split(',') data_string = self.processLineData(data) geo_tweet = GeoTweet.geoTweetFromAString(data_string) initClusTweets.append(geo_tweet) startRunBatchFrame = geo_tweet.getTimestamp() endRunBatchFrame = startRunBatchFrame + queryFrameDuration # set startTS and endTS for the Batchrun print('Clustream is starting to init....') self.initClustream(initClusTweets) print('Clustream stating is done!') dtf = None batch = [] while True : line = ctf.readline() if line == '': break line = line.replace('\n', '') data = line.split(',') batch.append(data) if len(batch) == 100: dtf = pd.DataFrame(batch, columns=column_names) input = dtf.text sequences_test = self._tokenizer.texts_to_sequences(input) x_test_seq = pad_sequences(sequences_test, maxlen=45) y = self._model.predict(x_test_seq) dtf['class'] = y for index, row in dtf.iterrows(): list_string = [] list_string.append(row['id']) list_string.append(row['timestamp']) list_string.append(row['lng']) list_string.append(row['lat']) cleaned_text = CleanRawTweets.tweet_cleaner(row['text']) tokenized = tokenizing(cleaned_text) list_string.append(tokenized) temp = '\t'.join(list_string) geo_tweet = GeoTweet.geoTweetFromAString(temp) self._clustream.update(geo_tweet) #update clustream if row['class'] > 0.65: if not checkRunBatch: # if the tweet is predicted as disaster tweet self._tdb.add(geo_tweet) # add tweet into tweetdatabase if (self._tdb.getEndTimestamp() >= endRunBatchFrame): self.trigger2(self._tdb) checkRunBatch = True startUpdateFrame = self._tdb.getStartTimestamp() + updateWindow endUpdateFrame = self._tdb.getEndTimestamp() + updateWindow else: if geo_tweet.getTimestamp() < endUpdateFrame: self._insert.add(geo_tweet) else: print("There are %d tweets in insert set"%self._insert.size()) startUpdateFrame = startUpdateFrame + updateWindow endUpdateFrame = endUpdateFrame + updateWindow self.update(startUpdateFrame) #self.update_normal(startUpdateFrame) writer.write(temp +'\n') batch = [] dtf = None processedTweets += 1 # if processedTweets == 50000: # writer.close() # break if processedTweets % 5000 == 0: ttime = time.time() - start print("- %d Tweets consumed %f seconds."%(processedTweets, ttime)) self._clustream.printStats() writer.close() end = time.time() consumed_time = end-start print("Time consumed is %f"%consumed_time)
def init(self, paraFile): start = time.time() # Read YAML file self._config = Config.load(paraFile) print(self._config) init_duration = int(self._config['timespan']['init']) update_span = int(self._config['timespan']['update']) # # training phase # cnnTraining = CNNTraining() # cnnTraining.training(self._config) # self._tokenizer = cnnTraining._tokenizer # self._model = cnnTraining._model # Processing raw tweets if the cleaned file is not created yet cleaned_tweets_file = self._config['file']['input']['cleaned_tweets'] if not os.path.isfile(cleaned_tweets_file): CleanRawTweets.procesingRawTweet(self._config) else: print("The cleaned file was created before!") countInit = 0 checkInit = False startUpdateFrame = 0 endUpdateFrame = 0 with open(cleaned_tweets_file, encoding='utf8') as ctf: #read the tweet file line by line line = ctf.readline() #read header line = line.replace('\n', '') column_names = line.split(',') dtf = None batch = [] while True: line = ctf.readline() if line == '': break line = line.replace('\n', '') data = line.split(',') batch.append(data) if len(batch) == 100: dtf = pd.DataFrame(batch, columns=column_names) for index, row in dtf.iterrows(): list_string = [] list_string.append(row['id']) list_string.append(row['timestamp']) list_string.append(row['lng']) list_string.append(row['lat']) cleaned_text = CleanRawTweets.tweet_cleaner( row['text']) tokenized = tokenizing(cleaned_text) list_string.append(tokenized) temp = '\t'.join(list_string) geo_tweet = GeoTweet.geoTweetFromAString(temp) if not checkInit: self._tdb.add(geo_tweet) if (self._tdb.getEndTimestamp() - self._tdb.getStartTimestamp() >= init_duration): self.trigger2(self._tdb) checkInit = True startUpdateFrame = self._tdb.getStartTimestamp( ) + update_span endUpdateFrame = self._tdb.getEndTimestamp( ) + update_span else: self._clustream.update( geo_tweet) # update geotweet if geo_tweet.getTimestamp() < endUpdateFrame: self._insert.add(geo_tweet) else: print("There are %d tweets in insert set" % self._insert.size()) startUpdateFrame = startUpdateFrame + update_span endUpdateFrame = endUpdateFrame + update_span self.update(startUpdateFrame) batch = [] dtf = None countInit += 1 if countInit % 1000 == 0: ttime = time.time() - start print("- %d Tweets consumed %f seconds." % (countInit, ttime)) if countInit == 20000: break end = time.time() consumed_time = end - start print("Time consumed is %f" % consumed_time)