def query(): """ given {start_time, end_time, grid_id [optional]} return -> [{grid_id, volume}, ...] """ start_time = datetime.strptime(request.query.get('start_time'), '%Y-%m-%dT%H:%M:%SZ') end_time = datetime.strptime(request.query.get('end_time'), '%Y-%m-%dT%H:%M:%SZ') grid_id = request.query.get('grid_id') mg = MongoDB() mg.connect() print('querying grid volumes...') # if grid_id: # results = mg.group_by([{'$match': {'created_at': {'$gt': start_time, '$lt': end_time}, 'grid_id': grid_id}}, # {'$group': {'_id': '$grid_id', 'count': {'$sum': 1}}} # ]) # else: # results = mg.group_by([{'$match': {'created_at': {'$gt': start_time, '$lt': end_time}}}, # {'$group': {'_id': '$grid_id', 'count': {'$sum': 1}}} # ]) # group and count distinct user: results = mg.group_by([ {'$match': {'created_at': {'$gt': start_time, '$lt': end_time}}}, {'$group': {'_id':{ 'grid_id':'$grid_id', 'user_id':'$user_id' }, 'count': {'$sum': 1} } }, {'$group': {'_id': '$_id.grid_id', 'count': {'$sum': 1}}} ]) ret = [] for result in results: ret.append({'grid_id': result['_id'], 'volume': result['count']}) response.content_type = 'application/json' return json_dumps(ret, indent=2)
def load_tweets_to_grids(): # mongodb mg = MongoDB() mg.connect() tweets = mg.find() grid_db = GridDB() grid_db.add(tweets) return grid_db
def all_grids(): mg = MongoDB() mg.connect() griddb = GridDB() print('querying grid volumes...') results = mg.group_by([{'$match': {'created_at': {'$gt': datetime.strptime('2012-10-15T20:00:02Z', '%Y-%m-%dT%H:%M:%SZ'), '$lt': datetime.strptime('2012-11-15T20:00:02Z', '%Y-%m-%dT%H:%M:%SZ')}}}]) # print(results) griddb.add(results) ret = Grid.get_raw_pandas_ts(results, 'D') STL.seasonal_decomposition(ret)
def single_grid(grid_id): mg = MongoDB() mg.connect() start_time = datetime.strptime('2012-10-15T20:00:02Z', '%Y-%m-%dT%H:%M:%SZ') end_time = datetime.strptime('2012-11-15T20:00:02Z', '%Y-%m-%dT%H:%M:%SZ') print('querying grid volumes...') # results = mg.group_by([{'$match': {'created_at': {'$gt': start_time, '$lt': end_time}, 'grid_id': grid_id}}, # {'$group': {'_id': '$grid_id', 'count': {'$sum': 1}}}]) results = mg.group_by([{'$match': {'created_at': {'$gt': datetime.strptime('2012-10-15T00:00:00Z', '%Y-%m-%dT%H:%M:%SZ'), '$lt': datetime.strptime('2012-11-15T00:00:00Z', '%Y-%m-%dT%H:%M:%SZ')}, 'grid_id': grid_id}}]) # print(results) print(results) ret = Grid.get_raw_pandas_ts(results, 'H') # print('------------') # print(ret) print(STL.seasonal_decomposition(ret))
def write_tweets_to_mongo(): """ load tweets, decide the grid of tweets and insert the tweets into mongo this function serves as the initial step of the pipeline. all following db-specific operations are based on the dataset inserted here. This function is only called once for the entire pipeline. """ data_file = "data/sandy_all.txt" kml_file = ["data/nj_ct.kml", "data/nyc_ct_sea.kml"] # kml_file = ['data/nyc_cb_sea.kml'] tweets = load_tweets(data_file) # trajectories = tweets_to_trajectories(tweets) grid_db = GridDB() grid_db.load_grid_from_file(kml_file[0]) grid_db.load_grid_from_file(kml_file[1]) # grid_db.write_grids_to_json('shapefile.json') grid_db.check_and_add(tweets) ##################################################### ############## index Tweets into MongoDB ############ ##################################################### # mongodb mg = MongoDB() mg.connect() print("connected...") mg.drop() # tweets to dicts; rst = [] for t in grid_db.get_tweets(): rst.append(t.to_dict()) print("inserting...") mg.insert_tweets(rst)
def query(): """ given {start_time, end_time, aggregation} return -> [{time, freq}, ...] """ start_time = datetime.strptime(request.query.get("start_time"), '%Y-%m-%dT%H:%M:%SZ') end_time = datetime.strptime(request.query.get("end_time"), '%Y-%m-%dT%H:%M:%SZ') aggregation = request.query.get("aggregation") mg = MongoDB() mg.connect() print("querying time series...") results = mg.find( {'created_at': {'$gt': start_time, '$lt': end_time}} ) results = Grid.get_ts(results, aggregation) ret = [] for result in results: ret.append({'start_time': result[0].to_datetime().strftime('%Y-%m-%dT%H:%M:%SZ'), 'frequency': result[1].item()}) response.content_type = 'application/json' return json_dumps(ret, indent=2)
return grid_db if __name__ == "__main__": # write_tweets_to_mongo() grid_db = load_tweets_to_grids() print(grid_db.get_outlier_grid_ids("H")) print("tweets loaded") # for grid_id in grid_db.grid_cache: # print(grid_id, grid_db.grid_cache[grid_id].get_ts(grid_db.get_tweets(), 'H')) exit() mg = MongoDB() mg.connect() print("querying grid volumes...") results = mg.group_by( [ { "$match": { "created_at": { "$gt": datetime.strptime("2012-10-15T20:00:02Z", "%Y-%m-%dT%H:%M:%SZ"), "$lt": datetime.strptime("2012-10-22T20:00:02Z", "%Y-%m-%dT%H:%M:%SZ"), } } } ] ) # print(results)
def wrapper(func, *args, **kwargs): mongo = MongoDB() mongo.connect(app.config['app.mongohost'])