/
transfer_data.py
63 lines (49 loc) · 2.04 KB
/
transfer_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
__author__ = 'eddiexie'
from open_gis import get_all_parks
import pymongo
import copy
from shapely.geometry import Point
import time
def store_into_db(parks, data, data_type = 'photo'):
taken = [0]*len(data)
conn = pymongo.MongoClient(host='grande.rutgers.edu')
for park in parks:
print 'Processing for Park ', park.get_name(), park.get_id()
cursor_store = conn['parks_plazas'][park.get_id()]
cursor_store.ensure_index([('created_time', pymongo.ASCENDING), ('data_type', pymongo.ASCENDING)])
for n, data_point in enumerate(data):
if Point(data_point['location']['longitude'] , data_point['location']['latitude']).within(park.get_poly()):
data_point['datatype'] = data_type
data_point['_id'] = data_point['id']
print 'saving data point...'
cursor_store.save(data_point)
taken[n] = 1
new_all_data = []
for i in range(len(data)):
if taken[i] == 0:
new_all_data.append(data[i])
data = new_all_data
taken = [0]*len(data)
def transfer(start, end):
# for each park, compute top keywords
parks = get_all_parks()
conn = pymongo.MongoClient(host='grande.rutgers.edu')
cursor_read_photos = conn['citybeat_production']['photos']
cursor_read_tweets = conn['citybeat_production']['tweets']
print start, end
condition = {'created_time':{'$lt': end, '$gt':start}}
print 'fetching data...'
all_photos = [photo for photo in cursor_read_photos.find(condition)]
all_tweets = [tweet for tweet in cursor_read_tweets.find(condition)]
print len(all_photos)
print len(all_tweets)
print 'fetching data done'
store_into_db(parks, all_photos, 'photo')
store_into_db(parks, all_tweets, 'tweet')
def run(days = 0):
current_time = int(time.time())
# Runs every hour
transfer(str(current_time - 3600*1.5), str(current_time) )
#for h in range(72):
# transfer(str(current_time - (h+1)*3600), str(current_time - h*3600))
run()