-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawl-projects.py
77 lines (67 loc) · 2.31 KB
/
crawl-projects.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import os, pymongo, sys, csv, datetime, threading, json, random, time
from behance_python.api import API
from behance_python.exceptions \
import TooManyRequests, InternalServerError, BehanceException
from behance_python.behance import Behance
from behance_python.user import User
def remove_dot_key(obj):
for key in obj.keys():
new_key = key.replace(".","")
if new_key != key:
obj[new_key] = obj[key]
del obj[key]
return obj
# set API key
key = raw_input('Input your Behance API key: ');
behance = API(key)
# db connection
localClient = pymongo.MongoClient('localhost', 27017)
db = localClient.behance
dbusers = db.users
dbprojects = db.projects
dbprojects.remove({}) # clear existing db collection
visitedProjects = {}
users = []
for user in dbusers.find():
users.append(user)
numUser = 0
numProj = 0
for user in users:
numUser = numUser + 1
print "Retrieving projects from user id: ", user["user_id"], " (", numUser, ")"
while True:
try:
u = User(user["user_id"], user["auth_key"])
break
except TooManyRequests as e:
print "Maximum Request Reached! Wating for Next Hour..."
time.sleep(60) # retry after 1 min
continue
except BehanceException as e:
print "BehanceException: ", str(e)
break
pageNum = 1
while True:
try:
projects = u.get_projects(page=pageNum)
if len(projects)==0:
break
for project in projects:
# avoid duplicate
if visitedProjects.has_key(project["id"]):
continue
visitedProjects[project["id"]] = project
dbprojects.insert(json.loads(json.dumps(project), object_hook=remove_dot_key))
numProj +=1
print "PageNum (Total Projects) = ", pageNum, ", ", numProj
pageNum +=1
if pageNum>=400:
break
except TooManyRequests as e:
print "Maximum Request Reached! Wating for Next Hour..."
time.sleep(60) # retry after 1 min
continue
except BehanceException as e:
print "BehanceException: ", str(e)
break
print "Total Projects, Users = ", numProj, ", ", numUser