forked from jude90/gk_mining
/
mapredu.py
143 lines (110 loc) · 3.61 KB
/
mapredu.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from parse import get_ukeys
import requests
import json
from model import Base,DB_Session,Article,Comment,User
from collections import defaultdict
from multiprocessing.dummy import Pool
import multiprocessing
import time
import sys
from datetime import datetime
from random import random
from pipe import *
header={"User-Agent":
'''Mozilla/5.0 (Windows NT 5.1)
AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/30.0.1599.101 Safari/537.36'''
}
pool =Pool(multiprocessing.cpu_count()*3)
#@profile
def partition(pages):
user_dct=defaultdict(list)
for article , ukey_lst in pages:
session = DB_Session()
session.execute(
Comment.__table__.insert(),
[{'article':article, 'user':ukey} for ukey in ukey_lst]
)
session.commit()
session.close()
del(session)
for ukey in ukey_lst:
try:
user_dct[ukey].append(article)
except :
user_dct[ukey]=[article]
return user_dct
#@profile
def foo_map(article):
try:
ukey_lst = list(set(get_ukeys(article)))
time.sleep(random())
print "get article",article
return article,ukey_lst
except Exception, e:
print >>log, e,"failure in article:"+article
time.sleep(100)
return None
#@profile
def foo_reduce(ukey):
url='http://apis.guokr.com/community/user/%s.json'%ukey
try:
user_json=requests.get(url,headers=header).content
time.sleep(random())
print "get user" ,ukey
profile = (json.loads(user_json))["result"]
user = dict(
ukey = profile["ukey"],
blogs = profile["blogs_count"],
posts = profile["posts_count"],
answers = profile["answers_count"],
questions= profile["questions_count"],
followers = profile["followers_count"],
followings = profile["followings_count"],
activities = profile["activities_count"],
answer_supports = profile["answer_supports_count"],
date_created = profile["date_created"],
)
return user
except Exception, e:
print >>log , e,"failure user :"+ukey
time.sleep(100)
return None
def chunks(arr, n):
return [arr[i:i+n] for i in range(0, len(arr), n)]
#@profile
def main():
urls = json.load(open("urls.txt",'r'))
page_set = pool.imap(foo_map, urls)|where(lambda x:x)
ukeys = partition(page_set)
'''
with open("ukeys.txt",'w') as fuk:
json.dump(ukeys,fuk,indent=4)
'''
print "finish articles !"
users = pool.map(foo_reduce,ukeys.iterkeys())| where(lambda x :x)|as_tuple
users_lst = chunks(users,1000)
session = DB_Session()
for us in users_lst:
session.execute(
User.__table__.insert(),
us
#pool.map(foo_reduce,ukeys.iterkeys())
)
session.commit()
session.close()
if __name__ == '__main__':
# DB_CONNECT_STRING ='mysql+mysqldb://root:@localhost/gk?charset=utf8'
# engine = create_engine(DB_CONNECT_STRING,echo=True)
# DB_Session = sessionmaker(bind= engine)
print "start at " ,datetime.now()
log = open("crawlog.txt",'a')
main()
'''
print foo_reduce('za4yxz')
print foo_reduce('toxrf4')
print foo_reduce('ifv1x3')
print foo_map("/article/21/")
print foo_map("/article/49/")
print foo_map("/article/56/")
'''