/
nw.py
152 lines (122 loc) · 4.4 KB
/
nw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import with_statement
import os
import re
import sys
import time
import urllib2
from threading import currentThread, Thread
import cPickle as pickle
import random
import socket
socket.setdefaulttimeout(10)
from contextlib import closing
from cStringIO import StringIO
from httplib import HTTP, HTTPConnection
from threadpool import ThreadPool, Queue, makeRequests
from lxml import etree
import re
HOST = "http://zh.wiktionary.org"
ct_match = re.compile("Category:(.+)")
item_match = re.compile("wiki/([^:]+)")
def worker(cq, qcs, qns, cs, ns):
#qcs : Queue Category list
#qns : Queue Node list
#cs : Categorey set s
#ns : Node set s
crt = currentThread()
h = HTTPConnection("zh.wiktionary.org") #HTTP()
h.connect()
while not cq.empty():
try:
task = cq.get(timeout=5)
except Queue.Empty:
break
print crt.getName(), task['url']
h.putrequest('GET', task['url'])
h.putheader("User-agent", "Mozilla/5.0")
h.endheaders()
with closing(h.getresponse()) as resp:
f = resp.fp
ret = f.read()
cur_node = etree.HTML(ret)
for node in cur_node.xpath("//div[@id='bodyContent']//a[starts-with(@href,'/wiki/Category:')]"):
href, text = node.get("href"), node.text
if "#" in href:
href = href[:href.index("#")]
if not href or href[:6] != "/wiki/":
continue
if href not in cs:
#cts_will.append({"url": HOST + href, "alias": text})
qcs.put({"url": "%s%s" % (HOST, href), "alias": text})
cs[href] = [text]
else:
cs[href].append(text)
#print node.get("href"), node.text
for node in cur_node.xpath("//div[@id='bodyContent']//a[not(contains(@href,':'))]"):
href, text = node.get("href"), node.text
if "#" in href:
href = href[:href.index("#")]
if not href or href[:6] != "/wiki/":
continue
if href not in ns:
qns.put({"url": "%s%s" % (HOST, href), "alias": text})
ns[href] = [text]
else:
ns[href].append(text)
def main():
cur_url = "http://zh.wikipedia.org/wiki/Category:%E9%A0%81%E9%9D%A2%E5%88%86%E9%A1%9E" #"http://zh.wikipedia.org/wiki/Wikipedia:%E5%88%86%E9%A1%9E%E7%B4%A2%E5%BC%95"
cur_alias = "分类"
cts = {}
its = {} # ItemName(Wiki Item URL) : ([Alias1, Alias2, Alias3, ...], [InItemName1, InItemName, ..], [OutItemName1, OutItemName2, ..])
h = HTTP()
h.connect("zh.wikipedia.org")
qcs = Queue.Queue()
qns = Queue.Queue()
qcs.put({"url": cur_url, "alias": cur_alias})
#tp = ThreadPool(10)
tds = [Thread(target=worker, args=(qcs if i % 2 else qns, qcs, qns, cts, its))
for i in xrange(10)]
t_start = time.time()
for i in tds:
i.start()
time.sleep(5)
for i in tds:
i.join()
#worker_c(qcs, qns, cts, its)
#rqs = makeRequests(worker_c, args)
#[tp.putRequest(req) for req in rqs]
#try:
# tp.joinAllDismissedWorkers()
#except KeyboardInterrupt:
# tp.joinAllDismissedWorkers()
with open("CTS.db", "wb") as f:
pickle.dump(cts, f)
with open("ITS.db", "wb") as f:
pickle.dump(its, f)
print "CTS:", cts
print "___________________________"
print "ITEMS:", its
print "Coust Time: %s" % (time.time() - t_start)
def analysis_db():
#DBNAME = "CTS.db"
#with open(DBNAME) as f:
# cts = pickle.load(f)
#cts = dict([(k, list(set(v))) for k, v in cts.iteritems()])
#cts = dict([(k, list(set([ct_match.match(v).group(1) if ct_match.match(v) else v for v in v]))) for k, v in cts.items() if len(v) > 1])
#for k, v in cts.iteritems():
# for i in v:
# print i
#with open(DBNAME, "w") as f:
# pickle.dump(cts, f)
DBNAME = "ITS.db"
with open(DBNAME) as f:
its = pickle.load(f)
#its = dict([(k, list(set([item_match.match(i.encode("utf-8")).group(1) if item_match.match(i.encode("utf-8")) else i for i in v]))) for k, v in its.items() if len(v) > 1])
for k, v in its.iteritems():
for i in v:
print i.encode("u8")
if __name__ == "__main__":
#main()
analysis_db()