/
test_indexer.py
95 lines (86 loc) · 3.04 KB
/
test_indexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
import requests
import sys
import traceback
from multiprocessing import Pool
from nlp_services.document_access import ListDocIdsService
from pprint import pprint
wid = sys.argv[1]
step = 50
#step = 10
def chunks(array, n):
"""Yield successive n-sized chunks from array"""
for i in xrange(0, len(array), n):
yield array[i:i+n]
def get_fields(doc_ids):
print 'Getting fields for %s' % doc_ids
array = []
r = requests.get(
'%swikia.php' % url,
params={'controller': 'WikiaSearchIndexer',
'method': 'get',
'service': 'All',
#'ids': doc_ids_subset}
'ids': '|'.join(doc_ids)}
)
try:
indexer = r.json().get('contents', [])
except KeyboardInterrupt:
sys.exit(0)
except:
print traceback.format_exc()
indexer = []
#pprint(indexer)
if indexer:
for doc in indexer:
if doc.get('id') is not None:
array.append(
(doc['id'],
(doc.get('headings_mv_%s' % lang, {}).get('set', []) +
doc.get('categories_mv_%s' % lang, {}).get('set', []))))
#indexed[doc['id']] = (
# doc.get('headings_mv_%s' % lang, {}).get('set', []) +
# doc.get('categories_mv_%s' % lang, {}).get('set', []))
return array
#foo = range(100)
#print map(lambda x:x, chunks(foo, 9))
#sys.exit(0)
details = requests.get(
'http://www.wikia.com/api/v1/Wikis/Details/',
params={'ids': wid}).json().get('items', {}).get(wid)
#doc_ids_combined = {}
#indexed = {}
if details is not None:
url = details.get('url')
lang = details.get('lang')
#print url
#doc_ids = ListDocIdsService().get_value(wid)
doc_ids = map(lambda x: x.split('_')[1],
filter(lambda y: '_' in y,
#ListDocIdsService().get_value(wid)))[:100]
ListDocIdsService().get_value(wid)))
#pprint(doc_ids); sys.exit(0)
#for n in range(0, len(doc_ids), step):
##for n in range(0, 20, step):
# print 'n = %d' % n
# doc_ids_subset = doc_ids[n:n+step]
r = Pool(processes=8).map_async(get_fields, chunks(doc_ids, step))
r.wait()
pprint(r.get())
print '*'*80
#for k in r.get(): # DEBUG
# print k
fields = []
m = map(lambda x: fields.extend(x), r.get())
#pprint(fields)
indexed = dict(fields)
pprint(indexed) # DEBUG
#for doc_id in doc_ids_to_heads:
# entity_response = doc_ids_to_entities.get(
# doc_id, {'titles': [], 'redirects': {}})
# doc_ids_combined[doc_id] = map(preprocess,
# indexed.get(doc_id, []) +
# entity_response['titles'] +
# entity_response['redirects'].keys() +
# entity_response['redirects'].values() +
# list(set(doc_ids_to_heads.get(doc_id,
# []))))