/
spotlight_field_enhancer.py
116 lines (110 loc) · 5.03 KB
/
spotlight_field_enhancer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pprint
import time
import spotlight
from requests.exceptions import HTTPError as HTTPErrorRequests
from requests.exceptions import ConnectionError
####annotations=spotlight.annotate('http://spotlight.dbpedia.org/rest/annotate', 'Hugh Hefner')
####pprint.pprint(annotations)
####print '\n\n'
####annotations=spotlight.annotate('http://spotlight.dbpedia.org/rest/annotate', 'Albert Einstein')
####pprint.pprint(annotations)
####print '\n\n'
####annotations=spotlight.annotate('http://spotlight.dbpedia.org/rest/annotate', 'California')
####pprint.pprint(annotations)
from urllib2 import HTTPError
import os
import solr
from solr import SolrException
URL_SOLR = os.environ.get('URL_SOLR', 'http://107.21.228.130:8080/solr/dc-collection/')
solr_db = solr.Solr(URL_SOLR)
query = '-entity_ss:[* TO *]'
query = '-spotlighted_b:[* TO *]'
#query = 'collection_name:"Calisphere - Santa Clara University: Digital Objects"'
#query = 'collection_name:"Calisphere - The Ruth and Sherman Lee Institute of Japanese Art"'
fq='-entity_ss:[* TO *]'
#resp = solr_db.select(query, fq=fq)#, sort='id asc')
FIELD_TO_ENHANCE = 'creator'
FIELD_TO_ENHANCE = 'description'
#FIELD_TO_ENHANCE = 'title'
#FIELD_TO_ENHANCE = 'subject'
def get_entity_refs_from_annotations(annotations):
'''Annotations are list of dicts. Use the "similarityScore" & URI for info
'''
uri_entities = []
for ann in annotations:
if ann['similarityScore'] >= .1:
print "FOUND ENTITY", ann['URI']
uri_entities.append(ann['URI'])
return uri_entities
DOCS_RETRIEVED = DOCS_PREVIOUSLY_ENHANCED = 0
exception_resp = []
def main():
entities_recognized = []
resp = solr_db.select(query)
DOCS_RETRIEVED = DOCS_PREVIOUSLY_ENHANCED = 0
n_enhance_attempts = 0
try:
while (resp):
for doc in resp.results:
DOCS_RETRIEVED += 1
if doc.has_key('entity_ss'):
DOCS_PREVIOUSLY_ENHANCED +=1
doc
doc_up = {'id':doc['id'], 'spotlighted_b':{'update':'true'}}
try:
solr_db.add(doc_up)
except SolrException, e:
if not e.httpcode == 400:
raise e
continue
if doc.has_key(FIELD_TO_ENHANCE):
for fvalue in doc[FIELD_TO_ENHANCE]:
n_enhance_attempts += 1
if n_enhance_attempts % 100 == 0:
print "NDOCS:", str(DOCS_RETRIEVED), ' -> ', FIELD_TO_ENHANCE, fvalue.encode('utf-8')
#TODO: run each enhancer, get entity data then set the
# the entity_ss using update syntax
try:
annotations = spotlight.annotate('http://spotlight.dbpedia.org/rest/annotate', fvalue)
except spotlight.SpotlightException, e:
exception_resp.append(e)
if not "No Resources found" in e.message:
print "NUM:", str(DOCS_RETRIEVED), " EEEE->", str(e)
print e.args, e.message
raise e
except ConnectionError, e:
time.sleep(1800)
continue
except HTTPError, e:
#TODO: logger
continue
except HTTPErrorRequests, e:
#TODO: logger
continue
if annotations:
entity_refs = get_entity_refs_from_annotations(annotations)
entities_recognized.extend(entity_refs)
for entity in entity_refs:
doc_up = {'id':doc['id'],
'spotlighted_b':'true',
'entity_ss':{'add':entity.replace('http://dbpedia.org/resource/', 'http://wikipedia.org/wiki/')}}
try:
print 'TRY UPDATE', str(doc_up)
solr_db.add(doc_up)
except SolrException, e:
if not e.httpcode == 400:
raise e
resp = solr_db.select(query, start=DOCS_RETRIEVED)
#resp = resp.next_batch()
solr_db.commit()
except Exception, e:
import traceback
print "EXCEPTION TYPE:", type(e)
print traceback.format_exc()
#print "ENTITIES", entities_recognized
print "NUMBER OF DOCS", str(DOCS_RETRIEVED)
print "NUMBER OF DOCS ENHANCED", str(DOCS_RETRIEVED-DOCS_PREVIOUSLY_ENHANCED)
print "NUMBER OF RECOGNIZED ENTITIES", str(len(entities_recognized))
print "RESPONSE EXCEPTIONS:", str(len(exception_resp)) #, ' ---- ', str(exception_resp)
if __name__=="__main__":
main()