This repository has been archived by the owner on Jan 13, 2023. It is now read-only.
forked from jonathansick/ads_bibdesk
-
Notifications
You must be signed in to change notification settings - Fork 1
/
adsbibdesk.py
263 lines (232 loc) · 9.48 KB
/
adsbibdesk.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#!/usr/bin/env python
"""
Rui Pereira <rui.pereira@gmail.com>
November 2009
--
Based on ADS to Bibdesk automator action by
Jonathan Sick, jonathansick@mac.com, August 2007
Input may be one of the following:
- ADS abstract page URL
- ADS bibcode
- arXiv abstract page
- arXiv identifier
"""
import cgi
import re
import subprocess
import sys
import tempfile
import urllib
import urlparse
from HTMLParser import HTMLParser
from htmlentitydefs import name2codepoint
# define these variables to use your account on a remote machine
# for fetching (refereed) PDF's you have no access locally
ssh_user = None
ssh_server = None
class BibTex:
def __init__(self, url):
"""
Create BibTex instance from ADS BibTex URL
"""
bibtex = urllib.urlopen(url).readlines()
bibtex = ' '.join([l.strip() for l in bibtex]).strip()
bibtex = bibtex[re.search('@[A-Z]+{', bibtex).start():]
self.type, self.bibcode, self.info = self.parsebib(bibtex)
def __str__(self):
return ','.join(['@' + self.type + '{' + self.bibcode] + ['%s=%s' % (i, j) for i, j in self.info.items()]) + '}'
def parsebib(self, bibtex):
"""
Parse bibtex code into dictionary
"""
r = re.search('(?<=^@)(?P<type>[A-Z]+){(?P<bibcode>\S+)(?P<info>,.+)}$', bibtex)
s = re.split('(,\s\w+\s=\s)', r.group('info'))
info = dict([(i[1:].replace('=', '').strip(), j.strip()) for i, j in zip(s[1::2], s[2::2])])
return r.group('type'), r.group('bibcode'), info
class ADSHTMLParser(HTMLParser):
def __init__(self, *args, **kwargs):
HTMLParser.__init__(self)
self.links = {}
self.tag = ''
self.get_abs = False
self.entities = {}
self.bibtex = None
self.abstract = None
self.title = ''
self.author = []
def mathml(self):
"""
Generate dictionary with MathML -> unicode conversion from
http://www.w3.org/Math/characters/byalpha.html
"""
w3 = 'http://www.w3.org/Math/characters/byalpha.html'
mathml = re.search('(?<=\<pre\>).+(?=</pre>)', urllib.urlopen(w3).read(), re.DOTALL).group()
entities = {}
for l in mathml[:-1].split('\n'):
s = l.split(',')
#ignore double hex values like 'U02266-00338'
if not '-' in s[1]:
#hexadecimal -> int values, for unichr
entities[s[0].strip()] = int(s[1].strip()[1:], 16)
return entities
def parse(self, url):
"""
Feed url into our own HTMLParser and parse found bibtex
"""
self.feed(urllib.urlopen(url).read())
#parse bibtex
if 'bibtex' in self.links:
self.bibtex = BibTex(self.links['bibtex'])
self.title = re.search('(?<={).+(?=})', self.bibtex.info['title']).group().replace('{','').replace('}','')
self.author = [a.strip() for a in
re.search('(?<={).+(?=})', self.bibtex.info['author']).group().split(' and ')]
def handle_starttag(self, tag, attrs):
#abstract
if tag.lower() == 'hr' and self.get_abs:
self.abstract = self.tag.strip().decode('utf-8')
self.get_abs = False
self.tag = ''
#handle old scanned articles abstracts
elif tag.lower() == 'img' and self.get_abs:
self.tag += dict(attrs)['src']
#links
elif tag.lower() == 'a':
if 'href' in dict(attrs):
href = dict(attrs)['href']
query = cgi.parse_qs(urlparse.urlparse(href).query)
if 'bibcode' in query:
if 'link_type' in query:
self.links[query['link_type'][0].lower()] = href
elif 'data_type' in query:
self.links[query['data_type'][0].lower()] = href
def handle_data(self, data):
if self.get_abs:
self.tag += data.replace('\n', ' ')
#beginning of abstract found
if data.strip() == 'Abstract':
self.get_abs = True
#handle html entities
def handle_entityref(self, name):
if self.get_abs:
if name in name2codepoint:
c = name2codepoint[name]
self.tag += unichr(c).encode('utf-8')
else:
#fetch mathml
if not self.entities:
#cache dict
self.entities = self.mathml()
if name in self.entities:
c = self.entities[name]
self.tag += unichr(c).encode('utf-8')
else:
#nothing worked, leave it as-is
self.tag += '&' + name + ';'
#handle unicode chars in utf-8
def handle_charref(self, name):
if self.get_abs:
self.tag += unichr(int(name)).encode('utf-8')
def getPDF(self):
"""
Fetch PDF and save it locally in a temporary file.
Tries by order:
- refereed article
- refereed article using another machine (set ssh_user & ssh_server)
- arXiv preprint
- electronic journal link
"""
if not self.links:
return 'failed'
def filetype(filename):
return subprocess.Popen('file %s' % filename, shell=True,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE).stdout.read()
#refereed
if 'article' in self.links:
url = self.links['article']
#try locally
pdf = tempfile.mktemp() + '.pdf'
urllib.urlretrieve(url, pdf)
if 'PDF document' in filetype(pdf):
return pdf
#try in remote server
# you need to set SSH public key authentication
# for this to work!
elif ssh_user is not None:
pdf = tempfile.mktemp() + '.pdf'
cmd = 'ssh %s@%s \"touch toto.pdf; wget -O toto.pdf \\"%s\\"\"' % (ssh_user, ssh_server, url)
cmd2 = 'scp -q %s@%s:toto.pdf %s' % (ssh_user, ssh_server, pdf)
subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
subprocess.Popen(cmd2, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()
if 'PDF document' in filetype(pdf):
return pdf
#arXiv
if 'preprint' in self.links:
#arXiv page
url = self.links['preprint']
for line in urllib.urlopen(url).readlines():
if 'dc:identifier' in line:
begin = re.search('dc:identifier="', line).end()
url = line[begin:-2].replace('&', '&').lower()
#get arXiv PDF
pdf = tempfile.mktemp() + '.pdf'
urllib.urlretrieve(url.replace('abs', 'pdf'), pdf)
if 'PDF document' in filetype(pdf):
return pdf
else:
return url
#electronic journal
if 'ejournal' in self.links:
return self.links['ejournal']
return 'failed'
if __name__ == '__main__':
# ADS mirrors - the 1st one of the list will be used as your mirror
adsmirrors = ['adsabs.harvard.edu',
'cdsads.u-strasbg.fr',
'ukads.nottingham.ac.uk',
'esoads.eso.org',
'ads.ari.uni-heidelberg.de',
'ads.inasan.ru',
'ads.mao.kiev.ua',
'ads.astro.puc.cl',
'ads.on.br',
'ads.nao.ac.jp',
'ads.bao.ac.cn',
'ads.iucaa.ernet.in',
'www.ads.lipi.go.id']
# Determine what we're dealing with. The goal is to get a URL into ADS
url = urlparse.urlparse(sys.argv[1])
if url.scheme == '':
# ADS bibcode?
adsURL = urlparse.urlunparse(('http', adsmirrors[0], 'abs/%s' % sys.argv[1], '', '', ''))
# arXiv identifier?
if 'No bibcodes' in urllib.urlopen(adsURL).read():
adsURL = urlparse.urlunparse(('http', 'arxiv.org', 'abs/%s' % sys.argv[1], '', '', ''))
if 'not recognized' in urllib.urlopen(adsURL).read():
# something's wrong
sys.exit()
else:
adsURL = urlparse.urlunparse(('http', adsmirrors[0], 'cgi-bin/bib_query',
'', 'arXiv:%s' % sys.argv[1], ''))
# arXiv URL
elif 'arxiv' in url.netloc:
# get paper identifier from URL and inject into ADS query
arxivid = '/'.join(url.path.split('/')[2:]),
adsURL = urlparse.urlunparse(('http', adsmirrors[0], 'cgi-bin/bib_query',
'', 'arXiv:%s' % arxivid, ''))
elif url.netloc in adsmirrors:
# we have a nice ADS abstract page entry
adsURL = sys.argv[1]
else:
# we're in trouble here
sys.exit()
# parse the ADS HTML file
ads = ADSHTMLParser()
ads.parse(adsURL)
#pdf local file, title, first author, abstract, bibtex code
#UTF-8 encoded
print ''.join(map(lambda x: x.encode('utf-8'), [ads.getPDF(), '|||',
ads.title, '|||',
ads.author[0], '|||',
ads.abstract, '|||',
ads.bibtex.__str__()]))