forked from yymao/adstex
/
adstex.py
492 lines (424 loc) · 14.8 KB
/
adstex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
"""
adstex: Automated generation of NASA ADS bibtex entries
from citation keys (identifiers, author+year) in your TeX source files.
Project website: https://github.com/yymao/adstex
The MIT License (MIT)
Copyright (c) 2015-2019 Yao-Yuan Mao (yymao)
http://opensource.org/licenses/MIT
"""
from __future__ import absolute_import, print_function
import os
import re
from argparse import ArgumentParser
from builtins import input
from collections import defaultdict
from datetime import date
from distutils.version import StrictVersion
from shutil import copyfile
import ads
import bibtexparser
import requests
try:
from urllib.parse import unquote
except ImportError:
from urllib import unquote
__version__ = "0.3.8"
_this_year = date.today().year % 100
_this_cent = date.today().year // 100
_re_comment = re.compile(r"(?<!\\)%.*(?=[\r\n])")
_re_bib = re.compile(r"\\(?:no)?bibliography\*?(?:(?!\n{2,})\s)*{((?:(?!\n{2,})[^{}])+)}")
_re_cite = re.compile(
r"\\(?:bibentry|[cC]ite[a-z]{0,7})\*?(?:(?!\n{2,})\s)*(?:(?<!\\)\[(?:(?!\n{2,}).)*?(?<!\\)\](?:(?!\n{2,})\s)*)*{((?:(?!\n{2,})[^{}])+)}",
re.S,
)
_re_fayear = re.compile(r"([A-Za-z-]+)(?:(?=[\W_])[^\s\d,]+)?((?:\d{2})?\d{2})")
_re_id = {}
_re_id["doi"] = re.compile(r"\b10\.\d{4,}(?:\.\d+)*\/(?:(?!['\"&<>])\S)+\b")
_re_id["bibcode"] = re.compile(r"\b\d{4}\D\S{13}[A-Z.:]\b")
_re_id["arxiv"] = re.compile(r"\b(?:\d{4}\.\d{4,5}|[a-z-]+(?:\.[A-Za-z-]+)?\/\d{7})\b")
_name_prefix = (
"van",
"di",
"de",
"den",
"der",
"van de",
"van den",
"van der",
"von der",
)
_name_prefix = sorted(_name_prefix, key=len, reverse=True)
_database = "astronomy"
# pylint: disable=missing-docstring
def fixedAdsSearchQuery(*args, **kwargs):
q = ads.SearchQuery(*args, **kwargs)
q.session # pylint: disable=pointless-statement
# pylint: disable=protected-access
if "Content-Type" in q._session.headers:
del q._session.headers["Content-Type"]
return q
def get_bparser():
try:
mybparser = bibtexparser.bparser.BibTexParser(common_strings=True)
mybparser.bib_database.strings["june"] = "June"
except TypeError:
mybparser = bibtexparser.bparser.BibTexParser()
return mybparser
def _match_name_prefix(name):
for prefix in _name_prefix:
p = prefix.replace(" ", "")
if name.lower().startswith(p):
return " ".join((prefix, name[len(p) :]))
def _y2toy4(y2):
y2 = int(y2)
k = int(y2 > _this_year)
return str((_this_cent - k) * 100 + y2)
def _is_like_string(s):
try:
s + ""
except TypeError:
return False
return True
def _headerize(msg, extraline=True):
return "{2}{0}\n{1}\n{0}".format("-" * 60, msg, "\n" if extraline else "")
def search_keys(files, find_bib=False):
if _is_like_string(files):
files = [files]
bib = None
keys = set()
for f in files:
with open(f) as fp:
text = fp.read()
text = _re_comment.sub("", text)
if find_bib and not bib:
m = _re_bib.search(text)
if m:
dirpath = os.path.dirname(f)
bib = []
for b in m.groups()[0].split(","):
b = b.strip()
if not b.lower().endswith(".bib"):
b += ".bib"
bib.append(os.path.join(dirpath, b))
for m in _re_cite.finditer(text):
for k in m.groups()[0].split(","):
keys.add(k.strip())
return keys, bib
def format_author(authors, max_char):
s = authors[0]
for author in authors[1:]:
if len(s) + len(author) + 2 < max_char - 7:
s = u"{}; {}".format(s, author)
else:
break
else:
return s
return s + u" et al."
def format_ads_entry(i, entry, max_char=78):
title = entry.title[0][: max_char - 4] if entry.title else "<no title>"
return u"[{}] {} (cited {} times)\n {}\n {}".format(
i,
entry.bibcode,
entry.citation_count,
format_author(entry.author, max_char - 4),
title,
)
def id2bibcode(id_this, possible_id_types=("bibcode", "doi", "arxiv")):
if _is_like_string(possible_id_types):
possible_id_types = [possible_id_types]
for id_type in possible_id_types:
m = _re_id[id_type].search(id_this)
if m:
s = fixedAdsSearchQuery(q="identifier:\"{}\"".format(m.group()), fl=["bibcode"])
try:
return next(s).bibcode
except StopIteration:
pass
def authoryear2bibcode(author, year, key):
q = 'author:"^{}" year:{} database:{}'.format(author, year, _database)
entries = list(
fixedAdsSearchQuery(
q=q,
fl=["id", "author", "bibcode", "title", "citation_count"],
sort="citation_count desc",
rows=20,
max_pages=0,
)
)
if entries:
total = len(entries)
print(
_headerize(
"Choose one entry from below for <{}> (most cited at the end)".format(
key
)
)
)
print(
u"\n\n".join(
format_ads_entry(total - i, e) for i, e in enumerate(reversed(entries))
)
)
print(
_headerize(
"Choose one entry from above for <{}>".format(key, extraline=False)
)
)
choices = list(range(0, len(entries) + 1))
c = -1
while c not in choices:
c = input(
"ENTER choice (if no matches, ENTER 0 to skip or ENTER an identifier): "
)
bibcode = id2bibcode(c)
if bibcode:
return bibcode
try:
c = int(c)
except (TypeError, ValueError):
pass
if not c:
return
return entries[c - 1].bibcode
elif " " not in author:
new_author = _match_name_prefix(author)
if new_author:
return authoryear2bibcode(new_author, year, key)
def find_bibcode(key):
bibcode = id2bibcode(key)
if bibcode:
return bibcode
m = _re_fayear.match(key)
if m:
fa, y = m.groups()
if len(y) == 2:
y = _y2toy4(y)
bibcode = authoryear2bibcode(fa, y, key)
if bibcode:
return bibcode
print(_headerize("ENTER an identifier (bibcode, arxiv, doi) for <{}>".format(key)))
c = True
while c:
c = input("Identifier (or press ENTER to skip): ")
bibcode = id2bibcode(c)
if bibcode:
return bibcode
def extract_bibcode(entry):
m = _re_id["bibcode"].search(unquote(entry.get("adsurl", "")))
if m:
return m.group()
def entry2bibcode(entry):
for field_name, possible_id_types in (
("adsurl", "bibcode"),
("doi", "doi"),
("eprint", "arxiv"),
("url", ("bibcode", "doi", "arxiv")),
("pages", "arxiv"),
):
if field_name in entry:
id_this = id2bibcode(unquote(entry[field_name]), possible_id_types)
if id_this:
return id_this
def update_bib(b1, b2):
# pylint: disable=protected-access
b1._entries_dict.clear()
b2._entries_dict.clear()
b1.entries_dict.update(b2.entries_dict)
b1.entries = list(b1.entries_dict.values())
return b1
def main():
parser = ArgumentParser()
parser.add_argument(
"files", metavar="TEX", nargs="+", help="tex files to search citation keys"
)
parser.add_argument(
"-o",
"--output",
metavar="BIB",
help="main bibtex file; new entries will be added to this file, existing entries may be updated",
)
parser.add_argument(
"-r",
"--other",
nargs="+",
metavar="BIB",
help="other bibtex files that contain existing references (read-only)",
)
parser.add_argument(
"--no-update",
dest="update",
action="store_false",
help="for existing entries, do not check ADS for updates",
)
parser.add_argument(
"--force-regenerate",
action="store_true",
help="for all existing entries, regenerate the bibtex with the latest version from ADS if found",
)
parser.add_argument(
"--merge-other",
action="store_true",
help="merge the entries from other bibtex files",
) # thanks to syrte for adding this option
parser.add_argument(
"--include-physics",
action="store_true",
help="include physics database when searching ADS",
)
parser.add_argument(
"--no-backup",
dest="backup",
action="store_false",
help="back up output file if being overwritten",
)
parser.add_argument(
"--version",
action="version",
version="%(prog)s {version}".format(version=__version__),
)
args = parser.parse_args()
if args.include_physics:
global _database # pylint: disable=global-statement
_database = '("astronomy" OR "physics")'
if len(args.files) == 1 and args.files[0].lower().endswith(
".bib"
): # bib update mode
if args.output or args.other:
parser.error(
"Input file is a bib file, not tex file. This will enter bib update mode. Do not specify `--output` and `--other` together in this mode."
)
if not args.update:
parser.error(
"Input file is a bib file, not tex file. This will enter bib update mode. Must not specify --no-update"
)
if not os.path.isfile(args.files[0]):
parser.error("Cannot locate input bib file {}".format(args.files[0]))
keys = None
args.output = args.files[0]
elif args.output: # bib output is specified
keys, _ = search_keys(args.files, find_bib=False)
else: # bib output is missing, auto-identify
keys, bib = search_keys(args.files, find_bib=True)
if not bib:
parser.error(
"Cannot identify bibtex file from the tex source. Use -o to specify a bibtex file as output."
)
args.output = bib.pop(0)
if args.other:
args.other.extend(bib)
else:
args.other = bib
msg = "Auto-identifying bibtex files...\n"
msg += "Main bibtex source (output file): {}\n".format(args.output)
if args.other:
msg += "Additional bibtex sources: {}\n".format(", ".join(args.other))
print(_headerize(msg))
if os.path.isfile(args.output):
with open(args.output) as fp:
bib = bibtexparser.load(fp, parser=get_bparser())
else:
bib = bibtexparser.loads(" ", parser=get_bparser())
bib_other = bibtexparser.loads(" ", parser=get_bparser())
if args.other:
for f in args.other:
with open(f) as fp:
bib_other = update_bib(
bib_other, bibtexparser.load(fp, parser=get_bparser())
)
if keys is None: # bib update mode
keys = list(bib.entries_dict)
not_found = set()
to_retrieve = set()
all_entries = defaultdict(list)
for key in keys:
key_exists = key in bib.entries_dict
key_exists_in_others = key in bib_other.entries_dict
if args.update:
if key_exists:
bibcode = extract_bibcode(bib.entries_dict[key])
bibcode_new = entry2bibcode(bib.entries_dict[key])
elif key_exists_in_others and args.merge_other:
bibcode = extract_bibcode(bib_other.entries_dict[key])
bibcode_new = entry2bibcode(bib_other.entries_dict[key])
else:
bibcode_new = None
if bibcode_new:
all_entries[bibcode_new].append(key)
if bibcode_new != bibcode or args.force_regenerate:
to_retrieve.add(bibcode_new)
print(
"{}:{} UPDATE => {}".format(
key,
"" if key_exists else " FOUND IN SECONDARY BIB SOURCES,",
bibcode_new,
)
)
continue
if key_exists:
print("{}: EXISTING".format(key))
continue
if key_exists_in_others and args.merge_other:
bib.entries_dict[key] = bib_other.entries_dict[key]
bib.entries = list(bib.entries_dict.values())
print("{}: FOUND IN OTHER BIB SOURCE, MERGED".format(key))
continue
if key_exists_in_others:
print("{}: FOUND IN OTHER BIB SOURCE, IGNORED".format(key))
continue
bibcode = find_bibcode(key)
if bibcode:
to_retrieve.add(bibcode)
all_entries[bibcode].append(key)
print("{}: NEW ENTRY => {}".format(key, bibcode))
else:
not_found.add(key)
print("{}: NOT FOUND".format(key))
if not_found:
print(_headerize("Please check the following keys"))
for key in not_found:
print(key)
repeated_keys = [t for t in all_entries.items() if len(t[1]) > 1]
if repeated_keys:
print(_headerize("The following keys refer to the same entry"))
for b, k in repeated_keys:
print(
"{1} has been referred as the following keys; please keep only one:\n{0}\n".format(
" ".join(k), b
)
)
if to_retrieve:
print(_headerize("Building new bibtex file, please wait..."))
bib_new = bibtexparser.loads(
ads.ExportQuery(list(to_retrieve), "bibtex").execute(), parser=get_bparser()
)
for entry in bib_new.entries:
entry["ID"] = all_entries[entry["ID"]][0]
bib = update_bib(bib, bib_new)
bib_dump_str = bibtexparser.dumps(bib).encode("utf8")
if args.backup and os.path.isfile(args.output):
copyfile(args.output, args.output + ".bak")
with open(args.output, "wb") as fp:
fp.write(bib_dump_str)
print(_headerize("Done!"))
# check version
try:
latest_version = StrictVersion(
requests.get(
"https://pypi.python.org/pypi/adstex/json", timeout=0.1,
).json()["info"]["version"]
)
except (requests.RequestException, KeyError, ValueError):
pass
else:
if latest_version > StrictVersion(__version__):
msg = "A newer version of adstex (v{}) is now available!\n".format(
latest_version
)
msg += "Please consider updating it by running:\n\n"
msg += "pip install adstex=={}".format(latest_version)
print(_headerize(msg))
if __name__ == "__main__":
try:
main()
except KeyboardInterrupt:
print(_headerize("Abort! adstex interupted by a keyboard signal!"))