-
Notifications
You must be signed in to change notification settings - Fork 1
/
bibtexsanitizer.py
executable file
·651 lines (564 loc) · 23.6 KB
/
bibtexsanitizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
import copy
import logging
import os
import re
import shutil
from tqdm import tqdm
from collections import OrderedDict
from pprint import pprint
import arxiv
import bibtexparser
import pyparsing as pp
import sh
import utils
logger = utils.initialize_logging('pybib')
MONTH_STRINGS = ['jan', 'feb', 'mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep',
'oct', 'nov', 'dec']
class DOIError(Exception):
pass
def load_bibtex_database(path):
"""Read .bib file and parse it using bibtexparser."""
with open(path, encoding='utf-8') as f:
bib_database = bibtexparser.load(f)
return bib_database
def save_bibtex_database_to_file(path, db):
"""Save bibtexparser library object into file."""
writer = bibtexparser.bwriter.BibTexWriter()
writer.indent = ' '
with open(path, 'w', encoding='utf-8') as f:
f.write(writer.write(db))
def _load_or_use(path_or_db):
"""Load from file if input is a path, otherwise just use input as db."""
if isinstance(path_or_db, str):
path = path_or_db
shutil.copyfile(path, path + '.bak')
db = load_bibtex_database(path)
else:
path = None
db = path_or_db
return (path, db)
def _save_or_return(path_or_db, new_db):
"""
If input was a string, save result to file, otherwise just return results.
"""
if isinstance(path_or_db, str):
save_bibtex_database_to_file(path_or_db, new_db)
os.remove(path_or_db + '.bak')
return None
else:
return new_db
def _fix_month_fields(text, action='strtoint'):
"""Convert all month fields from string to numeric format."""
months_strings = ['jan', 'feb', 'mar', 'may', 'jun', 'jul', 'aug', 'sep',
'oct', 'nov', 'dec']
for month_str in months_strings:
patt = r'month\s*=\s*{{{}}}'.format(month_str)
text = re.sub(patt, newpatt, flags=re.I)
# TO FINISHHHHHH
def fix_bibtex_syntax(path, make_backup=True, method='all'):
"""Fix bad fields in bib file."""
with open(path, encoding='utf-8') as f:
text = f.read()
if make_backup:
backup_file = path + '.old'
logger.info('Saving backup file at `{}`...'.format(backup_file))
shutil.copyfile(path, backup_file)
# fix fields without curly braces for value
logger.info('Ensuring all fields have curly braces...')
newtext = re.sub(r'([a-z]*)\s*=\s*([^\s{].*)(,\n|\s*})',
r'\1 = {\2}\3',
text)
# fix title fields with double curly braces
logger.info('Ensuring titles dont\'t have *double* curly braces (because that\'s just horrible)...')
newtext = re.sub(r'title\s*=\s*{{([^}]*)}}(,\s*\n|\s*})',
r'title = {\1}\2',
newtext)
# logger.info('Ensuring months fields are in numeric format...')
# newtext = _fix_month_fields(newtext, action='strtoint')
# save results
with open(path, 'w', encoding='utf-8') as f:
f.write(newtext)
# do addition house cleaning
if method == 'all':
fields_to_remove = [
'file', 'abstract', 'arxivid', 'bdsk-url-1', 'bdsk-url-2',
'keywords', 'mendeley-tags', 'date-modified', 'date-added',
'link', 'numpages']
logger.info('Removing unnecessary fields: {}...'.format(', '.join(fields_to_remove)))
remove_field_from_all_entries(path, fields_to_remove)
logger.info('Checking fields consistency...')
check_arxiv_fields_consistency(path, fix=True)
logger.info('All done! You\'re good to go.')
def find_entries_without_field(path_or_db, field):
"""Return entries without field."""
_, db = _load_or_use(path_or_db)
lacking_entries = []
for entry in db.entries:
if field not in entry:
lacking_entries += [entry]
return lacking_entries
def remove_field_from_all_entries(path_or_db, fields):
# if `database` is a string we assume it to be the path of the bib file
if isinstance(path_or_db, str):
path = path_or_db
database = load_bibtex_database(path)
else:
path = None
database = path_or_db
# sanitize input parameter
if isinstance(fields, str):
fields = [fields]
if not isinstance(fields, (list, tuple)):
raise ValueError('`fields` should be a list or tuple')
# remove unwanted fields
for entry in database.entries:
for field in fields:
entry.pop(field, None)
# save or return result
if path is not None:
save_bibtex_database_to_file(path, database)
else:
return database
def _is_newstyle_arxiv_id(arxiv_number):
"""Check whether arxiv id is old or new style (or invalid)."""
if re.match(r'^[0-9]{4}\.[0-9]*$', arxiv_number):
return True
if re.match(r'^[a-z]*-[a-z]*/[0-9]*$', arxiv_number):
return False
raise ValueError('Not a proper arxiv id')
def pull_info_from_doi(doi, accepted_fields=None):
"""Pull down paper info using its DOI identifier."""
ans = sh.curl('-LH', r'Accept: text/bibliography; style=bibtex',
r'http://dx.doi.org/' + doi)
ans = ans.stdout.decode('UTF-8')
if accepted_fields is None:
accepted_fields = ['title', 'year', 'volume', 'number', 'pages', 'ISBN',
'journal', 'publisher', 'month', 'author', 'doi']
details = {}
for field in accepted_fields:
value = re.search(field + r'={([^}]*)}', ans)
if value:
details[field] = value[1]
details['doi'] = doi
return details
def pull_info_from_arxiv_id(arxiv_id, requested_fields=None, use_doi=True):
"""Pull down paper info from arxiv id."""
ans = arxiv.query(id_list=[arxiv_id])
if not ans:
logger.error('No paper found with the ID {}.'.format(arxiv_id))
ans = ans[0]
# extract fields
details = extract_fields_from_arxiv_query_result(
ans, requested_fields=requested_fields, use_doi=use_doi)
return details
def pull_info_from_gscholar(query, accepted_fields=None):
"""Look for entry in google scholar."""
import gscholar
bibtex_string = gscholar.query(query)[0]
if accepted_fields is None:
accepted_fields = ['title', 'year', 'volume', 'number', 'pages', 'ISBN',
'journal', 'publisher', 'month', 'author', 'doi']
details = {}
for field in accepted_fields:
value = re.search(field + r'={([^}]*)}', bibtex_string)
if value:
details[field] = value[1]
return details
def authors_list_to_string(authors):
"""Take list of authors full names and return author string for bibtex."""
authors_strings = []
for author in authors:
names = author.split(' ')
last_name = names[-1]
authors_strings.append(last_name + ', ' + ' '.join(names[:-1]))
return ' and '.join(authors_strings)
def extract_fields_from_arxiv_query_result(result, requested_fields=None,
use_doi=True):
"""Take the answer of an arxiv query and extract relevant fields from it.
"""
if requested_fields is None:
requested_fields = ['title', 'authors', 'doi', 'year', 'month',
'journal_reference', 'url']
fields = dict()
# extract generic fields
for field in requested_fields:
# the arxiv api calles the set of authors `authors`, while bibtex uses
# the key `author`. Similar adjustments have to be made for other fields
if field == 'authors':
fields['author'] = result[field]
elif field == 'year':
fields['year'] = str(result['published_parsed'][0])
elif field == 'month':
fields['month'] = str(MONTH_STRINGS[result['published_parsed'][1] - 1])
elif field == 'journal_reference':
fields['journal'] = result[field]
elif field == 'url':
fields['url'] = result['arxiv_url']
else:
fields[field] = result[field]
# extract arxiv id info
arxiv_number = re.search('arxiv.org/abs/([^v]*)', result['id']).group(1)
newstyle = _is_newstyle_arxiv_id(arxiv_number)
if newstyle:
fields.update(dict(
archiveprefix='arXiv',
primaryclass=result['tags'][0]['term'],
eprint=arxiv_number
))
else:
fields.update(dict(
eprint=arxiv_number
))
# if requested, try to use the doi to extract additional information
if use_doi and fields['doi']:
fields.update(pull_info_from_doi(fields['doi']))
else:
# if no doi is used, we need to reformat the `author` entry, to convert
# it from a list of authors to a single string with all the authors
fields['author'] = authors_list_to_string(fields['author'])
# return results
if not fields['doi']:
del fields['doi']
return fields
def arxiv_query_title(title):
"""Query arxiv for papers with given title."""
query = 'ti:"{}"'.format(title.replace('-', ' '))
return arxiv.query(search_query=query)
def _has_journal_arxiv_field(db_entry):
"""Check whether entry has a journal field containing arxiv info."""
if 'journal' not in db_entry:
return False
journal = db_entry['journal']
# this detects entries coming from google scholar
if journal[:5] != 'arXiv':
return False
patt = r'arXiv preprint arXiv:([0-9]{4}\.[0-9]*)$'
match = re.match(patt, journal)
if not match:
return False
return match.group(1)
def fill_bibdatabase_arxiv_entries(db, max_processed_entries=None, force=False):
"""Use titles to try to fill in the arxiv details."""
num_processed_entries = 0
for entry in db.entries:
# if an eprint entry already exists, don't touch anything
if 'eprint' in entry and not force:
continue
arxiv_id = _has_journal_arxiv_field(entry)
if arxiv_id:
print('Found id {} in journal entry, using this'.format(arxiv_id))
print('Deleting journal field from this entry')
del entry['journal']
answers = arxiv.query(id_list=[arxiv_id])
else:
answers = arxiv_query_title(entry['title'])
# if we get more than one answer from the arxiv, we look for one with
# an exactly matching title. If none is found, do nothing.
if len(answers) > 1:
logger.info('{}: Found more than one result'.format(entry['ID']))
for ans in answers:
if ans['title'] == entry['title']:
answer = [ans]
else:
continue
elif len(answers) == 0:
logger.info('{}: No results'.format(entry['ID']))
continue
answer = answers[0]
fields_to_add = extract_fields_from_arxiv_query_result(answer)
entry.update(fields_to_add)
logger.info('Updated {}'.format(entry['ID']))
# abort if maximum number of entries to process has been reached
if max_processed_entries is not None:
num_processed_entries += 1
if num_processed_entries == max_processed_entries:
print('Processed {} entries. Stopping as requested.'.format(
num_processed_entries))
return
def _update_entry_from_doi(entry):
"""Refill entry fields using its DOI."""
if 'doi' not in entry:
return entry
accepted_fields = ['title', 'volume', 'year', 'number', 'journal',
'publisher', 'month', 'pages']
new_fields = pull_info_from_doi(entry['doi'], accepted_fields)
entry.update(new_fields)
return entry
def update_entries_from_doi(path_or_db, monitor=True):
# parsing input parameters
_, db = _load_or_use(path_or_db)
# doing the deed
iterator = db.entries
if monitor:
iterator = tqdm(iterator)
for entry in iterator:
_update_entry_from_doi(entry)
# save or return result
return _save_or_return(path_or_db, db)
def check_id_style(path_or_db, style='gscholar'):
"""Makes sure that all entries' ids follows the correct style."""
if isinstance(path_or_db, str):
db = load_bibtex_database(path_or_db)
else:
db = path_or_db
if style != 'gscholar':
raise ValueError('Only google scholar style supported at the moment.')
all_good = True
for entry in db.entries:
ID = entry['ID']
if style == 'gscholar':
matcher = re.compile(r'[a-z]*[0-9]{4}[a-z]*$')
match = matcher.match(ID)
if not match:
all_good = False
print('{} does not match the ID style.'.format(ID))
return all_good
def _fix_key_casing(entry, correct_string):
if correct_string in entry:
return None
wrong_key = None
for key in entry.keys():
if key.lower() == correct_string.lower():
wrong_key = key
# is an incorrectly cased key was indeed found, amend the mistake
if wrong_key is not None:
entry[correct_string] = entry[wrong_key]
del entry[wrong_key]
# return result
return entry
def check_arxiv_fields_consistency(path_or_db, fix=True, assume_arxiv=True, assume_quantph=True):
"""Makes sure that all arxiv entries follow the correct format.
The correct format is here assumed to be:
1) For new-style eprints:
archivePrefix = {arXiv},
eprint = {0902.0885},
primaryClass = {quant-ph}
2) For old-style eprints:
archivePrefix = {arXiv},
eprint = {quant-ph/0401062}
If the `fix` parameter is given, attempts are made to fix inconsistencies.
The `assume_arxiv` parameter has the parser assume that all eprint entries are
supposed to refer to arxiv eprints.
If assume_quantph is True then the primaryClass will be automatically set to quant-ph, if missing.
"""
_, db = _load_or_use(path_or_db)
for entry in db.entries:
# sometimes the 'archivePrefix' and 'primaryClass' entries exist but are not properly cased. Fix this
entry = _fix_key_casing(entry, 'archivePrefix')
entry = _fix_key_casing(entry, 'primaryClass')
# if there is a 'eprint' entry but not a 'archivePrefix' one, things
# are probably wrong
if 'eprint' in entry and 'archivePrefix' not in entry:
if not assume_arxiv:
print("The entry '{}' is probably missing the 'archivePrefix' field."
" (has to be fixed manually).".format(entry['ID']))
else:
print("Adding 'archivePrefix' field to the entry '{}'".format(
entry['ID']))
entry['archivePrefix'] = 'arXiv'
# if the archivePrefix field is given and indicating an arxiv entry...
if 'archivePrefix' in entry and entry['archivePrefix'] == 'arXiv':
if 'eprint' not in entry:
print("The entry '{}' is missing the 'eprint' field."
" (has to be fixed manually).".format(entry['ID']))
else:
arxiv_entry_style = None
# check correct format of the eprint entry
if re.match(r'[0-9]{4}\.[0-9]{4,5}$', entry['eprint']):
arxiv_entry_style = 'new'
if 'primaryClass' not in entry and not assume_quantph:
print("The entry '{}' is missing the 'primaryClass' field"
" (has to be fixed manually).".format(entry['ID']))
elif 'primaryClass' not in entry and assume_quantph:
print("Adding 'quant-ph' as primaryClass for the '{}' entry.".format(
entry['ID']))
entry['primaryClass'] = 'quant-ph'
elif re.match(r'[a-z-]+/[0-9]+$', entry['eprint']):
arxiv_entry_style = 'old'
if 'primaryClass' in entry:
print("The entry '{}' should NOT have a 'primaryClass' field"
" (has to be fixed manually).".format(entry['ID']))
else:
print("The entry '{}' is using an incorrectly formatted 'eprint' field"
" (has to be fixed manually).".format(entry['ID']))
return _save_or_return(path_or_db, db)
def check_fields(path_or_db, fields=None):
"""Prints all entries not containing one of the specified fields."""
if isinstance(path_or_db, str):
db = load_bibtex_database(path_or_db)
else:
db = path_or_db
for entry in db.entries:
for field in fields:
if field not in entry:
print('{} is missing the {}'.format(entry['ID'], field))
def make_id_for_entry(entry, style='gscholar'):
"""Take entry as a dict, and return an ID to use for the bib entry."""
if style != 'gscholar':
raise NotImplementedError('Not implemented yet.')
try:
entry['title']
except KeyError:
logger.info('Title entry missing. Could not create id for entry.')
logger.info(entry)
raise KeyError('I could not find title information from the given DOI/'
'arXiv. This often happens with books, which for some r'
'easons often do no include the title information.')
try:
entry['author']
entry['year']
except KeyError:
# try to pull down additional information from google scholar
logger.info('I could not find author/year information from DOI/arxiv,'
' attempting to pull information down from gscholar.')
gscholar_result = pull_info_from_gscholar(
entry['title'], accepted_fields=['author', 'year'])
if 'author' in gscholar_result and 'year' in gscholar_result:
logger.info('Author/year information pulled from scholar.')
entry.update(gscholar_result)
else:
raise KeyError("author, title and year are required.")
title = entry['title']
logger.info('I found the title "{}"'.format(title))
year = entry['year']
author = entry['author'].split(',')[0].lower()
# extract first author
if author[0] == '{':
author = author[1:]
if author[-1] == '}':
author = author[:-1]
if ' ' in author:
author = author.split(' ')[0]
# -- extract first word (looking at "words" with more than 3 chars) --
# gather all words in the title
words_in_title = re.findall(r'\S+', title)
# remove punctuation from words
words_in_title = [utils.remove_punctuation(w) for w in words_in_title]
# filter words with less than 3 chars
words_in_title = [w for w in words_in_title if len(w) > 3]
# extract first word
first_word = words_in_title[0].lower()
if first_word[0] == '{':
first_word = first_word[1:]
if first_word[-1] == '}':
first_word = first_word[:-1]
if '-' in first_word:
first_word = first_word.split('-')[0]
# build new id
newid = '{}{}{}'.format(author, year, first_word)
logger.info('New id for the given entry: `{}`'.format(newid))
return newid
def fix_ids_to_scholar_style(path):
"""Fix entries ids to match the google scholar format."""
db = load_bibtex_database(path)
for entry in db.entries:
ID = entry['ID']
scholar_style = re.match(r'[a-z]*[0-9]{4}[a-z]*$', ID)
if not scholar_style:
logger.info('Processing {}'.format(ID))
newid = make_id_for_entry(entry)
# check that the new ID doesn't already exists
if any(entry['ID'] == newid for entry in db.entries):
print(entry['ID'], newid)
raise ValueError('There is already an entry with this ID, '
'there may be something wrong!')
entry['ID'] = newid
save_bibtex_database_to_file(path, db)
def make_bibentry_from_arxiv_id(arxiv_id):
"""Build bib entry from a single arxiv id."""
entry = pull_info_from_arxiv_id(arxiv_id)
# remove None entries, if any were produced
entry = {k: v for k, v in entry.items() if v is not None}
# add necessary data
entry['ENTRYTYPE'] = 'article'
entry['ID'] = make_id_for_entry(entry)
return entry
def make_bibentry_from_doi(doi):
"""Build bib entry from a single DOI."""
entry = pull_info_from_doi(doi)
entry['ENTRYTYPE'] = 'article'
try:
entry['ID'] = make_id_for_entry(entry)
except DOIError:
logger.debug('Full entry:')
logger.debug(entry)
raise DOIError('Something went wrong while fetching'
' the id to use for the doi "{}". Probably,'
' the DOI database does not hold enough information'
' about the paper (it often lacks a title).'.format(doi))
return entry
def get_bibentry_from_doi(dois):
"""Return the bibtex entry corresponding to the given doi, as a string.
The actual heavy lifting is done by `make_bibentry_from_doi`, here we just
check the input, warn the logger, and call `make_bibentry_from_doi` as
suitable.
"""
if isinstance(dois, str):
logger.info('Making bib entry for the doi "{}"'.format(dois))
entries = [make_bibentry_from_doi(dois)]
else: # we assume it's a list of strings otherwise
entries = []
for doi in dois:
logger.info('Making bib entry for the doi "{}"'.format(doi))
entries.append(make_bibentry_from_doi(doi))
# build the db using the entries
return _print_bibtex_string_from_entries(entries)
def _print_bibtex_string_from_entries(entries):
# build the db using the entries
db = bibtexparser.bibdatabase.BibDatabase()
db.entries = entries
# prepare and print results
writer = bibtexparser.bwriter.BibTexWriter()
writer.indent = ' '
return writer.write(db)
def get_bibentry_from_arxiv_id(arxiv_ids):
"""Return the bibtex entry corresponding to an arxiv id as a string."""
if isinstance(arxiv_ids, str):
entries = [make_bibentry_from_arxiv_id(arxiv_ids)]
else:
entries = []
for arxiv_id in arxiv_ids:
entries.append(make_bibentry_from_arxiv_id(arxiv_id))
# build the db using the entries
return _print_bibtex_string_from_entries(entries)
def add_entry_from_arxiv_id(path_or_db, arxiv_id, force=False):
"""Build entry corresponding to arxiv id, and add it to bib database."""
# parse input parameteres
path, db = _load_or_use(path_or_db)
# check whether entry already exists
if not force:
for entry in db.entries:
if 'eprint' in entry and entry['eprint'] == arxiv_id:
logger.info('An entry with arxiv id {} already exists.'.format(
arxiv_id))
if path:
return None
else:
return db
# do the thing
newentry = make_bibentry_from_arxiv_id(arxiv_id)
# add new entry to database
db.entries.append(newentry)
# save or return output
return _save_or_return(path_or_db, db)
def add_entries_from_arxiv_ids(path_or_db, arxiv_ids):
"""Take list of arxiv ids and build corresponding entries."""
_, db = _load_or_use(path_or_db)
# do the thing
for arxiv_id in arxiv_ids:
db = add_entry_from_arxiv_id(db, arxiv_id)
# save or return result
_save_or_return(path_or_db, db)
def add_entry_from_doi(path_or_db, doi):
"""Add an entry retrieved from a doi identifier."""
_, db = _load_or_use(path_or_db)
# check whether entry already exists
for entry in db.entries:
if 'doi' in entry and entry['doi'] == doi:
logger.info('An entry with doi {} already exists.'.format(doi))
return db
# do the thing
newentry = make_bibentry_from_doi(doi)
db.entries.append(newentry)
# save or return output
_save_or_return(path_or_db, db)