def remap_keys(old_filename, new_filename, output_folder): """ Create a script to remap bibtex keys from one .bib file to another. This function uses the edit distance on filenames generated from a bitex entry to compare entries together, and greedily matches old entries to new entries. """ old_db = utils.read_bib_file(old_filename, homogenize=False) new_db = utils.read_bib_file(new_filename, homogenize=False) old_list = {} new_list = {} subst = {} for entry in new_db.entries: name = nomenclature.gen_filename(entry) new_list[name] = entry['ID'] for entry in old_db.entries: name = nomenclature.gen_filename(entry) if name in new_list.keys(): subst[entry['ID']] = new_list[name] del new_list[name] else: old_list[name] = entry['ID'] for name, bibkey in new_list.items(): match, score = utils.most_similar_filename(name, old_list.keys()) if score < 0.90: print(termcolor.colored("Warning: potentially incorrect substitution:", 'yellow', attrs=["bold"])) print(termcolor.colored('-', 'red') + match) print(termcolor.colored('+', 'green') + name) subst[old_list[match]] = bibkey utils.write_remap_script(subst, output_folder)
def rename_folder(folder, use_backup): """ Rename the pdf files in the given folder according to the information found in `biblio.bib`. Note that this function will update file entries in `biblio.bib`, but also in `.queried.bib`. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but renames the pdfs in the given folder, and update bib files. """ # Read input bib files pretty_bib_path = os.path.join(folder, 'biblio.bib') pretty_db = utils.read_bib_file(pretty_bib_path) queried_bib_path = os.path.join(folder, '.queried.bib') queried_db = utils.read_bib_file(queried_bib_path) queried_files = utils.create_file_dict(queried_db) # Iterate over db entries need_rename = False for entry in pretty_db.entries: old_filename = utils.decode_filename_field(entry['file']) new_filename = nomenclature.gen_filename(entry) if not os.path.exists(os.path.join(folder, old_filename)): print(termcolor.colored('file not found: ', 'red') + old_filename) elif old_filename != new_filename: need_rename = True print(termcolor.colored('-', 'red') + old_filename) print(termcolor.colored('+', 'green') + new_filename) # Skip if nothing to rename if not need_rename: return # Ask confirmation cmd = input('(Y/n) ') if cmd == '' or cmd == 'y' or cmd == 'Y': for entry in pretty_db.entries: old_filename = utils.decode_filename_field(entry['file']) new_filename = nomenclature.gen_filename(entry) old_path = os.path.join(folder, old_filename) new_path = os.path.join(folder, new_filename) if os.path.exists(old_path): os.rename(old_path, new_path) new_val = utils.encode_filename_field(new_filename) if old_filename in queried_files: idx = queried_files[old_filename] queried_db.entries[idx]['file'] = new_val entry['file'] = new_val # Write output bibtex files utils.write_with_backup(pretty_bib_path, utils.write_bib(pretty_db, order=False), use_backup) utils.write_with_backup(queried_bib_path, utils.write_bib(queried_db, order=False), use_backup)
def scholarly_query(authors, title): """ Query Google Scholar database. Args: authors (list): a list of strings for up the first authors last names. title (str): the title of the article. Returns: A record (dict) of the bibtex entry obtained from Google Scholar. """ query = ' '.join(authors) + ' ' + title search_query = scholarly.search_pubs_query(query) try: res = next(search_query) except StopIteration: return None res.fill() if 'abstract' in res.bib: del res.bib['abstract'] # Post-process title res.bib['title'] = re.sub('\\.*$', '', res.bib['title']) print('S: ' + nomenclature.gen_filename(res.bib)) return res.bib
def extract_from_file(filename, output_folder): """ Extract citation from a given .bib file, and write the resulting list in a '.biblist' in the given folder. """ db = utils.read_bib_file(filename, homogenize=False) outfile = os.path.join(output_folder, ".biblist") if os.path.exists(outfile): cmd = input("overwrite existing file '{0}' (y/N) ".format(outfile)) if cmd != 'y': return filenames = sorted([nomenclature.gen_filename(entry) for entry in db.entries]) with open(outfile, 'w') as f: for name in filenames: f.write(name) f.write('\n')
def sync_folder(folder, use_backup): """ Update the file field of bibtex entries for the given folder. When an entry could not find a good match, it will be removed from the bibtex, unless the user explicitly prevents it. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but updates `.queried.bib` and `biblio.bib` files. """ for bib_file in ('.queried.bib', 'biblio.bib'): bib_path = os.path.join(folder, bib_file) db = utils.read_bib_file(bib_path) unmatched = set( [os.path.basename(f) for f in utils.get_pdf_list(folder)]) to_delete = [] for i, entry in enumerate(db.entries): guess = nomenclature.gen_filename(entry) if 'file' in entry: guess = utils.decode_filename_field(entry['file']) match, score = utils.most_similar_filename(guess, unmatched) if score >= 0.90: unmatched.remove(match) entry['file'] = utils.encode_filename_field(match) else: print( termcolor.colored(bib_file, "magenta") + ": ({1}) will remove '{0}'".format( guess, termcolor.colored(score, "yellow"))) to_delete.append(i) # Delete unmatched entries if to_delete: cmd = input('(Y/n) ') if cmd == '' or cmd == 'y' or cmd == 'Y': for i in sorted(to_delete, reverse=True): del db.entries[i] # Write synced database utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)
def sync_folder(folder, use_backup): """ Update the file field of bibtex entries for the given folder. When an entry could not find a good match, it will be removed from the bibtex, unless the user explicitly prevents it. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but updates `.queried.bib` and `biblio.bib` files. """ for bib_file in ('.queried.bib', 'biblio.bib'): bib_path = os.path.join(folder, bib_file) db = utils.read_bib_file(bib_path) unmatched = set([os.path.basename(f) for f in utils.get_pdf_list(folder)]) to_delete = [] for i, entry in enumerate(db.entries): guess = nomenclature.gen_filename(entry) if 'file' in entry: guess = utils.decode_filename_field(entry['file']) match, score = utils.most_similar_filename(guess, unmatched) if score >= 0.90: unmatched.remove(match) entry['file'] = utils.encode_filename_field(match) else: print(termcolor.colored(bib_file, "magenta") + ": ({1}) will remove '{0}'".format(guess, termcolor.colored(score, "yellow"))) to_delete.append(i) # Delete unmatched entries if to_delete: cmd = input('(Y/n) ') if cmd == '' or cmd == 'y' or cmd == 'Y': for i in sorted(to_delete, reverse=True): del db.entries[i] # Write synced database utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)
def guess_manual_files(folder, queried_db, update_queried_db=True): """ Tries to guess which files in a folder correspond to entries placed in the `.manual.bib` file. This is useful for e.g. to avoid performing online queries for files which we know have a manual entry. If a '.manual.bib' is present, override corresponding queried entries The way it works is as follows: 1. Guess the filename of each entry in `.manual.bib` 2. Find entry in `.queried.bib` with the closest file name in its 'file' field 3. Override with manual entry """ files = create_file_dict(queried_db) manual_bib_path = os.path.join(folder, '.manual.bib') if os.path.exists(manual_bib_path): manual_database = read_bib_file(manual_bib_path, homogenize=True) for entry in manual_database.entries: guess = nomenclature.gen_filename(entry) file = encode_filename_field(guess) best_score = 0.0 best_idx = -1 # Compare again other file entries for key, idx in sorted(files.items()): sc = simratio(key, file) if sc > best_score: best_score = sc best_idx = idx # Update 'file' field match, _ = most_similar_filename(guess, folder) entry['file'] = encode_filename_field(match) # If best match is good enough, override old entry if update_queried_db: if best_score > 0.95: queried_db.entries[best_idx] = entry else: queried_db.entries.append(entry) else: files[match] = -1 return files
def crossref_query(authors, title): """ Query Crossref database. Args: authors (list): a list of strings for up the first authors last names. title (str): the title of the article. filename (str): the original path of the file to link to. Returns: A tuple (bibtex, json, score) where the first element is the data in bibtex format (returned as a record/dict), the second element is the data returned in json format, and the third element is the score of the match given by Crossref. """ cr = Crossref() query = ['+"' + name + '"' for name in authors] query = ' '.join(query) + ' +"' + title + '"' x = cr.works(query=query) assert x['status'] == "ok" # No result found if not x['message']['items']: print_score(0) return (None, [], 0) best_item = x['message']['items'][0] for item in x['message']['items']: if item['score'] < best_item['score']: break else: best_item = pick_best(title, best_item, item) # Retrieve DOI and json item doi = best_item['DOI'] res_json = best_item # If the entry is invalid, return a score of 0 if 'author' not in res_json or not res_json['title']: print_score(0) return (None, res_json, 0) # Retrieve metadata as bibtex entry res_bib = cn.content_negotiation(ids=doi, format="bibentry") res_bib = re.sub('ä', 'ä', res_bib) res_bib = re.sub('Ă', 'Ö', res_bib) res_bib = re.sub('รถ', 'ö', res_bib) res_bib = re.sub('Ăź', 'ü', res_bib) res_bib = re.sub('Ěo', 'ö', res_bib) res_bib = re.sub('ďż˝', 'ø', res_bib) res_bib = re.sub('ĂŤ', 'ë', res_bib) db = bibtexparser.loads(res_bib) assert len(db.entries) == 1 res_bib = db.entries[0] # If article has subtitle(s), fix bibtex entry if 'subtitle' in res_json: subtitles = [x for x in res_json['subtitle'] if not str.isupper(x)] else: subtitles = [] if len(subtitles) > 0: # Discard subtitle that are all uppercase title = ' '.join(res_json['title']) subtitle = ' '.join(subtitles) if title.lower().startswith( subtitle.lower()) or utils.simratio(title, subtitle) > 0.95: # Don't repeat title if the subtitle is too similar to the title new_title = title else: new_title = title + ": " + subtitle res_bib['title'] = new_title else: new_title = ' '.join(res_json['title']) res_bib['title'] = new_title # Post-process title res_bib['title'] = re.sub('\\*$', '', res_bib['title']) res_bib['title'] = re.sub('^[0-9]*\\. ', '', res_bib['title']) res_bib['title'] = re.sub('\\.*$', '', res_bib['title']) # If bibtex entry has a 'journal' field, then use the longest alias from the json if 'journal' in res_bib: best = "" for container in res_json['container-title']: if len(container) > len(best): best = container res_bib['journal'] = best # If entry is missing the year, set score to 0 score = res_json['score'] if 'year' not in res_bib: score = 0 # Fix incorrect year in crossref entry if 'published-print' in res_json: item = res_json['published-print'] if 'date-parts' in item and len(item['date-parts']) == 1: date = item['date-parts'][0] year = date[0] month = date[1] if len(date) > 1 else None if str(year) != res_bib['year']: res_bib['year'] = str(year) if month is None and 'month' in res_bib: del res_bib['month'] elif month is not None: assert month >= 1 and month <= 12 month_str = utils.MONTHS[month - 1] res_bib['month'] = month_str # Fix potential ambiguous author entries msg = utils.fix_author_field(res_bib, res_json) print('C: ' + nomenclature.gen_filename(res_bib)) print_score(score) # If score is above threshold, display msg from fix_author_field if score >= config.crossref_accept_threshold and msg: print(msg) # Return database entry return (res_bib, res_json, score)
def crossref_query(authors, title): """ Query Crossref database. Args: authors (list): a list of strings for up the first authors last names. title (str): the title of the article. filename (str): the original path of the file to link to. Returns: A tuple (bibtex, json, score) where the first element is the data in bibtex format (returned as a record/dict), the second element is the data returned in json format, and the third element is the score of the match given by Crossref. """ cr = Crossref() # works?query.title=An+Improved+Adaptive+Constraint+Aggregation+for+Integrated+Layout+and+Topology+Optimization&query.author=Gao+Zhu+Zhang+Zhou&sort=score&rows=1 # query = ['+' + name + '' for name in authors] # query = 'query.title=' + urllib.parse.quote_plus(title) + '&query.author=' + urllib.parse.quote_plus(' '.join(authors)) + '&sort=score&rows=1' # print(query) if ''.join(authors): args = dict( query_title=urllib.parse.quote_plus(title), query_author=urllib.parse.quote_plus(' '.join(authors)) ) else: args = dict( query=urllib.parse.quote_plus(title), ) x = cr.works(sort='score', limit=1, **args) # x = cr.works(query=query) assert x['status'] == "ok" # No result found if not x['message']['items']: print_score(0) return (None, [], 0) best_item = x['message']['items'][0] # print(json.dumps(best_item, indent=4)) for item in x['message']['items']: if item['score'] < best_item['score']: break else: best_item = pick_best(title, best_item, item) # Retrieve DOI and json item doi = best_item['DOI'] res_json = best_item # If the entry is invalid, return a score of 0 if 'author' not in res_json or not res_json['title']: print_score(0) return (None, res_json, 0) # Retrieve metadata as bibtex entry res_bib = cn.content_negotiation(ids=doi, format="bibentry") res_bib = re.sub('ä', 'ä', res_bib) res_bib = re.sub('Ă', 'Ö', res_bib) res_bib = re.sub('รถ', 'ö', res_bib) res_bib = re.sub('Ăź', 'ü', res_bib) res_bib = re.sub('Ěo', 'ö', res_bib) res_bib = re.sub('ďż˝', 'ø', res_bib) res_bib = re.sub('ĂŤ', 'ë', res_bib) db = bibtexparser.loads(res_bib) assert len(db.entries) == 1 res_bib = db.entries[0] # If article has subtitle(s), fix bibtex entry subtitles = None if 'subtitle' in res_json: subtitles = [x for x in res_json['subtitle'] if not str.isupper(x)] if subtitles: # Discard subtitle that are all uppercase title = ' '.join(res_json['title']) subtitle = ' '.join(subtitles) if title.lower().startswith(subtitle.lower()) or utils.simratio(title, subtitle) > 0.95: # Don't repeat title if the subtitle is too similar to the title new_title = title else: new_title = title + ": " + subtitle res_bib['title'] = new_title else: new_title = ' '.join(res_json['title']) res_bib['title'] = new_title # Post-process title res_bib['title'] = re.sub('\\*$', '', res_bib['title']) res_bib['title'] = re.sub('^[0-9]*\\. ', '', res_bib['title']) res_bib['title'] = re.sub('\\.*$', '', res_bib['title']) # If bibtex entry has a 'journal' field, then use the longest alias from the json if 'journal' in res_bib: best = "" for container in res_json['container-title']: if len(container) > len(best): best = container res_bib['journal'] = best # If entry is missing the year, set score to 0 score = res_json['score'] if 'year' not in res_bib: score = 0 # Fix incorrect year in crossref entry if 'published-print' in res_json: item = res_json['published-print'] if 'date-parts' in item and len(item['date-parts']) == 1: date = item['date-parts'][0] year = date[0] month = date[1] if len(date) > 1 else None if str(year) != res_bib['year']: res_bib['year'] = str(year) if month is None and 'month' in res_bib: del res_bib['month'] elif month is not None: assert month >= 1 and month <= 12 month_str = utils.MONTHS[month - 1] res_bib['month'] = month_str # Fix potential ambiguous author entries msg = utils.fix_author_field(res_bib, res_json) print('C: ' + nomenclature.gen_filename(res_bib)) print_score(score) # If score is above threshold, display msg from fix_author_field if score >= config.crossref_accept_threshold and msg: print(msg) # Return database entry return (res_bib, res_json, score)
def guess_manual_files(folder, queried_db, update_queried_db=True): """ Tries to guess which files in a folder correspond to entries placed in the `.manual.bib` file. This is useful for e.g. to avoid performing online queries for files which we know have a manual entry. If a '.manual.bib' is present, override corresponding queried entries The way it works is as follows: 1. Guess the filename of each entry in `.manual.bib` 2. Find entry in `.queried.bib` with the closest file name in its 'file' field 3. Override with manual entry """ files = create_file_dict(queried_db) manual_bib_path = os.path.join(folder, '.manual.bib') if os.path.exists(manual_bib_path): manual_database = read_bib_file(manual_bib_path, homogenize=True) for entry in manual_database.entries: guess = nomenclature.gen_filename(entry) logger.warning("Try to find a match for manual entry: {}", guess) queried_best_score = 0.0 queried_best_idx = -1 queried_best_key = None # Find the entry from .queried that is the most similar to the manual entry for key, idx in sorted(files.items()): sc = simratio(key, guess) if sc > queried_best_score: queried_best_score = sc queried_best_idx = idx queried_best_key = key # Find most similar filename in the folder being processed match, match_score = most_similar_filename(guess, folder) if match_score < 0.9: logger.warning( "Cannot find a file matching manual entry (simratio: {}).\n- Entry: {}\n- Match: {}", match_score, guess, match) res = None while res not in ['y', 'n']: res = input("Use best match for this entry? [y/n]") if res == 'n': continue else: logger.info("Found a file matching manual entry: {}", guess) entry['file'] = encode_filename_field(match) files[match] = -1 # If best match is good enough, override queried entry with the manual one if update_queried_db: if queried_best_idx >= 0 and queried_best_score > 0.90: logger.info("Found a query matching manual entry: {}", guess) queried_db.entries[queried_best_idx] = entry elif queried_best_idx >= 0 and queried_best_score > 0.80: logger.warning( "Could not find a query matching manual entry (simratio: {}).\n- Entry: {}\n- Query: {}", queried_best_score, guess, queried_best_key) res = None while res not in ['y', 'n']: res = input( "Replace this query with the manual entry? [y/n]") if res == 'y': queried_db.entries[queried_best_idx] = entry else: queried_db.entries.append(entry) else: logger.debug( "Could not find a query matching manual entry: {}", guess) queried_db.entries.append(entry) return files