def merge_folder_tree(folder, use_backup): """ Merge bib files from the current subtree into a master bib file at the root. This function updates the 'file' link of each entry with the relative path to each subfolder that has been processed. Args: folder (str): relative or absolute path of the folder to process. Returns: Nothing, but creates a file named `master.bib` in the given folder. """ db = BibDatabase() for subdir, _dirs, _files in os.walk(os.path.abspath(folder)): if os.path.exists(os.path.join(subdir, '.nobib')): continue # Skip blacklisted folders reldir = os.path.relpath(subdir, os.path.abspath(folder)) bib_path = os.path.join(subdir, 'biblio.bib') subdb = utils.read_bib_file(bib_path) for entry in subdb.entries: filename = utils.decode_filename_field(entry['file']) filename = os.path.join(reldir, filename) entry['file'] = utils.encode_filename_field(filename) db.entries += subdb.entries # Remove duplicated entries entries_dict = db.entries_dict db.entries = [val for key, val in entries_dict.items()] # Write result bib_path = os.path.join(folder, 'master.bib') utils.write_with_backup(bib_path, utils.write_bib(db, order=True), use_backup)
def rename_folder(folder, use_backup): """ Rename the pdf files in the given folder according to the information found in `biblio.bib`. Note that this function will update file entries in `biblio.bib`, but also in `.queried.bib`. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but renames the pdfs in the given folder, and update bib files. """ # Read input bib files pretty_bib_path = os.path.join(folder, 'biblio.bib') pretty_db = utils.read_bib_file(pretty_bib_path) queried_bib_path = os.path.join(folder, '.queried.bib') queried_db = utils.read_bib_file(queried_bib_path) queried_files = utils.create_file_dict(queried_db) # Iterate over db entries need_rename = False for entry in pretty_db.entries: old_filename = utils.decode_filename_field(entry['file']) new_filename = nomenclature.gen_filename(entry) if not os.path.exists(os.path.join(folder, old_filename)): print(termcolor.colored('file not found: ', 'red') + old_filename) elif old_filename != new_filename: need_rename = True print(termcolor.colored('-', 'red') + old_filename) print(termcolor.colored('+', 'green') + new_filename) # Skip if nothing to rename if not need_rename: return # Ask confirmation cmd = input('(Y/n) ') if cmd == '' or cmd == 'y' or cmd == 'Y': for entry in pretty_db.entries: old_filename = utils.decode_filename_field(entry['file']) new_filename = nomenclature.gen_filename(entry) old_path = os.path.join(folder, old_filename) new_path = os.path.join(folder, new_filename) if os.path.exists(old_path): os.rename(old_path, new_path) new_val = utils.encode_filename_field(new_filename) if old_filename in queried_files: idx = queried_files[old_filename] queried_db.entries[idx]['file'] = new_val entry['file'] = new_val # Write output bibtex files utils.write_with_backup(pretty_bib_path, utils.write_bib(pretty_db, order=False), use_backup) utils.write_with_backup(queried_bib_path, utils.write_bib(queried_db, order=False), use_backup)
def query_crossref_folder(folder, use_backup): """ Query metadata information for unmatched pdf files in the given folder. This function only queries Crossref. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but writes the queried databases in bibtex format in the given folder (and backup previous database if it differed). """ # Create database db = utils.read_bib_file(os.path.join(folder, '.queried.bib')) files = utils.guess_manual_files(folder, db, update_queried_db=False) utils.add_skip_files(folder, files) json_entries = [] rejected = [] # For each pdf in the folder import pdb pdb.set_trace() for path in utils.get_pdf_list(folder): file = os.path.basename(path) parsed = nomenclature.parse_filename(file) if parsed is None or file in files: continue print('Q: ' + os.path.basename(file)) authors, title = parsed # Crossref rbib, rjson, score = providers.crossref_query(authors, title) if score >= config.crossref_accept_threshold: # Append filename and store entry rbib['file'] = utils.encode_filename_field(file) json_entries.append(rjson) db.entries.append(rbib) else: rejected.append(os.path.basename(file)) # Store results bib_path = os.path.join(folder, '.queried.bib') utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup) json_path = os.path.join(folder, '.queried.json') json_str = json.dumps(json_entries, sort_keys=True, indent=4, separators=(',', ': ')) utils.write_with_backup(json_path, json_str, use_backup) rejected_path = os.path.join(folder, '.rejected.txt') if len(rejected) > 0: utils.write_with_backup(rejected_path, '\n'.join(rejected), use_backup)
def query_crossref_folder(folder, use_backup): """ Query metadata information for unmatched pdf files in the given folder. This function only queries Crossref. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but writes the queried databases in bibtex format in the given folder (and backup previous database if it differed). """ # Create database db = utils.read_bib_file(os.path.join(folder, '.queried.bib')) files = utils.guess_manual_files(folder, db, update_queried_db=False) utils.add_skip_files(folder, files) json_entries = [] rejected = [] # For each pdf in the folder for path in utils.get_pdf_list(folder): file = os.path.basename(path) parsed = nomenclature.parse_filename(file) if parsed is None or file in files: continue print('Q: ' + os.path.basename(file)) authors, title = parsed # Crossref rbib, rjson, score = providers.crossref_query(authors, title) if score >= config.crossref_accept_threshold: # Append filename and store entry rbib['file'] = utils.encode_filename_field(file) json_entries.append(rjson) db.entries.append(rbib) else: rejected.append(os.path.basename(file)) # Store results bib_path = os.path.join(folder, '.queried.bib') utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup) json_path = os.path.join(folder, '.queried.json') json_str = json.dumps(json_entries, sort_keys=True, indent=4, separators=(',', ': ')) utils.write_with_backup(json_path, json_str, use_backup) rejected_path = os.path.join(folder, '.rejected.txt') if len(rejected) > 0: utils.write_with_backup(rejected_path, '\n'.join(rejected), use_backup)
def sync_folder(folder, use_backup): """ Update the file field of bibtex entries for the given folder. When an entry could not find a good match, it will be removed from the bibtex, unless the user explicitly prevents it. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but updates `.queried.bib` and `biblio.bib` files. """ for bib_file in ('.queried.bib', 'biblio.bib'): bib_path = os.path.join(folder, bib_file) db = utils.read_bib_file(bib_path) unmatched = set( [os.path.basename(f) for f in utils.get_pdf_list(folder)]) to_delete = [] for i, entry in enumerate(db.entries): guess = nomenclature.gen_filename(entry) if 'file' in entry: guess = utils.decode_filename_field(entry['file']) match, score = utils.most_similar_filename(guess, unmatched) if score >= 0.90: unmatched.remove(match) entry['file'] = utils.encode_filename_field(match) else: print( termcolor.colored(bib_file, "magenta") + ": ({1}) will remove '{0}'".format( guess, termcolor.colored(score, "yellow"))) to_delete.append(i) # Delete unmatched entries if to_delete: cmd = input('(Y/n) ') if cmd == '' or cmd == 'y' or cmd == 'Y': for i in sorted(to_delete, reverse=True): del db.entries[i] # Write synced database utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)
def sync_folder(folder, use_backup): """ Update the file field of bibtex entries for the given folder. When an entry could not find a good match, it will be removed from the bibtex, unless the user explicitly prevents it. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but updates `.queried.bib` and `biblio.bib` files. """ for bib_file in ('.queried.bib', 'biblio.bib'): bib_path = os.path.join(folder, bib_file) db = utils.read_bib_file(bib_path) unmatched = set([os.path.basename(f) for f in utils.get_pdf_list(folder)]) to_delete = [] for i, entry in enumerate(db.entries): guess = nomenclature.gen_filename(entry) if 'file' in entry: guess = utils.decode_filename_field(entry['file']) match, score = utils.most_similar_filename(guess, unmatched) if score >= 0.90: unmatched.remove(match) entry['file'] = utils.encode_filename_field(match) else: print(termcolor.colored(bib_file, "magenta") + ": ({1}) will remove '{0}'".format(guess, termcolor.colored(score, "yellow"))) to_delete.append(i) # Delete unmatched entries if to_delete: cmd = input('(Y/n) ') if cmd == '' or cmd == 'y' or cmd == 'Y': for i in sorted(to_delete, reverse=True): del db.entries[i] # Write synced database utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)
def query_google_folder(folder, use_backup): """ Query metadata information for unmatched pdf files in the given folder. This function only queries Google Scholar. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but writes the queried databases in bibtex format in the given folder (and backup previous database if it differed). """ # Create database db = utils.read_bib_file(os.path.join(folder, '.queried.bib')) files = utils.guess_manual_files(folder, db, update_queried_db=False) utils.add_skip_files(folder, files) for path in utils.get_pdf_list(folder): file = os.path.basename(path) parsed = nomenclature.parse_filename(file) if parsed is None or file in files: continue print('Q: ' + os.path.basename(file)) authors, title = parsed # Google Scholar rbib = providers.scholarly_query(authors, title) if rbib is None: continue # Append filename and store entry rbib['file'] = utils.encode_filename_field(file) db.entries.append(rbib) # Store results bib_path = os.path.join(folder, '.queried.bib') utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)