def query_crossref_folder(folder, use_backup): """ Query metadata information for unmatched pdf files in the given folder. This function only queries Crossref. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but writes the queried databases in bibtex format in the given folder (and backup previous database if it differed). """ # Create database db = utils.read_bib_file(os.path.join(folder, '.queried.bib')) files = utils.guess_manual_files(folder, db, update_queried_db=False) utils.add_skip_files(folder, files) json_entries = [] rejected = [] # For each pdf in the folder import pdb pdb.set_trace() for path in utils.get_pdf_list(folder): file = os.path.basename(path) parsed = nomenclature.parse_filename(file) if parsed is None or file in files: continue print('Q: ' + os.path.basename(file)) authors, title = parsed # Crossref rbib, rjson, score = providers.crossref_query(authors, title) if score >= config.crossref_accept_threshold: # Append filename and store entry rbib['file'] = utils.encode_filename_field(file) json_entries.append(rjson) db.entries.append(rbib) else: rejected.append(os.path.basename(file)) # Store results bib_path = os.path.join(folder, '.queried.bib') utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup) json_path = os.path.join(folder, '.queried.json') json_str = json.dumps(json_entries, sort_keys=True, indent=4, separators=(',', ': ')) utils.write_with_backup(json_path, json_str, use_backup) rejected_path = os.path.join(folder, '.rejected.txt') if len(rejected) > 0: utils.write_with_backup(rejected_path, '\n'.join(rejected), use_backup)
def query_crossref_folder(folder, use_backup): """ Query metadata information for unmatched pdf files in the given folder. This function only queries Crossref. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but writes the queried databases in bibtex format in the given folder (and backup previous database if it differed). """ # Create database db = utils.read_bib_file(os.path.join(folder, '.queried.bib')) files = utils.guess_manual_files(folder, db, update_queried_db=False) utils.add_skip_files(folder, files) json_entries = [] rejected = [] # For each pdf in the folder for path in utils.get_pdf_list(folder): file = os.path.basename(path) parsed = nomenclature.parse_filename(file) if parsed is None or file in files: continue print('Q: ' + os.path.basename(file)) authors, title = parsed # Crossref rbib, rjson, score = providers.crossref_query(authors, title) if score >= config.crossref_accept_threshold: # Append filename and store entry rbib['file'] = utils.encode_filename_field(file) json_entries.append(rjson) db.entries.append(rbib) else: rejected.append(os.path.basename(file)) # Store results bib_path = os.path.join(folder, '.queried.bib') utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup) json_path = os.path.join(folder, '.queried.json') json_str = json.dumps(json_entries, sort_keys=True, indent=4, separators=(',', ': ')) utils.write_with_backup(json_path, json_str, use_backup) rejected_path = os.path.join(folder, '.rejected.txt') if len(rejected) > 0: utils.write_with_backup(rejected_path, '\n'.join(rejected), use_backup)
def sync_folder(folder, use_backup): """ Update the file field of bibtex entries for the given folder. When an entry could not find a good match, it will be removed from the bibtex, unless the user explicitly prevents it. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but updates `.queried.bib` and `biblio.bib` files. """ for bib_file in ('.queried.bib', 'biblio.bib'): bib_path = os.path.join(folder, bib_file) db = utils.read_bib_file(bib_path) unmatched = set( [os.path.basename(f) for f in utils.get_pdf_list(folder)]) to_delete = [] for i, entry in enumerate(db.entries): guess = nomenclature.gen_filename(entry) if 'file' in entry: guess = utils.decode_filename_field(entry['file']) match, score = utils.most_similar_filename(guess, unmatched) if score >= 0.90: unmatched.remove(match) entry['file'] = utils.encode_filename_field(match) else: print( termcolor.colored(bib_file, "magenta") + ": ({1}) will remove '{0}'".format( guess, termcolor.colored(score, "yellow"))) to_delete.append(i) # Delete unmatched entries if to_delete: cmd = input('(Y/n) ') if cmd == '' or cmd == 'y' or cmd == 'Y': for i in sorted(to_delete, reverse=True): del db.entries[i] # Write synced database utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)
def sync_folder(folder, use_backup): """ Update the file field of bibtex entries for the given folder. When an entry could not find a good match, it will be removed from the bibtex, unless the user explicitly prevents it. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but updates `.queried.bib` and `biblio.bib` files. """ for bib_file in ('.queried.bib', 'biblio.bib'): bib_path = os.path.join(folder, bib_file) db = utils.read_bib_file(bib_path) unmatched = set([os.path.basename(f) for f in utils.get_pdf_list(folder)]) to_delete = [] for i, entry in enumerate(db.entries): guess = nomenclature.gen_filename(entry) if 'file' in entry: guess = utils.decode_filename_field(entry['file']) match, score = utils.most_similar_filename(guess, unmatched) if score >= 0.90: unmatched.remove(match) entry['file'] = utils.encode_filename_field(match) else: print(termcolor.colored(bib_file, "magenta") + ": ({1}) will remove '{0}'".format(guess, termcolor.colored(score, "yellow"))) to_delete.append(i) # Delete unmatched entries if to_delete: cmd = input('(Y/n) ') if cmd == '' or cmd == 'y' or cmd == 'Y': for i in sorted(to_delete, reverse=True): del db.entries[i] # Write synced database utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)
def query_google_folder(folder, use_backup): """ Query metadata information for unmatched pdf files in the given folder. This function only queries Google Scholar. Args: folder (str): absolute or relative path to the folder to process. use_backup (bool): whether to backup previous files before writing. Returns: Nothing, but writes the queried databases in bibtex format in the given folder (and backup previous database if it differed). """ # Create database db = utils.read_bib_file(os.path.join(folder, '.queried.bib')) files = utils.guess_manual_files(folder, db, update_queried_db=False) utils.add_skip_files(folder, files) for path in utils.get_pdf_list(folder): file = os.path.basename(path) parsed = nomenclature.parse_filename(file) if parsed is None or file in files: continue print('Q: ' + os.path.basename(file)) authors, title = parsed # Google Scholar rbib = providers.scholarly_query(authors, title) if rbib is None: continue # Append filename and store entry rbib['file'] = utils.encode_filename_field(file) db.entries.append(rbib) # Store results bib_path = os.path.join(folder, '.queried.bib') utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)