Exemple #1
0
def merge_folder_tree(folder, use_backup):
    """
    Merge bib files from the current subtree into a master bib file at the root.
    This function updates the 'file' link of each entry with the relative path
    to each subfolder that has been processed.

    Args:
        folder (str): relative or absolute path of the folder to process.

    Returns:
        Nothing, but creates a file named `master.bib` in the given folder.
    """
    db = BibDatabase()
    for subdir, _dirs, _files in os.walk(os.path.abspath(folder)):
        if os.path.exists(os.path.join(subdir, '.nobib')):
            continue  # Skip blacklisted folders
        reldir = os.path.relpath(subdir, os.path.abspath(folder))
        bib_path = os.path.join(subdir, 'biblio.bib')
        subdb = utils.read_bib_file(bib_path)
        for entry in subdb.entries:
            filename = utils.decode_filename_field(entry['file'])
            filename = os.path.join(reldir, filename)
            entry['file'] = utils.encode_filename_field(filename)
        db.entries += subdb.entries
    # Remove duplicated entries
    entries_dict = db.entries_dict
    db.entries = [val for key, val in entries_dict.items()]
    # Write result
    bib_path = os.path.join(folder, 'master.bib')
    utils.write_with_backup(bib_path, utils.write_bib(db, order=True), use_backup)
Exemple #2
0
def merge_folder_tree(folder, use_backup):
    """
    Merge bib files from the current subtree into a master bib file at the root.
    This function updates the 'file' link of each entry with the relative path
    to each subfolder that has been processed.

    Args:
        folder (str): relative or absolute path of the folder to process.

    Returns:
        Nothing, but creates a file named `master.bib` in the given folder.
    """
    db = BibDatabase()
    for subdir, _dirs, _files in os.walk(os.path.abspath(folder)):
        if os.path.exists(os.path.join(subdir, '.nobib')):
            continue  # Skip blacklisted folders
        reldir = os.path.relpath(subdir, os.path.abspath(folder))
        bib_path = os.path.join(subdir, 'biblio.bib')
        subdb = utils.read_bib_file(bib_path)
        for entry in subdb.entries:
            filename = utils.decode_filename_field(entry['file'])
            filename = os.path.join(reldir, filename)
            entry['file'] = utils.encode_filename_field(filename)
        db.entries += subdb.entries
    # Remove duplicated entries
    entries_dict = db.entries_dict
    db.entries = [val for key, val in entries_dict.items()]
    # Write result
    bib_path = os.path.join(folder, 'master.bib')
    utils.write_with_backup(bib_path, utils.write_bib(db, order=True),
                            use_backup)
Exemple #3
0
def rename_folder(folder, use_backup):
    """
    Rename the pdf files in the given folder according to the information found
    in `biblio.bib`. Note that this function will update file entries in
    `biblio.bib`, but also in `.queried.bib`.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but renames the pdfs in the given folder, and update bib files.
    """

    # Read input bib files
    pretty_bib_path = os.path.join(folder, 'biblio.bib')
    pretty_db = utils.read_bib_file(pretty_bib_path)
    queried_bib_path = os.path.join(folder, '.queried.bib')
    queried_db = utils.read_bib_file(queried_bib_path)
    queried_files = utils.create_file_dict(queried_db)

    # Iterate over db entries
    need_rename = False
    for entry in pretty_db.entries:
        old_filename = utils.decode_filename_field(entry['file'])
        new_filename = nomenclature.gen_filename(entry)
        if not os.path.exists(os.path.join(folder, old_filename)):
            print(termcolor.colored('file not found: ', 'red') + old_filename)
        elif old_filename != new_filename:
            need_rename = True
            print(termcolor.colored('-', 'red') + old_filename)
            print(termcolor.colored('+', 'green') + new_filename)

    # Skip if nothing to rename
    if not need_rename:
        return

    # Ask confirmation
    cmd = input('(Y/n) ')
    if cmd == '' or cmd == 'y' or cmd == 'Y':
        for entry in pretty_db.entries:
            old_filename = utils.decode_filename_field(entry['file'])
            new_filename = nomenclature.gen_filename(entry)
            old_path = os.path.join(folder, old_filename)
            new_path = os.path.join(folder, new_filename)
            if os.path.exists(old_path):
                os.rename(old_path, new_path)
                new_val = utils.encode_filename_field(new_filename)
                if old_filename in queried_files:
                    idx = queried_files[old_filename]
                    queried_db.entries[idx]['file'] = new_val
                entry['file'] = new_val

    # Write output bibtex files
    utils.write_with_backup(pretty_bib_path,
                            utils.write_bib(pretty_db, order=False),
                            use_backup)
    utils.write_with_backup(queried_bib_path,
                            utils.write_bib(queried_db, order=False),
                            use_backup)
Exemple #4
0
def rename_folder(folder, use_backup):
    """
    Rename the pdf files in the given folder according to the information found
    in `biblio.bib`. Note that this function will update file entries in
    `biblio.bib`, but also in `.queried.bib`.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but renames the pdfs in the given folder, and update bib files.
    """

    # Read input bib files
    pretty_bib_path = os.path.join(folder, 'biblio.bib')
    pretty_db = utils.read_bib_file(pretty_bib_path)
    queried_bib_path = os.path.join(folder, '.queried.bib')
    queried_db = utils.read_bib_file(queried_bib_path)
    queried_files = utils.create_file_dict(queried_db)

    # Iterate over db entries
    need_rename = False
    for entry in pretty_db.entries:
        old_filename = utils.decode_filename_field(entry['file'])
        new_filename = nomenclature.gen_filename(entry)
        if not os.path.exists(os.path.join(folder, old_filename)):
            print(termcolor.colored('file not found: ', 'red') + old_filename)
        elif old_filename != new_filename:
            need_rename = True
            print(termcolor.colored('-', 'red') + old_filename)
            print(termcolor.colored('+', 'green') + new_filename)

    # Skip if nothing to rename
    if not need_rename:
        return

    # Ask confirmation
    cmd = input('(Y/n) ')
    if cmd == '' or cmd == 'y' or cmd == 'Y':
        for entry in pretty_db.entries:
            old_filename = utils.decode_filename_field(entry['file'])
            new_filename = nomenclature.gen_filename(entry)
            old_path = os.path.join(folder, old_filename)
            new_path = os.path.join(folder, new_filename)
            if os.path.exists(old_path):
                os.rename(old_path, new_path)
                new_val = utils.encode_filename_field(new_filename)
                if old_filename in queried_files:
                    idx = queried_files[old_filename]
                    queried_db.entries[idx]['file'] = new_val
                entry['file'] = new_val

    # Write output bibtex files
    utils.write_with_backup(pretty_bib_path, utils.write_bib(pretty_db, order=False), use_backup)
    utils.write_with_backup(queried_bib_path, utils.write_bib(queried_db, order=False), use_backup)
Exemple #5
0
def query_crossref_folder(folder, use_backup):
    """
    Query metadata information for unmatched pdf files in the given folder.
    This function only queries Crossref.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but writes the queried databases in bibtex format in the given
        folder (and backup previous database if it differed).
    """

    # Create database
    db = utils.read_bib_file(os.path.join(folder, '.queried.bib'))
    files = utils.guess_manual_files(folder, db, update_queried_db=False)
    utils.add_skip_files(folder, files)
    json_entries = []
    rejected = []

    # For each pdf in the folder
    import pdb
    pdb.set_trace()
    for path in utils.get_pdf_list(folder):
        file = os.path.basename(path)
        parsed = nomenclature.parse_filename(file)
        if parsed is None or file in files:
            continue
        print('Q: ' + os.path.basename(file))
        authors, title = parsed

        # Crossref
        rbib, rjson, score = providers.crossref_query(authors, title)
        if score >= config.crossref_accept_threshold:
            # Append filename and store entry
            rbib['file'] = utils.encode_filename_field(file)
            json_entries.append(rjson)
            db.entries.append(rbib)
        else:
            rejected.append(os.path.basename(file))

    # Store results
    bib_path = os.path.join(folder, '.queried.bib')
    utils.write_with_backup(bib_path, utils.write_bib(db, order=False),
                            use_backup)
    json_path = os.path.join(folder, '.queried.json')
    json_str = json.dumps(json_entries,
                          sort_keys=True,
                          indent=4,
                          separators=(',', ': '))
    utils.write_with_backup(json_path, json_str, use_backup)
    rejected_path = os.path.join(folder, '.rejected.txt')
    if len(rejected) > 0:
        utils.write_with_backup(rejected_path, '\n'.join(rejected), use_backup)
Exemple #6
0
def query_crossref_folder(folder, use_backup):
    """
    Query metadata information for unmatched pdf files in the given folder.
    This function only queries Crossref.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but writes the queried databases in bibtex format in the given
        folder (and backup previous database if it differed).
    """

    # Create database
    db = utils.read_bib_file(os.path.join(folder, '.queried.bib'))
    files = utils.guess_manual_files(folder, db, update_queried_db=False)
    utils.add_skip_files(folder, files)
    json_entries = []
    rejected = []

    # For each pdf in the folder
    for path in utils.get_pdf_list(folder):
        file = os.path.basename(path)
        parsed = nomenclature.parse_filename(file)
        if parsed is None or file in files:
            continue
        print('Q: ' + os.path.basename(file))
        authors, title = parsed

        # Crossref
        rbib, rjson, score = providers.crossref_query(authors, title)
        if score >= config.crossref_accept_threshold:
            # Append filename and store entry
            rbib['file'] = utils.encode_filename_field(file)
            json_entries.append(rjson)
            db.entries.append(rbib)
        else:
            rejected.append(os.path.basename(file))

    # Store results
    bib_path = os.path.join(folder, '.queried.bib')
    utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)
    json_path = os.path.join(folder, '.queried.json')
    json_str = json.dumps(json_entries, sort_keys=True, indent=4, separators=(',', ': '))
    utils.write_with_backup(json_path, json_str, use_backup)
    rejected_path = os.path.join(folder, '.rejected.txt')
    if len(rejected) > 0:
        utils.write_with_backup(rejected_path, '\n'.join(rejected), use_backup)
Exemple #7
0
def sync_folder(folder, use_backup):
    """
    Update the file field of bibtex entries for the given folder.
    When an entry could not find a good match, it will be removed from the
    bibtex, unless the user explicitly prevents it.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but updates `.queried.bib` and `biblio.bib` files.
    """
    for bib_file in ('.queried.bib', 'biblio.bib'):
        bib_path = os.path.join(folder, bib_file)
        db = utils.read_bib_file(bib_path)
        unmatched = set(
            [os.path.basename(f) for f in utils.get_pdf_list(folder)])
        to_delete = []
        for i, entry in enumerate(db.entries):
            guess = nomenclature.gen_filename(entry)
            if 'file' in entry:
                guess = utils.decode_filename_field(entry['file'])
            match, score = utils.most_similar_filename(guess, unmatched)
            if score >= 0.90:
                unmatched.remove(match)
                entry['file'] = utils.encode_filename_field(match)
            else:
                print(
                    termcolor.colored(bib_file, "magenta") +
                    ": ({1}) will remove '{0}'".format(
                        guess, termcolor.colored(score, "yellow")))
                to_delete.append(i)

        # Delete unmatched entries
        if to_delete:
            cmd = input('(Y/n) ')
            if cmd == '' or cmd == 'y' or cmd == 'Y':
                for i in sorted(to_delete, reverse=True):
                    del db.entries[i]

        # Write synced database
        utils.write_with_backup(bib_path, utils.write_bib(db, order=False),
                                use_backup)
Exemple #8
0
def sync_folder(folder, use_backup):
    """
    Update the file field of bibtex entries for the given folder.
    When an entry could not find a good match, it will be removed from the
    bibtex, unless the user explicitly prevents it.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but updates `.queried.bib` and `biblio.bib` files.
    """
    for bib_file in ('.queried.bib', 'biblio.bib'):
        bib_path = os.path.join(folder, bib_file)
        db = utils.read_bib_file(bib_path)
        unmatched = set([os.path.basename(f) for f in utils.get_pdf_list(folder)])
        to_delete = []
        for i, entry in enumerate(db.entries):
            guess = nomenclature.gen_filename(entry)
            if 'file' in entry:
                guess = utils.decode_filename_field(entry['file'])
            match, score = utils.most_similar_filename(guess, unmatched)
            if score >= 0.90:
                unmatched.remove(match)
                entry['file'] = utils.encode_filename_field(match)
            else:
                print(termcolor.colored(bib_file, "magenta") +
                      ": ({1}) will remove '{0}'".format(guess, termcolor.colored(score, "yellow")))
                to_delete.append(i)

        # Delete unmatched entries
        if to_delete:
            cmd = input('(Y/n) ')
            if cmd == '' or cmd == 'y' or cmd == 'Y':
                for i in sorted(to_delete, reverse=True):
                    del db.entries[i]

        # Write synced database
        utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)
Exemple #9
0
def query_google_folder(folder, use_backup):
    """
    Query metadata information for unmatched pdf files in the given folder.
    This function only queries Google Scholar.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but writes the queried databases in bibtex format in the given
        folder (and backup previous database if it differed).
    """

    # Create database
    db = utils.read_bib_file(os.path.join(folder, '.queried.bib'))
    files = utils.guess_manual_files(folder, db, update_queried_db=False)
    utils.add_skip_files(folder, files)

    for path in utils.get_pdf_list(folder):
        file = os.path.basename(path)
        parsed = nomenclature.parse_filename(file)
        if parsed is None or file in files:
            continue
        print('Q: ' + os.path.basename(file))
        authors, title = parsed

        # Google Scholar
        rbib = providers.scholarly_query(authors, title)
        if rbib is None:
            continue

        # Append filename and store entry
        rbib['file'] = utils.encode_filename_field(file)
        db.entries.append(rbib)

    # Store results
    bib_path = os.path.join(folder, '.queried.bib')
    utils.write_with_backup(bib_path, utils.write_bib(db, order=False),
                            use_backup)
Exemple #10
0
def query_google_folder(folder, use_backup):
    """
    Query metadata information for unmatched pdf files in the given folder.
    This function only queries Google Scholar.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but writes the queried databases in bibtex format in the given
        folder (and backup previous database if it differed).
    """

    # Create database
    db = utils.read_bib_file(os.path.join(folder, '.queried.bib'))
    files = utils.guess_manual_files(folder, db, update_queried_db=False)
    utils.add_skip_files(folder, files)

    for path in utils.get_pdf_list(folder):
        file = os.path.basename(path)
        parsed = nomenclature.parse_filename(file)
        if parsed is None or file in files:
            continue
        print('Q: ' + os.path.basename(file))
        authors, title = parsed

        # Google Scholar
        rbib = providers.scholarly_query(authors, title)
        if rbib is None:
            continue

        # Append filename and store entry
        rbib['file'] = utils.encode_filename_field(file)
        db.entries.append(rbib)

    # Store results
    bib_path = os.path.join(folder, '.queried.bib')
    utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)