コード例 #1
0
ファイル: autobib.py プロジェクト: jdumas/autobib
def remap_keys(old_filename, new_filename, output_folder):
    """
    Create a script to remap bibtex keys from one .bib file to another.

    This function uses the edit distance on filenames generated from a bitex entry
    to compare entries together, and greedily matches old entries to new entries.
    """
    old_db = utils.read_bib_file(old_filename, homogenize=False)
    new_db = utils.read_bib_file(new_filename, homogenize=False)
    old_list = {}
    new_list = {}
    subst = {}
    for entry in new_db.entries:
        name = nomenclature.gen_filename(entry)
        new_list[name] = entry['ID']
    for entry in old_db.entries:
        name = nomenclature.gen_filename(entry)
        if name in new_list.keys():
            subst[entry['ID']] = new_list[name]
            del new_list[name]
        else:
            old_list[name] = entry['ID']
    for name, bibkey in new_list.items():
        match, score = utils.most_similar_filename(name, old_list.keys())
        if score < 0.90:
            print(termcolor.colored("Warning: potentially incorrect substitution:", 'yellow', attrs=["bold"]))
            print(termcolor.colored('-', 'red') + match)
            print(termcolor.colored('+', 'green') + name)
        subst[old_list[match]] = bibkey
    utils.write_remap_script(subst, output_folder)
コード例 #2
0
ファイル: autobib.py プロジェクト: jdumas/autobib
def rename_folder(folder, use_backup):
    """
    Rename the pdf files in the given folder according to the information found
    in `biblio.bib`. Note that this function will update file entries in
    `biblio.bib`, but also in `.queried.bib`.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but renames the pdfs in the given folder, and update bib files.
    """

    # Read input bib files
    pretty_bib_path = os.path.join(folder, 'biblio.bib')
    pretty_db = utils.read_bib_file(pretty_bib_path)
    queried_bib_path = os.path.join(folder, '.queried.bib')
    queried_db = utils.read_bib_file(queried_bib_path)
    queried_files = utils.create_file_dict(queried_db)

    # Iterate over db entries
    need_rename = False
    for entry in pretty_db.entries:
        old_filename = utils.decode_filename_field(entry['file'])
        new_filename = nomenclature.gen_filename(entry)
        if not os.path.exists(os.path.join(folder, old_filename)):
            print(termcolor.colored('file not found: ', 'red') + old_filename)
        elif old_filename != new_filename:
            need_rename = True
            print(termcolor.colored('-', 'red') + old_filename)
            print(termcolor.colored('+', 'green') + new_filename)

    # Skip if nothing to rename
    if not need_rename:
        return

    # Ask confirmation
    cmd = input('(Y/n) ')
    if cmd == '' or cmd == 'y' or cmd == 'Y':
        for entry in pretty_db.entries:
            old_filename = utils.decode_filename_field(entry['file'])
            new_filename = nomenclature.gen_filename(entry)
            old_path = os.path.join(folder, old_filename)
            new_path = os.path.join(folder, new_filename)
            if os.path.exists(old_path):
                os.rename(old_path, new_path)
                new_val = utils.encode_filename_field(new_filename)
                if old_filename in queried_files:
                    idx = queried_files[old_filename]
                    queried_db.entries[idx]['file'] = new_val
                entry['file'] = new_val

    # Write output bibtex files
    utils.write_with_backup(pretty_bib_path, utils.write_bib(pretty_db, order=False), use_backup)
    utils.write_with_backup(queried_bib_path, utils.write_bib(queried_db, order=False), use_backup)
コード例 #3
0
ファイル: providers.py プロジェクト: jdumas/autobib
def scholarly_query(authors, title):
    """
    Query Google Scholar database.

    Args:
        authors (list): a list of strings for up the first authors last names.
        title (str): the title of the article.

    Returns:
        A record (dict) of the bibtex entry obtained from Google Scholar.
    """
    query = ' '.join(authors) + ' ' + title
    search_query = scholarly.search_pubs_query(query)
    try:
        res = next(search_query)
    except StopIteration:
        return None
    res.fill()
    if 'abstract' in res.bib:
        del res.bib['abstract']

    # Post-process title
    res.bib['title'] = re.sub('\\.*$', '', res.bib['title'])

    print('S: ' + nomenclature.gen_filename(res.bib))
    return res.bib
コード例 #4
0
ファイル: autobib.py プロジェクト: jdumas/autobib
def extract_from_file(filename, output_folder):
    """
    Extract citation from a given .bib file, and write the resulting list in a
    '.biblist' in the given folder.
    """
    db = utils.read_bib_file(filename, homogenize=False)
    outfile = os.path.join(output_folder, ".biblist")
    if os.path.exists(outfile):
        cmd = input("overwrite existing file '{0}' (y/N) ".format(outfile))
        if cmd != 'y':
            return
    filenames = sorted([nomenclature.gen_filename(entry) for entry in db.entries])
    with open(outfile, 'w') as f:
        for name in filenames:
            f.write(name)
            f.write('\n')
コード例 #5
0
ファイル: autobib.py プロジェクト: zhaochaocs/autobib
def sync_folder(folder, use_backup):
    """
    Update the file field of bibtex entries for the given folder.
    When an entry could not find a good match, it will be removed from the
    bibtex, unless the user explicitly prevents it.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but updates `.queried.bib` and `biblio.bib` files.
    """
    for bib_file in ('.queried.bib', 'biblio.bib'):
        bib_path = os.path.join(folder, bib_file)
        db = utils.read_bib_file(bib_path)
        unmatched = set(
            [os.path.basename(f) for f in utils.get_pdf_list(folder)])
        to_delete = []
        for i, entry in enumerate(db.entries):
            guess = nomenclature.gen_filename(entry)
            if 'file' in entry:
                guess = utils.decode_filename_field(entry['file'])
            match, score = utils.most_similar_filename(guess, unmatched)
            if score >= 0.90:
                unmatched.remove(match)
                entry['file'] = utils.encode_filename_field(match)
            else:
                print(
                    termcolor.colored(bib_file, "magenta") +
                    ": ({1}) will remove '{0}'".format(
                        guess, termcolor.colored(score, "yellow")))
                to_delete.append(i)

        # Delete unmatched entries
        if to_delete:
            cmd = input('(Y/n) ')
            if cmd == '' or cmd == 'y' or cmd == 'Y':
                for i in sorted(to_delete, reverse=True):
                    del db.entries[i]

        # Write synced database
        utils.write_with_backup(bib_path, utils.write_bib(db, order=False),
                                use_backup)
コード例 #6
0
ファイル: autobib.py プロジェクト: jdumas/autobib
def sync_folder(folder, use_backup):
    """
    Update the file field of bibtex entries for the given folder.
    When an entry could not find a good match, it will be removed from the
    bibtex, unless the user explicitly prevents it.

    Args:
        folder (str): absolute or relative path to the folder to process.
        use_backup (bool): whether to backup previous files before writing.

    Returns:
        Nothing, but updates `.queried.bib` and `biblio.bib` files.
    """
    for bib_file in ('.queried.bib', 'biblio.bib'):
        bib_path = os.path.join(folder, bib_file)
        db = utils.read_bib_file(bib_path)
        unmatched = set([os.path.basename(f) for f in utils.get_pdf_list(folder)])
        to_delete = []
        for i, entry in enumerate(db.entries):
            guess = nomenclature.gen_filename(entry)
            if 'file' in entry:
                guess = utils.decode_filename_field(entry['file'])
            match, score = utils.most_similar_filename(guess, unmatched)
            if score >= 0.90:
                unmatched.remove(match)
                entry['file'] = utils.encode_filename_field(match)
            else:
                print(termcolor.colored(bib_file, "magenta") +
                      ": ({1}) will remove '{0}'".format(guess, termcolor.colored(score, "yellow")))
                to_delete.append(i)

        # Delete unmatched entries
        if to_delete:
            cmd = input('(Y/n) ')
            if cmd == '' or cmd == 'y' or cmd == 'Y':
                for i in sorted(to_delete, reverse=True):
                    del db.entries[i]

        # Write synced database
        utils.write_with_backup(bib_path, utils.write_bib(db, order=False), use_backup)
コード例 #7
0
ファイル: utils.py プロジェクト: jdumas/autobib
def guess_manual_files(folder, queried_db, update_queried_db=True):
    """
    Tries to guess which files in a folder correspond to entries placed in the
    `.manual.bib` file. This is useful for e.g. to avoid performing online queries
    for files which we know have a manual entry.

    If a '.manual.bib' is present, override corresponding queried entries
    The way it works is as follows:
      1. Guess the filename of each entry in `.manual.bib`
      2. Find entry in `.queried.bib` with the closest file name in its 'file' field
      3. Override with manual entry
    """
    files = create_file_dict(queried_db)
    manual_bib_path = os.path.join(folder, '.manual.bib')
    if os.path.exists(manual_bib_path):
        manual_database = read_bib_file(manual_bib_path, homogenize=True)
        for entry in manual_database.entries:
            guess = nomenclature.gen_filename(entry)
            file = encode_filename_field(guess)
            best_score = 0.0
            best_idx = -1
            # Compare again other file entries
            for key, idx in sorted(files.items()):
                sc = simratio(key, file)
                if sc > best_score:
                    best_score = sc
                    best_idx = idx
            # Update 'file' field
            match, _ = most_similar_filename(guess, folder)
            entry['file'] = encode_filename_field(match)
            # If best match is good enough, override old entry
            if update_queried_db:
                if best_score > 0.95:
                    queried_db.entries[best_idx] = entry
                else:
                    queried_db.entries.append(entry)
            else:
                files[match] = -1
    return files
コード例 #8
0
ファイル: utils.py プロジェクト: zhaochaocs/autobib
def guess_manual_files(folder, queried_db, update_queried_db=True):
    """
    Tries to guess which files in a folder correspond to entries placed in the
    `.manual.bib` file. This is useful for e.g. to avoid performing online queries
    for files which we know have a manual entry.

    If a '.manual.bib' is present, override corresponding queried entries
    The way it works is as follows:
      1. Guess the filename of each entry in `.manual.bib`
      2. Find entry in `.queried.bib` with the closest file name in its 'file' field
      3. Override with manual entry
    """
    files = create_file_dict(queried_db)
    manual_bib_path = os.path.join(folder, '.manual.bib')
    if os.path.exists(manual_bib_path):
        manual_database = read_bib_file(manual_bib_path, homogenize=True)
        for entry in manual_database.entries:
            guess = nomenclature.gen_filename(entry)
            file = encode_filename_field(guess)
            best_score = 0.0
            best_idx = -1
            # Compare again other file entries
            for key, idx in sorted(files.items()):
                sc = simratio(key, file)
                if sc > best_score:
                    best_score = sc
                    best_idx = idx
            # Update 'file' field
            match, _ = most_similar_filename(guess, folder)
            entry['file'] = encode_filename_field(match)
            # If best match is good enough, override old entry
            if update_queried_db:
                if best_score > 0.95:
                    queried_db.entries[best_idx] = entry
                else:
                    queried_db.entries.append(entry)
            else:
                files[match] = -1
    return files
コード例 #9
0
ファイル: providers.py プロジェクト: zhaochaocs/autobib
def crossref_query(authors, title):
    """
    Query Crossref database.

    Args:
        authors (list): a list of strings for up the first authors last names.
        title (str): the title of the article.
        filename (str): the original path of the file to link to.

    Returns:
        A tuple (bibtex, json, score) where the first element is the data in
        bibtex format (returned as a record/dict), the second element is the
        data returned in json format, and the third element is the score of the
        match given by Crossref.
    """
    cr = Crossref()
    query = ['+"' + name + '"' for name in authors]
    query = ' '.join(query) + ' +"' + title + '"'
    x = cr.works(query=query)
    assert x['status'] == "ok"

    # No result found
    if not x['message']['items']:
        print_score(0)
        return (None, [], 0)

    best_item = x['message']['items'][0]
    for item in x['message']['items']:
        if item['score'] < best_item['score']:
            break
        else:
            best_item = pick_best(title, best_item, item)

    # Retrieve DOI and json item
    doi = best_item['DOI']
    res_json = best_item

    # If the entry is invalid, return a score of 0
    if 'author' not in res_json or not res_json['title']:
        print_score(0)
        return (None, res_json, 0)

    # Retrieve metadata as bibtex entry
    res_bib = cn.content_negotiation(ids=doi, format="bibentry")
    res_bib = re.sub('ä', 'ä', res_bib)
    res_bib = re.sub('Ö', 'Ö', res_bib)
    res_bib = re.sub('รถ', 'ö', res_bib)
    res_bib = re.sub('Ăź', 'ü', res_bib)
    res_bib = re.sub('̈o', 'ö', res_bib)
    res_bib = re.sub('ďż˝', 'ø', res_bib)
    res_bib = re.sub('ĂŤ', 'ë', res_bib)
    db = bibtexparser.loads(res_bib)
    assert len(db.entries) == 1
    res_bib = db.entries[0]

    # If article has subtitle(s), fix bibtex entry
    if 'subtitle' in res_json:
        subtitles = [x for x in res_json['subtitle'] if not str.isupper(x)]
    else:
        subtitles = []
    if len(subtitles) > 0:
        # Discard subtitle that are all uppercase
        title = ' '.join(res_json['title'])
        subtitle = ' '.join(subtitles)
        if title.lower().startswith(
                subtitle.lower()) or utils.simratio(title, subtitle) > 0.95:
            # Don't repeat title if the subtitle is too similar to the title
            new_title = title
        else:
            new_title = title + ": " + subtitle
        res_bib['title'] = new_title
    else:
        new_title = ' '.join(res_json['title'])
        res_bib['title'] = new_title

    # Post-process title
    res_bib['title'] = re.sub('\\*$', '', res_bib['title'])
    res_bib['title'] = re.sub('^[0-9]*\\. ', '', res_bib['title'])
    res_bib['title'] = re.sub('\\.*$', '', res_bib['title'])

    # If bibtex entry has a 'journal' field, then use the longest alias from the json
    if 'journal' in res_bib:
        best = ""
        for container in res_json['container-title']:
            if len(container) > len(best):
                best = container
        res_bib['journal'] = best

    # If entry is missing the year, set score to 0
    score = res_json['score']
    if 'year' not in res_bib:
        score = 0

    # Fix incorrect year in crossref entry
    if 'published-print' in res_json:
        item = res_json['published-print']
        if 'date-parts' in item and len(item['date-parts']) == 1:
            date = item['date-parts'][0]
            year = date[0]
            month = date[1] if len(date) > 1 else None
            if str(year) != res_bib['year']:
                res_bib['year'] = str(year)
                if month is None and 'month' in res_bib:
                    del res_bib['month']
                elif month is not None:
                    assert month >= 1 and month <= 12
                    month_str = utils.MONTHS[month - 1]
                    res_bib['month'] = month_str

    # Fix potential ambiguous author entries
    msg = utils.fix_author_field(res_bib, res_json)

    print('C: ' + nomenclature.gen_filename(res_bib))
    print_score(score)

    # If score is above threshold, display msg from fix_author_field
    if score >= config.crossref_accept_threshold and msg:
        print(msg)

    # Return database entry
    return (res_bib, res_json, score)
コード例 #10
0
ファイル: providers.py プロジェクト: jdumas/autobib
def crossref_query(authors, title):
    """
    Query Crossref database.

    Args:
        authors (list): a list of strings for up the first authors last names.
        title (str): the title of the article.
        filename (str): the original path of the file to link to.

    Returns:
        A tuple (bibtex, json, score) where the first element is the data in
        bibtex format (returned as a record/dict), the second element is the
        data returned in json format, and the third element is the score of the
        match given by Crossref.
    """
    cr = Crossref()
    # works?query.title=An+Improved+Adaptive+Constraint+Aggregation+for+Integrated+Layout+and+Topology+Optimization&query.author=Gao+Zhu+Zhang+Zhou&sort=score&rows=1
    # query = ['+' + name + '' for name in authors]
    # query = 'query.title=' + urllib.parse.quote_plus(title) + '&query.author=' + urllib.parse.quote_plus(' '.join(authors)) + '&sort=score&rows=1'
    # print(query)
    if ''.join(authors):
        args = dict(
            query_title=urllib.parse.quote_plus(title),
            query_author=urllib.parse.quote_plus(' '.join(authors))
        )
    else:
        args = dict(
            query=urllib.parse.quote_plus(title),
        )
    x = cr.works(sort='score', limit=1, **args)
    # x = cr.works(query=query)
    assert x['status'] == "ok"

    # No result found
    if not x['message']['items']:
        print_score(0)
        return (None, [], 0)

    best_item = x['message']['items'][0]
    # print(json.dumps(best_item, indent=4))
    for item in x['message']['items']:
        if item['score'] < best_item['score']:
            break
        else:
            best_item = pick_best(title, best_item, item)

    # Retrieve DOI and json item
    doi = best_item['DOI']
    res_json = best_item

    # If the entry is invalid, return a score of 0
    if 'author' not in res_json or not res_json['title']:
        print_score(0)
        return (None, res_json, 0)

    # Retrieve metadata as bibtex entry
    res_bib = cn.content_negotiation(ids=doi, format="bibentry")
    res_bib = re.sub('ä', 'ä', res_bib)
    res_bib = re.sub('Ö', 'Ö', res_bib)
    res_bib = re.sub('รถ', 'ö', res_bib)
    res_bib = re.sub('Ăź', 'ü', res_bib)
    res_bib = re.sub('̈o', 'ö', res_bib)
    res_bib = re.sub('ďż˝', 'ø', res_bib)
    res_bib = re.sub('ĂŤ', 'ë', res_bib)
    db = bibtexparser.loads(res_bib)
    assert len(db.entries) == 1
    res_bib = db.entries[0]

    # If article has subtitle(s), fix bibtex entry
    subtitles = None
    if 'subtitle' in res_json:
        subtitles = [x for x in res_json['subtitle'] if not str.isupper(x)]

    if subtitles:
        # Discard subtitle that are all uppercase
        title = ' '.join(res_json['title'])
        subtitle = ' '.join(subtitles)
        if title.lower().startswith(subtitle.lower()) or utils.simratio(title, subtitle) > 0.95:
            # Don't repeat title if the subtitle is too similar to the title
            new_title = title
        else:
            new_title = title + ": " + subtitle
        res_bib['title'] = new_title
    else:
        new_title = ' '.join(res_json['title'])
        res_bib['title'] = new_title

    # Post-process title
    res_bib['title'] = re.sub('\\*$', '', res_bib['title'])
    res_bib['title'] = re.sub('^[0-9]*\\. ', '', res_bib['title'])
    res_bib['title'] = re.sub('\\.*$', '', res_bib['title'])

    # If bibtex entry has a 'journal' field, then use the longest alias from the json
    if 'journal' in res_bib:
        best = ""
        for container in res_json['container-title']:
            if len(container) > len(best):
                best = container
        res_bib['journal'] = best

    # If entry is missing the year, set score to 0
    score = res_json['score']
    if 'year' not in res_bib:
        score = 0

    # Fix incorrect year in crossref entry
    if 'published-print' in res_json:
        item = res_json['published-print']
        if 'date-parts' in item and len(item['date-parts']) == 1:
            date = item['date-parts'][0]
            year = date[0]
            month = date[1] if len(date) > 1 else None
            if str(year) != res_bib['year']:
                res_bib['year'] = str(year)
                if month is None and 'month' in res_bib:
                    del res_bib['month']
                elif month is not None:
                    assert month >= 1 and month <= 12
                    month_str = utils.MONTHS[month - 1]
                    res_bib['month'] = month_str

    # Fix potential ambiguous author entries
    msg = utils.fix_author_field(res_bib, res_json)

    print('C: ' + nomenclature.gen_filename(res_bib))
    print_score(score)

    # If score is above threshold, display msg from fix_author_field
    if score >= config.crossref_accept_threshold and msg:
        print(msg)

    # Return database entry
    return (res_bib, res_json, score)
コード例 #11
0
def guess_manual_files(folder, queried_db, update_queried_db=True):
    """
    Tries to guess which files in a folder correspond to entries placed in the
    `.manual.bib` file. This is useful for e.g. to avoid performing online queries
    for files which we know have a manual entry.

    If a '.manual.bib' is present, override corresponding queried entries
    The way it works is as follows:
      1. Guess the filename of each entry in `.manual.bib`
      2. Find entry in `.queried.bib` with the closest file name in its 'file' field
      3. Override with manual entry
    """
    files = create_file_dict(queried_db)
    manual_bib_path = os.path.join(folder, '.manual.bib')
    if os.path.exists(manual_bib_path):
        manual_database = read_bib_file(manual_bib_path, homogenize=True)
        for entry in manual_database.entries:
            guess = nomenclature.gen_filename(entry)
            logger.warning("Try to find a match for manual entry: {}", guess)
            queried_best_score = 0.0
            queried_best_idx = -1
            queried_best_key = None
            # Find the entry from .queried that is the most similar to the manual entry
            for key, idx in sorted(files.items()):
                sc = simratio(key, guess)
                if sc > queried_best_score:
                    queried_best_score = sc
                    queried_best_idx = idx
                    queried_best_key = key
            # Find most similar filename in the folder being processed
            match, match_score = most_similar_filename(guess, folder)
            if match_score < 0.9:
                logger.warning(
                    "Cannot find a file matching manual entry (simratio: {}).\n- Entry: {}\n- Match: {}",
                    match_score, guess, match)
                res = None
                while res not in ['y', 'n']:
                    res = input("Use best match for this entry? [y/n]")
                if res == 'n':
                    continue
            else:
                logger.info("Found a file matching manual entry: {}", guess)
            entry['file'] = encode_filename_field(match)
            files[match] = -1
            # If best match is good enough, override queried entry with the manual one
            if update_queried_db:
                if queried_best_idx >= 0 and queried_best_score > 0.90:
                    logger.info("Found a query matching manual entry: {}",
                                guess)
                    queried_db.entries[queried_best_idx] = entry
                elif queried_best_idx >= 0 and queried_best_score > 0.80:
                    logger.warning(
                        "Could not find a query matching manual entry (simratio: {}).\n- Entry: {}\n- Query: {}",
                        queried_best_score, guess, queried_best_key)
                    res = None
                    while res not in ['y', 'n']:
                        res = input(
                            "Replace this query with the manual entry? [y/n]")
                    if res == 'y':
                        queried_db.entries[queried_best_idx] = entry
                    else:
                        queried_db.entries.append(entry)
                else:
                    logger.debug(
                        "Could not find a query matching manual entry: {}",
                        guess)
                    queried_db.entries.append(entry)
    return files