Example #1
0
 def test_write_common_strings(self):
     bib_database = BibDatabase()
     bib_database.load_common_strings()
     writer = BibTexWriter(write_common_strings=True)
     result = bibdeskparser.dumps(bib_database, writer=writer)
     with io.open('tests/data/common_strings.bib') as f:
         expected = f.read()
     self.assertEqual(result, expected)
Example #2
0
 def test_ignore_common_strings_only_if_not_overloaded(self):
     bib_database = BibDatabase()
     bib_database.load_common_strings()
     bib_database.strings['jan'] = 'Janvier'
     result = bibdeskparser.dumps(bib_database)
     self.assertEqual(result, '@string{jan = {Janvier}}\n\n')
Example #3
0
 def test_ignore_common_strings(self):
     bib_database = BibDatabase()
     bib_database.load_common_strings()
     result = bibdeskparser.dumps(bib_database)
     self.assertEqual(result, '')
Example #4
0
class BibTexParser:
    """
    A parser for reading BibTeX bibliographic data files.

    Example::

        from bibdeskparser.bparser import BibTexParser

        bibtex_str = ...

        parser = BibTexParser()
        parser.ignore_nonstandard_types = False
        parser.homogenize_fields = False
        parser.common_strings = False
        bib_database = bibdeskparser.loads(bibtex_str, parser)

    :param customization: function or None (default)
        Customization to apply to parsed entries.
    :param ignore_nonstandard_types: bool (default True)
        If True ignores non-standard bibtex entry types.
    :param homogenize_fields: bool (default False)
        Common field name replacements (as set in alt_dict attribute).
    :param interpolate_strings: bool (default True)
        If True, replace bibtex string by their value, else uses
        BibDataString objects.
    :param common_strings: bool (default False)
        Include common string definitions (e.g. month abbreviations) to
        the bibtex file.
    :param add_missing_from_crossref: bool (default False)
        Resolve BibTeX references set in the crossref field for BibTeX entries
        and add the fields from the referenced entry to the referencing entry.
    """
    def __new__(cls, data=None, **args):
        """
        To catch the old API structure in which creating the parser would
        immediately parse and return data.
        """

        if data is None:
            return super(BibTexParser, cls).__new__(cls)
        else:
            # For backwards compatibility: if data is given, parse
            # and return the `BibDatabase` object instead of the parser.
            return parse(data, **args)

    def __init__(
        self,
        data=None,
        customization=None,
        ignore_nonstandard_types=True,
        homogenize_fields=False,
        interpolate_strings=True,
        common_strings=False,
        add_missing_from_crossref=False,
    ):
        """
        Creates a parser for rading BibTeX files

        :return: parser
        :rtype: `BibTexParser`
        """
        self.bib_database = BibDatabase()

        #: Load common strings such as months abbreviation
        #: Default: `False`.
        self.common_strings = common_strings
        if self.common_strings:
            self.bib_database.load_common_strings()

        #: Callback function to process BibTeX entries after parsing,
        #: for example to create a list from a string with multiple values.
        #: By default all BibTeX values are treated as simple strings.
        #: Default: `None`.
        self.customization = customization

        #: Ignore non-standard BibTeX types (`book`, `article`, etc).
        #: Default: `True`.
        self.ignore_nonstandard_types = ignore_nonstandard_types

        #: Sanitize BibTeX field names, for example change `url` to `link` etc.
        #: Field names are always converted to lowercase names.
        #: Default: `False`.
        self.homogenize_fields = homogenize_fields

        #: Interpolate Bibtex Strings or keep the structure
        self.interpolate_strings = interpolate_strings

        # On some sample data files, the character encoding detection simply
        # hangs We are going to default to utf8, and mandate it.
        self.encoding = 'utf8'

        # Add missing field from cross-ref
        self.add_missing_from_crossref = add_missing_from_crossref

        # pre-defined set of key changes
        self.alt_dict = {
            'keyw': u'keyword',
            'keywords': u'keyword',
            'authors': u'author',
            'editors': u'editor',
            'urls': u'url',
            'link': u'url',
            'links': u'url',
            'subjects': u'subject',
            'xref': u'crossref',
        }

        # Setup the parser expression
        self._init_expressions()

    def parse(self, bibtex_str, partial=False):
        """Parse a BibTeX string into an object

        :param bibtex_str: BibTeX string
        :type: str
        :param partial: If True, print errors only on parsing failures.
            If False, an exception is raised.
        :type: bool
        :return: bibliographic database
        :rtype: BibDatabase
        """
        bibtex_file_obj = self._bibtex_file_obj(bibtex_str)
        try:
            self._expr.parseFile(bibtex_file_obj)
        except self._expr.ParseException as exc:
            logger.error("Could not parse properly, starting at %s", exc.line)
            if not partial:
                raise exc

        if self.add_missing_from_crossref:
            self.bib_database.add_missing_from_crossref()

        return self.bib_database

    def parse_file(self, file, partial=False):
        """Parse a BibTeX file into an object

        :param file: BibTeX file or file-like object
        :type: typing.IO
        :param partial: If True, print errors only on parsing failures.
            If False, an exception is raised.
        :type: bool
        :return: bibliographic database
        :rtype: BibDatabase
        """
        return self.parse(file.read(), partial=partial)

    def _init_expressions(self):
        """
        Defines all parser expressions used internally.
        """
        self._expr = BibtexExpression()

        # Handle string as BibDataString object
        self._expr.set_string_name_parse_action(
            lambda s, l, t: BibDataString(self.bib_database, t[0]))
        if self.interpolate_strings:
            maybe_interpolate = lambda expr: as_text(expr)
        else:
            maybe_interpolate = lambda expr: expr
        self._expr.set_string_expression_parse_action(
            lambda s, l, t: maybe_interpolate(
                BibDataStringExpression.expression_if_needed(t)))

        # Add notice to logger
        self._expr.add_log_function(logger.debug)

        # Set actions
        self._expr.entry.addParseAction(lambda s, l, t: self._add_entry(
            t.get('EntryType'), t.get('Key'), t.get('Fields')))
        self._expr.implicit_comment.addParseAction(
            lambda s, l, t: self._add_comment(t[0]))
        self._expr.explicit_comment.addParseAction(
            lambda s, l, t: self._add_comment(t[0]))
        self._expr.preamble_decl.addParseAction(
            lambda s, l, t: self._add_preamble(t[0]))
        self._expr.string_def.addParseAction(lambda s, l, t: self._add_string(
            t['StringName'].name, t['StringValue']))

    def _bibtex_file_obj(self, bibtex_str):
        # Some files have Byte-order marks inserted at the start
        byte = b'\xef\xbb\xbf'
        if isinstance(bibtex_str, str):
            byte = str(byte, self.encoding, 'ignore')
            if bibtex_str[0] == byte:
                bibtex_str = bibtex_str[1:]
        else:
            if bibtex_str[:3] == byte:
                bibtex_str = bibtex_str[3:]
            bibtex_str = bibtex_str.decode(encoding=self.encoding)
        return io.StringIO(bibtex_str)

    def _clean_val(self, val):
        """ Clean instring before adding to dictionary

        :param val: a value
        :type val: string
        :returns: string -- value
        """
        if not val or val == "{}":
            return ''
        return val

    def _clean_key(self, key):
        """ Lowercase a key and return as str.

        :param key: a key
        :type key: str
        :returns: (str) string-value
        """
        key = key.lower()
        if not isinstance(key, str):
            return str(key, 'utf-8')
        else:
            return key

    def _clean_field_key(self, key):
        """ Clean a bibtex field key and homogenize alternative forms.

        :param key: a key
        :type key: str
        :returns: string-value
        """
        key = self._clean_key(key)
        if self.homogenize_fields:
            if key in list(self.alt_dict.keys()):
                key = self.alt_dict[key]
        return key

    def _add_entry(self, entry_type, entry_id, fields):
        """ Adds a parsed entry.
        Includes checking type and fields, cleaning, applying customizations.

        :param entry_type: the entry type
        :type entry_type: str
        :param entry_id: the entry bibid
        :type entry_id: str
        :param fields: the fields and values
        :type fields: dictionary
        :returns: string-value
        """
        d = {}
        entry_type = self._clean_key(entry_type)
        if self.ignore_nonstandard_types and entry_type not in STANDARD_TYPES:
            logger.warning('Entry type %s not standard. Not considered.',
                           entry_type)
            return
        for key in fields:
            d[self._clean_field_key(key)] = self._clean_val(fields[key])
        d['ENTRYTYPE'] = entry_type
        d['ID'] = entry_id

        crossref = d.get('crossref', None)
        if self.add_missing_from_crossref and crossref is not None:
            d['_crossref'] = crossref

        if self.customization is not None:
            logger.debug('Apply customizations and return dict')
            d = self.customization(d)

        self.bib_database.entries.append(d)

    def _add_comment(self, comment):
        """
        Stores a comment in the list of comment.

        :param comment: the parsed comment
        :type comment: str
        """
        logger.debug('Store comment in list of comments: ' +
                     comment.__repr__())
        self.bib_database.comments.append(comment)

    def _add_string(self, string_key, string):
        """
        Stores a new string in the string dictionary.

        :param string_key: the string key
        :type string_key: str
        :param string: the string value
        :type string: str
        """
        if string_key in self.bib_database.strings:
            logger.warning('Overwritting existing string for key: %s.',
                           string_key)
        logger.debug(u'Store string: {} -> {}'.format(string_key, string))
        self.bib_database.strings[string_key] = self._clean_val(string)

    def _add_preamble(self, preamble):
        """
        Stores a preamble.

        :param preamble: the parsed preamble
        :type preamble: str
        """
        logger.debug('Store preamble in list of preambles')
        self.bib_database.preambles.append(preamble)