コード例 #1
0
ファイル: bibfile_utils.py プロジェクト: alyssaq/distil
def read_entries_from_file(fname, should_complain_about_non_ascii=True):
    #s = open(fname).read()
    # Handle possibly-screwed Unicode strings.
    # 's' will now be a Unicode string.
    (s, contains_non_ascii,
     errors) = unicode_string_utils.open_file_read_unicode(fname)
    if contains_non_ascii and should_complain_about_non_ascii:
        complain_about_non_ascii(fname)

    b = bibtex.BibtexParser()

    # The method 'checkFormat' complains about some situations (like a space
    # after "@incollection" or "@article", before the left-brace) that seem to
    # present no problems to 'getEntries'.
    #
    # It also complains about any non-ASCII characters in the string, whether
    # you've opened the file using the built-in 'open' function (which returns
    # a regular string containing UTF-8 characters) or using the 'codecs.open'
    # function with the encoding specified as either "utf-8" or "utf-8-sig"
    # (which returns a Unicode string containing Unicode code-points).
    #
    # Hence, we'll attempt to gloss over any limitations of 'checkFormat' before
    # we invoke it.
    if contains_non_ascii:
        # Temporarily convert any non-ASCII characters to an ASCII character that's
        # not one of the BibTeX syntax characters) and check.
        if not b.checkFormat(
                gloss_over_checkFormat_limitations(
                    unicode_string_utils.replace_non_ascii_unicode(s))):
            raise InvalidFormat(fname)
    else:
        if not b.checkFormat(gloss_over_checkFormat_limitations(s)):
            raise InvalidFormat(fname)

    # The 'preprocess' method will accept either Unicode strings or regular
    # strings, and will always return a regular string (with any Unicode
    # code-points encoded as UTF-8, just as the built-in 'open' function would
    # return).  For example, u'\u2019', the Unicode code-point for "RIGHT SINGLE
    # QUOTATION MARK" will be encoded as '\xe2\x80\x99'.
    #
    # If you invoke 'preprocess' before the 'getEntries' method, 'getEntries'
    # won't have any Unicode problems.
    s = b.preprocess(s)

    # The 'getEntries' method doesn't like to accept Unicode strings, only
    # regular strings (although it's not unhappy if the regular strings contain
    # UTF-8 characters).  If you give it a UnicodeString, it raises a TypeError
    # to complain:
    #
    # File ".../bibliograph/parsing/parsers/base.py", line 134, in getEntries
    #   source = self.checkEncoding(source)
    # File ".../bibliograph/parsing/parsers/base.py", line 143, in checkEncoding
    #   source = unicode(source, 'utf-8')
    # TypeError: decoding Unicode is not supported
    #
    # Hence, we invoke the 'preprocess' method first, to ensure that 'getEntries'
    # won't have any Unicode problems.
    entries = b.getEntries(s)

    return entries
コード例 #2
0
ファイル: bibfile_utils.py プロジェクト: alyssaq/distil
def read_entries_from_file(fname, should_complain_about_non_ascii=True):
  #s = open(fname).read()
  # Handle possibly-screwed Unicode strings.
  # 's' will now be a Unicode string.
  (s, contains_non_ascii, errors) = unicode_string_utils.open_file_read_unicode(fname)
  if contains_non_ascii and should_complain_about_non_ascii:
    complain_about_non_ascii(fname)

  b = bibtex.BibtexParser()

  # The method 'checkFormat' complains about some situations (like a space
  # after "@incollection" or "@article", before the left-brace) that seem to
  # present no problems to 'getEntries'.
  #
  # It also complains about any non-ASCII characters in the string, whether
  # you've opened the file using the built-in 'open' function (which returns
  # a regular string containing UTF-8 characters) or using the 'codecs.open'
  # function with the encoding specified as either "utf-8" or "utf-8-sig"
  # (which returns a Unicode string containing Unicode code-points).
  #
  # Hence, we'll attempt to gloss over any limitations of 'checkFormat' before
  # we invoke it.
  if contains_non_ascii:
    # Temporarily convert any non-ASCII characters to an ASCII character that's
    # not one of the BibTeX syntax characters) and check.
    if not b.checkFormat(gloss_over_checkFormat_limitations(
        unicode_string_utils.replace_non_ascii_unicode(s))):
      raise InvalidFormat(fname)
  else:
    if not b.checkFormat(gloss_over_checkFormat_limitations(s)):
      raise InvalidFormat(fname)

  # The 'preprocess' method will accept either Unicode strings or regular
  # strings, and will always return a regular string (with any Unicode
  # code-points encoded as UTF-8, just as the built-in 'open' function would
  # return).  For example, u'\u2019', the Unicode code-point for "RIGHT SINGLE
  # QUOTATION MARK" will be encoded as '\xe2\x80\x99'.
  #
  # If you invoke 'preprocess' before the 'getEntries' method, 'getEntries'
  # won't have any Unicode problems.
  s = b.preprocess(s)

  # The 'getEntries' method doesn't like to accept Unicode strings, only
  # regular strings (although it's not unhappy if the regular strings contain
  # UTF-8 characters).  If you give it a UnicodeString, it raises a TypeError
  # to complain:
  #
  # File ".../bibliograph/parsing/parsers/base.py", line 134, in getEntries
  #   source = self.checkEncoding(source)
  # File ".../bibliograph/parsing/parsers/base.py", line 143, in checkEncoding
  #   source = unicode(source, 'utf-8')
  # TypeError: decoding Unicode is not supported
  #
  # Hence, we invoke the 'preprocess' method first, to ensure that 'getEntries'
  # won't have any Unicode problems.
  entries = b.getEntries(s)

  return entries
コード例 #3
0
def writeOrAppend(f, chunk):
    if os.path.exists(f):
        content, has_non_ascii_chars, errors = unicode_string_utils.open_file_read_unicode(
            f)
        content = tftp.sub(ur'\1\n TFTSUB \n\2', content)
        content = udbp.sub(ur'\1\n UDBSUB \n\2', content)
        content = ulbp.sub(ur'\1\n ULBSUB \n\2', content)
        content = (content.replace(u' TFTSUB ', chunk['tft']).replace(
            u' UDBSUB ', chunk['udb']).replace(u' ULBSUB ', chunk['ulb']))
    else:
        content = TMPL.format(**chunk)
    writeFile(f, content)
コード例 #4
0
def writeOrAppend(f, chunk):
    if os.path.exists(f):
        content, has_non_ascii_chars, errors = unicode_string_utils.open_file_read_unicode(f)
        content = tftp.sub(ur'\1\n TFTSUB \n\2', content)
        content = udbp.sub(ur'\1\n UDBSUB \n\2', content)
        content = ulbp.sub(ur'\1\n ULBSUB \n\2', content)
        content = ( content.replace(u' TFTSUB ', chunk['tft'])
                           .replace(u' UDBSUB ', chunk['udb'])
                           .replace(u' ULBSUB ', chunk['ulb']) )
    else:
        content = TMPL.format(**chunk)
    writeFile(f, content)
コード例 #5
0
def writeOrAppend(f, chunk):

    if os.path.exists(f):
        content, has_non_ascii_chars, errors = unicode_string_utils.open_file_read_unicode(f)
        content = tftp.sub(ur'\1\n TFTSUB \n\2', content)
        content = udbp.sub(ur'\1\n UDBSUB \n\2', content)
        content = ulbp.sub(ur'\1\n ULBSUB \n\2', content)
        # noinspection PyTypeChecker
        content = (content.replace(u' TFTSUB ', chunk['tft'])
                   .replace(u' UDBSUB ', chunk['udb'])
                   .replace(u' ULBSUB ', chunk['ulb']))
    else:
        # do not create new files automatically
        return
        # content = TMPL.format(**chunk)

    writeFile(f, content)
コード例 #6
0
ファイル: bibfile_utils.py プロジェクト: alyssaq/distil
def replace_cite_key_in_file(new_cite_key, in_fname, out_fname=None):
  """Assumes that 'in_fname' contains only one BibTeX entry.

  If 'out_fname' is not supplied, or is None, 'out_fname' will be the same as
  'in_fname'.
  """
  if not out_fname:
    out_fname = in_fname

  # Handle possibly-screwed Unicode strings.
  # 's' will now be a Unicode string.
  (s, contains_non_ascii, errors) = unicode_string_utils.open_file_read_unicode(in_fname)

  obj = re.compile('^(@[^@{]+{)[^,]+,')
  if not obj.match(s):
    raise InvalidFormat(in_fname)
  s = obj.sub(r'\1%s,' % new_cite_key, s, 1)

  # Write 's' back out in UTF-8, in case it contains any non-ASCII code-points.
  codecs.open(out_fname, 'w', encoding="utf-8-sig").write(s)
コード例 #7
0
ファイル: bibfile_utils.py プロジェクト: alyssaq/distil
def replace_cite_key_in_file(new_cite_key, in_fname, out_fname=None):
    """Assumes that 'in_fname' contains only one BibTeX entry.

  If 'out_fname' is not supplied, or is None, 'out_fname' will be the same as
  'in_fname'.
  """
    if not out_fname:
        out_fname = in_fname

    # Handle possibly-screwed Unicode strings.
    # 's' will now be a Unicode string.
    (s, contains_non_ascii,
     errors) = unicode_string_utils.open_file_read_unicode(in_fname)

    obj = re.compile('^(@[^@{]+{)[^,]+,')
    if not obj.match(s):
        raise InvalidFormat(in_fname)
    s = obj.sub(r'\1%s,' % new_cite_key, s, 1)

    # Write 's' back out in UTF-8, in case it contains any non-ASCII code-points.
    codecs.open(out_fname, 'w', encoding="utf-8-sig").write(s)