def read_entries_from_file(fname, should_complain_about_non_ascii=True): #s = open(fname).read() # Handle possibly-screwed Unicode strings. # 's' will now be a Unicode string. (s, contains_non_ascii, errors) = unicode_string_utils.open_file_read_unicode(fname) if contains_non_ascii and should_complain_about_non_ascii: complain_about_non_ascii(fname) b = bibtex.BibtexParser() # The method 'checkFormat' complains about some situations (like a space # after "@incollection" or "@article", before the left-brace) that seem to # present no problems to 'getEntries'. # # It also complains about any non-ASCII characters in the string, whether # you've opened the file using the built-in 'open' function (which returns # a regular string containing UTF-8 characters) or using the 'codecs.open' # function with the encoding specified as either "utf-8" or "utf-8-sig" # (which returns a Unicode string containing Unicode code-points). # # Hence, we'll attempt to gloss over any limitations of 'checkFormat' before # we invoke it. if contains_non_ascii: # Temporarily convert any non-ASCII characters to an ASCII character that's # not one of the BibTeX syntax characters) and check. if not b.checkFormat( gloss_over_checkFormat_limitations( unicode_string_utils.replace_non_ascii_unicode(s))): raise InvalidFormat(fname) else: if not b.checkFormat(gloss_over_checkFormat_limitations(s)): raise InvalidFormat(fname) # The 'preprocess' method will accept either Unicode strings or regular # strings, and will always return a regular string (with any Unicode # code-points encoded as UTF-8, just as the built-in 'open' function would # return). For example, u'\u2019', the Unicode code-point for "RIGHT SINGLE # QUOTATION MARK" will be encoded as '\xe2\x80\x99'. # # If you invoke 'preprocess' before the 'getEntries' method, 'getEntries' # won't have any Unicode problems. s = b.preprocess(s) # The 'getEntries' method doesn't like to accept Unicode strings, only # regular strings (although it's not unhappy if the regular strings contain # UTF-8 characters). If you give it a UnicodeString, it raises a TypeError # to complain: # # File ".../bibliograph/parsing/parsers/base.py", line 134, in getEntries # source = self.checkEncoding(source) # File ".../bibliograph/parsing/parsers/base.py", line 143, in checkEncoding # source = unicode(source, 'utf-8') # TypeError: decoding Unicode is not supported # # Hence, we invoke the 'preprocess' method first, to ensure that 'getEntries' # won't have any Unicode problems. entries = b.getEntries(s) return entries
def read_entries_from_file(fname, should_complain_about_non_ascii=True): #s = open(fname).read() # Handle possibly-screwed Unicode strings. # 's' will now be a Unicode string. (s, contains_non_ascii, errors) = unicode_string_utils.open_file_read_unicode(fname) if contains_non_ascii and should_complain_about_non_ascii: complain_about_non_ascii(fname) b = bibtex.BibtexParser() # The method 'checkFormat' complains about some situations (like a space # after "@incollection" or "@article", before the left-brace) that seem to # present no problems to 'getEntries'. # # It also complains about any non-ASCII characters in the string, whether # you've opened the file using the built-in 'open' function (which returns # a regular string containing UTF-8 characters) or using the 'codecs.open' # function with the encoding specified as either "utf-8" or "utf-8-sig" # (which returns a Unicode string containing Unicode code-points). # # Hence, we'll attempt to gloss over any limitations of 'checkFormat' before # we invoke it. if contains_non_ascii: # Temporarily convert any non-ASCII characters to an ASCII character that's # not one of the BibTeX syntax characters) and check. if not b.checkFormat(gloss_over_checkFormat_limitations( unicode_string_utils.replace_non_ascii_unicode(s))): raise InvalidFormat(fname) else: if not b.checkFormat(gloss_over_checkFormat_limitations(s)): raise InvalidFormat(fname) # The 'preprocess' method will accept either Unicode strings or regular # strings, and will always return a regular string (with any Unicode # code-points encoded as UTF-8, just as the built-in 'open' function would # return). For example, u'\u2019', the Unicode code-point for "RIGHT SINGLE # QUOTATION MARK" will be encoded as '\xe2\x80\x99'. # # If you invoke 'preprocess' before the 'getEntries' method, 'getEntries' # won't have any Unicode problems. s = b.preprocess(s) # The 'getEntries' method doesn't like to accept Unicode strings, only # regular strings (although it's not unhappy if the regular strings contain # UTF-8 characters). If you give it a UnicodeString, it raises a TypeError # to complain: # # File ".../bibliograph/parsing/parsers/base.py", line 134, in getEntries # source = self.checkEncoding(source) # File ".../bibliograph/parsing/parsers/base.py", line 143, in checkEncoding # source = unicode(source, 'utf-8') # TypeError: decoding Unicode is not supported # # Hence, we invoke the 'preprocess' method first, to ensure that 'getEntries' # won't have any Unicode problems. entries = b.getEntries(s) return entries
def writeOrAppend(f, chunk): if os.path.exists(f): content, has_non_ascii_chars, errors = unicode_string_utils.open_file_read_unicode( f) content = tftp.sub(ur'\1\n TFTSUB \n\2', content) content = udbp.sub(ur'\1\n UDBSUB \n\2', content) content = ulbp.sub(ur'\1\n ULBSUB \n\2', content) content = (content.replace(u' TFTSUB ', chunk['tft']).replace( u' UDBSUB ', chunk['udb']).replace(u' ULBSUB ', chunk['ulb'])) else: content = TMPL.format(**chunk) writeFile(f, content)
def writeOrAppend(f, chunk): if os.path.exists(f): content, has_non_ascii_chars, errors = unicode_string_utils.open_file_read_unicode(f) content = tftp.sub(ur'\1\n TFTSUB \n\2', content) content = udbp.sub(ur'\1\n UDBSUB \n\2', content) content = ulbp.sub(ur'\1\n ULBSUB \n\2', content) content = ( content.replace(u' TFTSUB ', chunk['tft']) .replace(u' UDBSUB ', chunk['udb']) .replace(u' ULBSUB ', chunk['ulb']) ) else: content = TMPL.format(**chunk) writeFile(f, content)
def writeOrAppend(f, chunk): if os.path.exists(f): content, has_non_ascii_chars, errors = unicode_string_utils.open_file_read_unicode(f) content = tftp.sub(ur'\1\n TFTSUB \n\2', content) content = udbp.sub(ur'\1\n UDBSUB \n\2', content) content = ulbp.sub(ur'\1\n ULBSUB \n\2', content) # noinspection PyTypeChecker content = (content.replace(u' TFTSUB ', chunk['tft']) .replace(u' UDBSUB ', chunk['udb']) .replace(u' ULBSUB ', chunk['ulb'])) else: # do not create new files automatically return # content = TMPL.format(**chunk) writeFile(f, content)
def replace_cite_key_in_file(new_cite_key, in_fname, out_fname=None): """Assumes that 'in_fname' contains only one BibTeX entry. If 'out_fname' is not supplied, or is None, 'out_fname' will be the same as 'in_fname'. """ if not out_fname: out_fname = in_fname # Handle possibly-screwed Unicode strings. # 's' will now be a Unicode string. (s, contains_non_ascii, errors) = unicode_string_utils.open_file_read_unicode(in_fname) obj = re.compile('^(@[^@{]+{)[^,]+,') if not obj.match(s): raise InvalidFormat(in_fname) s = obj.sub(r'\1%s,' % new_cite_key, s, 1) # Write 's' back out in UTF-8, in case it contains any non-ASCII code-points. codecs.open(out_fname, 'w', encoding="utf-8-sig").write(s)