def exportUrlFeeder( self, filename, urlList): # Takes as an input a list and returns nothing. ''' Description: This function is used to export the urls into a flat file. Status: In progress - Should be moved to a separate package. Usage: Is used within the harvest functions as a url exporter. ''' urlList = sorted(urlList) # Sort urls so it can be more easy to read. fobj = open(filename, 'wa') for link in range(len(urlList)): try: encodedUrl = UnicodeDammit.detwingle(urlList[link]) encodedUrl.decode("utf8") fobj.write( encodedUrl) # Exports the urls in a file.Re move function- fobj.write('\n') except: exportFeedLogger.logError( "Unexpected error while open output file in exportUrlFeeder" ) pass fobj.flush() # Flush IO buffer. fobj.close() # Close file.
def test_detwingle_ignores_multibyte_characters(self): # Each of these characters has a UTF-8 representation ending # in \x93. \x93 is a smart quote if interpreted as # Windows-1252. But our code knows to skip over multibyte # UTF-8 characters, so they'll survive the process unscathed. for tricky_unicode_char in ( u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one. ): input = tricky_unicode_char.encode("utf8") self.assertTrue(input.endswith(b'\x93')) output = UnicodeDammit.detwingle(input) self.assertEqual(output, input)
def test_detwingle_ignores_multibyte_characters(self): # Each of these characters has a UTF-8 representation ending # in \x93. \x93 is a smart quote if interpreted as # Windows-1252. But our code knows to skip over multibyte # UTF-8 characters, so they'll survive the process unscathed. for tricky_unicode_char in ( "\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93' "\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93' "\xf0\x90\x90\x93", # This is a CJK character, not sure which one. ): input = tricky_unicode_char.encode("utf8") self.assertTrue(input.endswith(b'\x93')) output = UnicodeDammit.detwingle(input) self.assertEqual(output, input)
def test_detwingle(self): # Here's a UTF8 document. utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8") # Here's a Windows-1252 document. windows_1252 = ( u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252") # Through some unholy alchemy, they've been stuck together. doc = utf8 + windows_1252 + utf8 # The document can't be turned into UTF-8: self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") # Unicode, Dammit thinks the whole document is Windows-1252, # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" # But if we run it through fix_embedded_windows_1252, it's fixed: fixed = UnicodeDammit.detwingle(doc) self.assertEqual(u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
def exportUrlFeeder(self, filename,urlList):# Takes as an input a list and returns nothing. ''' Description: This function is used to export the urls into a flat file. Status: In progress - Should be moved to a separate package. Usage: Is used within the harvest functions as a url exporter. ''' urlList = sorted(urlList) # Sort urls so it can be more easy to read. fobj = open(filename,'wa') for link in range(len(urlList)): try: encodedUrl = UnicodeDammit.detwingle(urlList[link]) encodedUrl.decode("utf8") fobj.write(encodedUrl) # Exports the urls in a file.Re move function- fobj.write('\n') except: exportFeedLogger.logError("Unexpected error while open output file in exportUrlFeeder") pass fobj.flush() # Flush IO buffer. fobj.close()# Close file.
def test_detwingle(self): # Here's a UTF8 document. utf8 = ("\N{SNOWMAN}" * 3).encode("utf8") # Here's a Windows-1252 document. windows_1252 = ("\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!" "\N{RIGHT DOUBLE QUOTATION MARK}").encode( "windows_1252" ) # Through some unholy alchemy, they've been stuck together. doc = utf8 + windows_1252 + utf8 # The document can't be turned into UTF-8: self.assertRaises(UnicodeDecodeError, doc.decode, "utf8") # Unicode, Dammit thinks the whole document is Windows-1252, # and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃" # But if we run it through fix_embedded_windows_1252, it's fixed: fixed = UnicodeDammit.detwingle(doc) self.assertEqual("☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))