def cmd(self, command, args, channel, **kwargs): if command == 'aww' or command == 'sad' or command == 'depressed' or command == 'morn': if self.aww_updated_at == None or (time() - self.aww_updated_at) > 600: url = 'https://reddit.com/r/aww/hot.json?limit=100' self.aww_list = json.loads(self.reddit_opener.open(url).read()) if 'error' in self.aww_list: print(self.aww_list['error']) return [(0, channel, kwargs['from_nick'], 'I\'m so sorry, Reddit gave me an error. :(')] else: self.aww_updated_at = time() item = self.aww_list['data']['children'][randint(1,len(self.aww_list['data']['children']) - 1)] message = item['data']['url'] message = self.redditfix_re.sub('&', message) nick = kwargs['from_nick'] if args: args = args.split(' ') if len(args) >= 1 and len(args[0]) > 2: nick = args[0] if len(args) == 2 and len(args[1]) > 2: if args[1][0] == '#': channel = args[1] else: channel = '#' + args[1] if command == 'morn': message = choice(self.mornlist) + message try: return[(0, channel, to_bytes(to_unicode(nick)), to_bytes(to_unicode(message)))] except: return[(0, channel, kwars['from_nick'], 'Couldn\'t convert to unicode. :(')]
def get_spec(self): """ Return the contents of this package's RPM spec file """ if os.path.exists(os.path.join(self.repo_path, 'dead.package')): return to_unicode( self.repo.tree()['dead.package'].data_stream.read()) return to_unicode( self.repo.tree()[self.package + '.spec'].data_stream.read())
def print_specs(specs): print Ut.headings("SPECIFICATIONS DATA", line=False) # PRINT SPECS for key, data in specs.items(): if key == "target" or key == "source": new_line = "\n" else: new_line = "" if type(data) == str or type(data) == int or type(data) == unicode: value = to_unicode(data) #.encode(encoding='utf-8') elif type(data) == float or type(data) == int: value = to_unicode(data) else: value = type(data) print "{}\t{:22}{}".format(new_line, key, "{}".format(": {}".format(to_bytes(value)))) if type(data) == dict: for detail, val in data.items(): print "\t\t{:18}: {}".format(detail, val) print ""
def test_exception_to_unicode_custom(self): # If given custom functions, then we should not mangle c = [ lambda e: converters.to_unicode(e.args[0], encoding='euc_jp'), lambda e: converters.to_unicode(e, encoding='euc_jp') ] tools.eq_( converters.exception_to_unicode(self.exceptions['euc_jpn'], converters=c), self.u_japanese) c.extend(converters.EXCEPTION_CONVERTERS) tools.eq_( converters.exception_to_unicode(self.exceptions['euc_jpn'], converters=c), self.u_japanese) c = [ lambda e: converters.to_unicode(e.args[0], encoding='latin1'), lambda e: converters.to_unicode(e, encoding='latin1') ] tools.eq_( converters.exception_to_unicode(self.exceptions['latin1_spanish'], converters=c), self.u_spanish) c.extend(converters.EXCEPTION_CONVERTERS) tools.eq_( converters.exception_to_unicode(self.exceptions['latin1_spanish'], converters=c), self.u_spanish)
def test_to_unicode_errors(self): tools.eq_(converters.to_unicode(self.latin1_spanish), self.u_mangled_spanish_latin1_as_utf8) tools.eq_(converters.to_unicode(self.latin1_spanish, errors='ignore'), self.u_spanish_ignore) tools.assert_raises(UnicodeDecodeError, converters.to_unicode, *[self.latin1_spanish], **{'errors': 'strict'})
def get_spec(self): """ Return the contents of this package's RPM spec file """ if os.path.exists(os.path.join(self.repo_path, 'dead.package')): return to_unicode( self.repo.tree()['dead.package'].data_stream.read()) return to_unicode(self.repo.tree()[self.package + '.spec'].data_stream.read())
def test_non_string(self): '''Test deprecated non_string parameter''' # unicode tools.assert_raises(TypeError, converters.to_unicode, *[5], **{'non_string': 'foo'}) tools.ok_(converters.to_unicode(5, non_string='empty') == u'') tools.ok_(converters.to_unicode(5, non_string='passthru') == 5) tools.ok_(converters.to_unicode(5, non_string='simplerepr') == u'5') tools.ok_(converters.to_unicode(5, non_string='repr') == u'5') tools.assert_raises(TypeError, converters.to_unicode, *[5], **{'non_string': 'strict'}) tools.ok_(converters.to_unicode(UnicodeNoStr(), non_string='simplerepr') == self.u_spanish) tools.ok_(converters.to_unicode(StrNoUnicode(), non_string='simplerepr') == self.u_spanish) tools.ok_(converters.to_unicode(StrReturnsUnicode(), non_string='simplerepr') == self.u_spanish) tools.ok_(converters.to_unicode(UnicodeReturnsStr(), non_string='simplerepr') == self.u_spanish) tools.ok_(converters.to_unicode(UnicodeStrCrossed(), non_string='simplerepr') == self.u_spanish) obj_repr = converters.to_unicode(object, non_string='simplerepr') tools.ok_(obj_repr == u"<type 'object'>" and isinstance(obj_repr, unicode)) # Bytes tools.ok_(converters.to_bytes(5) == '5') tools.ok_(converters.to_bytes(5, non_string='empty') == '') tools.ok_(converters.to_bytes(5, non_string='passthru') == 5) tools.ok_(converters.to_bytes(5, non_string='simplerepr') == '5') tools.ok_(converters.to_bytes(5, non_string='repr') == '5') # Raise a TypeError if the msg is non_string and we're set to strict tools.assert_raises(TypeError, converters.to_bytes, *[5], **{'non_string': 'strict'}) # Raise a TypeError if given an invalid non_string arg tools.assert_raises(TypeError, converters.to_bytes, *[5], **{'non_string': 'INVALID'}) # No __str__ method so this returns repr string = converters.to_bytes(UnicodeNoStr(), non_string='simplerepr') self._check_repr_bytes(string, 'UnicodeNoStr') # This object's _str__ returns a utf8 encoded object tools.ok_(converters.to_bytes(StrNoUnicode(), non_string='simplerepr') == self.utf8_spanish) # This object's __str__ returns unicode which to_bytes converts to utf8 tools.ok_(converters.to_bytes(StrReturnsUnicode(), non_string='simplerepr') == self.utf8_spanish) # Unless we explicitly ask for something different tools.ok_(converters.to_bytes(StrReturnsUnicode(), non_string='simplerepr', encoding='latin1') == self.latin1_spanish) # This object has no __str__ so it returns repr string = converters.to_bytes(UnicodeReturnsStr(), non_string='simplerepr') self._check_repr_bytes(string, 'UnicodeReturnsStr') # This object's __str__ returns unicode which to_bytes converts to utf8 tools.ok_(converters.to_bytes(UnicodeStrCrossed(), non_string='simplerepr') == self.utf8_spanish) # This object's __repr__ returns unicode which to_bytes converts to utf8 tools.ok_(converters.to_bytes(ReprUnicode(), non_string='simplerepr') == u'ReprUnicode(El veloz murciélago saltó sobre el perro perezoso.)'.encode('utf8')) tools.ok_(converters.to_bytes(ReprUnicode(), non_string='repr') == u'ReprUnicode(El veloz murciélago saltó sobre el perro perezoso.)'.encode('utf8')) obj_repr = converters.to_bytes(object, non_string='simplerepr') tools.ok_(obj_repr == "<type 'object'>" and isinstance(obj_repr, str))
def encoding_path(s): s = s.strip() if check_os() == 'Windows': return to_unicode(s, 'utf-8') s = to_unicode(s) try: return s.encode('utf-8', 'replace') except Exception, e: log.warning(e)
def listen(self, msg, channel, **kwargs): for karmatoken in self.reg.findall(msg): match = [x for x in karmatoken if x != ""][0] if match.startswith("++") or match.endswith("++"): if match.strip("++") != kwargs['from_nick']: self.backend(channel).positiveKarma(to_unicode(match.strip("++"))) if match.startswith("--") or match.endswith("--"): self.backend(channel).negativeKarma(to_unicode(match.strip("--")))
def import_page(self, page, pages): title = to_unicode(page['post_title']) self.vprint("BEGIN Importing page '{0}'".format(to_bytes(title)), 1) mezz_page = self.get_or_create(RichTextPage, title=title) mezz_page.created = page['post_modified'] mezz_page.updated = page['post_modified'] mezz_page.content = to_unicode(page['post_content']) mezz_page.save() self.vprint("END Importing page'{0}'".format(to_bytes(title)), 1)
def import_page(self, page, pages): title = to_unicode(page['post_title']) self.vprint("BEGIN Importing page '{0}'".format(to_bytes(title)), 1) mezz_page = self.get_or_create(RichTextPage, title=title) mezz_page.created = page['post_modified'] mezz_page.updated = page['post_modified'] mezz_page.content = to_unicode(page['post_content']) mezz_page.save() self.vprint("END Importing page'{0}'".format(to_bytes(title)), 1)
def enconding_path(s): if sys.platform in ["darwin"]: s = to_unicode(s) try: s = s.encode('utf-8', 'replace') except: pass elif sys.platform in ["win32"]: s = to_unicode(s, 'utf-8') # s = s.encode('utf-8', 'replace') return s
def to_display(s): s = s.strip() info = osinfo.OSInfo() if info == 'Windows': return to_unicode(s, 'utf-8') s = to_unicode(s) try: return s.encode('utf-8', 'replace') except Exception: pass return s
def test_to_unicode_nonstring_with_objects_that_have__unicode__and__str__(self): '''Test that to_unicode handles objects that have __unicode__ and __str__ methods''' if sys.version_info < (3, 0): # None of these apply on python3 because python3 does not use __unicode__ # and it enforces __str__ returning str tools.eq_(converters.to_unicode(UnicodeNoStr(), nonstring='simplerepr'), self.u_spanish) tools.eq_(converters.to_unicode(StrNoUnicode(), nonstring='simplerepr'), self.u_spanish) tools.eq_(converters.to_unicode(UnicodeReturnsStr(), nonstring='simplerepr'), self.u_spanish) tools.eq_(converters.to_unicode(StrReturnsUnicode(), nonstring='simplerepr'), self.u_spanish) tools.eq_(converters.to_unicode(UnicodeStrCrossed(), nonstring='simplerepr'), self.u_spanish)
def to_display(s): s = s.strip() info = osinfo.OSInfo() if info == 'Windows': return to_unicode(s, 'utf-8') s = to_unicode(s) try: return s.encode('utf-8', 'replace') except Exception: pass return s
def test_guess_encoding_with_chardet(self): # We go this slightly roundabout way because multiple encodings can # output the same byte sequence. What we're really interested in is # if we can get the original unicode string without knowing the # converters beforehand tools.ok_(to_unicode(self.utf8_spanish, misc.guess_encoding(self.utf8_spanish)) == self.u_spanish) tools.ok_(to_unicode(self.latin1_spanish, misc.guess_encoding(self.latin1_spanish)) == self.u_spanish) tools.ok_(to_unicode(self.utf8_japanese, misc.guess_encoding(self.utf8_japanese)) == self.u_japanese)
def _get_factoid_interaction(self, txn, factoid_key, location, protocol, channel): """ Gets a factoid if it exists, otherwise raises MissingFactoidError :return: (factoid_name, [entry, entry, ...]) """ self.logger.trace(_("Getting factoid params: factoid_key = '%s', " "location = '%s', protocol = '%s', " "channel = '%s'"), factoid_key, location, protocol, channel) if location is None: self.logger.trace(_("Location is None - getting all factoids with " "key '%s'"), factoid_key) txn.execute("SELECT location, protocol, channel, factoid_name, " "info FROM factoids WHERE factoid_key = ?", ( to_unicode(factoid_key), )) results = txn.fetchall() if len(results) > 0: # Check for channel match for row in results: if ((row[0] == self.CHANNEL and row[1] == protocol and row[2] == channel)): self.logger.trace(_("Match found (channel)!")) return (row[3], row[4].split("\n")) # Check for protocol match for row in results: if row[0] == self.PROTOCOL and row[1] == protocol: self.logger.trace(_("Match found (protocol)!")) return (row[3], row[4].split("\n")) # Check for global match for row in results: if row[0] == self.GLOBAL: self.logger.trace(_("Match found (global)!")) return (row[3], row[4].split("\n")) else: txn.execute("SELECT location, protocol, channel, factoid_name, " "info FROM factoids WHERE factoid_key = ? AND " "location = ? AND protocol = ? AND channel = ?", ( to_unicode(factoid_key), to_unicode(location), to_unicode(protocol), to_unicode(channel) )) results = txn.fetchall() if len(results) > 0: return (results[0][3], results[0][4].split("\n")) raise MissingFactoidError(_("Factoid '%s' does not exist") % factoid_key)
def test_to_unicode(self): '''Test to_unicode when the user gives good values''' tools.eq_(converters.to_unicode(self.u_japanese, encoding='latin1'), self.u_japanese) tools.eq_(converters.to_unicode(self.utf8_spanish), self.u_spanish) tools.eq_(converters.to_unicode(self.utf8_japanese), self.u_japanese) tools.eq_(converters.to_unicode(self.latin1_spanish, encoding='latin1'), self.u_spanish) tools.eq_(converters.to_unicode(self.euc_jp_japanese, encoding='euc_jp'), self.u_japanese) tools.assert_raises(TypeError, converters.to_unicode, *[5], **{'nonstring': 'foo'})
def _ugettext(self, message): if not isbasestring(message): return u'' if self._fallback: msg = to_unicode(message, encoding=self.input_charset) try: message = self._fallback.ugettext(msg) except (AttributeError, UnicodeError): # Ignore UnicodeErrors: We'll do our own decoding later pass # Make sure we're returning unicode return to_unicode(message, encoding=self.input_charset)
def ugettext(self, message): if not isinstance(message, basestring): return u'' if self._fallback: msg = to_unicode(message, encoding=self.input_charset) try: message = self._fallback.ugettext(msg) except (AttributeError, UnicodeError): # Ignore UnicodeErrors: We'll do our own decoding later pass # Make sure we're returning unicode return to_unicode(message, encoding=self.input_charset)
def assign(self, **kwargs): if 'id' in kwargs: self.id = kwargs['id'] if 'eid' in kwargs: self.eid = to_unicode(kwargs['eid']) if 'caption' in kwargs: self.caption = to_unicode(kwargs['caption']) # if 'suggestion' in kwargs: self.suggestion = to_unicode(kwargs['suggestion']) if 'description' in kwargs: self.description = to_unicode(kwargs['description']) if 'alias' in kwargs: self.alias = to_unicode(kwargs['alias']) if 'creation_time' in kwargs: self.creation_time = datetime.fromtimestamp( float(kwargs['creation_time'])) if 'creator' in kwargs: self.creator = kwargs['creator'] return self
def test_normalize(self, string_in, string_out, regex): """ Unit testing function for paideia_utils.sanitize_greek() """ print 'string in', string_in print 'expected string out', string_out actual = GreekNormalizer().normalize(string_in) print 'actual string out', actual assert actual == string_out assert isinstance(actual, unicode) regex1 = re.compile(to_unicode(regex), re.I | re.U) assert re.match(regex1, to_unicode(actual))
def __init__(self,entryurl, gApi = None): #check for google api key, request if necessary if gApi != None: self.gApi = defineApi(gApi) else: self.gApi = defineApi() self.entryurl = entryurl table = self.getHTMLTable() fieldList = self.getFieldList(table) entryList = self.getEntryList(table) self.ecolex_id = self.getEntry('Legislation ID number',fieldList,entryList) self.name = self.getEntry('Title of tex',fieldList,entryList) self.country = self.getEntry('Country',fieldList,entryList) self.date = self.getEntry('Date of tex',fieldList,entryList) self.legtype = self.getEntry('Type of documen',fieldList,entryList) self.source = self.getEntry('Source',fieldList,entryList) self.fulltext = self.getUrl('Link to full tex',fieldList,entryList) self.abstract = self.getEntry('Abstrac',fieldList,entryList) # concatenate subject and keywords only of there are entries keywordsA = self.getEntry('Keyword(s)',fieldList,entryList) keywordsB = self.getEntry('Subject(s)',fieldList,entryList) if keywordsA != None and keywordsB != None: self.keywords = keywordsA + '; ' + keywordsB elif keywordsA != None and keywordsB == None: self.keywords = keywordsA elif keywordsA == None and keywordsB != None: self.keywords = keywordsB else: self.keywords = None # check language and translate keywords and abstract if not english if self.abstract != None: languageSample = ' '.join(self.abstract.split(' ')[0:5]) self.language = identify(languageSample,self.gApi) if self.language != 'en': translationAB = translate(self.abstract,self.language,'en',self.gApi) self.abstractEN = to_unicode(translationAB) translationKW = translate(self.abstract,self.language,'en',self.gApi) self.keywordsEN = to_unicode(translationKW) else: self.abstractEN = self.abstract self.keywordsEN = self.keywords else: self.language = None self.abstractEN = None self.keywordsEN = None
def populate(comps='comps-f16', do_dependencies=True): from yum.comps import Comps session = DBSession() c = Comps() c.add('comps/%s.xml' % comps) for group in c.groups: g = Group(id=group.groupid, name=group.name, description=group.description) session.add(g) for package in group.packages: p = session.query(Package).filter_by( name=to_unicode(package)).first() if not p: p = Package(name=package) session.add(p) p.group = g session.flush() root = Root(name=u'Fedora') session.add(root) session.flush() for category in c.categories: c = Category(id=category.categoryid, name=category.name, description=category.description) session.add(c) root.categories.append(c) for group in category.groups: g = session.query(Group).filter_by( group_id=to_unicode(group)).first() if not g: print "Cannot find group: %s" % group else: g.category = c session.flush() if do_dependencies: for package in session.query(Package).all(): add_dependencies(package, session) session.commit()
def test_exception_to_unicode_custom(self): # If given custom functions, then we should not mangle c = [lambda e: converters.to_unicode(e, encoding='euc_jp')] tools.ok_(converters.exception_to_unicode(self.exceptions['euc_jpn'], converters=c) == self.u_japanese) c.extend(converters.EXCEPTION_CONVERTERS) tools.ok_(converters.exception_to_unicode(self.exceptions['euc_jpn'], converters=c) == self.u_japanese) c = [lambda e: converters.to_unicode(e, encoding='latin1')] tools.ok_(converters.exception_to_unicode(self.exceptions['latin1_spanish'], converters=c) == self.u_spanish) c.extend(converters.EXCEPTION_CONVERTERS) tools.ok_(converters.exception_to_unicode(self.exceptions['latin1_spanish'], converters=c) == self.u_spanish)
def _get_user_txn(self, txn, user, protocol): user = user.lower() user = to_unicode(user) txn.execute(u"SELECT * FROM users WHERE user=? AND protocol=?", (user, protocol)) r = txn.fetchone() return r
def brute_txt(fn): """ Convert anything to txt """ # if url, send there if not os.path.exists(fn): print('! No filename found') return '' # get ext ext=os.path.splitext(fn)[-1][1:] txt='' # epub if ext in {'epub'}: txt=epub2txt(fn) elif ext in {'xml','html','htm'}: with open(fn) as f: content=f.read() txt=xml2txt(content,CONTENT_TAGS[ext]) elif ext in {'txt'}: with open(fn,'rb') as f: content=f.read() return to_unicode(content) elif ext in {'pdf'}: txt=pdf2txt(fn) else: import fulltext txt=fulltext.get(fn) if not txt: return '' # clean txt=txt.replace('\xa0', ' ') if 'project gutenberg ebook' in txt.lower(): txt=clean_gutenberg(txt) return txt
def lngettext(self, msgid1, msgid2, n): if n == 1: tmsg = msgid1 else: tmsg = msgid2 if not isinstance(msgid1, basestring): return '' msgid1 = to_unicode(msgid1, encoding=self.input_charset) try: #pylint:disable-msg=E1101 tmsg = self._catalog[(msgid1, self.plural(n))] except KeyError: if self._fallback: try: tmsg = self._fallback.ngettext(msgid1, msgid2, n) except UnicodeError: # Ignore UnicodeErrors: We'll do our own encoding next pass # Make sure that we're returning a str if self._output_charset: return to_bytes(tmsg, encoding=self._output_charset, nonstring='empty') return to_bytes(tmsg, encoding=locale.getpreferredencoding(), nonstring='empty')
def translate_command(self, protocol, caller, source, command, raw_args, parsed_args): if len(parsed_args) < 2: caller.respond( "Usage: {CHARS}" + command + " <languages> <text>" ) return langs = parsed_args[0] text = u" ".join([to_unicode(x) for x in parsed_args[1:]]) if u":" in langs: split = langs.split(u":") from_lang, to_lang = split[0], split[1] else: from_lang, to_lang = u"", langs try: translation = self.goslate.translate(text, to_lang, from_lang) source.respond(u"[{}] {}".format(to_lang, translation)) except Error as e: source.respond(u"Translation error: {}".format(e)) except Exception as e: self.logger.exception("Translation error") source.respond(u"Translation error: {}".format(e))
def _insert_or_update_user(self, txn, user, protocol): user = user.lower() user = to_unicode(user) txn.execute("SELECT * FROM users WHERE user=? AND protocol=?", (user, protocol)) r = txn.fetchone() now = time.time() if r is None: txn.execute( "INSERT INTO users VALUES (?, ?, ?)", (user, protocol, now) ) return False else: txn.execute( "UPDATE users SET at=? WHERE user=? AND protocol=?", (now, user, protocol) ) return True
def main(argv): if len(argv) == 3: output = open(argv[2], 'w') elif len(argv) == 2: print("Default output file used: \"output.txt\"") output = open("output.txt", 'w') else: print("Usage: ./srcYUML2graphViz.py [inputFile] [outputFile]") exit(1) output.write("digraph hierarchy {\nsize=\"5, 5\"\n") output.write("node[shape=record,style=filled,fillcolor=gray95]\n") output.write( "edge[dir=\"both\", arrowtail=\"empty\", arrowhead=\"empty\", labeldistance=\"2.0\"]\n" ) file = open(argv[1], "rb") #rb is R-read and B-binary input_str = to_unicode(file.read()) file.close() input = InputStream(input_str) lexer = srcYUML2graphVizLexer(input) stream = CommonTokenStream(lexer) parser = srcYUML2graphVizParser(stream) tree = parser.yuml() relay = Relay(output) #realization of Listener walker = ParseTreeWalker() walker.walk(relay, tree) #enterYuml(self, tree) #ok here is where I need to start learning to navigate the parse tree #print(Trees.toStringTree(tree, None, parser)) output.close()
def _ugettext(self, message): if not isbasestring(message): return u'' message = to_unicode(message, encoding=self.input_charset) try: message = self._catalog[message] #pylint:disable-msg=E1101 except KeyError: if self._fallback: try: message = self._fallback.ugettext(message) except (AttributeError, UnicodeError): # Ignore UnicodeErrors: We'll do our own encoding next pass # Make sure that we're returning unicode return to_unicode(message, encoding=self.input_charset)
def _lngettext(self, msgid1, msgid2, n): if n == 1: tmsg = msgid1 else: tmsg = msgid2 if not isbasestring(msgid1): return '' u_msgid1 = to_unicode(msgid1, encoding=self.input_charset) try: #pylint:disable-msg=E1101 tmsg = self._catalog[(u_msgid1, self.plural(n))] except KeyError: if self._fallback: try: tmsg = self._fallback.lngettext(msgid1, msgid2, n) except (AttributeError, UnicodeError): # Ignore UnicodeErrors: We'll do our own encoding next pass # Next decide what encoding to use for the strings we return output_encoding = (self._output_charset or locale.getpreferredencoding()) return self._reencode_if_necessary(tmsg, output_encoding)
def update_details(self, bug: typing.Union[typing.Any, None], bug_entity: 'models.Bug'): """ Update the details on bug_entity to match what is found in Bugzilla. Args: bug: The Bugzilla Bug we will use to update our own Bug object from. If None, bug_entity.bug_id will be used to fetch the object from Bugzilla. bug_entity: The bug we wish to update. """ if not bug: try: bug = self.bz.getbug(bug_entity.bug_id) except xmlrpc_client.Fault as err: if err.faultCode == 102: bug_entity.title = 'Private bug' bug_entity.private = True log.info("Marked bug #" + str(bug_entity.bug_id) + " as private.") else: bug_entity.title = 'Invalid bug number' log.error("Got fault from Bugzilla: fault code: %d, fault string: %s" % ( err.faultCode, err.faultString)) return except Exception: log.exception("Unknown exception from Bugzilla") return if bug.product == 'Security Response': bug_entity.parent = True bug_entity.title = to_unicode(bug.short_desc) if isinstance(bug.keywords, str): keywords = bug.keywords.split() else: # python-bugzilla 0.8.0+ keywords = bug.keywords if 'security' in [keyword.lower() for keyword in keywords]: bug_entity.security = True
def _reencode_if_necessary(self, message, output_encoding): '''Return a byte string that's valid in a specific charset. .. warning:: This method may mangle the message if the inpput encoding is not known or the message isn't represntable in the chosen output encoding. ''' valid = False msg = None try: valid = byte_string_valid_encoding(message, output_encoding) except TypeError: # input was unicode, so it needs to be encoded pass if valid: return message try: # Decode to unicode so we can re-encode to desired encoding msg = to_unicode(message, encoding=self.input_charset, nonstring='strict') except TypeError: # Not a string; return an empty byte string return '' # Make sure that we're returning a str of the desired encoding return to_bytes(msg, encoding=output_encoding)
def _get_user_txn(self, txn, user, protocol): user = user.lower() user = to_unicode(user) txn.execute(u"SELECT * FROM users WHERE user=? AND protocol=?", (user, protocol)) r = txn.fetchone() return r
def process_input(text, stop_word, stop_symbols_string): try: temp = to_bytes(text.lower()) # temp = str(temp).decode(encoding="utf-8") # REMOVE DATA IN BRACKETS # REMOVE (....) FROM THE VALUE temp = remove_info_in_bracket(temp) # REMOVE STOP WORLD if len(stop_word) > 0: temp = remove_stop_words(temp, stop_word) # REMOVE SYMBOLS OR CHARACTER if stop_symbols_string is not None and len(stop_symbols_string) > 0: pattern = str("[{}]".format(str(stop_symbols_string).strip())).replace(" ", "") temp = re.sub(pattern, "", temp) return to_unicode(temp) except Exception as error: print "!!!!!!!!!!!!! PROBLEM !!!!!!!!!!!!!!!!!!!" print str(error.message) return text
def textPreprocess(text): #load dictionary of specialist lexicon global medical if not medical: file = open('./dictionary_files/medical.pkl', 'r') medical = pickle.load(file) file.close() # Force all the text to be of the same type, deals with accented letters text = to_unicode(text) # Covert to lower case text = text.lower() # Split text into sentences sentence_token = nltk.data.load('tokenizers/punkt/english.pickle') sentences = sentence_token.tokenize(text.strip()) text = [] for sentence in sentences: # Split on non alphanumeric and non hyphen characters and keep delimiter sentence = re.split("([^\w\-]+)||\b", sentence) # Delete whitespace tokens sentence = [word.replace(' ','') for word in sentence] sentence = filter(None, sentence) #look up variable length sequences of words in medical dictionary, stem them if not present numTokens = 5 #phrases up to 5 words long while (numTokens > 0): processedText=[] start=0 #Check each phrase of n tokens while there are sufficient tokens after while (start <= (len(sentence) - numTokens)): phrase=sentence[start] nextToken=1 while nextToken < numTokens: #add the next tokens to the current one phrase = phrase+" "+sentence[start+nextToken] nextToken += 1 if phrase in medical: #convert tokens to one token from specialist processedText.append(medical[phrase]) # skip the next tokens start += (numTokens) elif numTokens == 1: #individual tokens, stem them if not in specialist and keep processedText.append(stem.snowball.EnglishStemmer().stem(phrase)) start += 1 else: #token not part of phrase, keep processedText.append(sentence[start]) start += 1 #Keep remaining tokens without enough tokens after them while (start < len(sentence)): processedText.append(sentence[start]) start += 1 sentence = processedText numTokens -= 1 text.append(sentence) # text.append(["end_rep"]) return(text)
def test_guess_encoding_with_chardet_uninstalled(self): if chardet: raise SkipTest('chardet installed, euc_jp will not be mangled') else: tools.ok_(to_unicode(self.euc_jp_japanese, misc.guess_encoding(self.euc_jp_japanese)) == self.u_mangled_euc_jp_as_latin1)
def from_files(cls, filenames, *args, **kwds): """Create Verbatims instance from verbfiles.""" cls.logger.info('Getting verbatims from file(s):\t%s...', filenames) verbs = [] for filename in filenames: with open(filename) as infile: incsv = csv.DictReader(infile) sql_id_fieldname = None code_fieldname = None text_fieldname = None for fieldname in incsv.fieldnames: if re.search(r'(verb|dc).*id', fieldname, re.I): sql_id_fieldname = fieldname elif re.search(r'\b(code|label)', fieldname, re.I): digit = re.search(r'\d+', fieldname) if not digit or int(digit.group(0)) == 1: code_fieldname = fieldname elif re.search(r'(verb)?.*(text|original)', fieldname, re.I): text_fieldname = fieldname for i, row in enumerate(incsv): verb = Verbatim( sql_id=row.get(sql_id_fieldname, i), code=row.get(code_fieldname, None), text=to_unicode(row[text_fieldname])) verbs.append(verb) verbs = cls(verbs, *args, **kwds) cls.logger.debug('Retrieved %d verbatims', len(verbs)) return verbs
def test_guess_encoding_with_chardet(self): # We go this slightly roundabout way because multiple encodings can # output the same byte sequence. What we're really interested in is # if we can get the original unicode string without knowing the # converters beforehand tools.ok_( to_unicode(self.utf8_spanish, misc.guess_encoding( self.utf8_spanish)) == self.u_spanish) tools.ok_( to_unicode(self.latin1_spanish, misc.guess_encoding(self.latin1_spanish)) == self.u_spanish) tools.ok_( to_unicode(self.utf8_japanese, misc.guess_encoding(self.utf8_japanese)) == self.u_japanese)
def main(argv): if len(argv) == 3: output = open(argv[2], 'w') elif len(argv) == 2: print("Default output file used: \"output.txt\"") output = open("output.txt", 'w') else: print("Usage: ./srcYUML2graphViz.py [inputFile] [outputFile]") exit(1) output.write("digraph hierarchy {\nsize=\"5, 5\"\n") output.write("node[shape=record,style=filled,fillcolor=gray95]\n") output.write("edge[dir=\"both\", arrowtail=\"empty\", arrowhead=\"empty\", labeldistance=\"2.0\"]\n") file = open(argv[1], "rb") #rb is R-read and B-binary input_str = to_unicode(file.read()) file.close() input = InputStream(input_str) lexer = srcYUML2graphVizLexer(input) stream = CommonTokenStream(lexer) parser = srcYUML2graphVizParser(stream) tree = parser.yuml() relay = Relay(output)#realization of Listener walker = ParseTreeWalker() walker.walk(relay, tree) #enterYuml(self, tree) #ok here is where I need to start learning to navigate the parse tree #print(Trees.toStringTree(tree, None, parser)) output.close()
def update_details(self, bug, bug_entity): """ Update the details on bug_entity to match what is found in Bugzilla. Args: bug (bugzilla.bug.Bug or None): The Bugzilla Bug we will use to update our own Bug object from. If None, bug_entity.bug_id will be used to fetch the object from Bugzilla. bug_entity(bodhi.server.models.Bug): The bug we wish to update. """ if not bug: try: bug = self.bz.getbug(bug_entity.bug_id) except xmlrpc_client.Fault: bug_entity.title = 'Invalid bug number' log.exception("Got fault from Bugzilla") return except Exception: log.exception("Unknown exception from Bugzilla") return if bug.product == 'Security Response': bug_entity.parent = True bug_entity.title = to_unicode(bug.short_desc) if isinstance(bug.keywords, six.string_types): keywords = bug.keywords.split() else: # python-bugzilla 0.8.0+ keywords = bug.keywords if 'security' in [keyword.lower() for keyword in keywords]: bug_entity.security = True
def import_koji_pkgs(): """ Get the latest packages from koji. These might not have made it into yum yet, so we won't even check for their summary until later. """ log.info("Importing koji packages") import koji session = koji.ClientSession("https://koji.fedoraproject.org/kojihub") count = 0 tagbp = 230 # id of el6-docs tag to bypass packages = session.listPackages() log.info("Looking through %i packages from koji." % len(packages)) for package in packages: name = to_unicode(package['package_name']) pkg_tagstatus = session.getPackageConfig(tagbp, package['package_id']) if pkg_tagstatus is not None: log.info("Package %s is tagged with el6-docs and will be skipped")\ % name continue # skipping if the package is tagged try: p = m.Package.by_name(ft.SESSION, name) except NoResultFound: log.debug(name + ' -') count += 1 ft.SESSION.add(m.Package(name=name, summary=u'')) log.info("Got %i new packages from koji (with no summaries yet)" % count)
def ugettext(self, message): if not isinstance(message, basestring): return u'' message = to_unicode(message, encoding=self.input_charset) try: message = self._catalog[message] #pylint:disable-msg=E1101 except KeyError: if self._fallback: try: message = self._fallback.ugettext(message) except (AttributeError, UnicodeError): # Ignore UnicodeErrors: We'll do our own encoding next pass # Make sure that we're returning unicode return to_unicode(message, encoding=self.input_charset)
def write_string_tag(self, attribute, value, tag, last): """ Write a [predicate Value@tag] line """ # Make sure attribute is okay for URI attribute = self.check_for_uri(attribute) last = u'.' if last is True else u";" if value is not None: value = value.strip() # print attribute + " " + value + " " + tag value = to_unicode(value) if value != "": to_write = to_unicode( self.inlineFormat.format(attribute, self.triple_value_tag(value, tag), last)) self.turtleWriter.write(to_write)
def lngettext(self, msgid1, msgid2, n): if n == 1: tmsg = msgid1 else: tmsg = msgid2 if not isinstance(msgid1, basestring): return '' u_msgid1 = to_unicode(msgid1, encoding=self.input_charset) try: #pylint:disable-msg=E1101 tmsg = self._catalog[(u_msgid1, self.plural(n))] except KeyError: if self._fallback: try: tmsg = self._fallback.ngettext(msgid1, msgid2, n) except (AttributeError, UnicodeError): # Ignore UnicodeErrors: We'll do our own encoding next pass # Next decide what encoding to use for the strings we return output_encoding = (self._output_charset or locale.getpreferredencoding()) return self._reencode_if_necessary(tmsg, output_encoding)
def _reencode_if_necessary(self, message, output_encoding): '''Return a byte string that's valid in a specific charset. .. warning:: This method may mangle the message if the inpput encoding is not known or the message isn't represntable in the chosen output encoding. ''' valid = False msg = None try: valid = byte_string_valid_encoding(message, output_encoding) except TypeError: # input was unicode, so it needs to be encoded pass if valid: return message try: # Decode to unicode so we can re-encode to desired encoding msg = to_unicode(message, encoding=self.input_charset, nonstring='strict') except TypeError: # Not a string; return an empty byte string return '' # Make sure that we're returning a str of the desired encoding return to_bytes(msg, encoding=output_encoding)
def test_to_unicode(self): '''Test to_unicode when the user gives good values''' tools.eq_(converters.to_unicode(self.u_japanese, encoding='latin1'), self.u_japanese) tools.eq_(converters.to_unicode(self.utf8_spanish), self.u_spanish) tools.eq_(converters.to_unicode(self.utf8_japanese), self.u_japanese) tools.eq_( converters.to_unicode(self.latin1_spanish, encoding='latin1'), self.u_spanish) tools.eq_( converters.to_unicode(self.euc_jp_japanese, encoding='euc_jp'), self.u_japanese) tools.assert_raises(TypeError, converters.to_unicode, *[5], **{'nonstring': 'foo'})
def capitalize(letter): """ Convert string to upper case in utf-8 safe way. """ letter = to_unicode(letter, encoding='utf8') newletter = letter.upper() newletter = to_bytes(letter, encoding='utf8') return newletter
def get_unicode(string, encoding='utf-8', errors='replace'): """fuerza una conversion a unicode a prueba de fallas""" # si el valor no es None, intenta convertir a unicode if string: try: RV = to_unicode(string, encoding, errors) except Exception: encoding = chardet.detect(string)["encoding"] RV = to_unicode(string, encoding, errors) # si es None, no convierte a unicode else: RV = string return RV
def get_unicode(string, encoding='utf-8', errors='replace'): """fuerza una conversion a unicode a prueba de fallas""" # si el valor no es None, intenta convertir a unicode if string: try: RV = to_unicode(string, encoding, errors) except Exception: encoding = chardet.detect(string)["encoding"] RV = to_unicode(string, encoding, errors) # si es None, no convierte a unicode else: RV = string return RV
def capitalize(letter): """ Convert string to upper case in utf-8 safe way. """ letter = to_unicode(letter, encoding='utf8') newletter = letter.upper() newletter = to_bytes(letter, encoding='utf8') return newletter
def test_guess_encoding_with_chardet_uninstalled(self): if chardet: raise SkipTest('chardet installed, euc_jp will not be mangled') else: tools.ok_( to_unicode(self.euc_jp_japanese, misc.guess_encoding(self.euc_jp_japanese)) == self.u_mangled_euc_jp_as_latin1)
def exceptions_search(self, uuid, page, search): uuid = to_unicode(uuid) search = urllib.unquote(search) try: page = int(page) except Exception: return abort(404, "Page not found") if page < 1: return abort(404, "Page not found") db = self.manager.mongo bots = db.get_collection("bots") exceptions = db.get_collection("exceptions") now = datetime.datetime.utcnow() last_online = now - datetime.timedelta(minutes=10) online = bots.find({ "last_seen": {"$gt": last_online} }).count() logged_num = exceptions.find({ "uuid": uuid, "traceback": {"$regex": "/%s/" % search} }).count() if logged_num < 1: return template( "templates/exceptions_form.html", online=online, error="No exceptions have been logged for the UUID '%s' " "with the search string '%s'" % (uuid, search) ) pages = (int(logged_num) / 10) overhang = int(logged_num) % 10 if overhang > 0: pages += 1 start = (page * 10) - 10 limit = 10 if page > pages: return abort(404, "Page not found") data = exceptions.find({ "uuid": uuid, "traceback": {"$regex": "/%s/" % search} }, skip=start, limit=limit, sort=[("date", DESCENDING)]) return template("templates/exceptions.html", online=online, error=None, cur_page=page, max_page=pages, data=data, uuid=uuid, search=search)
def _delete_factoid_interaction(self, txn, factoid_key, location, protocol, channel): """ Deletes a factoid if it exists, otherwise raises MissingFactoidError """ self.logger.trace("DELETE | Key: %s | Loc: %s | Pro: %s | Cha: %s" % (factoid_key, location, protocol, channel)) if location == self.CHANNEL: txn.execute( "DELETE FROM factoids WHERE factoid_key = ? AND " "location = ? AND protocol = ? AND channel = ?", (to_unicode(factoid_key), to_unicode(location), to_unicode(protocol), to_unicode(channel))) else: txn.execute( "DELETE FROM factoids WHERE factoid_key = ? AND " "location = ? AND protocol = ?", (to_unicode(factoid_key), to_unicode(location), to_unicode(protocol))) if txn.rowcount == 0: raise MissingFactoidError( _("Factoid '%s' does not exist") % factoid_key) e = FactoidDeletedEvent(self, factoid_key) self.events.run_callback("Factoids/Deleted", e, from_thread=True)
def test_guess_encoding_with_chardet_installed(self): if chardet: tools.ok_( to_unicode(self.euc_jp_japanese, misc.guess_encoding(self.euc_jp_japanese)) == self.u_japanese) else: raise SkipTest( 'chardet not installed, euc_jp will not be guessed correctly')