def serializeFields(data): """Turns every attribute of the Mambu object in to a string representation. If the object is an iterable one, it goes down to each of its elements and turns its attributes too, recursively. The base case is when it's a MambuStruct class (this one) so it just 'serializes' the attr atribute. Refer to MambuStruct.serializeStruct pydoc. This is perhaps the worst way to do it, still looking for a better way. """ if isinstance(data, MambuStruct): return data.serializeStruct() try: it = iter(data) except TypeError as terr: return unicode(data) if type(it) == type(iter([])): l = [] for e in it: l.append(MambuStruct.serializeFields(e)) return l elif type(it) == type(iter({})): d = {} for k in it: d[k] = MambuStruct.serializeFields(data[k]) return d # elif ... tuples? sets? return unicode(data)
def define_viable(self, value): try: if unicode(value)[-1] == u'%': viable = float(unicode(value)[:-1]) / 100 else: viable = float(value) except ValueError: print("Invalid viable value '%s'" % value) else: self._viable = viable
def __new__(cls, s, *args, **kwargs): if isinstance(s, _): s = unicode(s.untranslated) if translator: trans = translator(s, *args, **kwargs) obj = super(_, cls).__new__(cls, trans, *args, **kwargs) else: obj = super(_, cls).__new__(cls, s, *args, **kwargs) obj.untranslated = unicode(s) obj._additionals = [] return obj
def define_variance(self, seq, value): seq = unicode(seq) # ensure sequence is a string try: if unicode(value)[-1] == u'%': cv = float(unicode(value)[:-1]) / 100 else: cv = float(value) except ValueError: print("Invalid cv value '%s'" % value) else: self._measure_data[seq]['variance'] = cv
def create_categories(connection): for linea in category_table_data: if category_table_data[linea][1] is not None: ac = Category(id=unicode(category_table_data[linea][0]), name=linea, parent=unicode(category_table_data[linea][1]), balance=0) else: ac = Category(id=unicode(category_table_data[linea][0]), name=linea, balance=0) connection.add(ac)
def feed_ctags(self, tagsfile_obj): for l in tagsfile_obj: #print l if not isPython3: l = builtins.unicode(l, 'utf8', 'replace') if l.startswith('!'): continue fields = l.split('\t') m = fields[0] fil = fields[1] pat = fields[2] # typ = fields[3] klass = None try: ext = fields[4] if ext and ext.startswith('class:'): klass = ext.split(':', 1)[1].strip() idd = self.class_id(klass) #print "klass",klass, idd except IndexError: ext = None # class id 0 = function idd = 0 c = self.cursor() #print fields fid = self.file_id(fil) c.execute('insert into function(class, name, searchpattern, file) values (?, ?, ?, ?)', [idd, m, pat, fid]) self.dbconn.commit()
def __call__(self, text): # preprocessing text = unicode(text) text = normalize_numbers(text) # text = ''.join(char for char in unicodedata.normalize('NFD', text) # if unicodedata.category(char) != 'Mn') # Strip accents # text = re.sub("[^ a-z'.,?!\-]", "", text) normalizer = hazm.Normalizer() text = normalizer.normalize(text) # tokenization words = hazm.word_tokenize(text) # tokens = pos_tag(words) # tuples of (word, tag) # steps prons = [] for word in words: if not any(letter in word for letter in self.graphemes): pron = [word] # elif word in self.homograph2features: # Check homograph # pron1, pron2, pos1 = self.homograph2features[word] # if pos.startswith(pos1): # pron = pron1 # else: # pron = pron2 elif word in self.tihu: # lookup tihu dict pron = self.tihu[word] else: # predict for oov pron = self.predict(word) prons.extend(pron) prons.extend([" "]) return prons[:-1]
def __valid_ip(self, value): try: if not ipaddress.ip_address(unicode(value)).is_global: return None except: return None return value
def predict(words, sess): ''' Returns predicted pronunciation of `words` which do NOT exist in the dictionary. :param words: A list of words. :return: pron: A list of phonemes ''' if len(words) > hp.batch_size: after = predict(words[hp.batch_size:], sess) words = words[:hp.batch_size] else: after = [] x = np.zeros((len(words), hp.maxlen), np.int32) # 0: <PAD> for i, w in enumerate(words): for j, g in enumerate((w + "E")[:hp.maxlen]): x[i][j] = g2idx.get(g, 2) # 2:<UNK> ## Autoregressive inference preds = np.zeros((len(x), hp.maxlen), np.int32) for j in range(hp.maxlen): _preds = sess.run(graph.preds, {graph.x: x, graph.y: preds}) preds[:, j] = _preds[:, j] # convert to string pron = [] for i in range(len(preds)): p = [u"%s" % unicode(idx2p[idx]) for idx in preds[i]] # Make p into unicode. if "<EOS>" in p: eos = p.index("<EOS>") p = p[:eos] pron.append(p) return pron + after
def _text_preprocessing(text): text = unicode(text) text = ''.join(char for char in unicodedata.normalize('NFD', text) if unicodedata.category(char) != 'Mn') text = text.lower() text = re.sub("[^ a-z'\".,?!()\[\]:;\-]", "", text) return text
def define_deviation(self, seq, value): seq = unicode(seq) # ensure sequence is a string try: dev = float(value) except ValueError: print("Invalid deviation value") else: self._measure_data[seq]['deviation'] = dev
def english_text_preprocessing(text, lower=True): text = unicode(text) text = ''.join(char for char in unicodedata.normalize('NFD', text) if unicodedata.category(char) != 'Mn') text = ''.join(char if char not in SYNOGLYPH2ASCII else SYNOGLYPH2ASCII[char] for char in text) if lower: text = text.lower() return text
def JSONResponse(obj, start_response): """ JSONResponse """ if isstring(obj): res = obj elif isinstance(obj, (dict, list)): res = unicode(json.dumps(obj)) else: res = obj return httpResponse(res, "200 OK", start_response)
def define_measurement(self, seq, ptype, value): seq = unicode(seq) # ensure sequence is a string try: avg = float(value) except ValueError: print("Invalid average value") else: self._measure_data[seq].update({ 'ptype': ptype, 'value': avg, })
def create_accounts(connection): for linea in acount_table_data: id_acc_type = unicode(Acounttype().get_one( acount_table_data[linea][2]).id) id_currency = unicode(Currency().get_one( acount_table_data[linea][3]).id) if acount_table_data[linea][1] is not None: ac = Account(id=unicode(acount_table_data[linea][0]), name=linea, parent=unicode(acount_table_data[linea][1]), id_account_type=id_acc_type, id_currency=id_currency, balance=0) else: ac = Account(id=unicode(acount_table_data[linea][0]), name=linea, id_account_type=id_acc_type, id_currency=id_currency, balance=0) connection.add(ac)
def process_django_model(app, what, name, obj, options, lines): # This causes import errors if left outside the function from django.db import models # Only look at objects that inherit from Django's base model class if inspect.isclass(obj) and issubclass(obj, models.Model): # Grab the field list from the meta class fields = obj._meta.fields for field in fields: # Decode and strip any html out of the field's help text help_text = strip_tags(unicode(field.help_text)) # Decode and capitalize the verbose name, for use if there isn't # any help text verbose_name = unicode(field.verbose_name).capitalize() if help_text: # Add the model field to the end of the docstring as a param # using the help text as the description lines.append(":param {}: {}".format(field.attname, help_text)) else: # Add the model field to the end of the docstring as a param # using the verbose name as the description lines.append(":param {}: {}".format(field.attname, verbose_name)) # Add the field's type to the docstring if isinstance(field, (models.ForeignKey, models.OneToOneField, models.ManyToManyField)): lines.append(":type %s: %s to :class:`%s.%s`" % ( field.attname, type(field).__name__, field.related_model.__module__, field.related_model.__name__, )) else: lines.append(":type {}: {}".format(field.attname, type(field).__name__)) # Return the extended docstring return lines
def SQL_EXEC(sql, args): """ SQL_EXEC - run a query o a file.sql """ try: env = mapify(args, sep=' ', kvsep='=', strip_char=' ', glue='"') res = SqliteDB.ExecuteP(sql, env, outputmode='response', verbose=False) return unicode(json.dumps(res)) except Exception as ex: manage(ex) return 0
def loadCss(self): #log("*** loadCss function ***") try: #log(self.cssFileInPlugin) f = open(self.cssFileInPlugin, 'r') #log(f.read()) css = unicode(f.read()) f.close() except Exception as e: log(e) css = u'' return css
def normalize(sentence): """ Normalize English text. """ # preprocessing sentence = unicode(sentence) sentence = normalize_numbers(sentence) sentence = ''.join(char for char in unicodedata.normalize('NFD', sentence) if unicodedata.category(char) != 'Mn') # Strip accents sentence = sentence.lower() sentence = re.sub(r"[^ a-z'.,?!\-]", "", sentence) sentence = sentence.replace("i.e.", "that is") sentence = sentence.replace("e.g.", "for example") return sentence
def postprocess(self): """Postprocessing. Just in case some elements on the addresses was converted to anything but string, it gets converted back to only string (unicode). Things on addresses are not useful but by what they say, not what they are. .. todo:: do the same thing to the 'address' field created on preprocessing. """ try: for name, item in self['addresses'][0].items(): try: if name == "indexInList": continue self['addresses'][0][name] = unicode(self['addresses'][0][name]) self['address'][name] = unicode(self['address'][name]) except AttributeError: pass except (KeyError, IndexError): pass super(MambuClient,self).postprocess()
def stringify(blob): retstr = '' if not blob: return '' # we were passed nothing, so return nothing elif isinstance(blob, list): for e in blob: retstr += stringify(e) elif isinstance(blob, dict): for k,v in blob.items(): retstr += stringify(unicode(k)) #print(type(retstr), type(v), v) retstr += stringify(unicode(v)) elif isinstance(blob, str): retstr += unicode(blob) elif isinstance(blob, bytes): retstr += unicode(blob) elif isinstance(blob, unicode): retstr += blob else: raise Exception("unknown type: %s" % str(type(blob))) #print(retstr) return retstr
def process_django_model(app, what, name, obj, options, lines): # This causes import errors if left outside the function from django.db import models # Only look at objects that inherit from Django's base model class if inspect.isclass(obj) and issubclass(obj, models.Model): # Grab the field list from the meta class fields = obj._meta.fields for field in fields: # Decode and strip any html out of the field's help text help_text = strip_tags(unicode(field.help_text)) # Decode and capitalize the verbose name, for use if there isn't # any help text verbose_name = unicode(field.verbose_name).capitalize() if help_text: # Add the model field to the end of the docstring as a param # using the help text as the description lines.append(u':param %s: %s' % (field.attname, help_text)) else: # Add the model field to the end of the docstring as a param # using the verbose name as the description lines.append(u':param %s: %s' % (field.attname, verbose_name)) # Add the field's type to the docstring if isinstance(field, (models.ForeignKey, models.OneToOneField, models.ManyToManyField)): lines.append(u':type %s: %s to :class:`%s.%s`' % (field.attname, type(field).__name__, field.rel.to.__module__, field.rel.to.__name__)) else: lines.append(u':type %s: %s' % (field.attname, type(field).__name__)) # Return the extended docstring return lines
def __call__(self, text): # preprocessing text = unicode(text) text = normalize_numbers(text) text = ''.join(char for char in unicodedata.normalize('NFD', text) if unicodedata.category(char) != 'Mn') # Strip accents text = text.lower() text = re.sub("[^ a-z'.,?!\-]", "", text) text = text.replace("i.e.", "that is") text = text.replace("e.g.", "for example") # tokenization words = word_tokenize(text) tokens = pos_tag(words) # tuples of (word, tag) # steps #prons = [] # YJS added prons2 = "" for word, pos in tokens: if re.search("[a-z]", word) is None: pron = [word] elif word in self.homograph2features: # Check homograph pron1, pron2, pos1 = self.homograph2features[word] if pos.startswith(pos1): pron = pron1 else: pron = pron2 elif word in self.cmu: # lookup CMU dict pron = self.cmu[word][0] else: # predict for oov pron = self.predict(word) #prons.extend(pron) #prons.extend([" "]) #YJS CHANGED new_word = '' for i in np.arange(np.size(pron)): new_word += pron[i] prons2 += new_word prons2 += ' ' # return prons[:-1_old_2] return prons2
def g2p(text): ''' Returns the pronunciation of text. :param text: A string. A sequence of words. :return: A list of phonemes. ''' # normalization text = unicode(text) #text = normalize_numbers(text) text = ''.join(char for char in unicodedata.normalize('NFD', text) if unicodedata.category(char) != 'Mn') # Strip accents text = text.lower() #text = re.sub("[^ a-z'.,?!\-]", "", text) #text = text.replace("i.e.", "that is") #text = text.replace("e.g.", "for example") # tokenization words = tokenize(text) tokens = (words) #tokens = pos_tag(words) # tuples of (word, tag) # g2p oovs, u_loc = [], [] ret = [] for token in tokens: pron = token2pron(token) # list of phonemes if pron == []: # oov oovs.append(token[0]) u_loc.append(len(ret)) ret.extend(pron) ret.extend([" "]) if len(oovs) > 0: global g_sess if g_sess is not None: # check global session prons = predict(oovs, g_sess) for i in range(len(oovs) - 1, -1, -1): ret = ret[:u_loc[i]] + prons[i] + ret[u_loc[i]:] else: # If global session is not defined, make new one as local. with tf.Session(graph=g, config=config) as sess: saver.restore( sess, tf.train.latest_checkpoint(os.path.join( dirname, hp.logdir))) prons = predict(oovs, sess) for i in range(len(oovs) - 1, -1, -1): ret = ret[:u_loc[i]] + prons[i] + ret[u_loc[i]:] return ret[:-1]
def JSONResponse(obj, start_response): """ JSONResponse """ if isstring(obj): text = obj elif isinstance(obj, (dict, list)): text = unicode(json.dumps(obj)) else: text = obj response_headers = [('Content-type', 'application/json'), ('Content-Length', str(len(text)))] if start_response: start_response("200 OK", response_headers) return [text.encode('utf-8')]
def __call__(self, text): # preprocessing text = unicode(text) text = normalize_numbers(text) text = ''.join(char for char in unicodedata.normalize('NFD', text) if unicodedata.category(char) != 'Mn') # Strip accents text = text.lower() #text = re.sub("[^ a-z'.,?!\-]", "", text) text = re.sub("[^ a-z'.,?!\-;:\"]", "", text) # mdda #text = re.sub("([a-z])\-([a-z])", r"\1 - \2", text) # mdda 'hot-shot' -> 'hot - shot' text = re.sub("([a-z])\-([a-z])", r"\1 \2", text) # mdda 'hot-shot' -> 'hot shot' text = text.replace("i.e.", "that is") text = text.replace("e.g.", "for example") # tokenization #words1 = word_tokenize(text) #print( words1 ) words2 = kaldi_tokenize(text) #print( words2 ) tokens = pos_tag(words2) # tuples of (word, tag) # steps prons = [] for word, pos in tokens: if re.search("[a-z]", word) is None: pron = [word] elif word in self.homograph2features: # Check homograph pron1, pron2, pos1 = self.homograph2features[word] if pos.startswith(pos1): pron = pron1 else: pron = pron2 elif word in self.cmu: # lookup CMU dict pron = self.cmu[word][0] else: # predict for oov pron = self.predict(word) #prons.extend(pron) #mdda #prons.extend([" "]) #mdda prons.append((word, pron)) #mdda #return prons[:-1] #mdda return prons #mdda
def _get(self, config, section, field, default): try: if isinstance(default, bool): self[field] = config.getboolean(section, field) elif isinstance(default, int): self[field] = config.getint(section, field) else: self[field] = config.get(section, field) if field == 'password' and self[field] != '' and len( self[field]) != 64: # likely not a hashed password. self[field] = hashlib.sha256( self[field]).hexdigest() # hash the original password. except ConfigParser.Error as e: logging.debug( "Could not parse setting '%s.%s': %s. Using default value: '%s'." % (section, field, unicode(e), default)) self[field] = default if field in ['database', 'assetdir']: self[field] = str(path.join(self.home, self[field]))
def create_index_page(self): ''' If there is no local html containing links to files, create one. ''' if os.path.isfile(self.index_page): print('>>> Reading cached index page') index_file = open(self.index_page, 'r') index_contents = index_file.read() index_file.close() else: print('>>> Downloading index page') fp = urllib.urlopen(self.kgs_url) data = unicode(fp.read()) fp.close() index_contents = data index_file = open(self.index_page, 'w') index_file.write(index_contents) index_file.close() return index_contents
def __call__(self, text, tidy=False, secret=False): # preprocessing text = unicode(text) text = normalize_numbers(text) # text = ''.join(char for char in unicodedata.normalize('NFD', text) # if unicodedata.category(char) != 'Mn') # Strip accents # text = re.sub("[^ a-z'.,?!\-]", "", text) normalizer = hazm.Normalizer() text = normalizer.normalize(text) # tokenization words = hazm.word_tokenize(text) # tokens = pos_tag(words) # tuples of (word, tag) # steps prons = [] for word in words: if not any(letter in word for letter in self.graphemes): pron = [word] # elif word in self.homograph2features: # Check homograph # pron1, pron2, pos1 = self.homograph2features[word] # if pos.startswith(pos1): # pron = pron1 # else: # pron = pron2 elif word in self.tihu: # lookup tihu dict pron = [self.tihu[word].replace(' ', '') ] if secret else [' ', self.tihu[word], ' '] else: # predict for oov pron = self.predict(word) prons.extend(pron) prons.extend([" "]) result = ''.join(prons[:-1]) if tidy: return Persian_g2p_converter.convert_from_native_to_good(result) return result
def sanitize(text, kana=True, wildcards=False): if kana: checker = isJapanese else: checker = isKanji if wildcards: text = re.sub(u'[\**]', u'%', text) text = re.sub(u'[\??]', u'_', text) overrides = [u'%', u'_'] else: overrides = list() result = unicode() for c in text: if checker(c) or c in overrides: result += c return result
def feed_scintilla(self, apifile_obj): """ handle scintilla api files Syntax is like: qt.QApplication.style?4() -> QStyle """ for l in apifile_obj: if not isPython3: l = builtins.unicode(l, 'utf8', 'replace') parts = l.split('?') fullsym = parts[0].rsplit('.', 1) klass, func = fullsym if len(parts) == 2: desc = parts[1] else: desc = '' # now our class is like qt.QApplication. We do the dirty trick and # remove all but actual class name shortclass = klass.rsplit('.', 1)[-1] #print func, klass, desc self.feed_function(func.strip(), shortclass.strip(), '', desc.strip()) self.dbconn.commit()
def __call__(self, text): # preprocessing text = unicode(text) text = normalize_numbers(text) text = ''.join(char for char in unicodedata.normalize('NFD', text) if unicodedata.category(char) != 'Mn') # Strip accents text = text.lower() text = re.sub("[^ a-z'.,?!\-#~\r\t_\"\']", "", text) text = text.replace("i.e.", "that is") text = text.replace("e.g.", "for example") # tokenization words = word_tokenize(text) tokens = pos_tag(words) # tuples of (word, tag) # steps prons = [] for word, pos in tokens: if re.search("[a-z]", word) is None: pron = [word] elif word in self.homograph2features: # Check homograph pron1, pron2, pos1 = self.homograph2features[word] if pos.startswith(pos1): pron = pron1 else: pron = pron2 elif word in self.cmu: # lookup CMU dict pron = self.cmu[word][0] else: # predict for oov pron = self.predict(word) prons.extend(pron) prons.extend([" "]) self.word_map["".join(pron)] = word return prons[:-1]
def u(s): return builtins.unicode(s)
def _str(s, encoding="UTF-8"): return unicode(s, encoding=encoding)
def ue(s, encoding): return builtins.unicode(s, encoding)
def _str(s, encoding="UTF-8"): s = unicode(s, encoding=encoding) return unichr_escape.sub(lambda x: x.group(0).decode('unicode-escape'), s)
def toUnicode(self, s): # pylint: disable=no-member return builtins.unicode(s)
def toUnicode(self, s): # pylint: disable=no-member if g.isPython3: return str(s) else: return builtins.unicode(s)
def toUnicode(self, s): if g.isPython3: return str(s) else: return builtins.unicode(s)