예제 #1
0
    def serializeFields(data):
        """Turns every attribute of the Mambu object in to a string representation.

        If the object is an iterable one, it goes down to each of its
        elements and turns its attributes too, recursively.

        The base case is when it's a MambuStruct class (this one) so it
        just 'serializes' the attr atribute. Refer to
        MambuStruct.serializeStruct pydoc.

        This is perhaps the worst way to do it, still looking for a better way.
        """
        if isinstance(data, MambuStruct):
            return data.serializeStruct()
        try:
            it = iter(data)
        except TypeError as terr:
            return unicode(data)
        if type(it) == type(iter([])):
            l = []
            for e in it:
                l.append(MambuStruct.serializeFields(e))
            return l
        elif type(it) == type(iter({})):
            d = {}
            for k in it:
                d[k] = MambuStruct.serializeFields(data[k])
            return d
        # elif ... tuples? sets?
        return unicode(data)
예제 #2
0
 def define_viable(self, value):
     try:
         if unicode(value)[-1] == u'%':
             viable = float(unicode(value)[:-1]) / 100
         else:
             viable = float(value)
     except ValueError:
         print("Invalid viable value '%s'" % value)
     else:
         self._viable = viable
예제 #3
0
 def __new__(cls, s, *args, **kwargs):
     if isinstance(s, _):
         s = unicode(s.untranslated)
     if translator:
         trans = translator(s, *args, **kwargs)
         obj = super(_, cls).__new__(cls, trans, *args, **kwargs)
     else:
         obj = super(_, cls).__new__(cls, s, *args, **kwargs)
     obj.untranslated = unicode(s)
     obj._additionals = []
     return obj
예제 #4
0
 def __new__(cls, s, *args, **kwargs):
     if isinstance(s, _):
         s = unicode(s.untranslated)
     if translator:
         trans = translator(s, *args, **kwargs)
         obj = super(_, cls).__new__(cls, trans, *args, **kwargs)
     else:
         obj = super(_, cls).__new__(cls, s, *args, **kwargs)
     obj.untranslated = unicode(s)
     obj._additionals = []
     return obj
예제 #5
0
 def define_variance(self, seq, value):
     seq = unicode(seq)  # ensure sequence is a string
     try:
         if unicode(value)[-1] == u'%':
             cv = float(unicode(value)[:-1]) / 100
         else:
             cv = float(value)
     except ValueError:
         print("Invalid cv value '%s'" % value)
     else:
         self._measure_data[seq]['variance'] = cv
예제 #6
0
def create_categories(connection):
    for linea in category_table_data:
        if category_table_data[linea][1] is not None:
            ac = Category(id=unicode(category_table_data[linea][0]),
                          name=linea,
                          parent=unicode(category_table_data[linea][1]),
                          balance=0)
        else:
            ac = Category(id=unicode(category_table_data[linea][0]),
                          name=linea,
                          balance=0)
        connection.add(ac)
예제 #7
0
 def feed_ctags(self, tagsfile_obj):
     for l in tagsfile_obj:
         #print l
         if not isPython3:
             l = builtins.unicode(l, 'utf8', 'replace')
         if l.startswith('!'):
             continue
         fields = l.split('\t')
         m = fields[0]
         fil = fields[1]
         pat = fields[2]
         # typ = fields[3]
         klass = None
         try:
             ext = fields[4]
             if ext and ext.startswith('class:'):
                 klass = ext.split(':', 1)[1].strip()
                 idd = self.class_id(klass)
                 #print "klass",klass, idd
         except IndexError:
             ext = None
             # class id 0 = function
             idd = 0
         c = self.cursor()
         #print fields
         fid = self.file_id(fil)
         c.execute('insert into function(class, name, searchpattern, file) values (?, ?, ?, ?)',
                   [idd, m, pat, fid])
     self.dbconn.commit()
예제 #8
0
    def __call__(self, text):
        # preprocessing
        text = unicode(text)
        text = normalize_numbers(text)
        # text = ''.join(char for char in unicodedata.normalize('NFD', text)
        #                if unicodedata.category(char) != 'Mn')  # Strip accents
        # text = re.sub("[^ a-z'.,?!\-]", "", text)

        normalizer = hazm.Normalizer()
        text = normalizer.normalize(text)
        # tokenization
        words = hazm.word_tokenize(text)
        # tokens = pos_tag(words)  # tuples of (word, tag)

        # steps
        prons = []
        for word in words:
            if not any(letter in word for letter in self.graphemes):
                pron = [word]

            # elif word in self.homograph2features:  # Check homograph
            #     pron1, pron2, pos1 = self.homograph2features[word]
            #     if pos.startswith(pos1):
            #         pron = pron1
            #     else:
            #         pron = pron2
            elif word in self.tihu:  # lookup tihu dict
                pron = self.tihu[word]
            else: # predict for oov
                pron = self.predict(word)

            prons.extend(pron)
            prons.extend([" "])

        return prons[:-1]
예제 #9
0
 def __valid_ip(self, value):
     try:
         if not ipaddress.ip_address(unicode(value)).is_global:
             return None
     except:
         return None
     return value
예제 #10
0
 def feed_ctags(self, tagsfile_obj):
     for l in tagsfile_obj:
         #print l
         if not isPython3:
             l = builtins.unicode(l, 'utf8', 'replace')
         if l.startswith('!'):
             continue
         fields = l.split('\t')
         m = fields[0]
         fil = fields[1]
         pat = fields[2]
         # typ = fields[3]
         klass = None
         try:
             ext = fields[4]
             if ext and ext.startswith('class:'):
                 klass = ext.split(':', 1)[1].strip()
                 idd = self.class_id(klass)
                 #print "klass",klass, idd
         except IndexError:
             ext = None
             # class id 0 = function
             idd = 0
         c = self.cursor()
         #print fields
         fid = self.file_id(fil)
         c.execute('insert into function(class, name, searchpattern, file) values (?, ?, ?, ?)',
                   [idd, m, pat, fid])
     self.dbconn.commit()
예제 #11
0
파일: g2p.py 프로젝트: Koomook/g2p
def predict(words, sess):
    '''
    Returns predicted pronunciation of `words` which do NOT exist in the dictionary.
    :param words: A list of words.
    :return: pron: A list of phonemes
    '''
    if len(words) > hp.batch_size:
        after = predict(words[hp.batch_size:], sess)
        words = words[:hp.batch_size]
    else:
        after = []
    x = np.zeros((len(words), hp.maxlen), np.int32)  # 0: <PAD>
    for i, w in enumerate(words):
        for j, g in enumerate((w + "E")[:hp.maxlen]):
            x[i][j] = g2idx.get(g, 2)  # 2:<UNK>

    ## Autoregressive inference
    preds = np.zeros((len(x), hp.maxlen), np.int32)
    for j in range(hp.maxlen):
        _preds = sess.run(graph.preds, {graph.x: x, graph.y: preds})
        preds[:, j] = _preds[:, j]

    # convert to string
    pron = []
    for i in range(len(preds)):
        p = [u"%s" % unicode(idx2p[idx])
             for idx in preds[i]]  # Make p into unicode.
        if "<EOS>" in p:
            eos = p.index("<EOS>")
            p = p[:eos]
        pron.append(p)

    return pron + after
예제 #12
0
파일: vocabs.py 프로젝트: stjordanis/NeMo
def _text_preprocessing(text):
    text = unicode(text)
    text = ''.join(char for char in unicodedata.normalize('NFD', text)
                   if unicodedata.category(char) != 'Mn')
    text = text.lower()
    text = re.sub("[^ a-z'\".,?!()\[\]:;\-]", "", text)
    return text
예제 #13
0
 def define_deviation(self, seq, value):
     seq = unicode(seq)  # ensure sequence is a string
     try:
         dev = float(value)
     except ValueError:
         print("Invalid deviation value")
     else:
         self._measure_data[seq]['deviation'] = dev
예제 #14
0
def english_text_preprocessing(text, lower=True):
    text = unicode(text)
    text = ''.join(char for char in unicodedata.normalize('NFD', text) if unicodedata.category(char) != 'Mn')
    text = ''.join(char if char not in SYNOGLYPH2ASCII else SYNOGLYPH2ASCII[char] for char in text)

    if lower:
        text = text.lower()

    return text
예제 #15
0
def JSONResponse(obj, start_response):
    """
    JSONResponse
    """
    if isstring(obj):
        res = obj
    elif isinstance(obj, (dict, list)):
        res = unicode(json.dumps(obj))
    else:
        res = obj
    return httpResponse(res, "200 OK", start_response)
예제 #16
0
 def define_measurement(self, seq, ptype, value):
     seq = unicode(seq)  # ensure sequence is a string
     try:
         avg = float(value)
     except ValueError:
         print("Invalid average value")
     else:
         self._measure_data[seq].update({
             'ptype': ptype,
             'value': avg,
         })
예제 #17
0
def create_accounts(connection):
    for linea in acount_table_data:
        id_acc_type = unicode(Acounttype().get_one(
            acount_table_data[linea][2]).id)
        id_currency = unicode(Currency().get_one(
            acount_table_data[linea][3]).id)
        if acount_table_data[linea][1] is not None:
            ac = Account(id=unicode(acount_table_data[linea][0]),
                         name=linea,
                         parent=unicode(acount_table_data[linea][1]),
                         id_account_type=id_acc_type,
                         id_currency=id_currency,
                         balance=0)
        else:
            ac = Account(id=unicode(acount_table_data[linea][0]),
                         name=linea,
                         id_account_type=id_acc_type,
                         id_currency=id_currency,
                         balance=0)
        connection.add(ac)
예제 #18
0
def process_django_model(app, what, name, obj, options, lines):
    # This causes import errors if left outside the function
    from django.db import models

    # Only look at objects that inherit from Django's base model class
    if inspect.isclass(obj) and issubclass(obj, models.Model):
        # Grab the field list from the meta class
        fields = obj._meta.fields

        for field in fields:
            # Decode and strip any html out of the field's help text
            help_text = strip_tags(unicode(field.help_text))

            # Decode and capitalize the verbose name, for use if there isn't
            # any help text
            verbose_name = unicode(field.verbose_name).capitalize()

            if help_text:
                # Add the model field to the end of the docstring as a param
                # using the help text as the description
                lines.append(":param {}: {}".format(field.attname, help_text))
            else:
                # Add the model field to the end of the docstring as a param
                # using the verbose name as the description
                lines.append(":param {}: {}".format(field.attname,
                                                    verbose_name))

            # Add the field's type to the docstring
            if isinstance(field, (models.ForeignKey, models.OneToOneField,
                                  models.ManyToManyField)):
                lines.append(":type %s: %s to :class:`%s.%s`" % (
                    field.attname,
                    type(field).__name__,
                    field.related_model.__module__,
                    field.related_model.__name__,
                ))
            else:
                lines.append(":type {}: {}".format(field.attname,
                                                   type(field).__name__))
    # Return the extended docstring
    return lines
예제 #19
0
def SQL_EXEC(sql, args):
    """
    SQL_EXEC - run a query o a file.sql
    """
    try:
        env = mapify(args, sep=' ', kvsep='=', strip_char=' ', glue='"')
        res = SqliteDB.ExecuteP(sql, env, outputmode='response', verbose=False)
        return unicode(json.dumps(res))
    except Exception as ex:
        manage(ex)

    return 0
예제 #20
0
 def loadCss(self):
     #log("*** loadCss function ***")
     try:
         #log(self.cssFileInPlugin)
         f = open(self.cssFileInPlugin, 'r')
         #log(f.read())
         css = unicode(f.read())
         f.close()
         
     except Exception as e:
         log(e)
         css = u''
     return css
예제 #21
0
def normalize(sentence):
    """ Normalize English text.
    """
    # preprocessing
    sentence = unicode(sentence)
    sentence = normalize_numbers(sentence)
    sentence = ''.join(char for char in unicodedata.normalize('NFD', sentence)
                       if unicodedata.category(char) != 'Mn')  # Strip accents
    sentence = sentence.lower()
    sentence = re.sub(r"[^ a-z'.,?!\-]", "", sentence)
    sentence = sentence.replace("i.e.", "that is")
    sentence = sentence.replace("e.g.", "for example")
    return sentence
예제 #22
0
    def postprocess(self):
        """Postprocessing.

        Just in case some elements on the addresses was converted to
        anything but string, it gets converted back to only string
        (unicode). Things on addresses are not useful but by what they
        say, not what they are.

.. todo:: do the same thing to the 'address' field created on
          preprocessing.
        """
        try:
            for name, item in self['addresses'][0].items():
                try:
                    if name == "indexInList": continue
                    self['addresses'][0][name] = unicode(self['addresses'][0][name])
                    self['address'][name] = unicode(self['address'][name])
                except AttributeError:
                    pass
        except (KeyError, IndexError):
            pass

        super(MambuClient,self).postprocess()
예제 #23
0
파일: main.py 프로젝트: tedder/rssfilter
def stringify(blob):
  retstr = ''
  if not blob:
    return '' # we were passed nothing, so return nothing
  elif isinstance(blob, list):
    for e in blob:
      retstr += stringify(e)
  elif isinstance(blob, dict):
    for k,v in blob.items():
      retstr += stringify(unicode(k))
      #print(type(retstr), type(v), v)
      retstr += stringify(unicode(v))
  elif isinstance(blob, str):
    retstr += unicode(blob)
  elif isinstance(blob, bytes):
    retstr += unicode(blob)
  elif isinstance(blob, unicode):
    retstr += blob
  else:
    raise Exception("unknown type: %s" % str(type(blob)))

  #print(retstr)
  return retstr
예제 #24
0
def process_django_model(app, what, name, obj, options, lines):
    # This causes import errors if left outside the function
    from django.db import models

    # Only look at objects that inherit from Django's base model class
    if inspect.isclass(obj) and issubclass(obj, models.Model):
        # Grab the field list from the meta class
        fields = obj._meta.fields

        for field in fields:
            # Decode and strip any html out of the field's help text
            help_text = strip_tags(unicode(field.help_text))

            # Decode and capitalize the verbose name, for use if there isn't
            # any help text
            verbose_name = unicode(field.verbose_name).capitalize()

            if help_text:
                # Add the model field to the end of the docstring as a param
                # using the help text as the description
                lines.append(u':param %s: %s' % (field.attname, help_text))
            else:
                # Add the model field to the end of the docstring as a param
                # using the verbose name as the description
                lines.append(u':param %s: %s' % (field.attname, verbose_name))

            # Add the field's type to the docstring
            if isinstance(field, (models.ForeignKey, models.OneToOneField, models.ManyToManyField)):
                lines.append(u':type %s: %s to :class:`%s.%s`' % (field.attname,
                                                                  type(field).__name__,
                                                                  field.rel.to.__module__,
                                                                  field.rel.to.__name__))
            else:
                lines.append(u':type %s: %s' % (field.attname, type(field).__name__))
    # Return the extended docstring
    return lines
예제 #25
0
    def __call__(self, text):
        # preprocessing
        text = unicode(text)
        text = normalize_numbers(text)
        text = ''.join(char for char in unicodedata.normalize('NFD', text)
                       if unicodedata.category(char) != 'Mn')  # Strip accents
        text = text.lower()
        text = re.sub("[^ a-z'.,?!\-]", "", text)
        text = text.replace("i.e.", "that is")
        text = text.replace("e.g.", "for example")

        # tokenization
        words = word_tokenize(text)
        tokens = pos_tag(words)  # tuples of (word, tag)

        # steps
        #prons = []
        # YJS added
        prons2 = ""
        for word, pos in tokens:
            if re.search("[a-z]", word) is None:
                pron = [word]

            elif word in self.homograph2features:  # Check homograph
                pron1, pron2, pos1 = self.homograph2features[word]
                if pos.startswith(pos1):
                    pron = pron1
                else:
                    pron = pron2
            elif word in self.cmu:  # lookup CMU dict
                pron = self.cmu[word][0]
            else:  # predict for oov
                pron = self.predict(word)

            #prons.extend(pron)
            #prons.extend([" "])

            #YJS CHANGED
            new_word = ''
            for i in np.arange(np.size(pron)):
                new_word += pron[i]

            prons2 += new_word
            prons2 += ' '


#        return prons[:-1_old_2]
        return prons2
예제 #26
0
파일: g2p.py 프로젝트: PyThaiNLP/g2p-old
def g2p(text):
    '''
    Returns the pronunciation of text.
    :param text: A string. A sequence of words.
    :return: A list of phonemes.
    '''
    # normalization
    text = unicode(text)
    #text = normalize_numbers(text)
    text = ''.join(char for char in unicodedata.normalize('NFD', text)
                   if unicodedata.category(char) != 'Mn')  # Strip accents
    text = text.lower()
    #text = re.sub("[^ a-z'.,?!\-]", "", text)
    #text = text.replace("i.e.", "that is")
    #text = text.replace("e.g.", "for example")

    # tokenization
    words = tokenize(text)
    tokens = (words)
    #tokens = pos_tag(words) # tuples of (word, tag)

    # g2p
    oovs, u_loc = [], []
    ret = []
    for token in tokens:
        pron = token2pron(token)  # list of phonemes
        if pron == []:  # oov
            oovs.append(token[0])
            u_loc.append(len(ret))
        ret.extend(pron)
        ret.extend([" "])

    if len(oovs) > 0:
        global g_sess
        if g_sess is not None:  # check global session
            prons = predict(oovs, g_sess)
            for i in range(len(oovs) - 1, -1, -1):
                ret = ret[:u_loc[i]] + prons[i] + ret[u_loc[i]:]
        else:  # If global session is not defined, make new one as local.
            with tf.Session(graph=g, config=config) as sess:
                saver.restore(
                    sess,
                    tf.train.latest_checkpoint(os.path.join(
                        dirname, hp.logdir)))
                prons = predict(oovs, sess)
                for i in range(len(oovs) - 1, -1, -1):
                    ret = ret[:u_loc[i]] + prons[i] + ret[u_loc[i]:]
    return ret[:-1]
예제 #27
0
def JSONResponse(obj, start_response):
    """
    JSONResponse
    """
    if isstring(obj):
        text = obj
    elif isinstance(obj, (dict, list)):
        text = unicode(json.dumps(obj))
    else:
        text = obj

    response_headers = [('Content-type', 'application/json'),
                        ('Content-Length', str(len(text)))]
    if start_response:
        start_response("200 OK", response_headers)
    return [text.encode('utf-8')]
예제 #28
0
    def __call__(self, text):
        # preprocessing
        text = unicode(text)
        text = normalize_numbers(text)
        text = ''.join(char for char in unicodedata.normalize('NFD', text)
                       if unicodedata.category(char) != 'Mn')  # Strip accents
        text = text.lower()
        #text = re.sub("[^ a-z'.,?!\-]", "", text)
        text = re.sub("[^ a-z'.,?!\-;:\"]", "", text)  # mdda
        #text = re.sub("([a-z])\-([a-z])", r"\1 - \2", text)   # mdda 'hot-shot' -> 'hot - shot'
        text = re.sub("([a-z])\-([a-z])", r"\1 \2",
                      text)  # mdda    'hot-shot' -> 'hot shot'
        text = text.replace("i.e.", "that is")
        text = text.replace("e.g.", "for example")

        # tokenization
        #words1 = word_tokenize(text)
        #print( words1 )
        words2 = kaldi_tokenize(text)
        #print( words2 )

        tokens = pos_tag(words2)  # tuples of (word, tag)

        # steps
        prons = []
        for word, pos in tokens:
            if re.search("[a-z]", word) is None:
                pron = [word]

            elif word in self.homograph2features:  # Check homograph
                pron1, pron2, pos1 = self.homograph2features[word]
                if pos.startswith(pos1):
                    pron = pron1
                else:
                    pron = pron2
            elif word in self.cmu:  # lookup CMU dict
                pron = self.cmu[word][0]
            else:  # predict for oov
                pron = self.predict(word)

            #prons.extend(pron)  #mdda
            #prons.extend([" "]) #mdda
            prons.append((word, pron))  #mdda

        #return prons[:-1]   #mdda
        return prons  #mdda
예제 #29
0
 def _get(self, config, section, field, default):
     try:
         if isinstance(default, bool):
             self[field] = config.getboolean(section, field)
         elif isinstance(default, int):
             self[field] = config.getint(section, field)
         else:
             self[field] = config.get(section, field)
             if field == 'password' and self[field] != '' and len(
                     self[field]) != 64:  # likely not a hashed password.
                 self[field] = hashlib.sha256(
                     self[field]).hexdigest()  # hash the original password.
     except ConfigParser.Error as e:
         logging.debug(
             "Could not parse setting '%s.%s': %s. Using default value: '%s'."
             % (section, field, unicode(e), default))
         self[field] = default
     if field in ['database', 'assetdir']:
         self[field] = str(path.join(self.home, self[field]))
예제 #30
0
 def create_index_page(self):
     '''
     If there is no local html containing links to files, create one.
     '''
     if os.path.isfile(self.index_page):
         print('>>> Reading cached index page')
         index_file = open(self.index_page, 'r')
         index_contents = index_file.read()
         index_file.close()
     else:
         print('>>> Downloading index page')
         fp = urllib.urlopen(self.kgs_url)
         data = unicode(fp.read())
         fp.close()
         index_contents = data
         index_file = open(self.index_page, 'w')
         index_file.write(index_contents)
         index_file.close()
     return index_contents
예제 #31
0
    def __call__(self, text, tidy=False, secret=False):

        # preprocessing
        text = unicode(text)
        text = normalize_numbers(text)
        # text = ''.join(char for char in unicodedata.normalize('NFD', text)
        #                if unicodedata.category(char) != 'Mn')  # Strip accents
        # text = re.sub("[^ a-z'.,?!\-]", "", text)

        normalizer = hazm.Normalizer()
        text = normalizer.normalize(text)
        # tokenization
        words = hazm.word_tokenize(text)
        # tokens = pos_tag(words)  # tuples of (word, tag)

        # steps
        prons = []
        for word in words:

            if not any(letter in word for letter in self.graphemes):
                pron = [word]

            # elif word in self.homograph2features:  # Check homograph
            #     pron1, pron2, pos1 = self.homograph2features[word]
            #     if pos.startswith(pos1):
            #         pron = pron1
            #     else:
            #         pron = pron2
            elif word in self.tihu:  # lookup tihu dict
                pron = [self.tihu[word].replace(' ', '')
                        ] if secret else [' ', self.tihu[word], ' ']
            else:  # predict for oov
                pron = self.predict(word)

            prons.extend(pron)
            prons.extend([" "])

        result = ''.join(prons[:-1])

        if tidy:
            return Persian_g2p_converter.convert_from_native_to_good(result)

        return result
예제 #32
0
def sanitize(text, kana=True, wildcards=False):
    if kana:
        checker = isJapanese
    else:
        checker = isKanji

    if wildcards:
        text = re.sub(u'[\**]', u'%', text)
        text = re.sub(u'[\??]', u'_', text)
        overrides = [u'%', u'_']
    else:
        overrides = list()

    result = unicode()
    for c in text:
        if checker(c) or c in overrides:
            result += c

    return result
예제 #33
0
    def feed_scintilla(self, apifile_obj):
        """ handle scintilla api files

        Syntax is like:

        qt.QApplication.style?4() -> QStyle
        """
        for l in apifile_obj:
            if not isPython3:
                l = builtins.unicode(l, 'utf8', 'replace')
            parts = l.split('?')
            fullsym = parts[0].rsplit('.', 1)
            klass, func = fullsym
            if len(parts) == 2:
                desc = parts[1]
            else:
                desc = ''
            # now our class is like qt.QApplication. We do the dirty trick and
            # remove all but actual class name
            shortclass = klass.rsplit('.', 1)[-1]
            #print func, klass, desc
            self.feed_function(func.strip(), shortclass.strip(), '', desc.strip())
        self.dbconn.commit()
예제 #34
0
    def feed_scintilla(self, apifile_obj):
        """ handle scintilla api files

        Syntax is like:

        qt.QApplication.style?4() -> QStyle
        """
        for l in apifile_obj:
            if not isPython3:
                l = builtins.unicode(l, 'utf8', 'replace')
            parts = l.split('?')
            fullsym = parts[0].rsplit('.', 1)
            klass, func = fullsym
            if len(parts) == 2:
                desc = parts[1]
            else:
                desc = ''
            # now our class is like qt.QApplication. We do the dirty trick and
            # remove all but actual class name
            shortclass = klass.rsplit('.', 1)[-1]
            #print func, klass, desc
            self.feed_function(func.strip(), shortclass.strip(), '', desc.strip())
        self.dbconn.commit()
예제 #35
0
    def __call__(self, text):
        # preprocessing
        text = unicode(text)
        text = normalize_numbers(text)
        text = ''.join(char for char in unicodedata.normalize('NFD', text)
                       if unicodedata.category(char) != 'Mn')  # Strip accents
        text = text.lower()
        text = re.sub("[^ a-z'.,?!\-#~\r\t_\"\']", "", text)
        text = text.replace("i.e.", "that is")
        text = text.replace("e.g.", "for example")

        # tokenization
        words = word_tokenize(text)
        tokens = pos_tag(words)  # tuples of (word, tag)

        # steps
        prons = []
        for word, pos in tokens:
            if re.search("[a-z]", word) is None:
                pron = [word]

            elif word in self.homograph2features:  # Check homograph
                pron1, pron2, pos1 = self.homograph2features[word]
                if pos.startswith(pos1):
                    pron = pron1
                else:
                    pron = pron2
            elif word in self.cmu:  # lookup CMU dict
                pron = self.cmu[word][0]
            else: # predict for oov
                pron = self.predict(word)

            prons.extend(pron)
            prons.extend([" "])
            self.word_map["".join(pron)] = word
        return prons[:-1]
예제 #36
0
 def u(s):
     return builtins.unicode(s)
예제 #37
0
 def _str(s, encoding="UTF-8"):
     return unicode(s, encoding=encoding)
예제 #38
0
 def ue(s, encoding):
     return builtins.unicode(s, encoding)
예제 #39
0
 def _str(s, encoding="UTF-8"):
     s = unicode(s, encoding=encoding)
     return unichr_escape.sub(lambda x:
                                  x.group(0).decode('unicode-escape'),
                              s)
예제 #40
0
 def _str(s, encoding="UTF-8"):
     return unicode(s, encoding=encoding)
예제 #41
0
 def toUnicode(self, s):
     # pylint: disable=no-member
     return builtins.unicode(s)
예제 #42
0
 def toUnicode(self, s):
     # pylint: disable=no-member
     if g.isPython3:
         return str(s)
     else:
         return builtins.unicode(s)
예제 #43
0
 def toUnicode(self, s):
     if g.isPython3:
         return str(s)
     else:
         return builtins.unicode(s)